From 00dd6e5a1feb952af88a474abffd07f5015ceee6 Mon Sep 17 00:00:00 2001 From: zhaizhiqiang <584508161@qq.com> Date: Fri, 26 Jun 2026 03:32:33 +0000 Subject: [PATCH 1/3] support mini-swe-agent and claud-code blockbox agent training recipes --- .../claude_code/Dockerfile.claude-code-tool | 21 + .../claude_code/claude_code_runner.py | 232 +++++++++ .../claude_code/config/claude_code.yaml | 1 + .../Dockerfile.mini-swe-agent-tool | 45 ++ .../blackbox_recipes/mini_swe_agent/README.md | 269 +++++++++++ .../mini_swe_agent/__init__.py | 0 .../mini_swe_agent/config/agent_config.yaml | 36 ++ .../config/agent_config_openyuanrong.yaml | 37 ++ .../mini_swe_agent/config/parallel_infer.yaml | 31 ++ .../config/swe_agent_blackbox.yaml | 123 +++++ .../swe_agent_blackbox_megatron_async.yaml | 162 +++++++ .../swe_agent_blackbox_megatron_sync.yaml | 129 +++++ .../mini_swe_agent/dataset.py | 34 ++ .../mini_swe_agent/framework.py | 105 ++++ .../mini_swe_agent/mini_swe_agent_runner.py | 227 +++++++++ .../mini_swe_agent/parallel_infer.py | 447 ++++++++++++++++++ .../blackbox_recipes/mini_swe_agent/reward.py | 74 +++ .../mini_swe_agent/run_agent.py | 106 +++++ .../mini_swe_agent/subprocess_runner.py | 61 +++ examples/blackbox_recipes/sandbox/sandbox.py | 10 + .../blackbox_recipes/scripts/build_tool.sh | 75 +++ .../blackbox_recipes/scripts/run_infer.sh | 66 +++ .../blackbox_recipes/scripts/run_train.sh | 122 +++++ .../scripts/run_train_megatron_async.sh | 199 ++++++++ .../scripts/run_train_megatron_sync.sh | 138 ++++++ 25 files changed, 2750 insertions(+) create mode 100644 examples/blackbox_recipes/claude_code/Dockerfile.claude-code-tool create mode 100644 examples/blackbox_recipes/claude_code/claude_code_runner.py create mode 100644 examples/blackbox_recipes/claude_code/config/claude_code.yaml create mode 100644 examples/blackbox_recipes/mini_swe_agent/Dockerfile.mini-swe-agent-tool create mode 100644 examples/blackbox_recipes/mini_swe_agent/README.md create mode 100644 examples/blackbox_recipes/mini_swe_agent/__init__.py create mode 100644 examples/blackbox_recipes/mini_swe_agent/config/agent_config.yaml create mode 100644 examples/blackbox_recipes/mini_swe_agent/config/agent_config_openyuanrong.yaml create mode 100644 examples/blackbox_recipes/mini_swe_agent/config/parallel_infer.yaml create mode 100644 examples/blackbox_recipes/mini_swe_agent/config/swe_agent_blackbox.yaml create mode 100644 examples/blackbox_recipes/mini_swe_agent/config/swe_agent_blackbox_megatron_async.yaml create mode 100644 examples/blackbox_recipes/mini_swe_agent/config/swe_agent_blackbox_megatron_sync.yaml create mode 100644 examples/blackbox_recipes/mini_swe_agent/dataset.py create mode 100644 examples/blackbox_recipes/mini_swe_agent/framework.py create mode 100644 examples/blackbox_recipes/mini_swe_agent/mini_swe_agent_runner.py create mode 100644 examples/blackbox_recipes/mini_swe_agent/parallel_infer.py create mode 100644 examples/blackbox_recipes/mini_swe_agent/reward.py create mode 100644 examples/blackbox_recipes/mini_swe_agent/run_agent.py create mode 100644 examples/blackbox_recipes/mini_swe_agent/subprocess_runner.py create mode 100644 examples/blackbox_recipes/sandbox/sandbox.py create mode 100755 examples/blackbox_recipes/scripts/build_tool.sh create mode 100755 examples/blackbox_recipes/scripts/run_infer.sh create mode 100755 examples/blackbox_recipes/scripts/run_train.sh create mode 100755 examples/blackbox_recipes/scripts/run_train_megatron_async.sh create mode 100755 examples/blackbox_recipes/scripts/run_train_megatron_sync.sh diff --git a/examples/blackbox_recipes/claude_code/Dockerfile.claude-code-tool b/examples/blackbox_recipes/claude_code/Dockerfile.claude-code-tool new file mode 100644 index 00000000..3d12af4c --- /dev/null +++ b/examples/blackbox_recipes/claude_code/Dockerfile.claude-code-tool @@ -0,0 +1,21 @@ +# Claude Code sidecar tool image. +# +# Mounted at /opt/claude-code inside the SWE-bench sandbox. + +FROM node:20-bookworm-slim AS builder + +ARG TOOL_VERSION="latest" +ARG NPM_REGISTRY="" + +ENV DISABLE_AUTOUPDATER=1 \ + IS_SANDBOX=1 \ + npm_config_audit=false \ + npm_config_fund=false \ + npm_config_update_notifier=false + +RUN if [ -n "${NPM_REGISTRY}" ]; then npm config set registry "${NPM_REGISTRY}"; fi \ + && npm install -g --prefix /opt/claude-code "@anthropic-ai/claude-code@${TOOL_VERSION}" \ + && /opt/claude-code/bin/claude --version + +FROM scratch +COPY --from=builder /opt/claude-code / diff --git a/examples/blackbox_recipes/claude_code/claude_code_runner.py b/examples/blackbox_recipes/claude_code/claude_code_runner.py new file mode 100644 index 00000000..bee41aaf --- /dev/null +++ b/examples/blackbox_recipes/claude_code/claude_code_runner.py @@ -0,0 +1,232 @@ +"""Claude Code runner for the blackbox SWE-agent recipe.""" + +from __future__ import annotations + +import json +import logging +import os +import shlex +import time + +from uni_agent.trainer.framework.types import SessionHandle, SessionRuntime + +logger = logging.getLogger(__name__) + +DEFAULT_TOOL_IMAGE = "claude-code-tool:latest" +TOOL_TARGET = "/opt/claude-code" + + +def extract_task(raw_prompt) -> str: + if isinstance(raw_prompt, str): + return raw_prompt + return next( + (m["content"] for m in raw_prompt if isinstance(m, dict) and m.get("role") == "user"), + str(raw_prompt), + ) + + +def _extract_issue_text(task: str) -> str: + start = task.find("") + end = task.find("") + if start >= 0 and end > start: + return task[start + len(""):end].strip() + marker = "\nFollow these steps to resolve the issue:" + if marker in task: + return task.split(marker, 1)[0].strip() + return task.strip() + + +def _decode_metadata_list(value) -> list[str]: + if not value: + return [] + if isinstance(value, list): + return [str(item) for item in value] + if isinstance(value, str): + try: + parsed = json.loads(value) + except json.JSONDecodeError: + return [value] + if isinstance(parsed, list): + return [str(item) for item in parsed] + return [str(value)] + + +def build_claude_task(raw_prompt, tools_kwargs: dict | None = None) -> str: + tools_kwargs = tools_kwargs or {} + task = extract_task(raw_prompt) + metadata = ((tools_kwargs.get("reward") or {}).get("metadata") or {}) + issue = metadata.get("problem_statement") or _extract_issue_text(task) + tests = _decode_metadata_list(metadata.get("FAIL_TO_PASS")) + if not tests: + tests = _decode_metadata_list(metadata.get("PASS_TO_PASS"))[:3] + tests_block = "\n".join(f"- {test}" for test in tests) if tests else "- Run the closest relevant tests you identify." + + return ( + "You are fixing a SWE-bench task in /testbed.\n\n" + "Issue:\n" + f"{issue}\n\n" + "Rules:\n" + "- Edit source files only. Do not modify tests.\n" + "- The development environment is already installed; do not install packages unless a test command proves it is necessary.\n" + "- There is no submit tool in this environment. Do not try to submit.\n" + "- Do not create extra edge-case test files after the relevant tests pass.\n" + "- Do not run `pytest --collect-only`, `git log`, or any other command that does not directly validate the fix.\n" + "- Do not analyze unrelated `is_separable` behavior.\n" + "- Do not run additional ad-hoc verification after the listed relevant pytest command passes.\n" + "- Do not commit.\n" + "- After the minimal fix is applied and a relevant pytest command passes, print a one-line summary and exit immediately.\n\n" + "Relevant tests to run after the fix:\n" + f"{tests_block}\n" + ) + + +def build_claude_command( + *, + task: str, + base_url: str, + max_turns: int, + model: str = "default", + permission_mode: str = "bypassPermissions", + conda_env: str | None = "testbed", + disable_web_tools: bool = True, + disable_slash_commands: bool = True, +) -> str: + env = { + "ANTHROPIC_BASE_URL": base_url, + "ANTHROPIC_API_KEY": "not-needed", + "ANTHROPIC_MODEL": model, + "ANTHROPIC_DEFAULT_HAIKU_MODEL": model, + "ANTHROPIC_DEFAULT_SONNET_MODEL": model, + "ANTHROPIC_DEFAULT_OPUS_MODEL": model, + "ANTHROPIC_SMALL_FAST_MODEL": model, + "CLAUDE_CODE_DISABLE_BACKGROUND_TASKS": "1", + "CLAUDE_CODE_DISABLE_NONESSENTIAL_TRAFFIC": "1", + "CLAUDE_CODE_FORK_SUBAGENT": "0", + "CLAUDE_CODE_SUBAGENT_MODEL": model, + "DISABLE_AUTOUPDATER": "1", + "IS_SANDBOX": "1", + } + env_assignments = [f"{key}={shlex.quote(value)}" for key, value in env.items()] + if conda_env: + conda_prefix = f"/opt/miniconda3/envs/{conda_env}" + env_assignments.extend( + [ + f"CONDA_DEFAULT_ENV={shlex.quote(conda_env)}", + f"CONDA_PREFIX={shlex.quote(conda_prefix)}", + f"PATH={shlex.quote(conda_prefix + '/bin')}:/opt/miniconda3/bin:$PATH", + ] + ) + env_prefix = " ".join(env_assignments) + argv = [ + "/opt/claude-code/bin/claude", + "-p", + task, + "--model", + model, + "--max-turns", + str(max_turns), + "--permission-mode", + permission_mode, + ] + if disable_slash_commands: + argv.append("--disable-slash-commands") + if disable_web_tools: + argv.extend(["--disallowedTools", "Agent", "Task", "WebFetch", "WebSearch"]) + return ( + "unset HTTP_PROXY HTTPS_PROXY http_proxy https_proxy NO_PROXY no_proxy; " + "cd /testbed; " + f"{env_prefix} " + + shlex.join(argv) + ) + + +async def _create_claude_sandbox( + *, + image: str, + sidecar_image: str, + gateway_url: str, +): + from examples.swe_agent_blackbox.sandbox import YRSandbox, extract_upstream + + upstream = extract_upstream(gateway_url) if gateway_url else "" + return await YRSandbox.create( + image=image, + sidecar_image=sidecar_image, + sidecar_target=TOOL_TARGET, + upstream=upstream, + ) + + +async def claude_code_runner( + *, + raw_prompt, + session: SessionHandle, + sample_index: int, + session_runtime: SessionRuntime, + tools_kwargs: dict | None = None, + tool_image: str = DEFAULT_TOOL_IMAGE, + run_timeout: int = 7200, + **kwargs, +) -> None: + from examples.swe_agent_blackbox.dataset import extract_image + from examples.swe_agent_blackbox.mini_swe_agent_runner import SandboxEnvForReward + from examples.swe_agent_blackbox.reward import build_reward_context, evaluate_in_env + + tools_kwargs = tools_kwargs or {} + task = build_claude_task(raw_prompt, tools_kwargs) + env_config = tools_kwargs.get("env", {}) + image = extract_image(env_config) + if not image: + raise ValueError(f"No Docker image found in tools_kwargs.env for sample {sample_index}") + + gateway_url = session.base_url + if not gateway_url: + raise ValueError(f"gateway_url is empty for sample {sample_index}") + + sandbox = await _create_claude_sandbox( + image=image, + sidecar_image=tool_image, + gateway_url=gateway_url, + ) + + try: + post_setup_cmd = env_config.get("post_setup_cmd", "") + if post_setup_cmd: + setup_result = await sandbox.run(post_setup_cmd, timeout=120) + if setup_result.exit_code != 0: + logger.warning("post_setup_cmd failed rc=%s: %.300s", setup_result.exit_code, setup_result.stdout + setup_result.stderr) + + from examples.swe_agent_blackbox.sandbox import rewrite_gateway_url + + claude_base_url = rewrite_gateway_url(gateway_url, strip_v1=True) + max_turns = int(os.environ.get("SWE_AGENT_MAX_TURNS", "100")) + agent_cmd = build_claude_command( + task=task, + base_url=claude_base_url, + max_turns=max_turns, + ) + + started_at = time.perf_counter() + result = await sandbox.run(agent_cmd, timeout=int(run_timeout)) + elapsed = time.perf_counter() - started_at + logger.info("[sample %d] claude-code finished rc=%s elapsed=%.1fs", sample_index, result.exit_code, elapsed) + if result.exit_code != 0: + logger.warning( + "[sample %d] claude-code failed stdout_tail=%r stderr_tail=%r", + sample_index, + (result.stdout or "")[-4000:], + (result.stderr or "")[-4000:], + ) + + metadata, eval_timeout = build_reward_context(tools_kwargs) + score, eval_result = await evaluate_in_env(SandboxEnvForReward(sandbox), metadata, eval_timeout) + logger.info("[sample %d] reward done score=%s resolved=%s", sample_index, score, eval_result.get("resolved")) + + reward_info = { + "reward_score": score, + "claude_code_exit_code": result.exit_code, + **eval_result, + } + await session_runtime.complete_session(session.session_id, reward_info=reward_info) + finally: + await sandbox.cleanup() diff --git a/examples/blackbox_recipes/claude_code/config/claude_code.yaml b/examples/blackbox_recipes/claude_code/config/claude_code.yaml new file mode 100644 index 00000000..503fa1da --- /dev/null +++ b/examples/blackbox_recipes/claude_code/config/claude_code.yaml @@ -0,0 +1 @@ +#TODO \ No newline at end of file diff --git a/examples/blackbox_recipes/mini_swe_agent/Dockerfile.mini-swe-agent-tool b/examples/blackbox_recipes/mini_swe_agent/Dockerfile.mini-swe-agent-tool new file mode 100644 index 00000000..a2fba565 --- /dev/null +++ b/examples/blackbox_recipes/mini_swe_agent/Dockerfile.mini-swe-agent-tool @@ -0,0 +1,45 @@ +# Mini-swe-agent sidecar tool image. +# +# Contains a self-contained Python venv at /opt/mini-swe-agent with +# mini-swe-agent + litellm installed. When mounted into a sandbox at +# /opt/mini-swe-agent, the agent can be invoked via: +# +# /opt/mini-swe-agent/bin/python /opt/mini-swe-agent/bin/run_agent.py ... +# +# Uses python-build-standalone for maximum portability across different +# glibc versions (built against older glibc, forward-compatible). +# +# Build: +# docker build -f Dockerfile.mini-swe-agent-tool -t mini-swe-agent-tool:latest . +# + +FROM debian:bullseye-slim AS builder + +ARG PBS_RELEASE="20260602" +ARG PBS_PYTHON="3.12.13" +ARG PIP_INDEX_URL="" + +# Download and extract python-build-standalone (stripped, 32MB) +RUN apt-get update && apt-get install -y --no-install-recommends ca-certificates wget \ + && rm -rf /var/lib/apt/lists/* \ + && wget -q \ + "https://github.com/astral-sh/python-build-standalone/releases/download/${PBS_RELEASE}/cpython-${PBS_PYTHON}%2B${PBS_RELEASE}-x86_64-unknown-linux-gnu-install_only_stripped.tar.gz" \ + -O /tmp/python.tar.gz \ + && mkdir -p /opt/mini-swe-agent \ + && tar -xzf /tmp/python.tar.gz -C /opt/mini-swe-agent --strip-components=1 \ + && rm /tmp/python.tar.gz + +# Install mini-swe-agent + litellm +RUN /opt/mini-swe-agent/bin/pip install --no-cache-dir \ + ${PIP_INDEX_URL:+-i ${PIP_INDEX_URL}} \ + "mini-swe-agent==2.2.8" \ + "litellm==1.81.7" + +# Copy the in-sandbox runner script +COPY run_agent.py /opt/mini-swe-agent/bin/run_agent.py + +# Final scratch image: files are at the image root level so that when +# akernel_sdk.Mount(target="/opt/mini-swe-agent") overlays this image, +# the files appear at /opt/mini-swe-agent/bin/python etc. +FROM scratch +COPY --from=builder /opt/mini-swe-agent / diff --git a/examples/blackbox_recipes/mini_swe_agent/README.md b/examples/blackbox_recipes/mini_swe_agent/README.md new file mode 100644 index 00000000..b32a637a --- /dev/null +++ b/examples/blackbox_recipes/mini_swe_agent/README.md @@ -0,0 +1,269 @@ +# Mini-SWE-Agent In-Sandbox Execution + +## Overview + +`mini_swe` and `claude_code` both run inside the SWE-bench sandbox through a +sidecar tool image. The external runner creates the sandbox, mounts the selected +tool image, starts the agent process, and evaluates the reward in the same +sandbox. + +For `mini_swe`, the agent executes commands through `LocalEnvironment` (local +bash) inside the sandbox and calls the LLM through the gateway URL passed in via +stdin. For `claude_code`, the runner starts the Claude Code CLI from the sidecar +image and points it at the same Anthropic-compatible gateway. + +The `mini_swe` tool image uses +[python-build-standalone](https://github.com/astral-sh/python-build-standalone) +to build an isolated Python environment. The Claude Code tool image uses a Node +builder to install the Claude Code npm package. Both images use a minimal +`FROM scratch` final stage, so the sandbox base image does not need to provide +Python, Node, or npm for the sidecar tool runtime. + +**Supported runners:** + +| runner | Description | +|--------|-------------| +| `uniagent` | Original SWE-agent runner | +| `mini_swe` | mini-swe-agent sidecar runner | +| `claude_code` | Claude Code sidecar runner; reward is returned through `complete_session(reward_info)` without writing a separate reward JSON file | + +**Supported sandbox types:** + +| Type | Description | +|------|-------------| +| OpenYuanRong (`"openyuanrong"`) | Uses `akernel_sdk.Mount` and `sandbox.commands.run()` | + +At runtime, the selected runner depends directly on its tool image. The tool +image does not need to be extracted into a host directory ahead of time. + +## Architecture + +```text +[Rollouter Host: mini_swe_agent_runner / claude_code_runner] + | + |-- _create_sandbox(image, sidecar_image) + | `-- openyuanrong: Sandbox(mounts=[Mount(target="/opt/", ...)]) + | + |-- sandbox.run("") + | `-- [Inside Sandbox] + | /opt/mini-swe-agent/bin/python3.12 or /opt/claude-code/bin/claude + | stdin <- task config JSON (task, gateway_url, agent) + | commands run inside the SWE-bench sandbox + | stdout -> runner-specific execution result + | + |-- parse agent result + |-- SandboxEnvForReward(sandbox) -> evaluate_in_env() + `-- session_runtime.complete_session(reward_info) +``` + +## Prerequisites + +1. **OpenYuanRong** - set `OPENYUANRONG_SERVER_ADDRESS` and `OPENYUANRONG_TOKEN`. +2. **Runner tool image** - build the selected tool image and push it to a remote + registry if the sandbox service cannot access local Docker images. + +## 1. Build Tool Image + +`mini_swe` and `claude_code` are both injected into the SWE-bench sandbox as +sidecar tool images, but they differ in image contents, mount paths, and +accelerator/mirror options. Use `build_tool.sh` for both runners, and select the +target runner with `--tool` or `TOOL_KIND`. + +| runner | Default tool image | Dockerfile | Sandbox mount path | Image contents | Mirror option | +|--------|--------------------|------------|--------------------|----------------|---------------| +| `mini_swe` | `mini-swe-agent-tool:latest` | `Dockerfile.mini-swe-agent-tool` | `/opt/mini-swe-agent` | Standalone Python 3.12, `mini-swe-agent`, `litellm`, and `run_agent.py` | `--pip-index` / `PIP_INDEX_URL` | +| `claude_code` | `claude-code-tool:latest` | `Dockerfile.claude-code-tool` | `/opt/claude-code` | Claude Code npm package installed by a Node 20 builder | `--npm-registry` / `NPM_REGISTRY` | + +### mini_swe Tool Image + +`mini_swe` is the default build target: + +```bash +# Use the default PyPI source. +bash examples/swe_agent_blackbox/build_tool.sh + +# Use a custom PyPI mirror. +bash examples/swe_agent_blackbox/build_tool.sh --pip-index https://pypi.tuna.tsinghua.edu.cn/simple/ + +# Build and push to a remote registry. +bash examples/swe_agent_blackbox/build_tool.sh --registry swr.cn-east-3.myhuaweicloud.com/openyuanrong +``` + +The `mini_swe` image uses `python-build-standalone` to build an isolated Python +runtime. The final `FROM scratch` image contains only the files needed under +`/opt/mini-swe-agent`, and it does not depend on the Python version installed in +the sandbox base image. + +After pushing the image, point runtime inference at it with `SWE_AGENT_TOOL_IMAGE`: + +```bash +SWE_AGENT_TOOL_IMAGE=swr.cn-east-3.myhuaweicloud.com/openyuanrong/mini-swe-agent-tool:latest \ +RUNNER=mini_swe \ +bash examples/swe_agent_blackbox/scripts/run_infer.sh +``` + +### Claude Code Tool Image + +Claude Code must be selected explicitly with `--tool claude_code`: + +```bash +# Use the default npm registry. +bash examples/swe_agent_blackbox/build_tool.sh --tool claude_code + +# Use a custom npm registry. +bash examples/swe_agent_blackbox/build_tool.sh \ + --tool claude_code \ + --npm-registry https://registry.npmmirror.com + +# Select the Claude Code npm package version. +bash examples/swe_agent_blackbox/build_tool.sh \ + --tool claude_code \ + --tool-version latest + +# Build and push the Claude Code sidecar image. +bash examples/swe_agent_blackbox/build_tool.sh \ + --tool claude_code \ + --registry swr.cn-east-3.myhuaweicloud.com/openyuanrong +``` + +The Claude Code image uses `node:20-bookworm-slim` as the builder stage and +installs `@anthropic-ai/claude-code` into `/opt/claude-code`. The final image is +also a `FROM scratch` sidecar image. At runtime, the runner mounts it into the +sandbox at `/opt/claude-code` and invokes `/opt/claude-code/bin/claude`. + +After pushing the image, point runtime inference at it with `SWE_AGENT_TOOL_IMAGE`: + +```bash +SWE_AGENT_TOOL_IMAGE=swr.cn-east-3.myhuaweicloud.com/openyuanrong/claude-code-tool:latest \ +RUNNER=claude_code \ +bash examples/swe_agent_blackbox/scripts/run_infer.sh +``` + +### Combined Build Options + +`--tool`, image tags, mirrors, and registries can be combined: + +```bash +bash examples/swe_agent_blackbox/build_tool.sh \ + --tool mini_swe \ + --pip-index https://pypi.tuna.tsinghua.edu.cn/simple/ \ + --registry swr.cn-east-3.myhuaweicloud.com/openyuanrong +``` + +The build script: + +1. Selects the Dockerfile and default image name from `--tool`: + - `mini_swe` -> `mini-swe-agent-tool:latest` + - `claude_code` -> `claude-code-tool:latest` +2. Tags and pushes the image when `--registry` is provided. + +Both tool images are sidecar runtime dependencies, not SWE-bench task base +images. The `mini_swe` Python runtime is fully isolated from the sandbox +container's Python. The `claude_code` Node/npm dependencies live only under +`/opt/claude-code`, so the sandbox base image does not need Node installed. + +### Build Environment Variables + +| Variable | Default | Description | +|----------|---------|-------------| +| `TOOL_IMAGE` | `mini-swe-agent-tool` / `claude-code-tool` | Image name; the default changes with `TOOL_KIND` | +| `TOOL_TAG` | `latest` | Image tag | +| `TOOL_VERSION` | `latest` | Tool package version; for `claude_code`, this selects the `@anthropic-ai/claude-code` npm package version | +| `PIP_INDEX_URL` | unset, use PyPI | pip index URL; equivalent to `--pip-index` | +| `TOOL_KIND` | `mini_swe` | Tool kind: `mini_swe` or `claude_code` | +| `NPM_REGISTRY` | unset, use npm default | npm registry URL; equivalent to `--npm-registry` | + +## 2. Inference With OpenYuanRong Sandbox + +### Using run_infer.sh + +```bash +cd "$(git rev-parse --show-toplevel)" + +RUNNER=mini_swe \ +SWE_AGENT_TOOL_IMAGE=swr.cn-east-3.myhuaweicloud.com/openyuanrong/mini-swe-agent-tool:latest \ +MODEL_PATH=$HOME/models/Qwen3.5-9B \ +DATA_PATH=$HOME/data/swe_agent/r2e_gym.parquet \ +MAX_SAMPLES=1 \ +TP=1 \ +bash examples/swe_agent_blackbox/scripts/run_infer.sh +``` + +### Calling Python Directly + +```bash +python examples/swe_agent_blackbox/parallel_infer.py \ + --model-path ~/models/Qwen3.5-9B \ + --data-path ~/data/swe_agent/r2e_gym.parquet \ + --max-samples 1 \ + --runner mini_swe \ + --max-turns 100 \ + --tensor-parallel-size 1 +``` + +## 3. Inference + +### Environment Variables + +```bash +export OPENYUANRONG_SERVER_ADDRESS="6.2.179.37:8888" +export OPENYUANRONG_TOKEN="" +export DEPLOYMENT=openyuanrong +``` + +### Run mini_swe + +```bash +RUNNER=mini_swe \ +OPENYUANRONG_SERVER_ADDRESS="6.2.179.37:8888" \ +OPENYUANRONG_TOKEN="" \ +DEPLOYMENT=openyuanrong \ +SWE_AGENT_TOOL_IMAGE=swr.cn-east-3.myhuaweicloud.com/openyuanrong/mini-swe-agent-tool:latest \ +bash examples/swe_agent_blackbox/scripts/run_infer.sh +``` + +### Run Claude Code + +```bash +RUNNER=claude_code \ +OPENYUANRONG_SERVER_ADDRESS="6.2.179.37:8888" \ +OPENYUANRONG_TOKEN="" \ +DEPLOYMENT=openyuanrong \ +SWE_AGENT_TOOL_IMAGE=swr.cn-east-3.myhuaweicloud.com/openyuanrong/claude-code-tool:latest \ +SWE_AGENT_MAX_TURNS=50 \ +SWE_AGENT_RUN_TIMEOUT=7200 \ +bash examples/swe_agent_blackbox/scripts/run_infer.sh +``` + +## 4. Training (Fully Async) + +```bash +OPENYUANRONG_SERVER_ADDRESS="6.2.179.37:8888" \ +OPENYUANRONG_TOKEN="" \ +MODEL_PATH=~/models/Qwen3.5-9B \ +bash examples/swe_agent_blackbox/scripts/run_train_megatron_async.sh +``` + +The training YAML keeps `mini_swe` as the default runner: + +```yaml +agent_runner_fqn: examples.swe_agent_blackbox.mini_swe_agent_runner.mini_swe_agent_runner +``` + +To run training with Claude Code, keep the YAML unchanged and override the runner +FQN from the launch command: + +```bash +python3 -m verl.experimental.fully_async_policy.fully_async_main \ + --config-path examples/swe_agent_blackbox/config \ + --config-name swe_agent_blackbox_megatron_async \ + actor_rollout_ref.rollout.custom.agent_framework.agent_runner_fqn=examples.swe_agent_blackbox.claude_code_runner.claude_code_runner +``` + +## 5. Configuration + +| Variable | Default | Description | +|----------|---------|-------------| +| `SWE_AGENT_MAX_TURNS` | `100` | Max agent steps | +| `SWE_AGENT_TOOL_IMAGE` | `swr.cn-east-3.myhuaweicloud.com/openyuanrong/mini-swe-agent-tool:latest` | Sidecar tool image | +| `DEBUG_MODE` | (unset) | Set to 1 to enable debug logging | diff --git a/examples/blackbox_recipes/mini_swe_agent/__init__.py b/examples/blackbox_recipes/mini_swe_agent/__init__.py new file mode 100644 index 00000000..e69de29b diff --git a/examples/blackbox_recipes/mini_swe_agent/config/agent_config.yaml b/examples/blackbox_recipes/mini_swe_agent/config/agent_config.yaml new file mode 100644 index 00000000..b7352b72 --- /dev/null +++ b/examples/blackbox_recipes/mini_swe_agent/config/agent_config.yaml @@ -0,0 +1,36 @@ +- name: swe_agent + + _target_: uni_agent.agent_loop.UniAgentLoop + concurrency: 64 + log_dir: /tmp/swebench_qwen3_coder + mask_abnormal_exit_traj: false + + interaction: + action_timeout: 300 + max_turns: 100 + + env: + deployment: + type: local + command: /usr/bin/python3 -m swerex.server --auth-token {token} + timeout: 600 + startup_timeout: 600 + container_runtime: docker + env_variables: + PIP_PROGRESS_BAR: "off" + PIP_CACHE_DIR: "~/.cache/pip" + PAGER: "cat" + MANPAGER: "cat" + LESS: "-R" + TQDM_DISABLE: "1" + GIT_PAGER: "cat" + + tool_parser: qwen3_coder + + tools: + - name: str_replace_editor + - name: execute_bash + - name: submit + + reward: + eval_timeout: 600 diff --git a/examples/blackbox_recipes/mini_swe_agent/config/agent_config_openyuanrong.yaml b/examples/blackbox_recipes/mini_swe_agent/config/agent_config_openyuanrong.yaml new file mode 100644 index 00000000..b298c676 --- /dev/null +++ b/examples/blackbox_recipes/mini_swe_agent/config/agent_config_openyuanrong.yaml @@ -0,0 +1,37 @@ +- name: swe_agent + + _target_: uni_agent.agent_loop.UniAgentLoop + concurrency: 64 + log_dir: /tmp/swebench_qwen3_coder + mask_abnormal_exit_traj: false + + interaction: + action_timeout: 300 + max_turns: 100 + + env: + deployment: + type: openyuanrong + command: /opt/swe-rex/bin/python /opt/swe-rex/bin/swerex-remote --host 0.0.0.0 --port {port} --auth-token {token} + timeout: 600 + startup_timeout: 600 + swerex_runtime_image: swr.cn-east-3.myhuaweicloud.com/openyuanrong/swerex-runtime:1.4.0 + swerex_runtime_target: /opt/swe-rex + env_variables: + PIP_PROGRESS_BAR: "off" + PIP_CACHE_DIR: "~/.cache/pip" + PAGER: "cat" + MANPAGER: "cat" + LESS: "-R" + TQDM_DISABLE: "1" + GIT_PAGER: "cat" + + tool_parser: qwen3_coder + + tools: + - name: str_replace_editor + - name: execute_bash + - name: submit + + reward: + eval_timeout: 600 diff --git a/examples/blackbox_recipes/mini_swe_agent/config/parallel_infer.yaml b/examples/blackbox_recipes/mini_swe_agent/config/parallel_infer.yaml new file mode 100644 index 00000000..0829fdcd --- /dev/null +++ b/examples/blackbox_recipes/mini_swe_agent/config/parallel_infer.yaml @@ -0,0 +1,31 @@ +# Parallel inference config for the blackbox SWE-agent recipe. +# Composes verl's base configs with inference-specific overrides. + +defaults: + - model_engine: dp + - actor@actor_rollout_ref.actor: ${model_engine}_actor + - rollout@actor_rollout_ref.rollout: rollout + - model@actor_rollout_ref.model: hf_model + - reward: reward + - _self_ + +hydra: + searchpath: + - pkg://verl.trainer.config + +actor_rollout_ref: + hybrid_engine: true + nccl_timeout: 600 + model: {} + rollout: + agent: {} + +trainer: + nnodes: 1 + n_gpus_per_node: 8 + logger: + - console + device: cuda + total_epochs: 1 + total_training_steps: null + balance_batch: false diff --git a/examples/blackbox_recipes/mini_swe_agent/config/swe_agent_blackbox.yaml b/examples/blackbox_recipes/mini_swe_agent/config/swe_agent_blackbox.yaml new file mode 100644 index 00000000..62b73da1 --- /dev/null +++ b/examples/blackbox_recipes/mini_swe_agent/config/swe_agent_blackbox.yaml @@ -0,0 +1,123 @@ +# PPO trainer config for the blackbox SWE-agent recipe (v2). +# Uses the generic AgentFrameworkRolloutAdapter + SWEAgentFramework subclass. + +hydra: + searchpath: + - pkg://verl.trainer.config + +defaults: + - ppo_trainer + - _self_ + +actor_rollout_ref: + hybrid_engine: true + nccl_timeout: 600 + + model: + path: ??? + enable_gradient_checkpointing: true + + rollout: + name: vllm + mode: async + prompt_length: 4096 + response_length: 131072 + max_model_len: 135168 + temperature: 1.0 + top_p: 1.0 + n: 8 + tensor_model_parallel_size: 4 + gpu_memory_utilization: 0.7 + calculate_log_probs: true + enable_sleep_mode: true + free_cache_engine: true + + multi_turn: + enable: true + max_assistant_turns: 1 + max_parallel_calls: 1 + format: qwen3_coder + + agent: + num_workers: 8 + agent_loop_manager_class: uni_agent.trainer.framework.entry.AgentFrameworkRolloutAdapter + + custom: + agent_framework: + framework_class_fqn: examples.swe_agent_blackbox.framework.SWEAgentFramework + agent_runner_fqn: examples.swe_agent_blackbox.mini_swe_agent_runner.mini_swe_agent_runner + gateway_count: 1 + completion_timeout_seconds: 600 + max_concurrent_sessions: 32 + agent_runner_kwargs: + agent_config_path: examples/swe_agent_blackbox/config/agent_config.yaml + + actor: + use_dynamic_bsz: true + ppo_mini_batch_size: 16 + use_kl_loss: false + kl_loss_coef: 0.0 + clip_ratio_low: 0.2 + clip_ratio_high: 0.28 + loss_agg_mode: token-mean + optim: + lr: 1e-6 + weight_decay: 0.1 + clip_grad: 1.0 + fsdp_config: + param_offload: true + optimizer_offload: true + grad_offload: true + +data: + train_files: ??? + val_files: ??? + max_prompt_length: 4096 + max_response_length: 131072 + train_batch_size: 128 + val_batch_size: 128 + return_raw_chat: true + trust_remote_code: true + custom_cls: + path: pkg://examples.swe_agent_blackbox.dataset + name: SWEBenchDataset + +algorithm: + gamma: 1.0 + lam: 1.0 + adv_estimator: grpo + use_kl_in_reward: false + kl_ctrl: + type: fixed + kl_coef: 0.0 + +reward: + custom_reward_function: + path: pkg://examples/swe_agent_blackbox.reward + name: compute_score + +trainer: + use_legacy_worker_impl: disable + nnodes: 1 + n_gpus_per_node: 8 + total_epochs: 10 + project_name: swe_agent_blackbox + experiment_name: swe_agent + logger: + - console + device: cuda + balance_batch: false + val_before_train: true + val_only: false + save_freq: 10 + test_freq: 10 + default_local_dir: checkpoints/swe_agent_blackbox + resume_mode: disable + +ray_kwargs: + ray_init: + runtime_env: + env_vars: + TRANSFER_QUEUE_ENABLE: "" + NCCL_P2P_DISABLE: "1" + NCCL_SHM_DISABLE: "1" diff --git a/examples/blackbox_recipes/mini_swe_agent/config/swe_agent_blackbox_megatron_async.yaml b/examples/blackbox_recipes/mini_swe_agent/config/swe_agent_blackbox_megatron_async.yaml new file mode 100644 index 00000000..d25fcce5 --- /dev/null +++ b/examples/blackbox_recipes/mini_swe_agent/config/swe_agent_blackbox_megatron_async.yaml @@ -0,0 +1,162 @@ +# Megatron + TQ fully-async training config for the blackbox SWE-agent recipe. +# Uses FullyAsyncAgentFrameworkRolloutAdapter + SWEAgentFramework with Megatron backend. +# +# Entry point: python3 -m verl.experimental.fully_async_policy.fully_async_main +# Requires: transfer_queue.enable=true (selects TQ path in FullyAsyncTaskRunner) + +hydra: + searchpath: + - pkg://verl.trainer.config + +defaults: + - ppo_megatron_trainer + - _self_ + +actor_rollout_ref: + hybrid_engine: false + nccl_timeout: 9600 + + model: + path: ??? + + rollout: + name: vllm + mode: async + prompt_length: 4096 + response_length: 131072 + max_model_len: 135168 + temperature: 1.0 + top_p: 1.0 + n: 8 + tensor_model_parallel_size: 2 + gpu_memory_utilization: 0.7 + calculate_log_probs: true + enable_sleep_mode: true + free_cache_engine: true + enable_chunked_prefill: true + max_num_batched_tokens: 135168 + checkpoint_engine: + backend: nccl + + multi_turn: + enable: true + max_assistant_turns: 1 + max_parallel_calls: 1 + format: qwen3_coder + + agent: + num_workers: 8 + agent_loop_manager_class: uni_agent.trainer.framework.entry.FullyAsyncAgentFrameworkRolloutAdapter + + custom: + agent_framework: + framework_class_fqn: examples.swe_agent_blackbox.framework.SWEAgentFramework + agent_runner_fqn: examples.swe_agent_blackbox.mini_swe_agent_runner.mini_swe_agent_runner + gateway_count: 1 + completion_timeout_seconds: 600 + max_concurrent_sessions: 32 + agent_runner_kwargs: + agent_config_path: examples/swe_agent_blackbox/config/agent_config.yaml + + actor: + use_dynamic_bsz: true + use_rollout_log_probs: true + ppo_mini_batch_size: 16 + ppo_micro_batch_size_per_gpu: 1 + use_kl_loss: false + kl_loss_coef: 0.0 + clip_ratio_low: 0.2 + clip_ratio_high: 0.28 + clip_ratio_c: 10.0 + loss_agg_mode: token-mean + entropy_coeff: 0 + optim: + lr: 1e-6 + weight_decay: 0.1 + lr_decay_style: constant + megatron: + param_offload: true + grad_offload: true + optimizer_offload: true + tensor_model_parallel_size: 8 + pipeline_model_parallel_size: 1 + context_parallel_size: 1 + use_mbridge: true + use_remove_padding: false + + ref: + megatron: + param_offload: false + tensor_model_parallel_size: 8 + pipeline_model_parallel_size: 1 + context_parallel_size: 1 + +data: + train_files: ??? + val_files: ??? + prompt_key: prompt + truncation: left + max_prompt_length: 4096 + max_response_length: 131072 + train_batch_size: 0 + gen_batch_size: 1 + return_raw_chat: true + trust_remote_code: true + custom_cls: + path: pkg://examples.swe_agent_blackbox.dataset + name: SWEBenchDataset + +algorithm: + gamma: 1.0 + lam: 1.0 + adv_estimator: grpo + use_kl_in_reward: false + kl_ctrl: + type: fixed + kl_coef: 0.0 + rollout_correction: + bypass_mode: true + +reward: + custom_reward_function: + path: pkg://examples.swe_agent_blackbox.reward + name: compute_score + +trainer: + nnodes: 1 + n_gpus_per_node: 8 + total_epochs: 10 + project_name: swe_agent_blackbox + experiment_name: swe_agent + logger: + - console + device: cuda + val_before_train: true + val_only: false + save_freq: 10 + test_freq: 10 + default_local_dir: checkpoints/swe_agent_blackbox + resume_mode: auto + +rollout: + nnodes: 1 + n_gpus_per_node: 8 + total_rollout_steps: 100000 + +async_training: + use_trainer_do_validate: false + staleness_threshold: 1.0 + trigger_parameter_sync_step: 4 + require_batches: 1 + partial_rollout: true + +transfer_queue: + enable: true + +ray_kwargs: + ray_init: + runtime_env: + env_vars: + TRANSFER_QUEUE_ENABLE: "" + NCCL_P2P_DISABLE: "1" + NCCL_SHM_DISABLE: "1" diff --git a/examples/blackbox_recipes/mini_swe_agent/config/swe_agent_blackbox_megatron_sync.yaml b/examples/blackbox_recipes/mini_swe_agent/config/swe_agent_blackbox_megatron_sync.yaml new file mode 100644 index 00000000..65b09b1a --- /dev/null +++ b/examples/blackbox_recipes/mini_swe_agent/config/swe_agent_blackbox_megatron_sync.yaml @@ -0,0 +1,129 @@ +# Megatron sync training config for the blackbox SWE-agent recipe. +# Uses main_ppo_sync + Megatron backend, same blackbox infrastructure as FSDP. +# +# Entry point: python3 -m verl.trainer.main_ppo_sync + +hydra: + searchpath: + - pkg://verl.trainer.config + +defaults: + - ppo_megatron_trainer + - _self_ + +actor_rollout_ref: + hybrid_engine: true + nccl_timeout: 600 + + model: + path: ??? + enable_gradient_checkpointing: true + + rollout: + name: vllm + mode: async + prompt_length: 4096 + response_length: 131072 + max_model_len: 135168 + temperature: 1.0 + top_p: 1.0 + n: 8 + tensor_model_parallel_size: 4 + gpu_memory_utilization: 0.7 + calculate_log_probs: true + enable_sleep_mode: true + free_cache_engine: true + + multi_turn: + enable: true + max_assistant_turns: 1 + max_parallel_calls: 1 + format: qwen3_coder + + agent: + num_workers: 8 + agent_loop_manager_class: uni_agent.trainer.framework.entry.AgentFrameworkRolloutAdapter + + custom: + agent_framework: + framework_class_fqn: examples.swe_agent_blackbox.framework.SWEAgentFramework + agent_runner_fqn: examples.swe_agent_blackbox.mini_swe_agent_runner.mini_swe_agent_runner + gateway_count: 1 + completion_timeout_seconds: 600 + max_concurrent_sessions: 32 + agent_runner_kwargs: + agent_config_path: examples/swe_agent_blackbox/config/agent_config.yaml + + actor: + use_dynamic_bsz: true + ppo_mini_batch_size: 16 + use_kl_loss: false + kl_loss_coef: 0.0 + clip_ratio_low: 0.2 + clip_ratio_high: 0.28 + loss_agg_mode: token-mean + optim: + lr: 1e-6 + weight_decay: 0.1 + clip_grad: 1.0 + megatron: + param_offload: true + grad_offload: true + optimizer_offload: true + tensor_model_parallel_size: 8 + pipeline_model_parallel_size: 1 + context_parallel_size: 1 + use_mbridge: true + +data: + train_files: ??? + val_files: ??? + max_prompt_length: 4096 + max_response_length: 131072 + train_batch_size: 128 + val_batch_size: 128 + return_raw_chat: true + trust_remote_code: true + custom_cls: + path: pkg://examples.swe_agent_blackbox.dataset + name: SWEBenchDataset + +algorithm: + gamma: 1.0 + lam: 1.0 + adv_estimator: grpo + use_kl_in_reward: false + kl_ctrl: + type: fixed + kl_coef: 0.0 + +reward: + custom_reward_function: + path: pkg://examples.swe_agent_blackbox.reward + name: compute_score + +trainer: + use_legacy_worker_impl: disable + nnodes: 1 + n_gpus_per_node: 8 + total_epochs: 10 + project_name: swe_agent_blackbox + experiment_name: swe_agent + logger: + - console + device: cuda + balance_batch: false + val_before_train: true + val_only: false + save_freq: 10 + test_freq: 10 + default_local_dir: checkpoints/swe_agent_blackbox + resume_mode: disable + +ray_kwargs: + ray_init: + runtime_env: + env_vars: + TRANSFER_QUEUE_ENABLE: "" + NCCL_P2P_DISABLE: "1" + NCCL_SHM_DISABLE: "1" diff --git a/examples/blackbox_recipes/mini_swe_agent/dataset.py b/examples/blackbox_recipes/mini_swe_agent/dataset.py new file mode 100644 index 00000000..89d65129 --- /dev/null +++ b/examples/blackbox_recipes/mini_swe_agent/dataset.py @@ -0,0 +1,34 @@ +"""SWEBench-specific dataset that injects verl-standard reward fields.""" + +from verl.utils.dataset.rl_dataset import RLHFDataset + + +def extract_image(env_config: dict) -> str: + """Extract Docker image from env config, supporting both flat and nested formats. + + Flat: env_config["image"] + Nested: env_config["deployment"]["image"] + """ + image = env_config.get("image") + if image: + return image + deployment = env_config.get("deployment") + if isinstance(deployment, dict): + image = deployment.get("image") + if image: + return image + return "" + + +class SWEBenchDataset(RLHFDataset): + + def __getitem__(self, item): + row_dict = super().__getitem__(item) + extra_info = row_dict.get("extra_info", {}) + tools_kwargs = extra_info.get("tools_kwargs", {}) + reward_config = tools_kwargs.get("reward", {}) + + row_dict.setdefault("data_source", reward_config.get("name", "unknown")) + row_dict.setdefault("reward_model", {"ground_truth": {}}) + + return row_dict diff --git a/examples/blackbox_recipes/mini_swe_agent/framework.py b/examples/blackbox_recipes/mini_swe_agent/framework.py new file mode 100644 index 00000000..7c5c027c --- /dev/null +++ b/examples/blackbox_recipes/mini_swe_agent/framework.py @@ -0,0 +1,105 @@ +"""SWE-agent specific framework subclass. + +Injects reward_info (from agent_runner's complete_session call) +into sample_fields["extra_info"] so the reward worker's +compute_score can access it via extra_info. + +Overrides _run_session to execute agent_runner in a separate Ray worker +process, preventing blocking operations from stalling the event loop. +""" + +from __future__ import annotations + +import asyncio +import functools +import logging +from dataclasses import replace +from uuid import uuid4 + +import ray + +from uni_agent.trainer.framework.framework import OpenAICompatibleAgentFramework + +from examples.swe_agent_blackbox.subprocess_runner import remote_agent_run + +logger = logging.getLogger(__name__) + + +class SWEAgentFramework(OpenAICompatibleAgentFramework): + + async def _score_trajectories(self, session_trajectories, sample_fields): + if session_trajectories and session_trajectories[-1].reward_info: + reward_info = session_trajectories[-1].reward_info + extra_info = dict(sample_fields.get("extra_info") or {}) + sample_fields = {**sample_fields, "extra_info": {**extra_info, **reward_info}} + return await super()._score_trajectories(session_trajectories, sample_fields) + + def _resolve_runner(self) -> tuple[str, dict]: + """Extract FQN and pre-bound kwargs from self.agent_runner. + + self.agent_runner may be a functools.partial (from_config wraps it), + so we unpack the original function and its keywords. + """ + fn = self.agent_runner + kwargs = {} + if isinstance(fn, functools.partial): + kwargs = dict(fn.keywords) + fn = fn.func + fqn = f"{fn.__module__}.{fn.__qualname__}" + return fqn, kwargs + + async def _run_session( + self, + *, + prompts, + raw_prompt, + sample_index: int, + session_id: str | None = None, + runner_kwargs: dict | None = None, + ): + """Run agent_runner in a Ray worker process instead of in-process.""" + session_id = session_id or f"session-{sample_index}-0-{uuid4().hex}" + sample_fields = self._extract_sample_fields(prompts=prompts, sample_index=sample_index) + session = await self.session_runtime.create_session(session_id) + agent_runner_fqn, resolved_kwargs = self._resolve_runner() + + try: + if runner_kwargs: + resolved_kwargs = {**resolved_kwargs, **runner_kwargs} + + ref = remote_agent_run.remote( + agent_runner_fqn=agent_runner_fqn, + raw_prompt=raw_prompt, + session_id=session_id, + base_url=session.base_url, + sample_index=sample_index, + runner_kwargs=resolved_kwargs, + ) + loop = asyncio.get_running_loop() + reward_info = await loop.run_in_executor(None, ray.get, ref) + + await self.session_runtime.complete_session( + session_id, reward_info=reward_info, + ) + session_trajectories = await self.session_runtime.finalize_session(session_id) + + except Exception as e: + logger.error("_run_session failed: session=%s, sample=%d, runner=%s: %s", + session_id, sample_index, agent_runner_fqn, e, exc_info=True) + await self.session_runtime.abort_session(session_id) + raise + + if not self.reward_loop_worker_handles or not session_trajectories: + return session_trajectories, sample_fields + + annotations = await self._score_trajectories(session_trajectories, sample_fields) + scored_trajectories = [] + for traj, (score, extra) in zip(session_trajectories, annotations, strict=True): + scored_trajectories.append( + replace( + traj, + reward_score=score, + extra_fields={**traj.extra_fields, "reward_extra_info": extra}, + ) + ) + return scored_trajectories, sample_fields diff --git a/examples/blackbox_recipes/mini_swe_agent/mini_swe_agent_runner.py b/examples/blackbox_recipes/mini_swe_agent/mini_swe_agent_runner.py new file mode 100644 index 00000000..33882bc8 --- /dev/null +++ b/examples/blackbox_recipes/mini_swe_agent/mini_swe_agent_runner.py @@ -0,0 +1,227 @@ +"""Mini-swe-agent runner for the blackbox SWE-agent recipe. + +Agent runs inside a OpenYuanRong remote sandbox via sidecar tool image mount. +The runner creates the sandbox, pipes task config via stdin, parses +the result from stdout, and evaluates reward in the same sandbox. +""" + +from __future__ import annotations + +import base64 +import json +import logging +import os +import shlex +import time +from pathlib import Path + +from uni_agent.trainer.framework.types import SessionHandle, SessionRuntime + +from examples.swe_agent_blackbox.dataset import extract_image +from examples.swe_agent_blackbox.reward import build_reward_context, evaluate_in_env +from examples.swe_agent_blackbox.sandbox import CommandResult, YRSandbox, extract_upstream, rewrite_gateway_url + +logger = logging.getLogger(__name__) +if os.environ.get("DEBUG_MODE"): + logger.setLevel(logging.DEBUG) + +DEFAULT_TOOL_IMAGE = "swr.cn-east-3.myhuaweicloud.com/openyuanrong/mini-swe-agent-tool:latest" + + +class SandboxEnvForReward: + """Adapts :class:`YRSandbox` to the async env interface used by + reward specs (``communicate``, ``write_file``, ``read_file``). + """ + + def __init__(self, sandbox): + self._sandbox = sandbox + + async def communicate(self, input: str, timeout=600, check="ignore", error_msg="Command failed") -> str: + result = await self._sandbox.run(input, timeout=int(timeout)) + if check == "raise" and result.exit_code != 0: + raise RuntimeError(f"{error_msg}: {result.stdout[:200]}") + return result.stdout + + async def write_file(self, path: str | Path, content: str) -> None: + encoded = base64.b64encode(content.encode()).decode() + await self.communicate(f"echo {encoded} | base64 -d > {path}", check="raise", error_msg=f"write {path}") + + async def read_file(self, path: str | Path, **_) -> str: + return await self.communicate(f"cat {path}") + + +def _extract_task(raw_prompt) -> str: + """Extract task text from raw_prompt (str or message list).""" + if isinstance(raw_prompt, str): + return raw_prompt + return next( + (m["content"] for m in raw_prompt if isinstance(m, dict) and m.get("role") == "user"), + str(raw_prompt), + ) + + +def _build_task_config( + *, + task: str, + gateway_url: str, +) -> dict: + """Build the task config passed to run_agent.py via stdin.""" + agent_gateway_url = rewrite_gateway_url(gateway_url) + step_limit = int(os.environ.get("SWE_AGENT_MAX_TURNS", "100")) + return { + "task": task, + "gateway_url": agent_gateway_url, + "agent": { + "step_limit": step_limit, + }, + } + + +def build_agent_command( + *, + config_b64: str, + conda_env: str = "testbed", +) -> str: + """Build the command that runs run_agent.py inside the sandbox.""" + conda_prefix = f"/opt/miniconda3/envs/{conda_env}" + env_prefix = ( + f"CONDA_DEFAULT_ENV={shlex.quote(conda_env)} " + f"CONDA_PREFIX={shlex.quote(conda_prefix)} " + f"PATH={shlex.quote(conda_prefix + '/bin')}:/opt/miniconda3/bin:$PATH" + ) + return ( + "unset HTTP_PROXY HTTPS_PROXY http_proxy https_proxy NO_PROXY no_proxy; " + f"{env_prefix} " + f"echo {config_b64} | base64 -d | " + "/opt/mini-swe-agent/bin/python /opt/mini-swe-agent/bin/run_agent.py" + ) + + +async def mini_swe_agent_runner( + *, + raw_prompt, + session: SessionHandle, + sample_index: int, + session_runtime: SessionRuntime, + tools_kwargs: dict | None = None, + tool_image: str = DEFAULT_TOOL_IMAGE, + run_timeout: int = 7200, + conda_env: str = "testbed", + **kwargs, +) -> None: + """Run mini-swe-agent inside a sandbox with sidecar tool mount. + + Flow: + 1. Create OpenYuanRong remote sandbox with mini-swe-agent sidecar + 2. Pipe task config to run_agent.py via stdin + 3. Parse agent result from stdout + 4. Evaluate reward in the same sandbox + 5. Complete session with reward_info + """ + tools_kwargs = tools_kwargs or {} + logger.info("mini_swe_agent_runner called, sample_index=%d", sample_index) + + # Extract task text and sandbox config (image from parquet) + task = _extract_task(raw_prompt) + logger.info("task extracted, %d chars", len(task)) + + env_config = tools_kwargs.get("env", {}) + image = extract_image(env_config) + if not image: + raise ValueError(f"No sandbox image found in tools_kwargs.env for sample {sample_index}") + + # Gateway URL — extract upstream for OpenYuanRong tunnel + gateway_url = session.base_url + if not gateway_url: + raise ValueError(f"gateway_url is empty for sample {sample_index}") + + upstream = extract_upstream(gateway_url) + sandbox = await YRSandbox.create( + image=image, sidecar_image=tool_image, upstream=upstream, + ) + sandbox_id = sandbox.sandbox_id + logger.info("Sandbox created (image=%s, sandbox_id=%s)", image, sandbox_id) + + # Build task config (gateway URL rewritten to sandbox-internal tunnel) + task_config = _build_task_config( + task=task, + gateway_url=gateway_url, + ) + + try: + # Run post_setup_cmd if provided (e.g. git checkout correct commit) + post_setup_cmd = env_config.get("post_setup_cmd", "") + if post_setup_cmd: + logger.info("Running post_setup_cmd (%d chars)...", len(post_setup_cmd)) + r = await sandbox.run(post_setup_cmd, timeout=600) + if r.exit_code != 0: + logger.warning("post_setup_cmd failed (rc=%d): %s", r.exit_code, r.stdout[:200]) + else: + logger.info("post_setup_cmd done") + + # Run agent inside sandbox — pipe config via base64-encoded stdin. + config_b64 = base64.b64encode(json.dumps(task_config).encode()).decode() + agent_cmd = build_agent_command(config_b64=config_b64, conda_env=conda_env) + logger.debug("[sample %d] starting agent inside sandbox", sample_index) + t0 = time.perf_counter() + agent_result = await sandbox.run(agent_cmd, timeout=int(run_timeout)) + elapsed = time.perf_counter() - t0 + logger.debug( + "[sample %d] agent process finished: rc=%d (%.1fs)", + sample_index, agent_result.exit_code, elapsed, + ) + + # Parse agent result from stdout + agent_info = _parse_agent_result(agent_result.stdout, sample_index) + logger.info( + "[sample %d] agent: exit_status=%s, submission=%d chars", + sample_index, agent_info.get("exit_status"), + len(agent_info.get("submission", "")), + ) + + # Evaluate reward in the same sandbox + metadata, eval_timeout = build_reward_context(tools_kwargs) + t0 = time.perf_counter() + reward_env = SandboxEnvForReward(sandbox) + score, eval_result = await evaluate_in_env(reward_env, metadata, eval_timeout) + logger.debug( + "[sample %d] reward done: score=%s, resolved=%s (%.1fs)", + sample_index, score, eval_result.get("resolved"), time.perf_counter() - t0, + ) + + reward_info = {"reward_score": score, **eval_result} + await session_runtime.complete_session(session.session_id, reward_info=reward_info) + + except Exception as e: + logger.warning("Mini-swe-agent runner failed for sample %d (sandbox_id=%s): %s", sample_index, sandbox_id, e) + raise + finally: + try: + await sandbox.cleanup() + except Exception: + pass + + +def _parse_agent_result(stdout: str, sample_index: int) -> dict: + """Parse agent result JSON from run_agent.py stdout. + + litellm may print error messages to stdout, polluting the output. + The last line starting with '{' is the result JSON. + """ + stdout = stdout.strip() + if not stdout: + return {"exit_status": "error", "submission": ""} + # Try the last line that looks like JSON first + lines = [l.strip() for l in stdout.split("\n") if l.strip()] + for line in reversed(lines): + if line.startswith("{"): + try: + return json.loads(line) + except json.JSONDecodeError: + continue + # Fallback: try entire stdout + try: + return json.loads(stdout) + except json.JSONDecodeError: + logger.warning("[sample %d] Failed to parse agent result (full stdout): %s", sample_index, stdout[:1000]) + return {"exit_status": "error", "submission": ""} diff --git a/examples/blackbox_recipes/mini_swe_agent/parallel_infer.py b/examples/blackbox_recipes/mini_swe_agent/parallel_infer.py new file mode 100644 index 00000000..c74765e0 --- /dev/null +++ b/examples/blackbox_recipes/mini_swe_agent/parallel_infer.py @@ -0,0 +1,447 @@ +"""Parallel inference runner for the blackbox SWE-agent recipe (v2). + +Creates an LLM server, GatewayServingRuntime, and SWEAgentFramework, +then runs agent sessions in parallel and reports resolve rate. + +Usage (CLI): + python examples/swe_agent_blackbox/parallel_infer.py \ + --model-path ~/models/Qwen3-Coder-30B-A3B-Instruct \ + --data-path ~/data/swe_agent/swe_bench_verified.parquet \ + --max-samples 10 +""" + +from __future__ import annotations + +import argparse +import asyncio +import json +import logging +import os +from functools import partial +from typing import Any +from uuid import uuid4 + +import numpy as np +import ray + +from verl import DataProto +from verl.protocol import pad_dataproto_to_divisor +from verl.utils import hf_tokenizer +from verl.utils.transferqueue_utils import tq as _tq_mock +from verl.workers.rollout.llm_server import LLMServerManager + +from uni_agent.trainer.gateway.runtime import GatewayServingRuntime + +from examples.swe_agent_blackbox.framework import SWEAgentFramework +from examples.swe_agent_blackbox.agent_runner import swe_agent_runner +from examples.swe_agent_blackbox.claude_code_runner import claude_code_runner + +try: + from examples.swe_agent_blackbox.mini_swe_agent_runner import mini_swe_agent_runner +except ImportError: + mini_swe_agent_runner = None + +logging.basicConfig( + format="%(asctime)s - %(name)s - %(levelname)s - %(message)s", + level=os.getenv("VERL_LOGGING_LEVEL", "INFO"), + force=True, +) +logger = logging.getLogger(__name__) + + +# ===================================================================== +# Dataset loading (inlined from dataset.py — only used here) +# ===================================================================== + + +def _remap_image_to_local(image_name: str) -> str: + parts = image_name.split("/") + if len(parts) > 1 and "." in parts[0]: + basename = parts[-1] + else: + basename = image_name + basename = basename.replace("_1776_", "__") + if ":" in basename: + basename = basename.rsplit(":", 1)[0] + return f"{basename}:latest" + + +def _remap_sample_images(sample: dict[str, Any]) -> dict[str, Any]: + extra_info = sample.get("extra_info") + if not extra_info: + return sample + tools_kwargs = extra_info.get("tools_kwargs", {}) + env = tools_kwargs.get("env", {}) + image = env.get("image") + if not image: + return sample + local_image = _remap_image_to_local(image) + if local_image != image: + logger.debug("Remapping image: %s -> %s", image, local_image) + env["image"] = local_image + return sample + + +def _inject_reward_fields(sample: dict[str, Any]) -> None: + """Inject verl-standard data_source and reward_model from extra_info.tools_kwargs.reward.""" + extra_info = sample.get("extra_info", {}) + tools_kwargs = extra_info.get("tools_kwargs", {}) + reward_config = tools_kwargs.get("reward", {}) + sample.setdefault("data_source", reward_config.get("name", "unknown")) + sample.setdefault("reward_model", {"ground_truth": {}}) + + +def load_swe_dataset(data_path: str | list[str], max_samples: int = -1) -> list[dict[str, Any]]: + import pyarrow.parquet as pq + + if isinstance(data_path, list): + paths = [os.path.expanduser(p) for p in data_path] + else: + paths = os.path.expanduser(data_path) + + logger.info("Loading dataset from: %s", data_path) + if isinstance(paths, list): + import pyarrow as pa + tables = [pq.read_table(p) for p in paths] + table = pa.concat_tables(tables) + else: + table = pq.read_table(paths) + samples = table.to_pylist() + + for i, sample in enumerate(samples): + samples[i] = _remap_sample_images(sample) + _inject_reward_fields(samples[i]) + + if max_samples > 0: + samples = samples[:max_samples] + logger.info("Using first %d samples (max_samples=%d)", len(samples), max_samples) + + logger.info("Loaded %d samples from %s", len(samples), data_path) + return samples + + +class _MockReplayBuffer: + """Minimal replay buffer for inference mode (no actual training).""" + + def add(self, partition_id, items): + pass + + +def run_inference( + *, + model_path: str, + data_path: str, + prompt_length: int = 4096, + response_length: int = 65536, + temperature: float = 0.8, + top_p: float = 0.9, + n: int = 1, + max_samples: int = -1, + engine: str = "vllm", + nnodes: int = 1, + n_gpus_per_node: int = 8, + tensor_parallel_size: int = 4, + gateway_count: int = 1, + max_concurrent_sessions: int = 2, + completion_timeout: float = 600.0, + tool_parser: str | None = None, + agent_config_path: str | None = None, + runner: str = "uniagent", + tool_image: str | None = None, + run_timeout: int = 7200, +) -> dict[str, Any]: + """Run parallel SWE-agent inference using the blackbox framework.""" + if runner == "mini_swe": + if mini_swe_agent_runner is None: + raise ImportError("mini-swe-agent is required for --runner mini_swe. Install with: pip install mini-swe-agent") + _agent_runner = partial( + mini_swe_agent_runner, + tool_image=tool_image or "swr.cn-east-3.myhuaweicloud.com/openyuanrong/mini-swe-agent-tool:latest", + run_timeout=run_timeout, + ) + elif runner == "claude_code": + _agent_runner = partial( + claude_code_runner, + tool_image=tool_image or "claude-code-tool:latest", + run_timeout=run_timeout, + ) + else: + _agent_runner = swe_agent_runner + + if not ray.is_initialized(): + ray.init() + + # 1. Init Hydra config + config = _init_hydra_config( + model_path=model_path, + engine=engine, + prompt_length=prompt_length, + response_length=response_length, + temperature=temperature, + top_p=top_p, + n=n, + nnodes=nnodes, + n_gpus_per_node=n_gpus_per_node, + tensor_parallel_size=tensor_parallel_size, + ) + + # 2. Load dataset + samples = load_swe_dataset(data_path, max_samples=max_samples) + logger.info( + "Loaded %d samples, %d rollout(s) each, runner=%s, gateway_count=%d, max_concurrent_sessions=%d", + len(samples), + n, + runner, + gateway_count, + max_concurrent_sessions, + ) + + if not samples: + raise ValueError("No samples to process") + + # 3. Create LLM server + logger.info("Initializing LLM server manager...") + llm_server_manager = LLMServerManager.create(config=config) + + # 4. Create GatewayServingRuntime + logger.info("Using tool_parser=%r", tool_parser) + + llm_client = llm_server_manager.get_client() + gateway_actor_kwargs = { + "tokenizer": hf_tokenizer(os.path.expanduser(model_path)), + "base_sampling_params": {"temperature": temperature, "top_p": top_p, "max_tokens": response_length}, + } + if tool_parser: + gateway_actor_kwargs["tool_parser_name"] = tool_parser + + gateway_runtime = GatewayServingRuntime( + llm_client=llm_client, + gateway_count=gateway_count, + gateway_actor_kwargs=gateway_actor_kwargs, + ) + + # 5. Create RewardLoopWorker for compute_score + from verl.experimental.reward_loop.reward_loop import RewardLoopWorker + reward_worker = ray.remote(RewardLoopWorker).remote(config, None) + + # 6. Create framework + framework = SWEAgentFramework( + session_runtime=gateway_runtime, + agent_runner=_agent_runner, + replay_buffer=_MockReplayBuffer(), + rollout_config={"n": n, "val_kwargs": {"n": n}}, + completion_timeout=completion_timeout, + wait_for_completion_after_agent_run=True, + max_concurrent_sessions=max_concurrent_sessions, + reward_loop_worker_handles=[reward_worker], + ) + + # 6. Build batch data and run + _tools_kwargs_list = [] + for sample in samples: + tk = (sample.get("extra_info") or {}).get("tools_kwargs", {}) + if runner == "uniagent" and agent_config_path: + tk["agent_config_path"] = agent_config_path + tk["model_path"] = os.path.expanduser(model_path) + _tools_kwargs_list.append(tk) + + from tensordict import TensorDict + from verl.utils import tensordict_utils as _tu + + raw_prompts = [sample["prompt"] for sample in samples] + uids = [str(uuid4()) for _ in samples] + td = TensorDict({"uid": uids, "global_steps": [0] * len(samples)}, batch_size=[len(samples)]) + _tu.assign_non_tensor_stack(td, "raw_prompt", raw_prompts) + _tu.assign_non_tensor_stack(td, "tools_kwargs", _tools_kwargs_list) + _tu.assign_non_tensor_stack(td, "data_source", [sample["data_source"] for sample in samples]) + _tu.assign_non_tensor_stack(td, "reward_model", [sample["reward_model"] for sample in samples]) + + batch = DataProto(batch=td, meta_info={}).repeat(n) + + size_divisor = gateway_count + batch_padded, pad_size = pad_dataproto_to_divisor(batch, size_divisor) + logger.info("Starting %d agent session(s)...", len(batch_padded)) + + _tq_store: dict[str, Any] = {} + + async def _dummy_kv_put(key, partition_id=None, tag=None, **kwargs): + _tq_store[key] = tag + + async def _dummy_kv_batch_put(keys=None, fields=None, tags=None, partition_id=None, **kwargs): + for i, key in enumerate(keys): + _tq_store[key] = {"fields": fields, "tag": tags[i] if tags else None} + + _tq_mock.async_kv_put = _dummy_kv_put + _tq_mock.async_kv_batch_put = _dummy_kv_batch_put + + async def _generate(): + return await framework.generate_sequences(batch_padded.batch) + + try: + stats = asyncio.run(_generate()) + except RuntimeError as e: + logger.warning("generate_sequences failed: %s", e) + stats = {} + + # 7. Collect scores + uid_to_sample_idx = {uid: i for i, uid in enumerate(uids)} + per_sample_scores = [0.0] * len(samples) + sample_trajectory_counts = [0] * len(samples) + for key, value in _tq_store.items(): + if not isinstance(value, dict) or "fields" not in value: + continue + fields = value["fields"] + rm_scores = fields.get("rm_scores", None) + if rm_scores is None: + continue + # Key format: {uid}_{session_index}_{index} + uid = key.rsplit("_", 2)[0] + sample_idx = uid_to_sample_idx.get(uid) + if sample_idx is None: + continue + score = float(rm_scores.float()[-1, -1].item()) + per_sample_scores[sample_idx] += score + sample_trajectory_counts[sample_idx] += 1 + + for i in range(len(samples)): + if sample_trajectory_counts[i] > 0: + per_sample_scores[i] /= sample_trajectory_counts[i] + + resolved_count = sum(1 for s in per_sample_scores if s > 0) + overall_mean = float(np.mean(per_sample_scores)) if per_sample_scores else 0.0 + logger.info( + "Resolved %d / %d samples (%.2f%%), mean score: %.4f", + resolved_count, len(samples), 100.0 * resolved_count / max(len(samples), 1), overall_mean, + ) + + # 8. Cleanup + asyncio.run(gateway_runtime.shutdown()) + + return { + "stats": stats, + "mean_score": overall_mean, + "per_sample_scores": per_sample_scores, + } + + +# ===================================================================== +# Helpers +# ===================================================================== + + +def _init_hydra_config( + *, + model_path: str, + engine: str, + prompt_length: int, + response_length: int, + temperature: float, + top_p: float, + n: int, + nnodes: int, + n_gpus_per_node: int, + tensor_parallel_size: int, +) -> Any: + """Initialize Hydra config with rollout/model settings.""" + from hydra import compose, initialize_config_dir + from omegaconf import OmegaConf + + config_dir = os.path.abspath("examples/swe_agent_blackbox/config") + with initialize_config_dir(config_dir=config_dir, version_base=None): + config = compose(config_name="parallel_infer") + + config.actor_rollout_ref.model.path = os.path.expanduser(model_path) + config.actor_rollout_ref.rollout.name = engine + config.actor_rollout_ref.rollout.mode = "async" + config.actor_rollout_ref.rollout.prompt_length = prompt_length + config.actor_rollout_ref.rollout.response_length = response_length + config.actor_rollout_ref.rollout.max_model_len = prompt_length + response_length + 1024 + config.actor_rollout_ref.rollout.n = n + config.actor_rollout_ref.rollout.tensor_model_parallel_size = tensor_parallel_size + config.actor_rollout_ref.rollout.gpu_memory_utilization = float(os.getenv("ROLLOUT_GPU_MEM_UTIL", "0.5")) + config.actor_rollout_ref.rollout.temperature = temperature + config.actor_rollout_ref.rollout.top_p = top_p + config.actor_rollout_ref.rollout.val_kwargs.temperature = temperature + config.actor_rollout_ref.rollout.val_kwargs.top_p = top_p + config.actor_rollout_ref.rollout.calculate_log_probs = True + config.actor_rollout_ref.rollout.multi_turn.max_assistant_turns = 100 + config.actor_rollout_ref.rollout.multi_turn.max_parallel_calls = 1 + config.actor_rollout_ref.rollout.nnodes = nnodes + config.actor_rollout_ref.rollout.n_gpus_per_node = n_gpus_per_node + config.trainer.nnodes = nnodes + config.trainer.n_gpus_per_node = n_gpus_per_node + + config.reward.custom_reward_function.path = "pkg://examples.swe_agent_blackbox.reward" + config.reward.custom_reward_function.name = "compute_score" + config.reward.num_workers = 1 + + OmegaConf.set_struct(config.actor_rollout_ref.rollout, False) + config.actor_rollout_ref.rollout.enable_sleep_mode = False + config.actor_rollout_ref.rollout.enforce_eager = os.getenv("ROLLOUT_ENFORCE_EAGER", "0") == "1" + OmegaConf.set_struct(config.actor_rollout_ref.rollout, True) + return config + + +# ===================================================================== +# CLI entry point +# ===================================================================== + + +def main(): + parser = argparse.ArgumentParser(description="SWE-Agent Blackbox Parallel Inference") + parser.add_argument("--data-path", type=str, default="~/data/swe_agent/swe_bench_verified.parquet") + parser.add_argument("--model-path", "--model", type=str, default="~/models/Qwen3-Coder-30B-A3B-Instruct") + parser.add_argument("--max-turns", type=int, default=100) + parser.add_argument("--prompt-length", type=int, default=4096) + parser.add_argument("--response-length", type=int, default=65536) + parser.add_argument("--temperature", type=float, default=0.8) + parser.add_argument("--top-p", type=float, default=0.9) + parser.add_argument("--n", type=int, default=1) + parser.add_argument("--max-samples", type=int, default=-1) + parser.add_argument("--engine", type=str, default="vllm", choices=["vllm", "sglang"]) + parser.add_argument("--nnodes", type=int, default=1) + parser.add_argument("--n-gpus-per-node", type=int, default=8) + parser.add_argument("--tensor-parallel-size", "--tp", type=int, default=4) + parser.add_argument("--gateway-count", type=int, default=1) + parser.add_argument("--max-concurrent-sessions", type=int, default=2) + parser.add_argument("--tool-parser", type=str, default="qwen3_coder") + parser.add_argument("--tool-image", type=str, default=None) + parser.add_argument("--run-timeout", type=int, default=7200) + parser.add_argument( + "--runner", type=str, default="uniagent", choices=["uniagent", "mini_swe", "claude_code"], + help="Agent runner: 'uniagent', 'mini_swe', or 'claude_code'.", + ) + parser.add_argument( + "--agent-config-path", type=str, + default="examples/swe_agent_blackbox/config/agent_config.yaml", + help="Path to agent config YAML.", + ) + args = parser.parse_args() + + os.environ["SWE_AGENT_MAX_TURNS"] = str(args.max_turns) + + run_inference( + model_path=args.model_path, + data_path=args.data_path, + prompt_length=args.prompt_length, + response_length=args.response_length, + temperature=args.temperature, + top_p=args.top_p, + n=args.n, + max_samples=args.max_samples, + engine=args.engine, + nnodes=args.nnodes, + n_gpus_per_node=args.n_gpus_per_node, + tensor_parallel_size=args.tensor_parallel_size, + gateway_count=args.gateway_count, + max_concurrent_sessions=args.max_concurrent_sessions, + tool_parser=args.tool_parser, + agent_config_path=args.agent_config_path, + runner=args.runner, + tool_image=args.tool_image, + run_timeout=args.run_timeout, + ) + + +if __name__ == "__main__": + main() diff --git a/examples/blackbox_recipes/mini_swe_agent/reward.py b/examples/blackbox_recipes/mini_swe_agent/reward.py new file mode 100644 index 00000000..61da218b --- /dev/null +++ b/examples/blackbox_recipes/mini_swe_agent/reward.py @@ -0,0 +1,74 @@ +"""Reward utilities for the blackbox SWE-agent recipe. + +Contains: +- build_reward_context: extract reward metadata + eval_timeout from tools_kwargs +- compute_score: thin reward function that reads reward_score from extra_info +- evaluate_in_env: run reward evaluation in Docker env (shared by both runners) +""" + +from __future__ import annotations + +import logging +import os +from typing import Any + +logger = logging.getLogger(__name__) + + +def build_reward_context(tools_kwargs: dict) -> tuple[dict[str, Any], int]: + """Extract reward metadata and eval_timeout from per-sample tools_kwargs.""" + reward_config = tools_kwargs.get("reward", {}) + metadata = { + "data_source": reward_config.get("name", "unknown"), + "reward_model": reward_config.get("metadata", {}), + } + eval_timeout = int(os.environ.get("SWE_AGENT_EVAL_TIMEOUT", "600")) + return metadata, eval_timeout + + +def compute_score(data_source: str, solution_str: str, ground_truth: str, extra_info=None) -> dict: + """Read reward_score from extra_info, injected by SWEAgentFramework.""" + score = 0.0 + if extra_info and "reward_score" in extra_info: + score = float(extra_info["reward_score"]) + return {"score": score} + + +def _get_reward_spec(data_source: str): + """Load reward spec class by data_source name.""" + from uni_agent.reward.registry import REWARD_SPEC_REGISTRY, _load_reward_spec_module + + if data_source not in REWARD_SPEC_REGISTRY: + _load_reward_spec_module(data_source) + cls = REWARD_SPEC_REGISTRY.get(data_source) + if cls is None: + raise ValueError(f"Unknown data_source: {data_source}. Available: {list(REWARD_SPEC_REGISTRY.keys())}") + return cls + + +async def evaluate_in_env( + env, + metadata: dict[str, Any], + eval_timeout: int = 600, +) -> tuple[float, dict]: + """Run reward evaluation in the Docker env. + + Returns (score, eval_result) where score is 1.0/0.0 and + eval_result contains details (eval_completed, resolved, etc.). + """ + data_source = metadata.get("data_source", "unknown") + reward_model = metadata.get("reward_model", {}) + + spec_cls = _get_reward_spec(data_source) + spec_metadata = reward_model.get("ground_truth", reward_model) + + spec = spec_cls( + run_id="swe_bb_eval", + metadata=spec_metadata, + env=env, + eval_timeout=eval_timeout, + ) + + resolved, result = await spec.compute_reward() + score = 1.0 if resolved else 0.0 + return score, result diff --git a/examples/blackbox_recipes/mini_swe_agent/run_agent.py b/examples/blackbox_recipes/mini_swe_agent/run_agent.py new file mode 100644 index 00000000..c5a4b165 --- /dev/null +++ b/examples/blackbox_recipes/mini_swe_agent/run_agent.py @@ -0,0 +1,106 @@ +#!/opt/mini-swe-agent/bin/python +"""Run mini-swe-agent inside the sandbox. + +Input: task config JSON from **stdin** + - task: str — the issue description for the agent to solve + - gateway_url: str — LLM gateway endpoint (tunnel URL for OpenYuanRong sandbox) + - agent: dict — agent config (e.g. step_limit) + +Output: agent result JSON to **stdout**, or error JSON on failure +""" + +from __future__ import annotations + +import json +import os +import sys + +DEFAULT_ACTION_TIMEOUT = 600 + + +def _fail(msg: str, exit_status: str = "error") -> None: + """Write error result to stdout and exit.""" + sys.stdout.write(json.dumps({"exit_status": exit_status, "submission": "", "error": msg})) + sys.stdout.write("\n") + sys.stdout.flush() + + +def main() -> None: + try: + # 1. Read task config from stdin + config = json.load(sys.stdin) + task = config["task"] + gateway_url = config["gateway_url"] + + # 2. Load swebench defaults + from minisweagent.config import builtin_config_dir, get_config_from_spec + + swebench_cfg = get_config_from_spec(str(builtin_config_dir / "benchmarks" / "swebench.yaml")) + + # 3. Create LocalEnvironment (use swebench defaults) + from minisweagent.environments.local import LocalEnvironment + + env_cfg = dict(swebench_cfg.get("environment", {})) + env_cfg.pop("environment_class", None) + env_cfg["timeout"] = DEFAULT_ACTION_TIMEOUT + env_cfg.setdefault("env", {}) + env_cfg["env"].setdefault("GIT_PAGER", "cat") + for key in ("image", "container_timeout", "run_args", "executable", "pull_timeout", + "forward_env", "interpreter"): + env_cfg.pop(key, None) + env = LocalEnvironment(**env_cfg) + + # 4. Create LitellmModel pointing at gateway + from minisweagent.models.litellm_model import LitellmModel + + model_defaults = dict(swebench_cfg.get("model", {})) + model_defaults.pop("model_name", None) + model_defaults.pop("model_kwargs", None) + model_cfg = model_defaults + model_cfg.update({ + "model_name": "openai/default", + "model_kwargs": { + "api_base": gateway_url, + "api_key": "not-needed", + "drop_params": True, + }, + "cost_tracking": "ignore_errors", + }) + model = LitellmModel(**model_cfg) + + # 5. Create DefaultAgent + from minisweagent.agents.default import DefaultAgent + + agent_defaults = dict(swebench_cfg.get("agent", {})) + agent_overrides = config.get("agent", {}) + agent_defaults.update(agent_overrides) + agent_cfg = agent_defaults + step_limit = agent_cfg.get("step_limit", 100) + agent_cfg["step_limit"] = step_limit + agent = DefaultAgent(model, env, **agent_cfg) + + # 6. Run agent + try: + info = agent.run(task=task) + except Exception as e: + info = {"exit_status": type(e).__name__, "submission": str(e)} + + # 7. Write result to stdout + result = { + "exit_status": info.get("exit_status", "unknown"), + "submission": info.get("submission", ""), + "model_stats": { + "instance_cost": agent.cost, + "api_calls": agent.n_calls, + }, + } + sys.stdout.write(json.dumps(result, ensure_ascii=False)) + sys.stdout.write("\n") + sys.stdout.flush() + + except Exception as e: + _fail(str(e), exit_status=type(e).__name__) + + +if __name__ == "__main__": + main() diff --git a/examples/blackbox_recipes/mini_swe_agent/subprocess_runner.py b/examples/blackbox_recipes/mini_swe_agent/subprocess_runner.py new file mode 100644 index 00000000..b03dc7d7 --- /dev/null +++ b/examples/blackbox_recipes/mini_swe_agent/subprocess_runner.py @@ -0,0 +1,61 @@ +"""Ray-based subprocess runner for agent_runner execution. + +Launches agent_runner in a separate Ray worker process to prevent blocking +operations (sleep, sync I/O, etc.) from stalling the framework's event loop. +""" + +from __future__ import annotations + +import asyncio +import logging +from typing import Any + +import ray + +from uni_agent.trainer.framework.types import SessionHandle + +logger = logging.getLogger(__name__) + + +class _StubSessionRuntime: + """Captures reward_info from agent_runner's complete_session call.""" + + def __init__(self): + self.reward_info: dict[str, Any] | None = None + + async def complete_session(self, session_id: str, reward_info: dict[str, Any] | None = None): + self.reward_info = reward_info + + +@ray.remote(num_cpus=0) +def remote_agent_run( + agent_runner_fqn: str, + raw_prompt, + session_id: str, + base_url: str, + sample_index: int, + runner_kwargs: dict, +) -> dict[str, Any] | None: + """Run agent_runner in a dedicated Ray worker process.""" + from verl.utils.import_utils import load_class_from_fqn + + agent_runner = load_class_from_fqn(agent_runner_fqn) + stub_runtime = _StubSessionRuntime() + handle = SessionHandle(session_id=session_id, base_url=base_url) + + async def _run(): + try: + await agent_runner( + raw_prompt=raw_prompt, + session=handle, + sample_index=sample_index, + session_runtime=stub_runtime, + **runner_kwargs, + ) + return stub_runtime.reward_info + except Exception as e: + logger.error("remote_agent_run failed: session_id=%s, sample=%d, error=%s", + session_id, sample_index, e, exc_info=True) + raise + + return asyncio.run(_run()) diff --git a/examples/blackbox_recipes/sandbox/sandbox.py b/examples/blackbox_recipes/sandbox/sandbox.py new file mode 100644 index 00000000..fb21ac94 --- /dev/null +++ b/examples/blackbox_recipes/sandbox/sandbox.py @@ -0,0 +1,10 @@ +"""OpenYuanRong (AKernel) remote sandbox command execution. + +Uses ``akernel_sdk.Sandbox`` with sidecar ``Mount`` to inject the +mini-swe-agent tool image. Supports upstream tunnel so the agent +inside the sandbox can reach the gateway via ``http://127.0.0.1:``. +""" + +#TODO + + diff --git a/examples/blackbox_recipes/scripts/build_tool.sh b/examples/blackbox_recipes/scripts/build_tool.sh new file mode 100755 index 00000000..e5158629 --- /dev/null +++ b/examples/blackbox_recipes/scripts/build_tool.sh @@ -0,0 +1,75 @@ +#!/usr/bin/env bash +# Build a SWE blackbox sidecar tool image. +# +# Usage: +# bash examples/swe_agent_blackbox/build_tool.sh +# bash examples/swe_agent_blackbox/build_tool.sh --tool claude_code +# bash examples/swe_agent_blackbox/build_tool.sh --pip-index https://pypi.tuna.tsinghua.edu.cn/simple/ +# bash examples/swe_agent_blackbox/build_tool.sh --npm-registry https://registry.npmmirror.com +# bash examples/swe_agent_blackbox/build_tool.sh --tool-version latest +# bash examples/swe_agent_blackbox/build_tool.sh --registry reg.antgroup-inc.cn/myrepo +# +set -euo pipefail + +SCRIPT_DIR="$(cd "$(dirname "$0")" && pwd)" +TOOL_KIND="${TOOL_KIND:-mini_swe}" +IMAGE_TAG="${TOOL_TAG:-latest}" +TOOL_VERSION="${TOOL_VERSION:-latest}" + +# Parse args +REGISTRY="" +PIP_INDEX_URL="${PIP_INDEX_URL:-}" +NPM_REGISTRY="${NPM_REGISTRY:-}" +while [[ $# -gt 0 ]]; do + case "$1" in + --tool) TOOL_KIND="$2"; shift 2 ;; + --registry) REGISTRY="$2"; shift 2 ;; + --pip-index) PIP_INDEX_URL="$2"; shift 2 ;; + --npm-registry) NPM_REGISTRY="$2"; shift 2 ;; + --tool-version) TOOL_VERSION="$2"; shift 2 ;; + *) echo "Unknown arg: $1"; exit 1 ;; + esac +done + +BUILD_ARGS=() +DOCKERFILE="${SCRIPT_DIR}/Dockerfile.mini-swe-agent-tool" +if [[ "${TOOL_KIND}" == "claude" ]]; then + TOOL_KIND="claude_code" +fi +if [[ "${TOOL_KIND}" == "claude_code" ]]; then + IMAGE_NAME="${TOOL_IMAGE:-claude-code-tool}" + DOCKERFILE="${SCRIPT_DIR}/Dockerfile.claude-code-tool" + BUILD_ARGS+=(--build-arg "TOOL_VERSION=${TOOL_VERSION}") + if [[ -n "${NPM_REGISTRY}" ]]; then + BUILD_ARGS+=(--build-arg "NPM_REGISTRY=${NPM_REGISTRY}") + fi +elif [[ "${TOOL_KIND}" == "mini_swe" ]]; then + IMAGE_NAME="${TOOL_IMAGE:-mini-swe-agent-tool}" + if [[ -n "${PIP_INDEX_URL}" ]]; then + BUILD_ARGS+=(--build-arg PIP_INDEX_URL="${PIP_INDEX_URL}") + fi +else + echo "Unknown tool: ${TOOL_KIND}; expected mini_swe or claude_code" + exit 1 +fi + +echo "==> Building ${TOOL_KIND} tool image: ${IMAGE_NAME}:${IMAGE_TAG}" +docker build \ + -f "${DOCKERFILE}" \ + -t "${IMAGE_NAME}:${IMAGE_TAG}" \ + "${BUILD_ARGS[@]}" \ + "${SCRIPT_DIR}/" + +if [[ -n "${REGISTRY}" ]]; then + FULL_TAG="${REGISTRY}/${IMAGE_NAME}:${IMAGE_TAG}" + echo "==> Tagging and pushing: ${FULL_TAG}" + docker tag "${IMAGE_NAME}:${IMAGE_TAG}" "${FULL_TAG}" + docker push "${FULL_TAG}" + echo " Pushed." +fi + +echo "" +echo "Tool image ready: ${IMAGE_NAME}:${IMAGE_TAG}" +if [[ -n "${REGISTRY}" ]]; then + echo " Remote sandbox: ${FULL_TAG}" +fi diff --git a/examples/blackbox_recipes/scripts/run_infer.sh b/examples/blackbox_recipes/scripts/run_infer.sh new file mode 100755 index 00000000..d5703aa6 --- /dev/null +++ b/examples/blackbox_recipes/scripts/run_infer.sh @@ -0,0 +1,66 @@ +#!/usr/bin/env bash +# Parallel inference for the blackbox SWE-agent recipe. +# +# Usage: +# bash examples/swe_agent_blackbox/scripts/run_infer.sh + +set -euo pipefail + +# ── Model & data ───────────────────────────────────────────────────────── +MODEL_PATH="${MODEL_PATH:-$HOME/models/Qwen3.5-9B}" +DATA_PATH="${DATA_PATH:-$HOME/data/swe_agent/swe_bench_verified.parquet}" + +# ── Inference parameters ───────────────────────────────────────────────── +MAX_SAMPLES="${MAX_SAMPLES:--1}" +PROMPT_LENGTH="${PROMPT_LENGTH:-4096}" +RESPONSE_LENGTH="${RESPONSE_LENGTH:-65536}" +TEMPERATURE="${TEMPERATURE:-1.0}" +TOP_P="${TOP_P:-1.0}" +N="${N:-8}" +ENGINE="${ENGINE:-vllm}" +TP="${TP:-4}" +N_GPUS_PER_NODE="${N_GPUS_PER_NODE:-8}" +GATEWAY_COUNT="${GATEWAY_COUNT:-1}" +MAX_CONCURRENT_SESSIONS="${MAX_CONCURRENT_SESSIONS:-2}" + +# ── Agent parameters ───────────────────────────────────────────────────── +RUNNER="${RUNNER:-uniagent}" +AGENT_CONFIG_PATH="${AGENT_CONFIG_PATH:-examples/swe_agent_blackbox/config/agent_config.yaml}" +export SWE_AGENT_MAX_TURNS="${SWE_AGENT_MAX_TURNS:-100}" +export SWE_AGENT_EVAL_TIMEOUT="${SWE_AGENT_EVAL_TIMEOUT:-600}" +SWE_AGENT_TOOL_IMAGE="${SWE_AGENT_TOOL_IMAGE:-}" +SWE_AGENT_RUN_TIMEOUT="${SWE_AGENT_RUN_TIMEOUT:-7200}" + +# ── Logging ────────────────────────────────────────────────────────────── +export VERL_LOGGING_LEVEL="${VERL_LOGGING_LEVEL:-INFO}" +export ROLLOUT_GPU_MEM_UTIL="${ROLLOUT_GPU_MEM_UTIL:-0.5}" + +echo "=== SWE-Agent Blackbox Inference ===" +echo "Model: ${MODEL_PATH}" +echo "Data: ${DATA_PATH}" +echo "Max samples: ${MAX_SAMPLES}" +echo "Engine: ${ENGINE} (TP=${TP})" +echo "Runner: ${RUNNER}" +echo "Gateway count: ${GATEWAY_COUNT}" +echo "Max concurrent sessions: ${MAX_CONCURRENT_SESSIONS}" +echo "=====================================" + +python examples/swe_agent_blackbox/parallel_infer.py \ + --model-path "${MODEL_PATH}" \ + --data-path "${DATA_PATH}" \ + --max-samples "${MAX_SAMPLES}" \ + --prompt-length "${PROMPT_LENGTH}" \ + --response-length "${RESPONSE_LENGTH}" \ + --temperature "${TEMPERATURE}" \ + --top-p "${TOP_P}" \ + --n "${N}" \ + --engine "${ENGINE}" \ + --tensor-parallel-size "${TP}" \ + --max-turns "${SWE_AGENT_MAX_TURNS}" \ + --runner "${RUNNER}" \ + --agent-config-path "${AGENT_CONFIG_PATH}" \ + --n-gpus-per-node "${N_GPUS_PER_NODE}" \ + --gateway-count "${GATEWAY_COUNT}" \ + --max-concurrent-sessions "${MAX_CONCURRENT_SESSIONS}" \ + --tool-image "${SWE_AGENT_TOOL_IMAGE}" \ + --run-timeout "${SWE_AGENT_RUN_TIMEOUT}" diff --git a/examples/blackbox_recipes/scripts/run_train.sh b/examples/blackbox_recipes/scripts/run_train.sh new file mode 100755 index 00000000..cf08005d --- /dev/null +++ b/examples/blackbox_recipes/scripts/run_train.sh @@ -0,0 +1,122 @@ +#!/usr/bin/env bash +# Training launch script for the blackbox SWE-agent recipe. +# +# Uses GRPO + AgentFrameworkRolloutAdapter with reward computed in-process +# by the agent runner, then passed through the reward worker's compute_score. +# +# Usage: +# bash examples/swe_agent_blackbox/scripts/run_train.sh +# +# All configurable via environment variables (see defaults below). + +set -euo pipefail + +# ── Model & data ───────────────────────────────────────────────────────── +MODEL_PATH="${MODEL_PATH:-$HOME/models/Qwen3-Coder-30B-A3B-Instruct}" +TRAIN_DATA="${TRAIN_DATA:-$HOME/data/swe_agent/swe_bench_verified.parquet}" +VAL_DATA="${VAL_DATA:-$HOME/data/swe_agent/swe_bench_verified.parquet}" + +# ── Hardware ───────────────────────────────────────────────────────────── +NNODES="${NNODES:-1}" +NGPUS_PER_NODE="${NGPUS_PER_NODE:-8}" + +# ── Training parameters ───────────────────────────────────────────────── +TRAIN_BATCH_SIZE="${TRAIN_BATCH_SIZE:-128}" +PROMPT_LENGTH="${PROMPT_LENGTH:-4096}" +RESPONSE_LENGTH="${RESPONSE_LENGTH:-131072}" +ACTOR_LR="${ACTOR_LR:-1e-6}" +TOTAL_EPOCHS="${TOTAL_EPOCHS:-10}" +SAVE_FREQ="${SAVE_FREQ:-10}" +TEST_FREQ="${TEST_FREQ:-10}" + +# ── Rollout parameters ────────────────────────────────────────────────── +ENGINE="${ENGINE:-vllm}" +TP="${TP:-4}" +ROLLOUT_GPU_MEM_UTIL="${ROLLOUT_GPU_MEM_UTIL:-0.7}" +N="${N:-8}" +TEMPERATURE="${TEMPERATURE:-1.0}" + +# ── Agent parameters ───────────────────────────────────────────────────── +RUNNER="${RUNNER:-mini_swe}" +MAX_TURNS="${MAX_TURNS:-100}" +AGENT_CONFIG_PATH="${AGENT_CONFIG_PATH:-examples/swe_agent_blackbox/config/agent_config.yaml}" +COMPLETION_TIMEOUT="${COMPLETION_TIMEOUT:-600}" +if [[ "${RUNNER}" == "claude_code" ]]; then + AGENT_RUNNER_FQN="examples.swe_agent_blackbox.claude_code_runner.claude_code_runner" + SWE_AGENT_TOOL_IMAGE="${SWE_AGENT_TOOL_IMAGE:-claude-code-tool:latest}" +elif [[ "${RUNNER}" == "mini_swe" ]]; then + AGENT_RUNNER_FQN="examples.swe_agent_blackbox.mini_swe_agent_runner.mini_swe_agent_runner" + SWE_AGENT_TOOL_IMAGE="${SWE_AGENT_TOOL_IMAGE:-swr.cn-east-3.myhuaweicloud.com/openyuanrong/mini-swe-agent-tool:latest}" +elif [[ "${RUNNER}" == "uniagent" ]]; then + AGENT_RUNNER_FQN="examples.swe_agent_blackbox.agent_runner.swe_agent_runner" + SWE_AGENT_TOOL_IMAGE="" +else + echo "Unknown RUNNER=${RUNNER}; expected mini_swe, claude_code, or uniagent" >&2 + exit 1 +fi +SWE_AGENT_RUN_TIMEOUT="${SWE_AGENT_RUN_TIMEOUT:-7200}" +RUNNER_ARGS=( + "actor_rollout_ref.rollout.custom.agent_framework.agent_runner_fqn=${AGENT_RUNNER_FQN}" +) +if [[ "${RUNNER}" != "uniagent" ]]; then + RUNNER_ARGS+=( + "+actor_rollout_ref.rollout.custom.agent_framework.agent_runner_kwargs.tool_image=${SWE_AGENT_TOOL_IMAGE}" + "+actor_rollout_ref.rollout.custom.agent_framework.agent_runner_kwargs.run_timeout=${SWE_AGENT_RUN_TIMEOUT}" + ) +fi + +# ── Logging ────────────────────────────────────────────────────────────── +PROJECT_NAME="${PROJECT_NAME:-swe_agent_blackbox}" +EXPERIMENT_NAME="${EXPERIMENT_NAME:-swe_agent_$(date +%Y%m%d_%H%M)}" +VERL_LOGGING_LEVEL="${VERL_LOGGING_LEVEL:-INFO}" + +export SWE_AGENT_MAX_TURNS="${MAX_TURNS}" +export SWE_AGENT_EVAL_TIMEOUT="${SWE_AGENT_EVAL_TIMEOUT:-600}" +export VERL_LOGGING_LEVEL + +# ── Environment for NCCL ───────────────────────────────────────────────── +export NCCL_P2P_DISABLE="${NCCL_P2P_DISABLE:-1}" +export NCCL_SHM_DISABLE="${NCCL_SHM_DISABLE:-1}" + +echo "=== SWE-Agent Blackbox Training ===" +echo "Model: ${MODEL_PATH}" +echo "Train data: ${TRAIN_DATA}" +echo "Val data: ${VAL_DATA}" +echo "Engine: ${ENGINE} (TP=${TP})" +echo "Runner: ${RUNNER}" +echo "Batch size: ${TRAIN_BATCH_SIZE}, N=${N}" +echo "Epochs: ${TOTAL_EPOCHS}" +echo "=====================================" + +python3 -m verl.trainer.main_ppo_sync \ + --config-name=swe_agent_blackbox \ + --config-path="$(pwd)/examples/swe_agent_blackbox/config" \ + actor_rollout_ref.model.path="${MODEL_PATH}" \ + data.train_files="['${TRAIN_DATA}']" \ + data.val_files="['${VAL_DATA}']" \ + data.train_batch_size=${TRAIN_BATCH_SIZE} \ + data.max_prompt_length=${PROMPT_LENGTH} \ + data.max_response_length=${RESPONSE_LENGTH} \ + actor_rollout_ref.rollout.name=${ENGINE} \ + actor_rollout_ref.rollout.tensor_model_parallel_size=${TP} \ + actor_rollout_ref.rollout.gpu_memory_utilization=${ROLLOUT_GPU_MEM_UTIL} \ + actor_rollout_ref.rollout.n=${N} \ + actor_rollout_ref.rollout.temperature=${TEMPERATURE} \ + actor_rollout_ref.rollout.prompt_length=${PROMPT_LENGTH} \ + actor_rollout_ref.rollout.response_length=${RESPONSE_LENGTH} \ + actor_rollout_ref.rollout.max_model_len=$((PROMPT_LENGTH + RESPONSE_LENGTH + 1024)) \ + actor_rollout_ref.rollout.multi_turn.max_assistant_turns=${MAX_TURNS} \ + actor_rollout_ref.actor.optim.lr=${ACTOR_LR} \ + actor_rollout_ref.rollout.nnodes=${NNODES} \ + actor_rollout_ref.rollout.n_gpus_per_node=${NGPUS_PER_NODE} \ + trainer.nnodes=${NNODES} \ + trainer.n_gpus_per_node=${NGPUS_PER_NODE} \ + trainer.total_epochs=${TOTAL_EPOCHS} \ + trainer.save_freq=${SAVE_FREQ} \ + trainer.test_freq=${TEST_FREQ} \ + trainer.project_name=${PROJECT_NAME} \ + trainer.experiment_name=${EXPERIMENT_NAME} \ + actor_rollout_ref.rollout.custom.agent_framework.agent_runner_kwargs.agent_config_path="${AGENT_CONFIG_PATH}" \ + actor_rollout_ref.rollout.custom.agent_framework.completion_timeout_seconds=${COMPLETION_TIMEOUT} \ + "${RUNNER_ARGS[@]}" \ + "$@" diff --git a/examples/blackbox_recipes/scripts/run_train_megatron_async.sh b/examples/blackbox_recipes/scripts/run_train_megatron_async.sh new file mode 100755 index 00000000..db3a8264 --- /dev/null +++ b/examples/blackbox_recipes/scripts/run_train_megatron_async.sh @@ -0,0 +1,199 @@ +#!/usr/bin/env bash +# Megatron + TQ fully-async training for the blackbox SWE-agent recipe. +# +# Uses FullyAsyncAgentFrameworkRolloutAdapter + SWEAgentFramework with Megatron backend. +# Data flows through TransferQueue (zero-copy) with ReplayBuffer flow control. +# +# Usage: +# bash examples/swe_agent_blackbox/scripts/run_train_megatron_async.sh +# +# All configurable via environment variables (see defaults below). + +set -euo pipefail + +# ── Model & data ───────────────────────────────────────────────────────── +MODEL_PATH="${MODEL_PATH:-${HOME}/models/Qwen3.5-9B}" +TRAIN_DATA="${TRAIN_DATA:-${HOME}/data/swe_agent/swe_rebench_filtered.parquet}" +VAL_DATA="${VAL_DATA:-${HOME}/data/swe_agent/swe_bench_verified.parquet}" +RUNTIME_ENV="${RUNTIME_ENV:-}" + +# ── Hardware ───────────────────────────────────────────────────────────── +NNODES_TRAIN="${NNODES_TRAIN:-1}" +NNODES_ROLLOUT="${NNODES_ROLLOUT:-1}" +NGPUS_PER_NODE="${NGPUS_PER_NODE:-8}" + +# ── Algorithm ──────────────────────────────────────────────────────────── +CLIP_RATIO_LOW="${CLIP_RATIO_LOW:-0.2}" +CLIP_RATIO_HIGH="${CLIP_RATIO_HIGH:-0.28}" +ACTOR_LR="${ACTOR_LR:-1e-6}" + +# ── Sequence lengths ───────────────────────────────────────────────────── +PROMPT_LENGTH="${PROMPT_LENGTH:-4096}" +RESPONSE_LENGTH="${RESPONSE_LENGTH:-131072}" +MAX_MODEL_LEN=$((PROMPT_LENGTH + RESPONSE_LENGTH)) + +# ── Rollout parameters ─────────────────────────────────────────────────── +ENGINE="${ENGINE:-vllm}" +GEN_TP="${GEN_TP:-2}" +N="${N:-8}" +TEMPERATURE="${TEMPERATURE:-1.0}" +ROLLOUT_GPU_MEM_UTIL="${ROLLOUT_GPU_MEM_UTIL:-0.7}" + +# ── Megatron training parallelism ──────────────────────────────────────── +TRAIN_TP="${TRAIN_TP:-8}" +TRAIN_PP="${TRAIN_PP:-1}" +TRAIN_CP="${TRAIN_CP:-1}" +OFFLOAD="${OFFLOAD:-True}" +OPTIMIZER_OFFLOAD_FRACTION="${OFFLOAD_FRACTION:-1.0}" +USE_MBRIDGE="${USE_MBRIDGE:-True}" +PPO_MINI_BATCH_SIZE="${PPO_MINI_BATCH_SIZE:-16}" + +# ── Agent parameters ───────────────────────────────────────────────────── +RUNNER="${RUNNER:-mini_swe}" +MAX_TURNS="${MAX_TURNS:-100}" +AGENT_CONFIG_PATH="${AGENT_CONFIG_PATH:-examples/swe_agent_blackbox/config/agent_config.yaml}" +COMPLETION_TIMEOUT="${COMPLETION_TIMEOUT:-600}" +if [[ "${RUNNER}" == "claude_code" ]]; then + AGENT_RUNNER_FQN="examples.swe_agent_blackbox.claude_code_runner.claude_code_runner" + SWE_AGENT_TOOL_IMAGE="${SWE_AGENT_TOOL_IMAGE:-claude-code-tool:latest}" +elif [[ "${RUNNER}" == "mini_swe" ]]; then + AGENT_RUNNER_FQN="examples.swe_agent_blackbox.mini_swe_agent_runner.mini_swe_agent_runner" + SWE_AGENT_TOOL_IMAGE="${SWE_AGENT_TOOL_IMAGE:-swr.cn-east-3.myhuaweicloud.com/openyuanrong/mini-swe-agent-tool:latest}" +elif [[ "${RUNNER}" == "uniagent" ]]; then + AGENT_RUNNER_FQN="examples.swe_agent_blackbox.agent_runner.swe_agent_runner" + SWE_AGENT_TOOL_IMAGE="" +else + echo "Unknown RUNNER=${RUNNER}; expected mini_swe, claude_code, or uniagent" >&2 + exit 1 +fi +SWE_AGENT_RUN_TIMEOUT="${SWE_AGENT_RUN_TIMEOUT:-7200}" +CONDA_ENV="${CONDA_ENV:-testbed}" +RUNNER_ARGS=( + "actor_rollout_ref.rollout.custom.agent_framework.agent_runner_fqn=${AGENT_RUNNER_FQN}" +) +if [[ "${RUNNER}" != "uniagent" ]]; then + RUNNER_ARGS+=( + "+actor_rollout_ref.rollout.custom.agent_framework.agent_runner_kwargs.tool_image=${SWE_AGENT_TOOL_IMAGE}" + "+actor_rollout_ref.rollout.custom.agent_framework.agent_runner_kwargs.run_timeout=${SWE_AGENT_RUN_TIMEOUT}" + "+actor_rollout_ref.rollout.custom.agent_framework.agent_runner_kwargs.conda_env=${CONDA_ENV}" + ) +fi + +# ── OpenYuanRong (YR remote sandbox) ───────────────────────────────────── +OPENYUANRONG_SERVER_ADDRESS="${OPENYUANRONG_SERVER_ADDRESS:-}" +OPENYUANRONG_TOKEN="${OPENYUANRONG_TOKEN:-}" +OPENYUANRONG_TUNNEL_SSL_VERIFY="${OPENYUANRONG_TUNNEL_SSL_VERIFY:-0}" + +# ── Async training ─────────────────────────────────────────────────────── +TOTAL_ROLLOUT_STEPS="${TOTAL_ROLLOUT_STEPS:-100000}" +STALENESS_THRESHOLD="${STALENESS_THRESHOLD:-1.0}" +TRIGGER_SYNC_STEP="${TRIGGER_SYNC_STEP:-4}" +PARTIAL_ROLLOUT="${PARTIAL_ROLLOUT:-True}" + +# ── Logging & checkpointing ────────────────────────────────────────────── +PROJECT_NAME="${PROJECT_NAME:-swe_agent_blackbox}" +EXPERIMENT_NAME="${EXPERIMENT_NAME:-swe_agent_$(date +%Y%m%d_%H%M)}" +SAVE_FREQ="${SAVE_FREQ:-10}" +TEST_FREQ="${TEST_FREQ:-10}" +CKPTS_DIR="${CKPTS_DIR:-checkpoints/${PROJECT_NAME}/${EXPERIMENT_NAME}}" + +export SWE_AGENT_MAX_TURNS="${MAX_TURNS}" +export SWE_AGENT_EVAL_TIMEOUT="${SWE_AGENT_EVAL_TIMEOUT:-600}" +export OPENYUANRONG_SERVER_ADDRESS +export OPENYUANRONG_TOKEN +export OPENYUANRONG_TUNNEL_SSL_VERIFY + +echo "=== SWE-Agent Blackbox Megatron Async Training ===" +echo "Model: ${MODEL_PATH}" +echo "Train data: ${TRAIN_DATA}" +echo "Val data: ${VAL_DATA}" +echo "Engine: ${ENGINE} (gen_tp=${GEN_TP}, train_tp=${TRAIN_TP})" +echo "Runner: ${RUNNER}" +echo "Batch: n=${N}, mini_bsz=${PPO_MINI_BATCH_SIZE}" +echo "Sequence: prompt=${PROMPT_LENGTH}, response=${RESPONSE_LENGTH}" +echo "Nodes: train=${NNODES_TRAIN}, rollout=${NNODES_ROLLOUT}" +echo "===================================================" + +# ── Compute derived parameters ─────────────────────────────────────────── +ACTOR_PPO_MAX_TOKEN_LEN=$(( (PROMPT_LENGTH + RESPONSE_LENGTH) / TRAIN_CP )) +INFER_PPO_MAX_TOKEN_LEN=$(( (PROMPT_LENGTH + RESPONSE_LENGTH) / TRAIN_CP )) + +RUNTIME_ENV_ARGS=() +if [ -n "${RUNTIME_ENV}" ]; then + RUNTIME_ENV_ARGS=(--runtime-env "${RUNTIME_ENV}") +fi + +# ── Ensure Ray is running ──────────────────────────────────────────────── +TOTAL_GPUS=$(( (NNODES_TRAIN + NNODES_ROLLOUT) * NGPUS_PER_NODE )) +if ! ray status &>/dev/null; then + echo "Starting Ray cluster (${TOTAL_GPUS} GPUs)..." + ray start --head --num-gpus="${TOTAL_GPUS}" --disable-usage-stats +else + echo "Ray cluster already running." +fi + +# ── Launch ──────────────────────────────────────────────────────────────── +WORKING_DIR="${WORKING_DIR:-$(pwd)}" + +ray job submit --no-wait --working-dir="${WORKING_DIR}" "${RUNTIME_ENV_ARGS[@]}" \ + -- python3 -m verl.experimental.fully_async_policy.fully_async_main \ + --config-name=swe_agent_blackbox_megatron_async \ + --config-path="$(pwd)/examples/swe_agent_blackbox/config" \ + hydra.searchpath=[pkg://verl.trainer.config] \ + actor_rollout_ref.model.path="${MODEL_PATH}" \ + data.train_files="['${TRAIN_DATA}']" \ + data.val_files="['${VAL_DATA}']" \ + data.max_prompt_length=${PROMPT_LENGTH} \ + data.max_response_length=${RESPONSE_LENGTH} \ + actor_rollout_ref.rollout.n=${N} \ + actor_rollout_ref.rollout.name=${ENGINE} \ + actor_rollout_ref.rollout.prompt_length=${PROMPT_LENGTH} \ + actor_rollout_ref.rollout.response_length=${RESPONSE_LENGTH} \ + actor_rollout_ref.rollout.max_model_len=${MAX_MODEL_LEN} \ + actor_rollout_ref.rollout.max_num_batched_tokens=${MAX_MODEL_LEN} \ + actor_rollout_ref.rollout.temperature=${TEMPERATURE} \ + actor_rollout_ref.rollout.tensor_model_parallel_size=${GEN_TP} \ + actor_rollout_ref.rollout.gpu_memory_utilization=${ROLLOUT_GPU_MEM_UTIL} \ + actor_rollout_ref.rollout.multi_turn.max_assistant_turns=${MAX_TURNS} \ + actor_rollout_ref.rollout.custom.agent_framework.completion_timeout_seconds=${COMPLETION_TIMEOUT} \ + actor_rollout_ref.rollout.custom.agent_framework.agent_runner_kwargs.agent_config_path="${AGENT_CONFIG_PATH}" \ + "${RUNNER_ARGS[@]}" \ + actor_rollout_ref.actor.clip_ratio_low=${CLIP_RATIO_LOW} \ + actor_rollout_ref.actor.clip_ratio_high=${CLIP_RATIO_HIGH} \ + actor_rollout_ref.actor.ppo_mini_batch_size=${PPO_MINI_BATCH_SIZE} \ + actor_rollout_ref.actor.ppo_max_token_len_per_gpu=${ACTOR_PPO_MAX_TOKEN_LEN} \ + actor_rollout_ref.actor.optim.lr=${ACTOR_LR} \ + actor_rollout_ref.actor.optim.lr_decay_steps=${TOTAL_ROLLOUT_STEPS} \ + +actor_rollout_ref.actor.optim.override_optimizer_config.optimizer_offload_fraction=${OPTIMIZER_OFFLOAD_FRACTION} \ + +actor_rollout_ref.actor.optim.override_optimizer_config.overlap_cpu_optimizer_d2h_h2d=True \ + +actor_rollout_ref.actor.optim.override_optimizer_config.use_precision_aware_optimizer=True \ + +actor_rollout_ref.actor.optim.override_optimizer_config.optimizer_cpu_offload=True \ + actor_rollout_ref.actor.megatron.param_offload=${OFFLOAD} \ + actor_rollout_ref.actor.megatron.grad_offload=${OFFLOAD} \ + actor_rollout_ref.actor.megatron.optimizer_offload=${OFFLOAD} \ + actor_rollout_ref.actor.megatron.tensor_model_parallel_size=${TRAIN_TP} \ + actor_rollout_ref.actor.megatron.pipeline_model_parallel_size=${TRAIN_PP} \ + actor_rollout_ref.actor.megatron.context_parallel_size=${TRAIN_CP} \ + actor_rollout_ref.actor.megatron.use_mbridge=${USE_MBRIDGE} \ + actor_rollout_ref.ref.megatron.param_offload=${OFFLOAD} \ + actor_rollout_ref.ref.megatron.tensor_model_parallel_size=${TRAIN_TP} \ + actor_rollout_ref.ref.megatron.pipeline_model_parallel_size=${TRAIN_PP} \ + actor_rollout_ref.ref.megatron.context_parallel_size=${TRAIN_CP} \ + actor_rollout_ref.rollout.log_prob_micro_batch_size_per_gpu=1 \ + actor_rollout_ref.rollout.log_prob_max_token_len_per_gpu=${INFER_PPO_MAX_TOKEN_LEN} \ + actor_rollout_ref.ref.log_prob_micro_batch_size_per_gpu=1 \ + actor_rollout_ref.ref.log_prob_max_token_len_per_gpu=${INFER_PPO_MAX_TOKEN_LEN} \ + trainer.project_name="${PROJECT_NAME}" \ + trainer.experiment_name="${EXPERIMENT_NAME}" \ + trainer.save_freq=${SAVE_FREQ} \ + trainer.test_freq=${TEST_FREQ} \ + trainer.default_local_dir="${CKPTS_DIR}" \ + trainer.nnodes=${NNODES_TRAIN} \ + trainer.n_gpus_per_node=${NGPUS_PER_NODE} \ + rollout.nnodes=${NNODES_ROLLOUT} \ + rollout.n_gpus_per_node=${NGPUS_PER_NODE} \ + rollout.total_rollout_steps=${TOTAL_ROLLOUT_STEPS} \ + async_training.staleness_threshold=${STALENESS_THRESHOLD} \ + async_training.trigger_parameter_sync_step=${TRIGGER_SYNC_STEP} \ + async_training.partial_rollout=${PARTIAL_ROLLOUT} \ + "$@" diff --git a/examples/blackbox_recipes/scripts/run_train_megatron_sync.sh b/examples/blackbox_recipes/scripts/run_train_megatron_sync.sh new file mode 100755 index 00000000..1a0c19d3 --- /dev/null +++ b/examples/blackbox_recipes/scripts/run_train_megatron_sync.sh @@ -0,0 +1,138 @@ +#!/usr/bin/env bash +# Megatron sync training for the blackbox SWE-agent recipe. +# +# Uses main_ppo_sync + Megatron backend with the same blackbox agent infrastructure +# (AgentFrameworkRolloutAdapter, subprocess_runner, SWEAgentFramework). +# +# Usage: +# bash examples/swe_agent_blackbox/scripts/run_train_megatron_sync.sh +# +# All configurable via environment variables (see defaults below). + +set -euo pipefail + +# ── Model & data ───────────────────────────────────────────────────────── +MODEL_PATH="${MODEL_PATH:-$HOME/models/Qwen3.5-9B}" +TRAIN_DATA="${TRAIN_DATA:-$HOME/data/swe_agent/swe_rebench_filtered.parquet}" +VAL_DATA="${VAL_DATA:-$HOME/data/swe_agent/swe_bench_verified.parquet}" + +# ── Hardware ───────────────────────────────────────────────────────────── +NNODES="${NNODES:-1}" +NGPUS_PER_NODE="${NGPUS_PER_NODE:-8}" + +# ── Training parameters ───────────────────────────────────────────────── +TRAIN_BATCH_SIZE="${TRAIN_BATCH_SIZE:-128}" +PROMPT_LENGTH="${PROMPT_LENGTH:-4096}" +RESPONSE_LENGTH="${RESPONSE_LENGTH:-131072}" +ACTOR_LR="${ACTOR_LR:-1e-6}" +TOTAL_EPOCHS="${TOTAL_EPOCHS:-10}" +SAVE_FREQ="${SAVE_FREQ:-10}" +TEST_FREQ="${TEST_FREQ:-10}" +PPO_MINI_BATCH_SIZE="${PPO_MINI_BATCH_SIZE:-16}" + +# ── Rollout parameters ────────────────────────────────────────────────── +ENGINE="${ENGINE:-vllm}" +TP="${TP:-4}" +ROLLOUT_GPU_MEM_UTIL="${ROLLOUT_GPU_MEM_UTIL:-0.7}" +N="${N:-8}" +TEMPERATURE="${TEMPERATURE:-1.0}" + +# ── Megatron parallelism ──────────────────────────────────────────────── +TRAIN_TP="${TRAIN_TP:-8}" +TRAIN_PP="${TRAIN_PP:-1}" +TRAIN_CP="${TRAIN_CP:-1}" +OFFLOAD="${OFFLOAD:-true}" +USE_MBRIDGE="${USE_MBRIDGE:-true}" + +# ── Agent parameters ───────────────────────────────────────────────────── +RUNNER="${RUNNER:-mini_swe}" +MAX_TURNS="${MAX_TURNS:-100}" +AGENT_CONFIG_PATH="${AGENT_CONFIG_PATH:-examples/swe_agent_blackbox/config/agent_config.yaml}" +COMPLETION_TIMEOUT="${COMPLETION_TIMEOUT:-600}" +if [[ "${RUNNER}" == "claude_code" ]]; then + AGENT_RUNNER_FQN="examples.swe_agent_blackbox.claude_code_runner.claude_code_runner" + SWE_AGENT_TOOL_IMAGE="${SWE_AGENT_TOOL_IMAGE:-claude-code-tool:latest}" +elif [[ "${RUNNER}" == "mini_swe" ]]; then + AGENT_RUNNER_FQN="examples.swe_agent_blackbox.mini_swe_agent_runner.mini_swe_agent_runner" + SWE_AGENT_TOOL_IMAGE="${SWE_AGENT_TOOL_IMAGE:-swr.cn-east-3.myhuaweicloud.com/openyuanrong/mini-swe-agent-tool:latest}" +elif [[ "${RUNNER}" == "uniagent" ]]; then + AGENT_RUNNER_FQN="examples.swe_agent_blackbox.agent_runner.swe_agent_runner" + SWE_AGENT_TOOL_IMAGE="" +else + echo "Unknown RUNNER=${RUNNER}; expected mini_swe, claude_code, or uniagent" >&2 + exit 1 +fi +SWE_AGENT_RUN_TIMEOUT="${SWE_AGENT_RUN_TIMEOUT:-7200}" +RUNNER_ARGS=( + "actor_rollout_ref.rollout.custom.agent_framework.agent_runner_fqn=${AGENT_RUNNER_FQN}" +) +if [[ "${RUNNER}" != "uniagent" ]]; then + RUNNER_ARGS+=( + "+actor_rollout_ref.rollout.custom.agent_framework.agent_runner_kwargs.tool_image=${SWE_AGENT_TOOL_IMAGE}" + "+actor_rollout_ref.rollout.custom.agent_framework.agent_runner_kwargs.run_timeout=${SWE_AGENT_RUN_TIMEOUT}" + ) +fi + +# ── Logging ────────────────────────────────────────────────────────────── +PROJECT_NAME="${PROJECT_NAME:-swe_agent_blackbox}" +EXPERIMENT_NAME="${EXPERIMENT_NAME:-swe_agent_$(date +%Y%m%d_%H%M)}" +VERL_LOGGING_LEVEL="${VERL_LOGGING_LEVEL:-INFO}" + +export SWE_AGENT_MAX_TURNS="${MAX_TURNS}" +export SWE_AGENT_EVAL_TIMEOUT="${SWE_AGENT_EVAL_TIMEOUT:-600}" +export VERL_LOGGING_LEVEL + +# ── Environment for NCCL ──────────────────────────────────────────────── +export NCCL_P2P_DISABLE="${NCCL_P2P_DISABLE:-1}" +export NCCL_SHM_DISABLE="${NCCL_SHM_DISABLE:-1}" + +echo "=== SWE-Agent Blackbox Megatron Sync Training ===" +echo "Model: ${MODEL_PATH}" +echo "Train data: ${TRAIN_DATA}" +echo "Val data: ${VAL_DATA}" +echo "Engine: ${ENGINE} (gen_tp=${TP}, train_tp=${TRAIN_TP})" +echo "Runner: ${RUNNER}" +echo "Batch size: ${TRAIN_BATCH_SIZE}, N=${N}" +echo "Sequence: prompt=${PROMPT_LENGTH}, response=${RESPONSE_LENGTH}" +echo "===============================================" + +python3 -m verl.trainer.main_ppo_sync \ + --config-name=swe_agent_blackbox_megatron_sync \ + --config-path="$(pwd)/examples/swe_agent_blackbox/config" \ + hydra.searchpath=[pkg://verl.trainer.config] \ + actor_rollout_ref.model.path="${MODEL_PATH}" \ + data.train_files="['${TRAIN_DATA}']" \ + data.val_files="['${VAL_DATA}']" \ + data.train_batch_size=${TRAIN_BATCH_SIZE} \ + data.max_prompt_length=${PROMPT_LENGTH} \ + data.max_response_length=${RESPONSE_LENGTH} \ + actor_rollout_ref.rollout.name=${ENGINE} \ + actor_rollout_ref.rollout.tensor_model_parallel_size=${TP} \ + actor_rollout_ref.rollout.gpu_memory_utilization=${ROLLOUT_GPU_MEM_UTIL} \ + actor_rollout_ref.rollout.n=${N} \ + actor_rollout_ref.rollout.temperature=${TEMPERATURE} \ + actor_rollout_ref.rollout.prompt_length=${PROMPT_LENGTH} \ + actor_rollout_ref.rollout.response_length=${RESPONSE_LENGTH} \ + actor_rollout_ref.rollout.max_model_len=$((PROMPT_LENGTH + RESPONSE_LENGTH)) \ + actor_rollout_ref.rollout.multi_turn.max_assistant_turns=${MAX_TURNS} \ + actor_rollout_ref.actor.optim.lr=${ACTOR_LR} \ + actor_rollout_ref.actor.ppo_mini_batch_size=${PPO_MINI_BATCH_SIZE} \ + actor_rollout_ref.actor.megatron.tensor_model_parallel_size=${TRAIN_TP} \ + actor_rollout_ref.actor.megatron.pipeline_model_parallel_size=${TRAIN_PP} \ + actor_rollout_ref.actor.megatron.context_parallel_size=${TRAIN_CP} \ + actor_rollout_ref.actor.megatron.param_offload=${OFFLOAD} \ + actor_rollout_ref.actor.megatron.grad_offload=${OFFLOAD} \ + actor_rollout_ref.actor.megatron.use_mbridge=${USE_MBRIDGE} \ + actor_rollout_ref.rollout.nnodes=${NNODES} \ + actor_rollout_ref.rollout.n_gpus_per_node=${NGPUS_PER_NODE} \ + trainer.nnodes=${NNODES} \ + trainer.n_gpus_per_node=${NGPUS_PER_NODE} \ + trainer.total_epochs=${TOTAL_EPOCHS} \ + trainer.save_freq=${SAVE_FREQ} \ + trainer.test_freq=${TEST_FREQ} \ + trainer.project_name=${PROJECT_NAME} \ + trainer.experiment_name=${EXPERIMENT_NAME} \ + actor_rollout_ref.rollout.custom.agent_framework.agent_runner_kwargs.agent_config_path="${AGENT_CONFIG_PATH}" \ + actor_rollout_ref.rollout.custom.agent_framework.completion_timeout_seconds=${COMPLETION_TIMEOUT} \ + "${RUNNER_ARGS[@]}" \ + "$@" From a28540a4452be30d2d5284b2c71c7c0b3f3354e9 Mon Sep 17 00:00:00 2001 From: sheng Date: Fri, 26 Jun 2026 14:18:43 +0800 Subject: [PATCH 2/3] feat: add openyuanrong sandbox --- examples/blackbox_recipes/sandbox/sandbox.py | 180 ++++++++++++++++++- 1 file changed, 179 insertions(+), 1 deletion(-) diff --git a/examples/blackbox_recipes/sandbox/sandbox.py b/examples/blackbox_recipes/sandbox/sandbox.py index fb21ac94..e6e46a8d 100644 --- a/examples/blackbox_recipes/sandbox/sandbox.py +++ b/examples/blackbox_recipes/sandbox/sandbox.py @@ -5,6 +5,184 @@ inside the sandbox can reach the gateway via ``http://127.0.0.1:``. """ -#TODO +from __future__ import annotations +import asyncio +import logging +import os +from dataclasses import dataclass +from typing import Any +from urllib.parse import urlparse + +@dataclass +class CommandResult: + """Result of a command executed inside a sandbox.""" + + stdout: str + stderr: str + exit_code: int + +logger = logging.getLogger(__name__) + +DEFAULT_PROXY_PORT = 38197 + + +def _configure_akernel_env() -> None: + """Map OPENYUANRONG_* env vars to AKERNEL_* before importing akernel_sdk.""" + server = os.getenv("OPENYUANRONG_SERVER_ADDRESS") + token = os.getenv("OPENYUANRONG_TOKEN") + tunnel_ssl_verify = os.getenv("OPENYUANRONG_TUNNEL_SSL_VERIFY", "0") + if not server or not token: + raise ValueError( + "OPENYUANRONG_SERVER_ADDRESS and OPENYUANRONG_TOKEN " + "environment variables must be set for YR sandbox" + ) + os.environ["AKERNEL_SERVER_ADDRESS"] = server + os.environ["AKERNEL_TOKEN"] = token + os.environ["TUNNEL_SSL_VERIFY"] = tunnel_ssl_verify + + +def extract_upstream(gateway_url: str) -> str: + """Extract host:port from a gateway URL for upstream tunnel config. + + Example: "http://8.92.9.155:40169/sessions/abc/v1" -> "8.92.9.155:40169" + """ + parsed = urlparse(gateway_url) + return f"{parsed.hostname}:{parsed.port}" + + +def rewrite_gateway_url( + gateway_url: str, + proxy_port: int = DEFAULT_PROXY_PORT, + *, + strip_v1: bool = False, +) -> str: + """Rewrite gateway URL to use the sandbox-internal tunnel. + + Replaces host:port with 127.0.0.1:, keeps path intact. + + Example: + "http://8.92.9.155:40169/sessions/abc/v1" + -> "http://127.0.0.1:8766/sessions/abc/v1" + """ + parsed = urlparse(gateway_url) + path = parsed.path.removesuffix("/v1") if strip_v1 else parsed.path + return f"http://127.0.0.1:{proxy_port}{path}" + + +class YRSandbox: + """Command execution via OpenYuanRong (AKernel) remote sandbox.""" + + def __init__(self, sandbox: Any) -> None: + self._sandbox = sandbox + + @property + def sandbox_id(self) -> str: + return getattr(self._sandbox, "sandbox_id", "unknown") + + + @classmethod + async def create( + cls, + *, + image: str, + sidecar_image: str, + upstream: str = "", + proxy_port: int = DEFAULT_PROXY_PORT, + env: dict[str, str] | None = None, + cpu: int = 1000, + memory: int = 2048, + cpu_limit: int = 4000, + mem_limit: int = 8192, + idle_timeout: int = 7200, + sidecar_target: str = "/opt/mini-swe-agent", + max_retries: int = 5, + **sandbox_kwargs: Any, + ) -> "YRSandbox": + """Create an OpenYuanRong sandbox with sidecar tool mounted. + + The sidecar image is mounted at ``sidecar_target`` inside the + sandbox via ``akernel_sdk.Mount``. + + If ``upstream`` is provided, a tunnel is set up so the sandbox can + reach the local gateway via ``http://127.0.0.1:``. + """ + _configure_akernel_env() + from akernel_sdk import Mount, Sandbox + + sb_kwargs: dict[str, Any] = { + "image": image, + "cpu": cpu, + "memory": memory, + "cpu_limit": cpu_limit, + "mem_limit": mem_limit, + "idle_timeout": idle_timeout, + "mounts": [ + Mount(target=sidecar_target, image_url=sidecar_image), + ], + } + if upstream: + sb_kwargs["upstream"] = upstream + sb_kwargs["proxy_port"] = proxy_port + if env: + sb_kwargs["env"] = env + sb_kwargs.update(sandbox_kwargs) + + logger.info( + "Creating YR sandbox (image=%s, cpu=%d, memory=%d, sidecar=%s:%s, upstream=%s)", + image, cpu, memory, sidecar_image, sidecar_target, upstream or "none", + ) + last_error: Exception | None = None + for retry in range(max_retries): + sandbox = None + try: + sandbox = await asyncio.to_thread(lambda: Sandbox(**sb_kwargs)) + logger.info("YR sandbox created: %s", getattr(sandbox, "sandbox_id", "?")) + return cls(sandbox=sandbox) + except Exception as exc: + last_error = exc + sandbox_id = getattr(sandbox, "sandbox_id", None) + logger.critical( + "Failed to create YR sandbox (sandbox_id=%s): %s", + sandbox_id or "n/a", exc, + ) + if sandbox is not None: + try: + await asyncio.to_thread(sandbox.kill) + except Exception: + pass + if retry < max_retries - 1: + sleep_time = min(30, 2 ** retry) + logger.info("Retrying YR sandbox creation in %d seconds...", sleep_time) + await asyncio.sleep(sleep_time) + + raise RuntimeError(f"Failed to create YR sandbox after {max_retries} retries") from last_error + + async def run(self, cmd: str, *, timeout: int = 600) -> CommandResult: + """Execute *cmd* inside the OpenYuanRong sandbox via ``sandbox.commands.run``.""" + try: + result = await asyncio.to_thread( + self._sandbox.commands.run, cmd, timeout=timeout, + ) + return CommandResult( + stdout=getattr(result, "stdout", ""), + stderr=getattr(result, "stderr", ""), + exit_code=getattr(result, "exit_code", -1), + ) + except Exception as e: + return CommandResult(stdout="", stderr=str(e), exit_code=-1) + + async def cleanup(self) -> None: + """Kill the OpenYuanRong sandbox if still running.""" + if self._sandbox is not None: + sandbox_id = getattr(self._sandbox, "sandbox_id", "?") + try: + if self._sandbox.is_running(): + await asyncio.to_thread(self._sandbox.kill) + logger.info("YR sandbox %s killed", sandbox_id) + else: + logger.info("YR sandbox %s already stopped", sandbox_id) + except Exception as e: + logger.warning("Failed to kill YR sandbox %s: %s", sandbox_id, e) + self._sandbox = None From 6de203b8a6a1fe1b07137c6dce8c81e73508267e Mon Sep 17 00:00:00 2001 From: zhaizhiqiang <584508161@qq.com> Date: Mon, 29 Jun 2026 03:30:36 +0000 Subject: [PATCH 3/3] 9235bf603757c85c41deef22fe93490a1b5f0921 --- .../claude_code/Dockerfile.claude-code-tool | 21 - .../claude_code/claude_code_runner.py | 232 --------- .../claude_code/config/claude_code.yaml | 1 - .../blackbox_recipes/mini_swe_agent/README.md | 248 ++------- .../mini_swe_agent/build_tool.sh | 56 ++ .../mini_swe_agent/config/agent_config.yaml | 36 -- .../config/agent_config_openyuanrong.yaml | 37 -- .../mini_swe_agent/config/parallel_infer.yaml | 31 -- .../config/swe_agent_blackbox.yaml | 123 ----- .../swe_agent_blackbox_megatron_sync.yaml | 129 ----- ...ml => swe_agent_blackbox_megatron_v1.yaml} | 70 +-- .../mini_swe_agent/dataset.py | 1 - .../mini_swe_agent/framework.py | 105 ---- .../mini_swe_agent/mini_swe_agent_runner.py | 69 ++- .../mini_swe_agent/parallel_infer.py | 486 ++++++++---------- .../blackbox_recipes/mini_swe_agent/reward.py | 2 +- .../mini_swe_agent/run_agent.py | 34 +- .../mini_swe_agent/run_infer.sh | 80 +++ .../mini_swe_agent/run_train.sh | 300 +++++++++++ .../mini_swe_agent/subprocess_runner.py | 61 --- .../{sandbox/sandbox.py => sandbox_client.py} | 85 +-- .../blackbox_recipes/scripts/build_tool.sh | 75 --- .../blackbox_recipes/scripts/run_infer.sh | 66 --- .../blackbox_recipes/scripts/run_train.sh | 122 ----- .../scripts/run_train_megatron_async.sh | 199 ------- .../scripts/run_train_megatron_sync.sh | 138 ----- .../r2e_gym_subset_filtered.py | 7 + .../data_preprocess/swe_bench_verified.py | 9 + examples/data_preprocess/swe_rebench.py | 8 + uni_agent/gateway/session/codec.py | 4 +- uni_agent/interaction/model.py | 4 +- verl | 2 +- 32 files changed, 881 insertions(+), 1960 deletions(-) delete mode 100644 examples/blackbox_recipes/claude_code/Dockerfile.claude-code-tool delete mode 100644 examples/blackbox_recipes/claude_code/claude_code_runner.py delete mode 100644 examples/blackbox_recipes/claude_code/config/claude_code.yaml create mode 100755 examples/blackbox_recipes/mini_swe_agent/build_tool.sh delete mode 100644 examples/blackbox_recipes/mini_swe_agent/config/agent_config.yaml delete mode 100644 examples/blackbox_recipes/mini_swe_agent/config/agent_config_openyuanrong.yaml delete mode 100644 examples/blackbox_recipes/mini_swe_agent/config/parallel_infer.yaml delete mode 100644 examples/blackbox_recipes/mini_swe_agent/config/swe_agent_blackbox.yaml delete mode 100644 examples/blackbox_recipes/mini_swe_agent/config/swe_agent_blackbox_megatron_sync.yaml rename examples/blackbox_recipes/mini_swe_agent/config/{swe_agent_blackbox_megatron_async.yaml => swe_agent_blackbox_megatron_v1.yaml} (62%) delete mode 100644 examples/blackbox_recipes/mini_swe_agent/framework.py create mode 100755 examples/blackbox_recipes/mini_swe_agent/run_infer.sh create mode 100755 examples/blackbox_recipes/mini_swe_agent/run_train.sh delete mode 100644 examples/blackbox_recipes/mini_swe_agent/subprocess_runner.py rename examples/blackbox_recipes/{sandbox/sandbox.py => sandbox_client.py} (66%) delete mode 100755 examples/blackbox_recipes/scripts/build_tool.sh delete mode 100755 examples/blackbox_recipes/scripts/run_infer.sh delete mode 100755 examples/blackbox_recipes/scripts/run_train.sh delete mode 100755 examples/blackbox_recipes/scripts/run_train_megatron_async.sh delete mode 100755 examples/blackbox_recipes/scripts/run_train_megatron_sync.sh diff --git a/examples/blackbox_recipes/claude_code/Dockerfile.claude-code-tool b/examples/blackbox_recipes/claude_code/Dockerfile.claude-code-tool deleted file mode 100644 index 3d12af4c..00000000 --- a/examples/blackbox_recipes/claude_code/Dockerfile.claude-code-tool +++ /dev/null @@ -1,21 +0,0 @@ -# Claude Code sidecar tool image. -# -# Mounted at /opt/claude-code inside the SWE-bench sandbox. - -FROM node:20-bookworm-slim AS builder - -ARG TOOL_VERSION="latest" -ARG NPM_REGISTRY="" - -ENV DISABLE_AUTOUPDATER=1 \ - IS_SANDBOX=1 \ - npm_config_audit=false \ - npm_config_fund=false \ - npm_config_update_notifier=false - -RUN if [ -n "${NPM_REGISTRY}" ]; then npm config set registry "${NPM_REGISTRY}"; fi \ - && npm install -g --prefix /opt/claude-code "@anthropic-ai/claude-code@${TOOL_VERSION}" \ - && /opt/claude-code/bin/claude --version - -FROM scratch -COPY --from=builder /opt/claude-code / diff --git a/examples/blackbox_recipes/claude_code/claude_code_runner.py b/examples/blackbox_recipes/claude_code/claude_code_runner.py deleted file mode 100644 index bee41aaf..00000000 --- a/examples/blackbox_recipes/claude_code/claude_code_runner.py +++ /dev/null @@ -1,232 +0,0 @@ -"""Claude Code runner for the blackbox SWE-agent recipe.""" - -from __future__ import annotations - -import json -import logging -import os -import shlex -import time - -from uni_agent.trainer.framework.types import SessionHandle, SessionRuntime - -logger = logging.getLogger(__name__) - -DEFAULT_TOOL_IMAGE = "claude-code-tool:latest" -TOOL_TARGET = "/opt/claude-code" - - -def extract_task(raw_prompt) -> str: - if isinstance(raw_prompt, str): - return raw_prompt - return next( - (m["content"] for m in raw_prompt if isinstance(m, dict) and m.get("role") == "user"), - str(raw_prompt), - ) - - -def _extract_issue_text(task: str) -> str: - start = task.find("") - end = task.find("") - if start >= 0 and end > start: - return task[start + len(""):end].strip() - marker = "\nFollow these steps to resolve the issue:" - if marker in task: - return task.split(marker, 1)[0].strip() - return task.strip() - - -def _decode_metadata_list(value) -> list[str]: - if not value: - return [] - if isinstance(value, list): - return [str(item) for item in value] - if isinstance(value, str): - try: - parsed = json.loads(value) - except json.JSONDecodeError: - return [value] - if isinstance(parsed, list): - return [str(item) for item in parsed] - return [str(value)] - - -def build_claude_task(raw_prompt, tools_kwargs: dict | None = None) -> str: - tools_kwargs = tools_kwargs or {} - task = extract_task(raw_prompt) - metadata = ((tools_kwargs.get("reward") or {}).get("metadata") or {}) - issue = metadata.get("problem_statement") or _extract_issue_text(task) - tests = _decode_metadata_list(metadata.get("FAIL_TO_PASS")) - if not tests: - tests = _decode_metadata_list(metadata.get("PASS_TO_PASS"))[:3] - tests_block = "\n".join(f"- {test}" for test in tests) if tests else "- Run the closest relevant tests you identify." - - return ( - "You are fixing a SWE-bench task in /testbed.\n\n" - "Issue:\n" - f"{issue}\n\n" - "Rules:\n" - "- Edit source files only. Do not modify tests.\n" - "- The development environment is already installed; do not install packages unless a test command proves it is necessary.\n" - "- There is no submit tool in this environment. Do not try to submit.\n" - "- Do not create extra edge-case test files after the relevant tests pass.\n" - "- Do not run `pytest --collect-only`, `git log`, or any other command that does not directly validate the fix.\n" - "- Do not analyze unrelated `is_separable` behavior.\n" - "- Do not run additional ad-hoc verification after the listed relevant pytest command passes.\n" - "- Do not commit.\n" - "- After the minimal fix is applied and a relevant pytest command passes, print a one-line summary and exit immediately.\n\n" - "Relevant tests to run after the fix:\n" - f"{tests_block}\n" - ) - - -def build_claude_command( - *, - task: str, - base_url: str, - max_turns: int, - model: str = "default", - permission_mode: str = "bypassPermissions", - conda_env: str | None = "testbed", - disable_web_tools: bool = True, - disable_slash_commands: bool = True, -) -> str: - env = { - "ANTHROPIC_BASE_URL": base_url, - "ANTHROPIC_API_KEY": "not-needed", - "ANTHROPIC_MODEL": model, - "ANTHROPIC_DEFAULT_HAIKU_MODEL": model, - "ANTHROPIC_DEFAULT_SONNET_MODEL": model, - "ANTHROPIC_DEFAULT_OPUS_MODEL": model, - "ANTHROPIC_SMALL_FAST_MODEL": model, - "CLAUDE_CODE_DISABLE_BACKGROUND_TASKS": "1", - "CLAUDE_CODE_DISABLE_NONESSENTIAL_TRAFFIC": "1", - "CLAUDE_CODE_FORK_SUBAGENT": "0", - "CLAUDE_CODE_SUBAGENT_MODEL": model, - "DISABLE_AUTOUPDATER": "1", - "IS_SANDBOX": "1", - } - env_assignments = [f"{key}={shlex.quote(value)}" for key, value in env.items()] - if conda_env: - conda_prefix = f"/opt/miniconda3/envs/{conda_env}" - env_assignments.extend( - [ - f"CONDA_DEFAULT_ENV={shlex.quote(conda_env)}", - f"CONDA_PREFIX={shlex.quote(conda_prefix)}", - f"PATH={shlex.quote(conda_prefix + '/bin')}:/opt/miniconda3/bin:$PATH", - ] - ) - env_prefix = " ".join(env_assignments) - argv = [ - "/opt/claude-code/bin/claude", - "-p", - task, - "--model", - model, - "--max-turns", - str(max_turns), - "--permission-mode", - permission_mode, - ] - if disable_slash_commands: - argv.append("--disable-slash-commands") - if disable_web_tools: - argv.extend(["--disallowedTools", "Agent", "Task", "WebFetch", "WebSearch"]) - return ( - "unset HTTP_PROXY HTTPS_PROXY http_proxy https_proxy NO_PROXY no_proxy; " - "cd /testbed; " - f"{env_prefix} " - + shlex.join(argv) - ) - - -async def _create_claude_sandbox( - *, - image: str, - sidecar_image: str, - gateway_url: str, -): - from examples.swe_agent_blackbox.sandbox import YRSandbox, extract_upstream - - upstream = extract_upstream(gateway_url) if gateway_url else "" - return await YRSandbox.create( - image=image, - sidecar_image=sidecar_image, - sidecar_target=TOOL_TARGET, - upstream=upstream, - ) - - -async def claude_code_runner( - *, - raw_prompt, - session: SessionHandle, - sample_index: int, - session_runtime: SessionRuntime, - tools_kwargs: dict | None = None, - tool_image: str = DEFAULT_TOOL_IMAGE, - run_timeout: int = 7200, - **kwargs, -) -> None: - from examples.swe_agent_blackbox.dataset import extract_image - from examples.swe_agent_blackbox.mini_swe_agent_runner import SandboxEnvForReward - from examples.swe_agent_blackbox.reward import build_reward_context, evaluate_in_env - - tools_kwargs = tools_kwargs or {} - task = build_claude_task(raw_prompt, tools_kwargs) - env_config = tools_kwargs.get("env", {}) - image = extract_image(env_config) - if not image: - raise ValueError(f"No Docker image found in tools_kwargs.env for sample {sample_index}") - - gateway_url = session.base_url - if not gateway_url: - raise ValueError(f"gateway_url is empty for sample {sample_index}") - - sandbox = await _create_claude_sandbox( - image=image, - sidecar_image=tool_image, - gateway_url=gateway_url, - ) - - try: - post_setup_cmd = env_config.get("post_setup_cmd", "") - if post_setup_cmd: - setup_result = await sandbox.run(post_setup_cmd, timeout=120) - if setup_result.exit_code != 0: - logger.warning("post_setup_cmd failed rc=%s: %.300s", setup_result.exit_code, setup_result.stdout + setup_result.stderr) - - from examples.swe_agent_blackbox.sandbox import rewrite_gateway_url - - claude_base_url = rewrite_gateway_url(gateway_url, strip_v1=True) - max_turns = int(os.environ.get("SWE_AGENT_MAX_TURNS", "100")) - agent_cmd = build_claude_command( - task=task, - base_url=claude_base_url, - max_turns=max_turns, - ) - - started_at = time.perf_counter() - result = await sandbox.run(agent_cmd, timeout=int(run_timeout)) - elapsed = time.perf_counter() - started_at - logger.info("[sample %d] claude-code finished rc=%s elapsed=%.1fs", sample_index, result.exit_code, elapsed) - if result.exit_code != 0: - logger.warning( - "[sample %d] claude-code failed stdout_tail=%r stderr_tail=%r", - sample_index, - (result.stdout or "")[-4000:], - (result.stderr or "")[-4000:], - ) - - metadata, eval_timeout = build_reward_context(tools_kwargs) - score, eval_result = await evaluate_in_env(SandboxEnvForReward(sandbox), metadata, eval_timeout) - logger.info("[sample %d] reward done score=%s resolved=%s", sample_index, score, eval_result.get("resolved")) - - reward_info = { - "reward_score": score, - "claude_code_exit_code": result.exit_code, - **eval_result, - } - await session_runtime.complete_session(session.session_id, reward_info=reward_info) - finally: - await sandbox.cleanup() diff --git a/examples/blackbox_recipes/claude_code/config/claude_code.yaml b/examples/blackbox_recipes/claude_code/config/claude_code.yaml deleted file mode 100644 index 503fa1da..00000000 --- a/examples/blackbox_recipes/claude_code/config/claude_code.yaml +++ /dev/null @@ -1 +0,0 @@ -#TODO \ No newline at end of file diff --git a/examples/blackbox_recipes/mini_swe_agent/README.md b/examples/blackbox_recipes/mini_swe_agent/README.md index b32a637a..436d3c65 100644 --- a/examples/blackbox_recipes/mini_swe_agent/README.md +++ b/examples/blackbox_recipes/mini_swe_agent/README.md @@ -2,268 +2,118 @@ ## Overview -`mini_swe` and `claude_code` both run inside the SWE-bench sandbox through a -sidecar tool image. The external runner creates the sandbox, mounts the selected -tool image, starts the agent process, and evaluates the reward in the same -sandbox. - -For `mini_swe`, the agent executes commands through `LocalEnvironment` (local -bash) inside the sandbox and calls the LLM through the gateway URL passed in via -stdin. For `claude_code`, the runner starts the Claude Code CLI from the sidecar -image and points it at the same Anthropic-compatible gateway. - -The `mini_swe` tool image uses +`mini-swe-agent` runs inside the SWE-bench sandbox through a sidecar tool image. +The external runner creates the sandbox, mounts the tool image at +`/opt/mini-swe-agent`, starts the agent process, and evaluates the reward in the +same sandbox. + +The agent executes commands through `LocalEnvironment` (local bash) inside the +sandbox and calls the LLM through the gateway URL passed in via stdin. The +`mini_swe` tool image uses [python-build-standalone](https://github.com/astral-sh/python-build-standalone) -to build an isolated Python environment. The Claude Code tool image uses a Node -builder to install the Claude Code npm package. Both images use a minimal +to build an isolated Python environment, then copies the result into a minimal `FROM scratch` final stage, so the sandbox base image does not need to provide -Python, Node, or npm for the sidecar tool runtime. +Python for the sidecar tool runtime. + +**This recipe is self-contained.** It shares only +[`../sandbox_client.py`](../sandbox_client.py) with the claude-code recipe; +everything else (`dataset.py`, `reward.py`, `run_agent.py`, `build_tool.sh`, +`run_train.sh`, config) lives in this directory and does not depend on +`claude_code/`. **Supported runners:** | runner | Description | |--------|-------------| -| `uniagent` | Original SWE-agent runner | | `mini_swe` | mini-swe-agent sidecar runner | -| `claude_code` | Claude Code sidecar runner; reward is returned through `complete_session(reward_info)` without writing a separate reward JSON file | **Supported sandbox types:** | Type | Description | |------|-------------| -| OpenYuanRong (`"openyuanrong"`) | Uses `akernel_sdk.Mount` and `sandbox.commands.run()` | - -At runtime, the selected runner depends directly on its tool image. The tool -image does not need to be extracted into a host directory ahead of time. +| openyuanrong | Uses `akernel_sdk.Mount` and `sandbox.commands.run()` | ## Architecture ```text -[Rollouter Host: mini_swe_agent_runner / claude_code_runner] +[Rollouter Host: mini_swe_agent_runner] | - |-- _create_sandbox(image, sidecar_image) - | `-- openyuanrong: Sandbox(mounts=[Mount(target="/opt/", ...)]) + |-- SandboxClient.create(image, sidecar_image, sidecar_target="/opt/mini-swe-agent") + | `-- akernel: Sandbox(mounts=[Mount(target="/opt/mini-swe-agent", ...)]) | |-- sandbox.run("") | `-- [Inside Sandbox] - | /opt/mini-swe-agent/bin/python3.12 or /opt/claude-code/bin/claude + | /opt/mini-swe-agent/bin/python /opt/mini-swe-agent/bin/run_agent.py | stdin <- task config JSON (task, gateway_url, agent) | commands run inside the SWE-bench sandbox - | stdout -> runner-specific execution result + | stdout -> agent execution result JSON | |-- parse agent result |-- SandboxEnvForReward(sandbox) -> evaluate_in_env() - `-- session_runtime.complete_session(reward_info) + `-- POST session.reward_info_url ``` ## Prerequisites -1. **OpenYuanRong** - set `OPENYUANRONG_SERVER_ADDRESS` and `OPENYUANRONG_TOKEN`. -2. **Runner tool image** - build the selected tool image and push it to a remote +1. **AKernel** — set `AKERNEL_SERVER_ADDRESS` and `AKERNEL_TOKEN`. +2. **Tool image** — build the mini-swe-agent tool image and push it to a remote registry if the sandbox service cannot access local Docker images. ## 1. Build Tool Image -`mini_swe` and `claude_code` are both injected into the SWE-bench sandbox as -sidecar tool images, but they differ in image contents, mount paths, and -accelerator/mirror options. Use `build_tool.sh` for both runners, and select the -target runner with `--tool` or `TOOL_KIND`. +`mini_swe` is injected into the SWE-bench sandbox as a sidecar tool image. Use +`build_tool.sh` to build it. -| runner | Default tool image | Dockerfile | Sandbox mount path | Image contents | Mirror option | -|--------|--------------------|------------|--------------------|----------------|---------------| -| `mini_swe` | `mini-swe-agent-tool:latest` | `Dockerfile.mini-swe-agent-tool` | `/opt/mini-swe-agent` | Standalone Python 3.12, `mini-swe-agent`, `litellm`, and `run_agent.py` | `--pip-index` / `PIP_INDEX_URL` | -| `claude_code` | `claude-code-tool:latest` | `Dockerfile.claude-code-tool` | `/opt/claude-code` | Claude Code npm package installed by a Node 20 builder | `--npm-registry` / `NPM_REGISTRY` | - -### mini_swe Tool Image - -`mini_swe` is the default build target: +| Default tool image | Dockerfile | Sandbox mount path | Image contents | +|--------------------|------------|--------------------|----------------| +| `mini-swe-agent-tool:latest` | `Dockerfile.mini-swe-agent-tool` | `/opt/mini-swe-agent` | Standalone Python 3.12, `mini-swe-agent`, `litellm`, and `run_agent.py` | ```bash # Use the default PyPI source. -bash examples/swe_agent_blackbox/build_tool.sh +bash examples/blackbox_recipes/mini_swe_agent/build_tool.sh # Use a custom PyPI mirror. -bash examples/swe_agent_blackbox/build_tool.sh --pip-index https://pypi.tuna.tsinghua.edu.cn/simple/ +bash examples/blackbox_recipes/mini_swe_agent/build_tool.sh --pip-index https://pypi.tuna.tsinghua.edu.cn/simple/ # Build and push to a remote registry. -bash examples/swe_agent_blackbox/build_tool.sh --registry swr.cn-east-3.myhuaweicloud.com/openyuanrong -``` - -The `mini_swe` image uses `python-build-standalone` to build an isolated Python -runtime. The final `FROM scratch` image contains only the files needed under -`/opt/mini-swe-agent`, and it does not depend on the Python version installed in -the sandbox base image. - -After pushing the image, point runtime inference at it with `SWE_AGENT_TOOL_IMAGE`: - -```bash -SWE_AGENT_TOOL_IMAGE=swr.cn-east-3.myhuaweicloud.com/openyuanrong/mini-swe-agent-tool:latest \ -RUNNER=mini_swe \ -bash examples/swe_agent_blackbox/scripts/run_infer.sh -``` - -### Claude Code Tool Image - -Claude Code must be selected explicitly with `--tool claude_code`: - -```bash -# Use the default npm registry. -bash examples/swe_agent_blackbox/build_tool.sh --tool claude_code - -# Use a custom npm registry. -bash examples/swe_agent_blackbox/build_tool.sh \ - --tool claude_code \ - --npm-registry https://registry.npmmirror.com - -# Select the Claude Code npm package version. -bash examples/swe_agent_blackbox/build_tool.sh \ - --tool claude_code \ - --tool-version latest - -# Build and push the Claude Code sidecar image. -bash examples/swe_agent_blackbox/build_tool.sh \ - --tool claude_code \ - --registry swr.cn-east-3.myhuaweicloud.com/openyuanrong -``` - -The Claude Code image uses `node:20-bookworm-slim` as the builder stage and -installs `@anthropic-ai/claude-code` into `/opt/claude-code`. The final image is -also a `FROM scratch` sidecar image. At runtime, the runner mounts it into the -sandbox at `/opt/claude-code` and invokes `/opt/claude-code/bin/claude`. - -After pushing the image, point runtime inference at it with `SWE_AGENT_TOOL_IMAGE`: - -```bash -SWE_AGENT_TOOL_IMAGE=swr.cn-east-3.myhuaweicloud.com/openyuanrong/claude-code-tool:latest \ -RUNNER=claude_code \ -bash examples/swe_agent_blackbox/scripts/run_infer.sh -``` - -### Combined Build Options - -`--tool`, image tags, mirrors, and registries can be combined: - -```bash -bash examples/swe_agent_blackbox/build_tool.sh \ - --tool mini_swe \ - --pip-index https://pypi.tuna.tsinghua.edu.cn/simple/ \ - --registry swr.cn-east-3.myhuaweicloud.com/openyuanrong +bash examples/blackbox_recipes/mini_swe_agent/build_tool.sh --registry swr.cn-east-3.myhuaweicloud.com/openyuanrong ``` -The build script: - -1. Selects the Dockerfile and default image name from `--tool`: - - `mini_swe` -> `mini-swe-agent-tool:latest` - - `claude_code` -> `claude-code-tool:latest` -2. Tags and pushes the image when `--registry` is provided. - -Both tool images are sidecar runtime dependencies, not SWE-bench task base -images. The `mini_swe` Python runtime is fully isolated from the sandbox -container's Python. The `claude_code` Node/npm dependencies live only under -`/opt/claude-code`, so the sandbox base image does not need Node installed. +The `mini_swe` Python runtime is fully isolated from the sandbox container's +Python. ### Build Environment Variables | Variable | Default | Description | |----------|---------|-------------| -| `TOOL_IMAGE` | `mini-swe-agent-tool` / `claude-code-tool` | Image name; the default changes with `TOOL_KIND` | +| `TOOL_IMAGE` | `mini-swe-agent-tool` | Image name | | `TOOL_TAG` | `latest` | Image tag | -| `TOOL_VERSION` | `latest` | Tool package version; for `claude_code`, this selects the `@anthropic-ai/claude-code` npm package version | -| `PIP_INDEX_URL` | unset, use PyPI | pip index URL; equivalent to `--pip-index` | -| `TOOL_KIND` | `mini_swe` | Tool kind: `mini_swe` or `claude_code` | -| `NPM_REGISTRY` | unset, use npm default | npm registry URL; equivalent to `--npm-registry` | +| `PIP_INDEX_URL` | unset, use PyPI | pip index URL (`--pip-index`) | -## 2. Inference With OpenYuanRong Sandbox +After pushing, point training at it with `SWE_AGENT_TOOL_IMAGE`. -### Using run_infer.sh +## 2. Training (Fully Async) ```bash -cd "$(git rev-parse --show-toplevel)" - -RUNNER=mini_swe \ +AKERNEL_SERVER_ADDRESS="6.2.179.37:8888" \ +AKERNEL_TOKEN="" \ SWE_AGENT_TOOL_IMAGE=swr.cn-east-3.myhuaweicloud.com/openyuanrong/mini-swe-agent-tool:latest \ -MODEL_PATH=$HOME/models/Qwen3.5-9B \ -DATA_PATH=$HOME/data/swe_agent/r2e_gym.parquet \ -MAX_SAMPLES=1 \ -TP=1 \ -bash examples/swe_agent_blackbox/scripts/run_infer.sh -``` - -### Calling Python Directly - -```bash -python examples/swe_agent_blackbox/parallel_infer.py \ - --model-path ~/models/Qwen3.5-9B \ - --data-path ~/data/swe_agent/r2e_gym.parquet \ - --max-samples 1 \ - --runner mini_swe \ - --max-turns 100 \ - --tensor-parallel-size 1 -``` - -## 3. Inference - -### Environment Variables - -```bash -export OPENYUANRONG_SERVER_ADDRESS="6.2.179.37:8888" -export OPENYUANRONG_TOKEN="" -export DEPLOYMENT=openyuanrong -``` - -### Run mini_swe - -```bash -RUNNER=mini_swe \ -OPENYUANRONG_SERVER_ADDRESS="6.2.179.37:8888" \ -OPENYUANRONG_TOKEN="" \ -DEPLOYMENT=openyuanrong \ -SWE_AGENT_TOOL_IMAGE=swr.cn-east-3.myhuaweicloud.com/openyuanrong/mini-swe-agent-tool:latest \ -bash examples/swe_agent_blackbox/scripts/run_infer.sh -``` - -### Run Claude Code - -```bash -RUNNER=claude_code \ -OPENYUANRONG_SERVER_ADDRESS="6.2.179.37:8888" \ -OPENYUANRONG_TOKEN="" \ -DEPLOYMENT=openyuanrong \ -SWE_AGENT_TOOL_IMAGE=swr.cn-east-3.myhuaweicloud.com/openyuanrong/claude-code-tool:latest \ -SWE_AGENT_MAX_TURNS=50 \ -SWE_AGENT_RUN_TIMEOUT=7200 \ -bash examples/swe_agent_blackbox/scripts/run_infer.sh -``` - -## 4. Training (Fully Async) - -```bash -OPENYUANRONG_SERVER_ADDRESS="6.2.179.37:8888" \ -OPENYUANRONG_TOKEN="" \ MODEL_PATH=~/models/Qwen3.5-9B \ -bash examples/swe_agent_blackbox/scripts/run_train_megatron_async.sh +bash examples/blackbox_recipes/mini_swe_agent/run_train.sh ``` -The training YAML keeps `mini_swe` as the default runner: +The training YAML keeps `mini_swe` as the only runner: ```yaml -agent_runner_fqn: examples.swe_agent_blackbox.mini_swe_agent_runner.mini_swe_agent_runner -``` - -To run training with Claude Code, keep the YAML unchanged and override the runner -FQN from the launch command: - -```bash -python3 -m verl.experimental.fully_async_policy.fully_async_main \ - --config-path examples/swe_agent_blackbox/config \ - --config-name swe_agent_blackbox_megatron_async \ - actor_rollout_ref.rollout.custom.agent_framework.agent_runner_fqn=examples.swe_agent_blackbox.claude_code_runner.claude_code_runner +agent_runner_fqn: examples.blackbox_recipes.mini_swe_agent.mini_swe_agent_runner.mini_swe_agent_runner ``` -## 5. Configuration +## 3. Configuration | Variable | Default | Description | |----------|---------|-------------| -| `SWE_AGENT_MAX_TURNS` | `100` | Max agent steps | +| `AGENT_MAX_TURNS` | `100` | mini-swe-agent `step_limit` (the agent's turn budget); read by the runner from the `AGENT_MAX_TURNS` env var | +| `SWE_AGENT_EVAL_TIMEOUT` | `600` | Reward evaluation timeout (seconds) | +| `SWE_AGENT_RUN_TIMEOUT` | `7200` | Max wall time for the agent process in the sandbox | | `SWE_AGENT_TOOL_IMAGE` | `swr.cn-east-3.myhuaweicloud.com/openyuanrong/mini-swe-agent-tool:latest` | Sidecar tool image | -| `DEBUG_MODE` | (unset) | Set to 1 to enable debug logging | +| `CONDA_ENV` | `testbed` | Conda env activated inside the sandbox before running the agent | diff --git a/examples/blackbox_recipes/mini_swe_agent/build_tool.sh b/examples/blackbox_recipes/mini_swe_agent/build_tool.sh new file mode 100755 index 00000000..7bcdbe1f --- /dev/null +++ b/examples/blackbox_recipes/mini_swe_agent/build_tool.sh @@ -0,0 +1,56 @@ +#!/usr/bin/env bash +# Build the mini-swe-agent sidecar tool image. +# +# The image uses python-build-standalone to build an isolated Python runtime +# with mini-swe-agent + litellm + run_agent.py, copied into a minimal +# `FROM scratch` final stage rooted at /opt/mini-swe-agent. It is mounted into +# the SWE-bench sandbox at /opt/mini-swe-agent, so the sandbox base image does +# not need Python for the sidecar tool runtime. +# +# Usage: +# bash examples/blackbox_recipes/mini_swe_agent/build_tool.sh +# bash examples/blackbox_recipes/mini_swe_agent/build_tool.sh --pip-index https://pypi.tuna.tsinghua.edu.cn/simple/ +# bash examples/blackbox_recipes/mini_swe_agent/build_tool.sh --registry swr.cn-east-3.myhuaweicloud.com/openyuanrong +# +set -euo pipefail + +SCRIPT_DIR="$(cd "$(dirname "${BASH_SOURCE[0]}")" && pwd)" +IMAGE_NAME="${TOOL_IMAGE:-mini-swe-agent-tool}" +IMAGE_TAG="${TOOL_TAG:-latest}" + +# Parse args +REGISTRY="" +PIP_INDEX_URL="${PIP_INDEX_URL:-}" +while [[ $# -gt 0 ]]; do + case "$1" in + --registry) REGISTRY="$2"; shift 2 ;; + --pip-index) PIP_INDEX_URL="$2"; shift 2 ;; + *) echo "Unknown arg: $1"; exit 1 ;; + esac +done + +BUILD_ARGS=() +if [[ -n "${PIP_INDEX_URL}" ]]; then + BUILD_ARGS+=(--build-arg PIP_INDEX_URL="${PIP_INDEX_URL}") +fi + +echo "==> Building mini_swe tool image: ${IMAGE_NAME}:${IMAGE_TAG}" +docker build \ + -f "${SCRIPT_DIR}/Dockerfile.mini-swe-agent-tool" \ + -t "${IMAGE_NAME}:${IMAGE_TAG}" \ + "${BUILD_ARGS[@]}" \ + "${SCRIPT_DIR}/" + +if [[ -n "${REGISTRY}" ]]; then + FULL_TAG="${REGISTRY}/${IMAGE_NAME}:${IMAGE_TAG}" + echo "==> Tagging and pushing: ${FULL_TAG}" + docker tag "${IMAGE_NAME}:${IMAGE_TAG}" "${FULL_TAG}" + docker push "${FULL_TAG}" + echo " Pushed." +fi + +echo "" +echo "Tool image ready: ${IMAGE_NAME}:${IMAGE_TAG}" +if [[ -n "${REGISTRY}" ]]; then + echo " Remote sandbox: ${FULL_TAG}" +fi diff --git a/examples/blackbox_recipes/mini_swe_agent/config/agent_config.yaml b/examples/blackbox_recipes/mini_swe_agent/config/agent_config.yaml deleted file mode 100644 index b7352b72..00000000 --- a/examples/blackbox_recipes/mini_swe_agent/config/agent_config.yaml +++ /dev/null @@ -1,36 +0,0 @@ -- name: swe_agent - - _target_: uni_agent.agent_loop.UniAgentLoop - concurrency: 64 - log_dir: /tmp/swebench_qwen3_coder - mask_abnormal_exit_traj: false - - interaction: - action_timeout: 300 - max_turns: 100 - - env: - deployment: - type: local - command: /usr/bin/python3 -m swerex.server --auth-token {token} - timeout: 600 - startup_timeout: 600 - container_runtime: docker - env_variables: - PIP_PROGRESS_BAR: "off" - PIP_CACHE_DIR: "~/.cache/pip" - PAGER: "cat" - MANPAGER: "cat" - LESS: "-R" - TQDM_DISABLE: "1" - GIT_PAGER: "cat" - - tool_parser: qwen3_coder - - tools: - - name: str_replace_editor - - name: execute_bash - - name: submit - - reward: - eval_timeout: 600 diff --git a/examples/blackbox_recipes/mini_swe_agent/config/agent_config_openyuanrong.yaml b/examples/blackbox_recipes/mini_swe_agent/config/agent_config_openyuanrong.yaml deleted file mode 100644 index b298c676..00000000 --- a/examples/blackbox_recipes/mini_swe_agent/config/agent_config_openyuanrong.yaml +++ /dev/null @@ -1,37 +0,0 @@ -- name: swe_agent - - _target_: uni_agent.agent_loop.UniAgentLoop - concurrency: 64 - log_dir: /tmp/swebench_qwen3_coder - mask_abnormal_exit_traj: false - - interaction: - action_timeout: 300 - max_turns: 100 - - env: - deployment: - type: openyuanrong - command: /opt/swe-rex/bin/python /opt/swe-rex/bin/swerex-remote --host 0.0.0.0 --port {port} --auth-token {token} - timeout: 600 - startup_timeout: 600 - swerex_runtime_image: swr.cn-east-3.myhuaweicloud.com/openyuanrong/swerex-runtime:1.4.0 - swerex_runtime_target: /opt/swe-rex - env_variables: - PIP_PROGRESS_BAR: "off" - PIP_CACHE_DIR: "~/.cache/pip" - PAGER: "cat" - MANPAGER: "cat" - LESS: "-R" - TQDM_DISABLE: "1" - GIT_PAGER: "cat" - - tool_parser: qwen3_coder - - tools: - - name: str_replace_editor - - name: execute_bash - - name: submit - - reward: - eval_timeout: 600 diff --git a/examples/blackbox_recipes/mini_swe_agent/config/parallel_infer.yaml b/examples/blackbox_recipes/mini_swe_agent/config/parallel_infer.yaml deleted file mode 100644 index 0829fdcd..00000000 --- a/examples/blackbox_recipes/mini_swe_agent/config/parallel_infer.yaml +++ /dev/null @@ -1,31 +0,0 @@ -# Parallel inference config for the blackbox SWE-agent recipe. -# Composes verl's base configs with inference-specific overrides. - -defaults: - - model_engine: dp - - actor@actor_rollout_ref.actor: ${model_engine}_actor - - rollout@actor_rollout_ref.rollout: rollout - - model@actor_rollout_ref.model: hf_model - - reward: reward - - _self_ - -hydra: - searchpath: - - pkg://verl.trainer.config - -actor_rollout_ref: - hybrid_engine: true - nccl_timeout: 600 - model: {} - rollout: - agent: {} - -trainer: - nnodes: 1 - n_gpus_per_node: 8 - logger: - - console - device: cuda - total_epochs: 1 - total_training_steps: null - balance_batch: false diff --git a/examples/blackbox_recipes/mini_swe_agent/config/swe_agent_blackbox.yaml b/examples/blackbox_recipes/mini_swe_agent/config/swe_agent_blackbox.yaml deleted file mode 100644 index 62b73da1..00000000 --- a/examples/blackbox_recipes/mini_swe_agent/config/swe_agent_blackbox.yaml +++ /dev/null @@ -1,123 +0,0 @@ -# PPO trainer config for the blackbox SWE-agent recipe (v2). -# Uses the generic AgentFrameworkRolloutAdapter + SWEAgentFramework subclass. - -hydra: - searchpath: - - pkg://verl.trainer.config - -defaults: - - ppo_trainer - - _self_ - -actor_rollout_ref: - hybrid_engine: true - nccl_timeout: 600 - - model: - path: ??? - enable_gradient_checkpointing: true - - rollout: - name: vllm - mode: async - prompt_length: 4096 - response_length: 131072 - max_model_len: 135168 - temperature: 1.0 - top_p: 1.0 - n: 8 - tensor_model_parallel_size: 4 - gpu_memory_utilization: 0.7 - calculate_log_probs: true - enable_sleep_mode: true - free_cache_engine: true - - multi_turn: - enable: true - max_assistant_turns: 1 - max_parallel_calls: 1 - format: qwen3_coder - - agent: - num_workers: 8 - agent_loop_manager_class: uni_agent.trainer.framework.entry.AgentFrameworkRolloutAdapter - - custom: - agent_framework: - framework_class_fqn: examples.swe_agent_blackbox.framework.SWEAgentFramework - agent_runner_fqn: examples.swe_agent_blackbox.mini_swe_agent_runner.mini_swe_agent_runner - gateway_count: 1 - completion_timeout_seconds: 600 - max_concurrent_sessions: 32 - agent_runner_kwargs: - agent_config_path: examples/swe_agent_blackbox/config/agent_config.yaml - - actor: - use_dynamic_bsz: true - ppo_mini_batch_size: 16 - use_kl_loss: false - kl_loss_coef: 0.0 - clip_ratio_low: 0.2 - clip_ratio_high: 0.28 - loss_agg_mode: token-mean - optim: - lr: 1e-6 - weight_decay: 0.1 - clip_grad: 1.0 - fsdp_config: - param_offload: true - optimizer_offload: true - grad_offload: true - -data: - train_files: ??? - val_files: ??? - max_prompt_length: 4096 - max_response_length: 131072 - train_batch_size: 128 - val_batch_size: 128 - return_raw_chat: true - trust_remote_code: true - custom_cls: - path: pkg://examples.swe_agent_blackbox.dataset - name: SWEBenchDataset - -algorithm: - gamma: 1.0 - lam: 1.0 - adv_estimator: grpo - use_kl_in_reward: false - kl_ctrl: - type: fixed - kl_coef: 0.0 - -reward: - custom_reward_function: - path: pkg://examples/swe_agent_blackbox.reward - name: compute_score - -trainer: - use_legacy_worker_impl: disable - nnodes: 1 - n_gpus_per_node: 8 - total_epochs: 10 - project_name: swe_agent_blackbox - experiment_name: swe_agent - logger: - - console - device: cuda - balance_batch: false - val_before_train: true - val_only: false - save_freq: 10 - test_freq: 10 - default_local_dir: checkpoints/swe_agent_blackbox - resume_mode: disable - -ray_kwargs: - ray_init: - runtime_env: - env_vars: - TRANSFER_QUEUE_ENABLE: "" - NCCL_P2P_DISABLE: "1" - NCCL_SHM_DISABLE: "1" diff --git a/examples/blackbox_recipes/mini_swe_agent/config/swe_agent_blackbox_megatron_sync.yaml b/examples/blackbox_recipes/mini_swe_agent/config/swe_agent_blackbox_megatron_sync.yaml deleted file mode 100644 index 65b09b1a..00000000 --- a/examples/blackbox_recipes/mini_swe_agent/config/swe_agent_blackbox_megatron_sync.yaml +++ /dev/null @@ -1,129 +0,0 @@ -# Megatron sync training config for the blackbox SWE-agent recipe. -# Uses main_ppo_sync + Megatron backend, same blackbox infrastructure as FSDP. -# -# Entry point: python3 -m verl.trainer.main_ppo_sync - -hydra: - searchpath: - - pkg://verl.trainer.config - -defaults: - - ppo_megatron_trainer - - _self_ - -actor_rollout_ref: - hybrid_engine: true - nccl_timeout: 600 - - model: - path: ??? - enable_gradient_checkpointing: true - - rollout: - name: vllm - mode: async - prompt_length: 4096 - response_length: 131072 - max_model_len: 135168 - temperature: 1.0 - top_p: 1.0 - n: 8 - tensor_model_parallel_size: 4 - gpu_memory_utilization: 0.7 - calculate_log_probs: true - enable_sleep_mode: true - free_cache_engine: true - - multi_turn: - enable: true - max_assistant_turns: 1 - max_parallel_calls: 1 - format: qwen3_coder - - agent: - num_workers: 8 - agent_loop_manager_class: uni_agent.trainer.framework.entry.AgentFrameworkRolloutAdapter - - custom: - agent_framework: - framework_class_fqn: examples.swe_agent_blackbox.framework.SWEAgentFramework - agent_runner_fqn: examples.swe_agent_blackbox.mini_swe_agent_runner.mini_swe_agent_runner - gateway_count: 1 - completion_timeout_seconds: 600 - max_concurrent_sessions: 32 - agent_runner_kwargs: - agent_config_path: examples/swe_agent_blackbox/config/agent_config.yaml - - actor: - use_dynamic_bsz: true - ppo_mini_batch_size: 16 - use_kl_loss: false - kl_loss_coef: 0.0 - clip_ratio_low: 0.2 - clip_ratio_high: 0.28 - loss_agg_mode: token-mean - optim: - lr: 1e-6 - weight_decay: 0.1 - clip_grad: 1.0 - megatron: - param_offload: true - grad_offload: true - optimizer_offload: true - tensor_model_parallel_size: 8 - pipeline_model_parallel_size: 1 - context_parallel_size: 1 - use_mbridge: true - -data: - train_files: ??? - val_files: ??? - max_prompt_length: 4096 - max_response_length: 131072 - train_batch_size: 128 - val_batch_size: 128 - return_raw_chat: true - trust_remote_code: true - custom_cls: - path: pkg://examples.swe_agent_blackbox.dataset - name: SWEBenchDataset - -algorithm: - gamma: 1.0 - lam: 1.0 - adv_estimator: grpo - use_kl_in_reward: false - kl_ctrl: - type: fixed - kl_coef: 0.0 - -reward: - custom_reward_function: - path: pkg://examples.swe_agent_blackbox.reward - name: compute_score - -trainer: - use_legacy_worker_impl: disable - nnodes: 1 - n_gpus_per_node: 8 - total_epochs: 10 - project_name: swe_agent_blackbox - experiment_name: swe_agent - logger: - - console - device: cuda - balance_batch: false - val_before_train: true - val_only: false - save_freq: 10 - test_freq: 10 - default_local_dir: checkpoints/swe_agent_blackbox - resume_mode: disable - -ray_kwargs: - ray_init: - runtime_env: - env_vars: - TRANSFER_QUEUE_ENABLE: "" - NCCL_P2P_DISABLE: "1" - NCCL_SHM_DISABLE: "1" diff --git a/examples/blackbox_recipes/mini_swe_agent/config/swe_agent_blackbox_megatron_async.yaml b/examples/blackbox_recipes/mini_swe_agent/config/swe_agent_blackbox_megatron_v1.yaml similarity index 62% rename from examples/blackbox_recipes/mini_swe_agent/config/swe_agent_blackbox_megatron_async.yaml rename to examples/blackbox_recipes/mini_swe_agent/config/swe_agent_blackbox_megatron_v1.yaml index d25fcce5..ad2c719d 100644 --- a/examples/blackbox_recipes/mini_swe_agent/config/swe_agent_blackbox_megatron_async.yaml +++ b/examples/blackbox_recipes/mini_swe_agent/config/swe_agent_blackbox_megatron_v1.yaml @@ -1,8 +1,8 @@ -# Megatron + TQ fully-async training config for the blackbox SWE-agent recipe. -# Uses FullyAsyncAgentFrameworkRolloutAdapter + SWEAgentFramework with Megatron backend. +# Megatron + V1 unified trainer config for the blackbox mini-swe recipe. # -# Entry point: python3 -m verl.experimental.fully_async_policy.fully_async_main -# Requires: transfer_queue.enable=true (selects TQ path in FullyAsyncTaskRunner) +# Entry point: python3 -m verl.trainer.main_ppo +# Default trainer mode is separate_async. On a single 8-GPU node this recipe +# uses 4 GPUs for trainer and 4 GPUs for standalone rollout. hydra: searchpath: @@ -13,7 +13,7 @@ defaults: - _self_ actor_rollout_ref: - hybrid_engine: false + hybrid_engine: true nccl_timeout: 9600 model: @@ -22,13 +22,16 @@ actor_rollout_ref: rollout: name: vllm mode: async + nnodes: 1 + n_gpus_per_node: 4 prompt_length: 4096 response_length: 131072 max_model_len: 135168 temperature: 1.0 top_p: 1.0 + top_k: -1 n: 8 - tensor_model_parallel_size: 2 + tensor_model_parallel_size: 4 gpu_memory_utilization: 0.7 calculate_log_probs: true enable_sleep_mode: true @@ -37,26 +40,29 @@ actor_rollout_ref: max_num_batched_tokens: 135168 checkpoint_engine: backend: nccl + update_weights_bucket_megabytes: 2048 multi_turn: enable: true - max_assistant_turns: 1 max_parallel_calls: 1 format: qwen3_coder agent: num_workers: 8 - agent_loop_manager_class: uni_agent.trainer.framework.entry.FullyAsyncAgentFrameworkRolloutAdapter + agent_loop_manager_class: uni_agent.framework.entry.AgentFrameworkRolloutAdapter custom: agent_framework: - framework_class_fqn: examples.swe_agent_blackbox.framework.SWEAgentFramework - agent_runner_fqn: examples.swe_agent_blackbox.mini_swe_agent_runner.mini_swe_agent_runner gateway_count: 1 - completion_timeout_seconds: 600 - max_concurrent_sessions: 32 - agent_runner_kwargs: - agent_config_path: examples/swe_agent_blackbox/config/agent_config.yaml + agent_runners: + swe_agent: + runner_fqn: examples.blackbox_recipes.mini_swe_agent.mini_swe_agent_runner.mini_swe_agent_runner + dispatch_mode: ray_task + max_concurrent_sessions: 32 + runner_kwargs: + tool_image: swr.cn-east-3.myhuaweicloud.com/openyuanrong/mini-swe-agent-tool:latest + run_timeout: 3600 + conda_env: testbed actor: use_dynamic_bsz: true @@ -78,16 +84,17 @@ actor_rollout_ref: param_offload: true grad_offload: true optimizer_offload: true - tensor_model_parallel_size: 8 + tensor_model_parallel_size: 4 pipeline_model_parallel_size: 1 context_parallel_size: 1 use_mbridge: true use_remove_padding: false ref: + log_prob_micro_batch_size_per_gpu: 1 megatron: param_offload: false - tensor_model_parallel_size: 8 + tensor_model_parallel_size: 4 pipeline_model_parallel_size: 1 context_parallel_size: 1 @@ -98,12 +105,14 @@ data: truncation: left max_prompt_length: 4096 max_response_length: 131072 - train_batch_size: 0 + train_batch_size: 1 + val_batch_size: 1 gen_batch_size: 1 return_raw_chat: true trust_remote_code: true + dataloader_num_workers: 0 custom_cls: - path: pkg://examples.swe_agent_blackbox.dataset + path: pkg://examples.blackbox_recipes.mini_swe_agent.dataset name: SWEBenchDataset algorithm: @@ -119,13 +128,14 @@ algorithm: reward: custom_reward_function: - path: pkg://examples.swe_agent_blackbox.reward + path: pkg://examples.blackbox_recipes.mini_swe_agent.reward name: compute_score trainer: nnodes: 1 - n_gpus_per_node: 8 + n_gpus_per_node: 4 total_epochs: 10 + total_training_steps: null project_name: swe_agent_blackbox experiment_name: swe_agent logger: @@ -137,18 +147,14 @@ trainer: test_freq: 10 default_local_dir: checkpoints/swe_agent_blackbox resume_mode: auto - -rollout: - nnodes: 1 - n_gpus_per_node: 8 - total_rollout_steps: 100000 - -async_training: - use_trainer_do_validate: false - staleness_threshold: 1.0 - trigger_parameter_sync_step: 4 - require_batches: 1 - partial_rollout: true + use_v1: true + v1: + trainer_mode: separate_async + colocate_async: + num_warmup_batches: 1 + separate_async: + num_warmup_batches: 4 + parameter_sync_step: 4 transfer_queue: enable: true diff --git a/examples/blackbox_recipes/mini_swe_agent/dataset.py b/examples/blackbox_recipes/mini_swe_agent/dataset.py index 89d65129..e7781c03 100644 --- a/examples/blackbox_recipes/mini_swe_agent/dataset.py +++ b/examples/blackbox_recipes/mini_swe_agent/dataset.py @@ -21,7 +21,6 @@ def extract_image(env_config: dict) -> str: class SWEBenchDataset(RLHFDataset): - def __getitem__(self, item): row_dict = super().__getitem__(item) extra_info = row_dict.get("extra_info", {}) diff --git a/examples/blackbox_recipes/mini_swe_agent/framework.py b/examples/blackbox_recipes/mini_swe_agent/framework.py deleted file mode 100644 index 7c5c027c..00000000 --- a/examples/blackbox_recipes/mini_swe_agent/framework.py +++ /dev/null @@ -1,105 +0,0 @@ -"""SWE-agent specific framework subclass. - -Injects reward_info (from agent_runner's complete_session call) -into sample_fields["extra_info"] so the reward worker's -compute_score can access it via extra_info. - -Overrides _run_session to execute agent_runner in a separate Ray worker -process, preventing blocking operations from stalling the event loop. -""" - -from __future__ import annotations - -import asyncio -import functools -import logging -from dataclasses import replace -from uuid import uuid4 - -import ray - -from uni_agent.trainer.framework.framework import OpenAICompatibleAgentFramework - -from examples.swe_agent_blackbox.subprocess_runner import remote_agent_run - -logger = logging.getLogger(__name__) - - -class SWEAgentFramework(OpenAICompatibleAgentFramework): - - async def _score_trajectories(self, session_trajectories, sample_fields): - if session_trajectories and session_trajectories[-1].reward_info: - reward_info = session_trajectories[-1].reward_info - extra_info = dict(sample_fields.get("extra_info") or {}) - sample_fields = {**sample_fields, "extra_info": {**extra_info, **reward_info}} - return await super()._score_trajectories(session_trajectories, sample_fields) - - def _resolve_runner(self) -> tuple[str, dict]: - """Extract FQN and pre-bound kwargs from self.agent_runner. - - self.agent_runner may be a functools.partial (from_config wraps it), - so we unpack the original function and its keywords. - """ - fn = self.agent_runner - kwargs = {} - if isinstance(fn, functools.partial): - kwargs = dict(fn.keywords) - fn = fn.func - fqn = f"{fn.__module__}.{fn.__qualname__}" - return fqn, kwargs - - async def _run_session( - self, - *, - prompts, - raw_prompt, - sample_index: int, - session_id: str | None = None, - runner_kwargs: dict | None = None, - ): - """Run agent_runner in a Ray worker process instead of in-process.""" - session_id = session_id or f"session-{sample_index}-0-{uuid4().hex}" - sample_fields = self._extract_sample_fields(prompts=prompts, sample_index=sample_index) - session = await self.session_runtime.create_session(session_id) - agent_runner_fqn, resolved_kwargs = self._resolve_runner() - - try: - if runner_kwargs: - resolved_kwargs = {**resolved_kwargs, **runner_kwargs} - - ref = remote_agent_run.remote( - agent_runner_fqn=agent_runner_fqn, - raw_prompt=raw_prompt, - session_id=session_id, - base_url=session.base_url, - sample_index=sample_index, - runner_kwargs=resolved_kwargs, - ) - loop = asyncio.get_running_loop() - reward_info = await loop.run_in_executor(None, ray.get, ref) - - await self.session_runtime.complete_session( - session_id, reward_info=reward_info, - ) - session_trajectories = await self.session_runtime.finalize_session(session_id) - - except Exception as e: - logger.error("_run_session failed: session=%s, sample=%d, runner=%s: %s", - session_id, sample_index, agent_runner_fqn, e, exc_info=True) - await self.session_runtime.abort_session(session_id) - raise - - if not self.reward_loop_worker_handles or not session_trajectories: - return session_trajectories, sample_fields - - annotations = await self._score_trajectories(session_trajectories, sample_fields) - scored_trajectories = [] - for traj, (score, extra) in zip(session_trajectories, annotations, strict=True): - scored_trajectories.append( - replace( - traj, - reward_score=score, - extra_fields={**traj.extra_fields, "reward_extra_info": extra}, - ) - ) - return scored_trajectories, sample_fields diff --git a/examples/blackbox_recipes/mini_swe_agent/mini_swe_agent_runner.py b/examples/blackbox_recipes/mini_swe_agent/mini_swe_agent_runner.py index 33882bc8..2b16099a 100644 --- a/examples/blackbox_recipes/mini_swe_agent/mini_swe_agent_runner.py +++ b/examples/blackbox_recipes/mini_swe_agent/mini_swe_agent_runner.py @@ -1,6 +1,6 @@ """Mini-swe-agent runner for the blackbox SWE-agent recipe. -Agent runs inside a OpenYuanRong remote sandbox via sidecar tool image mount. +Agent runs inside a remote sandbox via sidecar tool image mount. The runner creates the sandbox, pipes task config via stdin, parses the result from stdout, and evaluates reward in the same sandbox. """ @@ -15,21 +15,24 @@ import time from pathlib import Path -from uni_agent.trainer.framework.types import SessionHandle, SessionRuntime +import httpx -from examples.swe_agent_blackbox.dataset import extract_image -from examples.swe_agent_blackbox.reward import build_reward_context, evaluate_in_env -from examples.swe_agent_blackbox.sandbox import CommandResult, YRSandbox, extract_upstream, rewrite_gateway_url +from examples.blackbox_recipes.mini_swe_agent.dataset import extract_image +from examples.blackbox_recipes.mini_swe_agent.reward import build_reward_context, evaluate_in_env +from examples.blackbox_recipes.sandbox_client import ( + SandboxClient, + extract_upstream, + rewrite_gateway_url, +) +from uni_agent.gateway.session import SessionHandle logger = logging.getLogger(__name__) -if os.environ.get("DEBUG_MODE"): - logger.setLevel(logging.DEBUG) DEFAULT_TOOL_IMAGE = "swr.cn-east-3.myhuaweicloud.com/openyuanrong/mini-swe-agent-tool:latest" class SandboxEnvForReward: - """Adapts :class:`YRSandbox` to the async env interface used by + """Adapts :class:`Sandbox` to the async env interface used by reward specs (``communicate``, ``write_file``, ``read_file``). """ @@ -67,7 +70,7 @@ def _build_task_config( ) -> dict: """Build the task config passed to run_agent.py via stdin.""" agent_gateway_url = rewrite_gateway_url(gateway_url) - step_limit = int(os.environ.get("SWE_AGENT_MAX_TURNS", "100")) + step_limit = int(os.environ.get("AGENT_MAX_TURNS", "100")) return { "task": task, "gateway_url": agent_gateway_url, @@ -84,15 +87,20 @@ def build_agent_command( ) -> str: """Build the command that runs run_agent.py inside the sandbox.""" conda_prefix = f"/opt/miniconda3/envs/{conda_env}" - env_prefix = ( + run_agent_env = ( f"CONDA_DEFAULT_ENV={shlex.quote(conda_env)} " f"CONDA_PREFIX={shlex.quote(conda_prefix)} " - f"PATH={shlex.quote(conda_prefix + '/bin')}:/opt/miniconda3/bin:$PATH" + f"PATH={shlex.quote(conda_prefix + '/bin')}:/opt/miniconda3/bin:$PATH " + "PIP_DISABLE_PIP_VERSION_CHECK=1 " + "PIP_PROGRESS_BAR=off" ) return ( "unset HTTP_PROXY HTTPS_PROXY http_proxy https_proxy NO_PROXY no_proxy; " - f"{env_prefix} " - f"echo {config_b64} | base64 -d | " + f"env {run_agent_env} sh -c 'echo \"[mini_swe] shell env: CONDA_DEFAULT_ENV=$CONDA_DEFAULT_ENV " + 'CONDA_PREFIX=$CONDA_PREFIX PATH=$PATH" >&2; ' + 'echo "[mini_swe] python=$(command -v python) pip=$(command -v pip)" >&2\' ; ' + f"printf %s {shlex.quote(config_b64)} | base64 -d | " + f"env {run_agent_env} " "/opt/mini-swe-agent/bin/python /opt/mini-swe-agent/bin/run_agent.py" ) @@ -102,21 +110,21 @@ async def mini_swe_agent_runner( raw_prompt, session: SessionHandle, sample_index: int, - session_runtime: SessionRuntime, tools_kwargs: dict | None = None, tool_image: str = DEFAULT_TOOL_IMAGE, run_timeout: int = 7200, conda_env: str = "testbed", + sandbox_max_retries: int = 10, **kwargs, ) -> None: """Run mini-swe-agent inside a sandbox with sidecar tool mount. Flow: - 1. Create OpenYuanRong remote sandbox with mini-swe-agent sidecar + 1. Create remote sandbox with mini-swe-agent sidecar 2. Pipe task config to run_agent.py via stdin 3. Parse agent result from stdout 4. Evaluate reward in the same sandbox - 5. Complete session with reward_info + 5. Post reward_info for the framework reward path """ tools_kwargs = tools_kwargs or {} logger.info("mini_swe_agent_runner called, sample_index=%d", sample_index) @@ -130,14 +138,17 @@ async def mini_swe_agent_runner( if not image: raise ValueError(f"No sandbox image found in tools_kwargs.env for sample {sample_index}") - # Gateway URL — extract upstream for OpenYuanRong tunnel + # Gateway URL — extract upstream for tunnel gateway_url = session.base_url if not gateway_url: raise ValueError(f"gateway_url is empty for sample {sample_index}") upstream = extract_upstream(gateway_url) - sandbox = await YRSandbox.create( - image=image, sidecar_image=tool_image, upstream=upstream, + sandbox = await SandboxClient.create( + image=image, + sidecar_image=tool_image, + upstream=upstream, + max_retries=int(sandbox_max_retries), ) sandbox_id = sandbox.sandbox_id logger.info("Sandbox created (image=%s, sandbox_id=%s)", image, sandbox_id) @@ -168,14 +179,17 @@ async def mini_swe_agent_runner( elapsed = time.perf_counter() - t0 logger.debug( "[sample %d] agent process finished: rc=%d (%.1fs)", - sample_index, agent_result.exit_code, elapsed, + sample_index, + agent_result.exit_code, + elapsed, ) # Parse agent result from stdout agent_info = _parse_agent_result(agent_result.stdout, sample_index) logger.info( "[sample %d] agent: exit_status=%s, submission=%d chars", - sample_index, agent_info.get("exit_status"), + sample_index, + agent_info.get("exit_status"), len(agent_info.get("submission", "")), ) @@ -186,11 +200,18 @@ async def mini_swe_agent_runner( score, eval_result = await evaluate_in_env(reward_env, metadata, eval_timeout) logger.debug( "[sample %d] reward done: score=%s, resolved=%s (%.1fs)", - sample_index, score, eval_result.get("resolved"), time.perf_counter() - t0, + sample_index, + score, + eval_result.get("resolved"), + time.perf_counter() - t0, ) reward_info = {"reward_score": score, **eval_result} - await session_runtime.complete_session(session.session_id, reward_info=reward_info) + if not session.reward_info_url: + raise ValueError(f"reward_info_url is empty for session {session.session_id}") + async with httpx.AsyncClient(timeout=30.0) as client: + response = await client.post(session.reward_info_url, json={"reward_info": reward_info}) + response.raise_for_status() except Exception as e: logger.warning("Mini-swe-agent runner failed for sample %d (sandbox_id=%s): %s", sample_index, sandbox_id, e) @@ -212,7 +233,7 @@ def _parse_agent_result(stdout: str, sample_index: int) -> dict: if not stdout: return {"exit_status": "error", "submission": ""} # Try the last line that looks like JSON first - lines = [l.strip() for l in stdout.split("\n") if l.strip()] + lines = [ln.strip() for ln in stdout.split("\n") if ln.strip()] for line in reversed(lines): if line.startswith("{"): try: diff --git a/examples/blackbox_recipes/mini_swe_agent/parallel_infer.py b/examples/blackbox_recipes/mini_swe_agent/parallel_infer.py index c74765e0..cd792cd5 100644 --- a/examples/blackbox_recipes/mini_swe_agent/parallel_infer.py +++ b/examples/blackbox_recipes/mini_swe_agent/parallel_infer.py @@ -1,11 +1,16 @@ -"""Parallel inference runner for the blackbox SWE-agent recipe (v2). +"""Standalone inference runner for the blackbox mini-swe-agent recipe. -Creates an LLM server, GatewayServingRuntime, and SWEAgentFramework, -then runs agent sessions in parallel and reports resolve rate. +Spins up vLLM + gateway + a reward worker, runs agent sessions in parallel, +and reports resolve rate. Does NOT start the Megatron trainer. -Usage (CLI): - python examples/swe_agent_blackbox/parallel_infer.py \ - --model-path ~/models/Qwen3-Coder-30B-A3B-Instruct \ +Reuses the recipe's existing training config +(config/swe_agent_blackbox_megatron_v1.yaml); its megatron/optimizer sections +are inert here since this driver never builds the actor worker group — only +the rollout, agent_framework, model, and reward sections are read. + +Usage: + python examples/blackbox_recipes/mini_swe_agent/parallel_infer.py \ + --model-path ~/models/Qwen3.5-9B \ --data-path ~/data/swe_agent/swe_bench_verified.parquet \ --max-samples 10 """ @@ -14,32 +19,20 @@ import argparse import asyncio -import json import logging import os -from functools import partial from typing import Any from uuid import uuid4 import numpy as np import ray -from verl import DataProto -from verl.protocol import pad_dataproto_to_divisor -from verl.utils import hf_tokenizer -from verl.utils.transferqueue_utils import tq as _tq_mock +from verl.experimental.reward_loop.reward_loop import RewardLoopWorker +from verl.utils import tensordict_utils as tu +from verl.utils.transferqueue_utils import tq from verl.workers.rollout.llm_server import LLMServerManager -from uni_agent.trainer.gateway.runtime import GatewayServingRuntime - -from examples.swe_agent_blackbox.framework import SWEAgentFramework -from examples.swe_agent_blackbox.agent_runner import swe_agent_runner -from examples.swe_agent_blackbox.claude_code_runner import claude_code_runner - -try: - from examples.swe_agent_blackbox.mini_swe_agent_runner import mini_swe_agent_runner -except ImportError: - mini_swe_agent_runner = None +from uni_agent.framework.entry import build_agent_framework, build_gateway_manager logging.basicConfig( format="%(asctime)s - %(name)s - %(levelname)s - %(message)s", @@ -48,9 +41,14 @@ ) logger = logging.getLogger(__name__) +# ── Recipe-specific constants (only these two differ between recipes) ────── +_CONFIG_DIR = os.path.join(os.path.dirname(os.path.abspath(__file__)), "config") +_CONFIG_NAME = "swe_agent_blackbox_megatron_v1" +_DEFAULT_TOOL_IMAGE = "swr.cn-east-3.myhuaweicloud.com/openyuanrong/mini-swe-agent-tool:latest" + # ===================================================================== -# Dataset loading (inlined from dataset.py — only used here) +# Dataset loading (inlined; keeps the driver self-contained) # ===================================================================== @@ -83,7 +81,6 @@ def _remap_sample_images(sample: dict[str, Any]) -> dict[str, Any]: def _inject_reward_fields(sample: dict[str, Any]) -> None: - """Inject verl-standard data_source and reward_model from extra_info.tools_kwargs.reward.""" extra_info = sample.get("extra_info", {}) tools_kwargs = extra_info.get("tools_kwargs", {}) reward_config = tools_kwargs.get("reward", {}) @@ -91,334 +88,272 @@ def _inject_reward_fields(sample: dict[str, Any]) -> None: sample.setdefault("reward_model", {"ground_truth": {}}) -def load_swe_dataset(data_path: str | list[str], max_samples: int = -1) -> list[dict[str, Any]]: +def load_swe_dataset(data_path: str, max_samples: int = -1) -> list[dict[str, Any]]: import pyarrow.parquet as pq - if isinstance(data_path, list): - paths = [os.path.expanduser(p) for p in data_path] - else: - paths = os.path.expanduser(data_path) - - logger.info("Loading dataset from: %s", data_path) - if isinstance(paths, list): - import pyarrow as pa - tables = [pq.read_table(p) for p in paths] - table = pa.concat_tables(tables) - else: - table = pq.read_table(paths) - samples = table.to_pylist() - + path = os.path.expanduser(data_path) + logger.info("Loading dataset from: %s", path) + samples = pq.read_table(path).to_pylist() for i, sample in enumerate(samples): samples[i] = _remap_sample_images(sample) _inject_reward_fields(samples[i]) - if max_samples > 0: samples = samples[:max_samples] - logger.info("Using first %d samples (max_samples=%d)", len(samples), max_samples) - - logger.info("Loaded %d samples from %s", len(samples), data_path) + logger.info("Loaded %d samples", len(samples)) return samples -class _MockReplayBuffer: - """Minimal replay buffer for inference mode (no actual training).""" - - def add(self, partition_id, items): - pass +# ===================================================================== +# Config +# ===================================================================== -def run_inference( +def _load_config( *, model_path: str, - data_path: str, - prompt_length: int = 4096, - response_length: int = 65536, - temperature: float = 0.8, - top_p: float = 0.9, - n: int = 1, - max_samples: int = -1, - engine: str = "vllm", - nnodes: int = 1, - n_gpus_per_node: int = 8, - tensor_parallel_size: int = 4, - gateway_count: int = 1, - max_concurrent_sessions: int = 2, - completion_timeout: float = 600.0, - tool_parser: str | None = None, - agent_config_path: str | None = None, - runner: str = "uniagent", - tool_image: str | None = None, - run_timeout: int = 7200, -) -> dict[str, Any]: - """Run parallel SWE-agent inference using the blackbox framework.""" - if runner == "mini_swe": - if mini_swe_agent_runner is None: - raise ImportError("mini-swe-agent is required for --runner mini_swe. Install with: pip install mini-swe-agent") - _agent_runner = partial( - mini_swe_agent_runner, - tool_image=tool_image or "swr.cn-east-3.myhuaweicloud.com/openyuanrong/mini-swe-agent-tool:latest", - run_timeout=run_timeout, - ) - elif runner == "claude_code": - _agent_runner = partial( - claude_code_runner, - tool_image=tool_image or "claude-code-tool:latest", - run_timeout=run_timeout, - ) - else: - _agent_runner = swe_agent_runner + engine: str, + prompt_length: int, + response_length: int, + temperature: float, + top_p: float, + n: int, + nnodes: int, + n_gpus_per_node: int, + tensor_parallel_size: int, + gateway_count: int, + max_concurrent_sessions: int, + tool_image: str | None, + run_timeout: int, +) -> Any: + """Compose the recipe's training config and override inference fields. - if not ray.is_initialized(): - ray.init() + The megatron/actor/optimizer sections are left untouched and never read. + """ + from hydra import compose, initialize_config_dir + from omegaconf import OmegaConf - # 1. Init Hydra config - config = _init_hydra_config( - model_path=model_path, - engine=engine, - prompt_length=prompt_length, - response_length=response_length, - temperature=temperature, - top_p=top_p, - n=n, - nnodes=nnodes, - n_gpus_per_node=n_gpus_per_node, - tensor_parallel_size=tensor_parallel_size, - ) + with initialize_config_dir(config_dir=_CONFIG_DIR, version_base=None): + config = compose(config_name=_CONFIG_NAME) - # 2. Load dataset - samples = load_swe_dataset(data_path, max_samples=max_samples) - logger.info( - "Loaded %d samples, %d rollout(s) each, runner=%s, gateway_count=%d, max_concurrent_sessions=%d", - len(samples), - n, - runner, - gateway_count, - max_concurrent_sessions, - ) + OmegaConf.set_struct(config, False) - if not samples: - raise ValueError("No samples to process") + config.actor_rollout_ref.model.path = os.path.expanduser(model_path) - # 3. Create LLM server - logger.info("Initializing LLM server manager...") - llm_server_manager = LLMServerManager.create(config=config) + ro = config.actor_rollout_ref.rollout + ro.name = engine + ro.mode = "async" + ro.prompt_length = prompt_length + ro.response_length = response_length + ro.max_model_len = prompt_length + response_length + 1024 + ro.max_num_batched_tokens = ro.max_model_len + ro.n = n + ro.temperature = temperature + ro.top_p = top_p + ro.tensor_model_parallel_size = tensor_parallel_size + ro.gpu_memory_utilization = float(os.getenv("ROLLOUT_GPU_MEM_UTIL", "0.7")) + ro.nnodes = nnodes + ro.n_gpus_per_node = n_gpus_per_node + ro.calculate_log_probs = True + ro.enable_sleep_mode = False + + af = ro.custom.agent_framework + af.gateway_count = gateway_count + runner_name = next(iter(af.agent_runners.keys())) + runner_cfg = af.agent_runners[runner_name] + runner_cfg.max_concurrent_sessions = max_concurrent_sessions + if tool_image: + runner_cfg.runner_kwargs.tool_image = tool_image + runner_cfg.runner_kwargs.run_timeout = run_timeout - # 4. Create GatewayServingRuntime - logger.info("Using tool_parser=%r", tool_parser) + config.trainer.nnodes = nnodes + config.trainer.n_gpus_per_node = n_gpus_per_node - llm_client = llm_server_manager.get_client() - gateway_actor_kwargs = { - "tokenizer": hf_tokenizer(os.path.expanduser(model_path)), - "base_sampling_params": {"temperature": temperature, "top_p": top_p, "max_tokens": response_length}, - } - if tool_parser: - gateway_actor_kwargs["tool_parser_name"] = tool_parser - - gateway_runtime = GatewayServingRuntime( - llm_client=llm_client, - gateway_count=gateway_count, - gateway_actor_kwargs=gateway_actor_kwargs, - ) + OmegaConf.set_struct(config, True) + return config - # 5. Create RewardLoopWorker for compute_score - from verl.experimental.reward_loop.reward_loop import RewardLoopWorker - reward_worker = ray.remote(RewardLoopWorker).remote(config, None) - # 6. Create framework - framework = SWEAgentFramework( - session_runtime=gateway_runtime, - agent_runner=_agent_runner, - replay_buffer=_MockReplayBuffer(), - rollout_config={"n": n, "val_kwargs": {"n": n}}, - completion_timeout=completion_timeout, - wait_for_completion_after_agent_run=True, - max_concurrent_sessions=max_concurrent_sessions, - reward_loop_worker_handles=[reward_worker], - ) - - # 6. Build batch data and run - _tools_kwargs_list = [] - for sample in samples: - tk = (sample.get("extra_info") or {}).get("tools_kwargs", {}) - if runner == "uniagent" and agent_config_path: - tk["agent_config_path"] = agent_config_path - tk["model_path"] = os.path.expanduser(model_path) - _tools_kwargs_list.append(tk) +# ===================================================================== +# Batch + score capture +# ===================================================================== - from tensordict import TensorDict - from verl.utils import tensordict_utils as _tu +def _build_prompts(samples: list[dict[str, Any]]) -> tuple[Any, list[str]]: raw_prompts = [sample["prompt"] for sample in samples] uids = [str(uuid4()) for _ in samples] - td = TensorDict({"uid": uids, "global_steps": [0] * len(samples)}, batch_size=[len(samples)]) - _tu.assign_non_tensor_stack(td, "raw_prompt", raw_prompts) - _tu.assign_non_tensor_stack(td, "tools_kwargs", _tools_kwargs_list) - _tu.assign_non_tensor_stack(td, "data_source", [sample["data_source"] for sample in samples]) - _tu.assign_non_tensor_stack(td, "reward_model", [sample["reward_model"] for sample in samples]) + tools_kwargs_list = [dict((sample.get("extra_info") or {}).get("tools_kwargs", {})) for sample in samples] + prompts = tu.get_tensordict( + tensor_dict={ + "raw_prompt": raw_prompts, + "uid": uids, + "data_source": [sample["data_source"] for sample in samples], + "reward_model": [sample["reward_model"] for sample in samples], + "tools_kwargs": tools_kwargs_list, + }, + non_tensor_dict={"global_steps": 0}, + ) + return prompts, uids - batch = DataProto(batch=td, meta_info={}).repeat(n) - size_divisor = gateway_count - batch_padded, pad_size = pad_dataproto_to_divisor(batch, size_divisor) - logger.info("Starting %d agent session(s)...", len(batch_padded)) +def _install_tq_capture() -> tuple[dict[str, float], dict[str, str]]: + """Monkeypatch the process-local TransferQueue to capture rm_scores in-memory. - _tq_store: dict[str, Any] = {} + Runner dispatch is a Ray task, but session finalize/score/TQ-writes happen + in this driver process, so patching ``tq`` here captures every write. + """ + captured_scores: dict[str, float] = {} + uid_status: dict[str, str] = {} - async def _dummy_kv_put(key, partition_id=None, tag=None, **kwargs): - _tq_store[key] = tag + async def _fake_put(*, key, partition_id=None, tag=None, **kwargs): + if isinstance(tag, dict) and "status" in tag: + uid_status[str(key)] = str(tag["status"]) - async def _dummy_kv_batch_put(keys=None, fields=None, tags=None, partition_id=None, **kwargs): + async def _fake_batch_put(*, keys=None, fields=None, tags=None, partition_id=None, **kwargs): + if fields is None or keys is None or "rm_scores" not in fields: + return + rm = fields["rm_scores"] # nested tensor; rm[i] is trajectory i's response scores for i, key in enumerate(keys): - _tq_store[key] = {"fields": fields, "tag": tags[i] if tags else None} + row = rm[i] + captured_scores[str(key)] = float(row[-1].item()) if row.numel() else 0.0 - _tq_mock.async_kv_put = _dummy_kv_put - _tq_mock.async_kv_batch_put = _dummy_kv_batch_put + tq.async_kv_put = _fake_put + tq.async_kv_batch_put = _fake_batch_put + return captured_scores, uid_status - async def _generate(): - return await framework.generate_sequences(batch_padded.batch) - try: - stats = asyncio.run(_generate()) - except RuntimeError as e: - logger.warning("generate_sequences failed: %s", e) - stats = {} - - # 7. Collect scores - uid_to_sample_idx = {uid: i for i, uid in enumerate(uids)} - per_sample_scores = [0.0] * len(samples) - sample_trajectory_counts = [0] * len(samples) - for key, value in _tq_store.items(): - if not isinstance(value, dict) or "fields" not in value: - continue - fields = value["fields"] - rm_scores = fields.get("rm_scores", None) - if rm_scores is None: - continue - # Key format: {uid}_{session_index}_{index} +def _report(samples, uids, captured_scores) -> dict[str, Any]: + uid_to_index = {uid: i for i, uid in enumerate(uids)} + per_sample_sum = [0.0] * len(samples) + per_sample_cnt = [0] * len(samples) + for key, score in captured_scores.items(): + # key format: {uid}_{session_index}_{index} uid = key.rsplit("_", 2)[0] - sample_idx = uid_to_sample_idx.get(uid) - if sample_idx is None: + idx = uid_to_index.get(uid) + if idx is None: continue - score = float(rm_scores.float()[-1, -1].item()) - per_sample_scores[sample_idx] += score - sample_trajectory_counts[sample_idx] += 1 - - for i in range(len(samples)): - if sample_trajectory_counts[i] > 0: - per_sample_scores[i] /= sample_trajectory_counts[i] - - resolved_count = sum(1 for s in per_sample_scores if s > 0) - overall_mean = float(np.mean(per_sample_scores)) if per_sample_scores else 0.0 + per_sample_sum[idx] += score + per_sample_cnt[idx] += 1 + per_sample_scores = [ + per_sample_sum[i] / per_sample_cnt[i] if per_sample_cnt[i] else 0.0 for i in range(len(samples)) + ] + resolved = sum(1 for s in per_sample_scores if s > 0) + mean = float(np.mean(per_sample_scores)) if per_sample_scores else 0.0 logger.info( "Resolved %d / %d samples (%.2f%%), mean score: %.4f", - resolved_count, len(samples), 100.0 * resolved_count / max(len(samples), 1), overall_mean, + resolved, len(samples), 100.0 * resolved / max(len(samples), 1), mean, ) - - # 8. Cleanup - asyncio.run(gateway_runtime.shutdown()) - - return { - "stats": stats, - "mean_score": overall_mean, - "per_sample_scores": per_sample_scores, - } + return {"resolved": resolved, "total": len(samples), "mean_score": mean, "per_sample_scores": per_sample_scores} # ===================================================================== -# Helpers +# Runner # ===================================================================== -def _init_hydra_config( +def run_inference( *, model_path: str, - engine: str, + data_path: str, prompt_length: int, response_length: int, temperature: float, top_p: float, n: int, + max_samples: int, + engine: str, nnodes: int, n_gpus_per_node: int, tensor_parallel_size: int, -) -> Any: - """Initialize Hydra config with rollout/model settings.""" - from hydra import compose, initialize_config_dir - from omegaconf import OmegaConf + gateway_count: int, + max_concurrent_sessions: int, + tool_image: str | None, + run_timeout: int, +) -> dict[str, Any]: + if not ray.is_initialized(): + ray.init() - config_dir = os.path.abspath("examples/swe_agent_blackbox/config") - with initialize_config_dir(config_dir=config_dir, version_base=None): - config = compose(config_name="parallel_infer") + config = _load_config( + model_path=model_path, + engine=engine, + prompt_length=prompt_length, + response_length=response_length, + temperature=temperature, + top_p=top_p, + n=n, + nnodes=nnodes, + n_gpus_per_node=n_gpus_per_node, + tensor_parallel_size=tensor_parallel_size, + gateway_count=gateway_count, + max_concurrent_sessions=max_concurrent_sessions, + tool_image=tool_image, + run_timeout=run_timeout, + ) - config.actor_rollout_ref.model.path = os.path.expanduser(model_path) - config.actor_rollout_ref.rollout.name = engine - config.actor_rollout_ref.rollout.mode = "async" - config.actor_rollout_ref.rollout.prompt_length = prompt_length - config.actor_rollout_ref.rollout.response_length = response_length - config.actor_rollout_ref.rollout.max_model_len = prompt_length + response_length + 1024 - config.actor_rollout_ref.rollout.n = n - config.actor_rollout_ref.rollout.tensor_model_parallel_size = tensor_parallel_size - config.actor_rollout_ref.rollout.gpu_memory_utilization = float(os.getenv("ROLLOUT_GPU_MEM_UTIL", "0.5")) - config.actor_rollout_ref.rollout.temperature = temperature - config.actor_rollout_ref.rollout.top_p = top_p - config.actor_rollout_ref.rollout.val_kwargs.temperature = temperature - config.actor_rollout_ref.rollout.val_kwargs.top_p = top_p - config.actor_rollout_ref.rollout.calculate_log_probs = True - config.actor_rollout_ref.rollout.multi_turn.max_assistant_turns = 100 - config.actor_rollout_ref.rollout.multi_turn.max_parallel_calls = 1 - config.actor_rollout_ref.rollout.nnodes = nnodes - config.actor_rollout_ref.rollout.n_gpus_per_node = n_gpus_per_node - config.trainer.nnodes = nnodes - config.trainer.n_gpus_per_node = n_gpus_per_node + samples = load_swe_dataset(data_path, max_samples=max_samples) + if not samples: + raise ValueError("No samples to process") - config.reward.custom_reward_function.path = "pkg://examples.swe_agent_blackbox.reward" - config.reward.custom_reward_function.name = "compute_score" - config.reward.num_workers = 1 + logger.info("Initializing LLM server manager...") + llm_server_manager = LLMServerManager.create(config=config) + llm_client = llm_server_manager.get_client() - OmegaConf.set_struct(config.actor_rollout_ref.rollout, False) - config.actor_rollout_ref.rollout.enable_sleep_mode = False - config.actor_rollout_ref.rollout.enforce_eager = os.getenv("ROLLOUT_ENFORCE_EAGER", "0") == "1" - OmegaConf.set_struct(config.actor_rollout_ref.rollout, True) - return config + gateway_manager = build_gateway_manager(config=config, llm_client=llm_client) + reward_worker = ray.remote(RewardLoopWorker).remote(config, None) + framework = build_agent_framework( + config=config, + gateway_manager=gateway_manager, + reward_loop_worker_handles=[reward_worker], + ) + + prompts, uids = _build_prompts(samples) + captured_scores, _uid_status = _install_tq_capture() + + logger.info("Starting %d sample(s), %d session(s) each...", len(samples), n) + try: + asyncio.run(framework.generate_sequences(prompts)) + except RuntimeError as exc: + logger.warning("generate_sequences failed: %s", exc) + + if not captured_scores: + logger.warning( + "No trajectory scores captured — all rollouts may have failed (see the " + "generate_sequences summary above), or the TransferQueue monkeypatch did not " + "reach the writer; resolve rate will be reported as 0." + ) + + result = _report(samples, uids, captured_scores) + + asyncio.run(gateway_manager.shutdown()) + return result # ===================================================================== -# CLI entry point +# CLI # ===================================================================== def main(): - parser = argparse.ArgumentParser(description="SWE-Agent Blackbox Parallel Inference") + parser = argparse.ArgumentParser(description="Blackbox mini-swe-agent standalone inference") + parser.add_argument("--model-path", "--model", type=str, default="~/models/Qwen3.5-9B") parser.add_argument("--data-path", type=str, default="~/data/swe_agent/swe_bench_verified.parquet") - parser.add_argument("--model-path", "--model", type=str, default="~/models/Qwen3-Coder-30B-A3B-Instruct") - parser.add_argument("--max-turns", type=int, default=100) + parser.add_argument("--max-samples", type=int, default=-1) parser.add_argument("--prompt-length", type=int, default=4096) - parser.add_argument("--response-length", type=int, default=65536) - parser.add_argument("--temperature", type=float, default=0.8) - parser.add_argument("--top-p", type=float, default=0.9) + parser.add_argument("--response-length", type=int, default=131072) + parser.add_argument("--temperature", type=float, default=1.0) + parser.add_argument("--top-p", type=float, default=1.0) parser.add_argument("--n", type=int, default=1) - parser.add_argument("--max-samples", type=int, default=-1) parser.add_argument("--engine", type=str, default="vllm", choices=["vllm", "sglang"]) + parser.add_argument("--tensor-parallel-size", "--tp", type=int, default=4) parser.add_argument("--nnodes", type=int, default=1) parser.add_argument("--n-gpus-per-node", type=int, default=8) - parser.add_argument("--tensor-parallel-size", "--tp", type=int, default=4) parser.add_argument("--gateway-count", type=int, default=1) - parser.add_argument("--max-concurrent-sessions", type=int, default=2) - parser.add_argument("--tool-parser", type=str, default="qwen3_coder") - parser.add_argument("--tool-image", type=str, default=None) + parser.add_argument("--max-concurrent-sessions", type=int, default=8) + parser.add_argument("--tool-image", type=str, default=_DEFAULT_TOOL_IMAGE) parser.add_argument("--run-timeout", type=int, default=7200) - parser.add_argument( - "--runner", type=str, default="uniagent", choices=["uniagent", "mini_swe", "claude_code"], - help="Agent runner: 'uniagent', 'mini_swe', or 'claude_code'.", - ) - parser.add_argument( - "--agent-config-path", type=str, - default="examples/swe_agent_blackbox/config/agent_config.yaml", - help="Path to agent config YAML.", - ) + parser.add_argument("--max-turns", type=int, default=100) args = parser.parse_args() - os.environ["SWE_AGENT_MAX_TURNS"] = str(args.max_turns) + # Set before ray.init so runner Ray tasks inherit it. + os.environ["AGENT_MAX_TURNS"] = str(args.max_turns) run_inference( model_path=args.model_path, @@ -435,9 +370,6 @@ def main(): tensor_parallel_size=args.tensor_parallel_size, gateway_count=args.gateway_count, max_concurrent_sessions=args.max_concurrent_sessions, - tool_parser=args.tool_parser, - agent_config_path=args.agent_config_path, - runner=args.runner, tool_image=args.tool_image, run_timeout=args.run_timeout, ) diff --git a/examples/blackbox_recipes/mini_swe_agent/reward.py b/examples/blackbox_recipes/mini_swe_agent/reward.py index 61da218b..267cfea5 100644 --- a/examples/blackbox_recipes/mini_swe_agent/reward.py +++ b/examples/blackbox_recipes/mini_swe_agent/reward.py @@ -27,7 +27,7 @@ def build_reward_context(tools_kwargs: dict) -> tuple[dict[str, Any], int]: def compute_score(data_source: str, solution_str: str, ground_truth: str, extra_info=None) -> dict: - """Read reward_score from extra_info, injected by SWEAgentFramework.""" + """Read reward_score from extra_info, injected by the agent runner.""" score = 0.0 if extra_info and "reward_score" in extra_info: score = float(extra_info["reward_score"]) diff --git a/examples/blackbox_recipes/mini_swe_agent/run_agent.py b/examples/blackbox_recipes/mini_swe_agent/run_agent.py index c5a4b165..68406803 100644 --- a/examples/blackbox_recipes/mini_swe_agent/run_agent.py +++ b/examples/blackbox_recipes/mini_swe_agent/run_agent.py @@ -3,7 +3,7 @@ Input: task config JSON from **stdin** - task: str — the issue description for the agent to solve - - gateway_url: str — LLM gateway endpoint (tunnel URL for OpenYuanRong sandbox) + - gateway_url: str — LLM gateway endpoint (tunnel URL for remote sandbox) - agent: dict — agent config (e.g. step_limit) Output: agent result JSON to **stdout**, or error JSON on failure @@ -12,7 +12,6 @@ from __future__ import annotations import json -import os import sys DEFAULT_ACTION_TIMEOUT = 600 @@ -45,8 +44,15 @@ def main() -> None: env_cfg["timeout"] = DEFAULT_ACTION_TIMEOUT env_cfg.setdefault("env", {}) env_cfg["env"].setdefault("GIT_PAGER", "cat") - for key in ("image", "container_timeout", "run_args", "executable", "pull_timeout", - "forward_env", "interpreter"): + for key in ( + "image", + "container_timeout", + "run_args", + "executable", + "pull_timeout", + "forward_env", + "interpreter", + ): env_cfg.pop(key, None) env = LocalEnvironment(**env_cfg) @@ -57,15 +63,17 @@ def main() -> None: model_defaults.pop("model_name", None) model_defaults.pop("model_kwargs", None) model_cfg = model_defaults - model_cfg.update({ - "model_name": "openai/default", - "model_kwargs": { - "api_base": gateway_url, - "api_key": "not-needed", - "drop_params": True, - }, - "cost_tracking": "ignore_errors", - }) + model_cfg.update( + { + "model_name": "openai/default", + "model_kwargs": { + "api_base": gateway_url, + "api_key": "not-needed", + "drop_params": True, + }, + "cost_tracking": "ignore_errors", + } + ) model = LitellmModel(**model_cfg) # 5. Create DefaultAgent diff --git a/examples/blackbox_recipes/mini_swe_agent/run_infer.sh b/examples/blackbox_recipes/mini_swe_agent/run_infer.sh new file mode 100755 index 00000000..2dfa997d --- /dev/null +++ b/examples/blackbox_recipes/mini_swe_agent/run_infer.sh @@ -0,0 +1,80 @@ +#!/usr/bin/env bash +# Standalone inference for the blackbox mini-swe-agent recipe. +# Runs rollout + reward only (no Megatron trainer) and reports resolve rate. +# +# Usage: +# bash examples/blackbox_recipes/mini_swe_agent/run_infer.sh +# +# All configurable via environment variables (see defaults below). + +set -euo pipefail + +SCRIPT_DIR="$(cd "$(dirname "${BASH_SOURCE[0]}")" && pwd)" +REPO_ROOT="${REPO_ROOT:-$(cd "${SCRIPT_DIR}/../../.." && pwd)}" +cd "${REPO_ROOT}" + +# ── Model & data ───────────────────────────────────────────────────────── +MODEL_PATH="${MODEL_PATH:-${HOME}/models/Qwen3.5-9B}" +DATA_PATH="${DATA_PATH:-${HOME}/data/swe_agent/swe_bench_verified.parquet}" + +# ── Inference parameters ───────────────────────────────────────────────── +MAX_SAMPLES="${MAX_SAMPLES:--1}" +PROMPT_LENGTH="${PROMPT_LENGTH:-4096}" +RESPONSE_LENGTH="${RESPONSE_LENGTH:-131072}" +TEMPERATURE="${TEMPERATURE:-1.0}" +TOP_P="${TOP_P:-1.0}" +N="${N:-1}" +ENGINE="${ENGINE:-vllm}" +TP="${TP:-4}" +NNODES="${NNODES:-1}" +N_GPUS_PER_NODE="${N_GPUS_PER_NODE:-8}" +GATEWAY_COUNT="${GATEWAY_COUNT:-1}" +MAX_CONCURRENT_SESSIONS="${MAX_CONCURRENT_SESSIONS:-8}" + +# ── Agent parameters ───────────────────────────────────────────────────── +AGENT_MAX_TURNS="${AGENT_MAX_TURNS:-100}" +SWE_AGENT_TOOL_IMAGE="${SWE_AGENT_TOOL_IMAGE:-swr.cn-east-3.myhuaweicloud.com/openyuanrong/mini-swe-agent-tool:latest}" +SWE_AGENT_RUN_TIMEOUT="${SWE_AGENT_RUN_TIMEOUT:-7200}" + +# ── AKernel (remote sandbox) ───────────────────────────────────────────── +export AKERNEL_SERVER_ADDRESS="${AKERNEL_SERVER_ADDRESS:-}" +export AKERNEL_TOKEN="${AKERNEL_TOKEN:-}" +export AKERNEL_TUNNEL_SSL_VERIFY="${AKERNEL_TUNNEL_SSL_VERIFY:-0}" + +# ── Logging & env ──────────────────────────────────────────────────────── +export VERL_LOGGING_LEVEL="${VERL_LOGGING_LEVEL:-INFO}" +export ROLLOUT_GPU_MEM_UTIL="${ROLLOUT_GPU_MEM_UTIL:-0.7}" +export AGENT_MAX_TURNS +export SWE_AGENT_EVAL_TIMEOUT="${SWE_AGENT_EVAL_TIMEOUT:-600}" +export PYTHONPATH="${REPO_ROOT}:${REPO_ROOT}/verl:${PYTHONPATH:-}" + +echo "=== Mini-SWE-Agent Blackbox Inference ===" +echo "Model: ${MODEL_PATH}" +echo "Data: ${DATA_PATH}" +echo "Max samples: ${MAX_SAMPLES}" +echo "Engine: ${ENGINE} (TP=${TP})" +echo "Tool image: ${SWE_AGENT_TOOL_IMAGE}" +echo "Batch: n=${N}, gateway=${GATEWAY_COUNT}, max_sessions=${MAX_CONCURRENT_SESSIONS}" +if [[ -n "${GATEWAY_MESSAGE_JSONL_PATH}" ]]; then + echo "Messages: ${GATEWAY_MESSAGE_JSONL_PATH}" +fi +echo "=========================================" + +python examples/blackbox_recipes/mini_swe_agent/parallel_infer.py \ + --model-path "${MODEL_PATH}" \ + --data-path "${DATA_PATH}" \ + --max-samples "${MAX_SAMPLES}" \ + --prompt-length "${PROMPT_LENGTH}" \ + --response-length "${RESPONSE_LENGTH}" \ + --temperature "${TEMPERATURE}" \ + --top-p "${TOP_P}" \ + --n "${N}" \ + --engine "${ENGINE}" \ + --tensor-parallel-size "${TP}" \ + --nnodes "${NNODES}" \ + --n-gpus-per-node "${N_GPUS_PER_NODE}" \ + --gateway-count "${GATEWAY_COUNT}" \ + --max-concurrent-sessions "${MAX_CONCURRENT_SESSIONS}" \ + --tool-image "${SWE_AGENT_TOOL_IMAGE}" \ + --run-timeout "${SWE_AGENT_RUN_TIMEOUT}" \ + --max-turns "${AGENT_MAX_TURNS}" diff --git a/examples/blackbox_recipes/mini_swe_agent/run_train.sh b/examples/blackbox_recipes/mini_swe_agent/run_train.sh new file mode 100755 index 00000000..75a042ba --- /dev/null +++ b/examples/blackbox_recipes/mini_swe_agent/run_train.sh @@ -0,0 +1,300 @@ +#!/usr/bin/env bash +# Megatron + V1 async training for the blackbox mini-swe recipe. +# +# Uses verl.trainer.main_ppo with the V1 unified trainer. The default mode is +# separate_async, which uses separate trainer and rollout GPU pools. +# +# Usage: +# bash examples/blackbox_recipes/mini_swe_agent/run_train.sh +# +# All configurable via environment variables (see defaults below). + +set -euo pipefail + +SCRIPT_DIR="$(cd "$(dirname "${BASH_SOURCE[0]}")" && pwd)" +REPO_ROOT="${REPO_ROOT:-$(cd "${SCRIPT_DIR}/../../.." && pwd)}" +cd "${REPO_ROOT}" + +# ── Model & data ───────────────────────────────────────────────────────── +MODEL_PATH="${MODEL_PATH:-${HOME}/models/Qwen3.5-9B}" +TRAIN_DATA="${TRAIN_DATA:-${HOME}/data/swe_agent/swe_rebench_filtered.parquet}" +VAL_DATA="${VAL_DATA:-${HOME}/data/swe_agent/swe_bench_verified.parquet}" +RUNTIME_ENV="${RUNTIME_ENV:-}" + +# ── V1 trainer ─────────────────────────────────────────────────────────── +TRAINER_MODE="${TRAINER_MODE:-separate_async}" +NUM_WARMUP_BATCHES="${NUM_WARMUP_BATCHES:-1}" +SEPARATE_NUM_WARMUP_BATCHES="${SEPARATE_NUM_WARMUP_BATCHES:-${NUM_WARMUP_BATCHES}}" +PARAMETER_SYNC_STEP="${PARAMETER_SYNC_STEP:-4}" +RAY_SUBMIT_MODE="${RAY_SUBMIT_MODE:-job}" +RAY_INIT_ADDRESS="${RAY_INIT_ADDRESS:-auto}" +RAY_STATUS_TIMEOUT="${RAY_STATUS_TIMEOUT:-5}" +CONFIG_NAME="${CONFIG_NAME:-swe_agent_blackbox_megatron_v1}" + +# ── Hardware ───────────────────────────────────────────────────────────── +NNODES="${NNODES:-${NNODES_TRAIN:-1}}" +PHYSICAL_GPUS_PER_NODE="${PHYSICAL_GPUS_PER_NODE:-8}" +if [[ "${TRAINER_MODE}" == "separate_async" ]]; then + N_GPUS_PER_NODE="${N_GPUS_PER_NODE:-${TRAIN_NGPUS_PER_NODE:-4}}" + ROLLOUT_NNODES="${ROLLOUT_NNODES:-${NNODES_ROLLOUT:-${NNODES}}}" + ROLLOUT_NGPUS_PER_NODE="${ROLLOUT_NGPUS_PER_NODE:-${NGPUS_PER_NODE_ROLLOUT:-4}}" +else + N_GPUS_PER_NODE="${N_GPUS_PER_NODE:-${TRAIN_NGPUS_PER_NODE:-${PHYSICAL_GPUS_PER_NODE}}}" + ROLLOUT_NNODES="${ROLLOUT_NNODES:-${NNODES_ROLLOUT:-0}}" + ROLLOUT_NGPUS_PER_NODE="${ROLLOUT_NGPUS_PER_NODE:-${NGPUS_PER_NODE_ROLLOUT:-${N_GPUS_PER_NODE}}}" +fi + +# ── Algorithm ──────────────────────────────────────────────────────────── +CLIP_RATIO_LOW="${CLIP_RATIO_LOW:-0.2}" +CLIP_RATIO_HIGH="${CLIP_RATIO_HIGH:-0.28}" +ACTOR_LR="${ACTOR_LR:-1e-6}" + +# ── Sequence lengths ───────────────────────────────────────────────────── +PROMPT_LENGTH="${PROMPT_LENGTH:-4096}" +RESPONSE_LENGTH="${RESPONSE_LENGTH:-131072}" +MAX_MODEL_LEN=$((PROMPT_LENGTH + RESPONSE_LENGTH)) + +# ── Rollout parameters ─────────────────────────────────────────────────── +ENGINE="${ENGINE:-vllm}" +if [[ "${TRAINER_MODE}" == "separate_async" ]]; then + GEN_TP="${GEN_TP:-${TP:-${ROLLOUT_NGPUS_PER_NODE}}}" +else + GEN_TP="${GEN_TP:-${TP:-2}}" +fi +N="${N:-8}" +TEMPERATURE="${TEMPERATURE:-1.0}" +TOP_P="${TOP_P:-1.0}" +TOP_K="${TOP_K:--1}" +ROLLOUT_GPU_MEM_UTIL="${ROLLOUT_GPU_MEM_UTIL:-0.7}" +UPDATE_WEIGHTS_BUCKET_MB="${UPDATE_WEIGHTS_BUCKET_MB:-2048}" + +# ── Megatron training parallelism ──────────────────────────────────────── +if [[ "${TRAINER_MODE}" == "separate_async" ]]; then + TRAIN_TP="${TRAIN_TP:-${TP:-${N_GPUS_PER_NODE}}}" +else + TRAIN_TP="${TRAIN_TP:-${TP:-8}}" +fi +TRAIN_PP="${TRAIN_PP:-1}" +TRAIN_CP="${TRAIN_CP:-1}" +OFFLOAD="${OFFLOAD:-True}" +OPTIMIZER_OFFLOAD_FRACTION="${OFFLOAD_FRACTION:-1.0}" +USE_MBRIDGE="${USE_MBRIDGE:-True}" +PPO_MINI_BATCH_SIZE="${PPO_MINI_BATCH_SIZE:-16}" + +# ── Agent parameters ───────────────────────────────────────────────────── +# AGENT_MAX_TURNS is the agent's turn budget inside the sandbox: it becomes the +# mini-swe-agent step_limit (read by the runner via the AGENT_MAX_TURNS env var). +# Note: the trainer's multi_turn.max_assistant_turns is NOT enforced on the +# blackbox rollout path (AgentFrameworkRolloutAdapter), so it is not exposed here. +RUNNER="${RUNNER:-mini_swe}" +AGENT_MAX_TURNS="${AGENT_MAX_TURNS:-100}" +if [[ "${RUNNER}" == "mini_swe" ]]; then + AGENT_RUNNER_FQN="examples.blackbox_recipes.mini_swe_agent.mini_swe_agent_runner.mini_swe_agent_runner" + SWE_AGENT_TOOL_IMAGE="${SWE_AGENT_TOOL_IMAGE:-swr.cn-east-3.myhuaweicloud.com/openyuanrong/mini-swe-agent-tool:latest}" +else + echo "Unknown RUNNER=${RUNNER}; this recipe currently supports mini_swe only" >&2 + exit 1 +fi +SWE_AGENT_RUN_TIMEOUT="${SWE_AGENT_RUN_TIMEOUT:-7200}" +CONDA_ENV="${CONDA_ENV:-testbed}" +GATEWAY_COUNT="${GATEWAY_COUNT:-1}" +MAX_CONCURRENT_SESSIONS="${MAX_CONCURRENT_SESSIONS:-32}" +NUM_AGENT_WORKERS="${NUM_AGENT_WORKERS:-8}" +RUNNER_ARGS=( + "actor_rollout_ref.rollout.agent.agent_loop_manager_class=uni_agent.framework.entry.AgentFrameworkRolloutAdapter" + "actor_rollout_ref.rollout.custom.agent_framework.gateway_count=${GATEWAY_COUNT}" + "actor_rollout_ref.rollout.custom.agent_framework.agent_runners.swe_agent.runner_fqn=${AGENT_RUNNER_FQN}" + "actor_rollout_ref.rollout.custom.agent_framework.agent_runners.swe_agent.dispatch_mode=ray_task" + "actor_rollout_ref.rollout.custom.agent_framework.agent_runners.swe_agent.max_concurrent_sessions=${MAX_CONCURRENT_SESSIONS}" + "actor_rollout_ref.rollout.custom.agent_framework.agent_runners.swe_agent.runner_kwargs.tool_image=${SWE_AGENT_TOOL_IMAGE}" + "actor_rollout_ref.rollout.custom.agent_framework.agent_runners.swe_agent.runner_kwargs.run_timeout=${SWE_AGENT_RUN_TIMEOUT}" + "actor_rollout_ref.rollout.custom.agent_framework.agent_runners.swe_agent.runner_kwargs.conda_env=${CONDA_ENV}" +) + +# ── AKernel (remote sandbox) ───────────────────────────────────────────── +AKERNEL_SERVER_ADDRESS="${AKERNEL_SERVER_ADDRESS:-}" +AKERNEL_TOKEN="${AKERNEL_TOKEN:-}" +AKERNEL_TUNNEL_SSL_VERIFY="${AKERNEL_TUNNEL_SSL_VERIFY:-0}" + +# ── Logging & checkpointing ────────────────────────────────────────────── +PROJECT_NAME="${PROJECT_NAME:-swe_agent_blackbox}" +EXPERIMENT_NAME="${EXPERIMENT_NAME:-swe_agent_$(date +%Y%m%d_%H%M)}" +SAVE_FREQ="${SAVE_FREQ:-10}" +TEST_FREQ="${TEST_FREQ:-10}" +TOTAL_EPOCHS="${TOTAL_EPOCHS:-10}" +TOTAL_TRAINING_STEPS="${TOTAL_TRAINING_STEPS:-}" +VAL_BEFORE_TRAIN="${VAL_BEFORE_TRAIN:-true}" +CKPTS_DIR="${CKPTS_DIR:-checkpoints/${PROJECT_NAME}/${EXPERIMENT_NAME}}" +TRAIN_MAX_SAMPLES="${TRAIN_MAX_SAMPLES:-${MAX_SAMPLES:--1}}" +VAL_MAX_SAMPLES="${VAL_MAX_SAMPLES:-${MAX_SAMPLES:--1}}" +TRAIN_BATCH_SIZE="${TRAIN_BATCH_SIZE:-${PPO_MINI_BATCH_SIZE}}" +VAL_BATCH_SIZE="${VAL_BATCH_SIZE:-${TRAIN_BATCH_SIZE}}" + +export AGENT_MAX_TURNS +export SWE_AGENT_EVAL_TIMEOUT="${SWE_AGENT_EVAL_TIMEOUT:-600}" +export SWE_AGENT_TOOL_IMAGE +export SWE_AGENT_RUN_TIMEOUT +export CONDA_ENV +export GATEWAY_COUNT +export AKERNEL_SERVER_ADDRESS +export AKERNEL_TOKEN +export AKERNEL_TUNNEL_SSL_VERIFY +export PYTHONPATH="${REPO_ROOT}:${REPO_ROOT}/verl:${PYTHONPATH:-}" + +echo "=== SWE-Agent Blackbox Megatron Async Training ===" +echo "Model: ${MODEL_PATH}" +echo "Train data: ${TRAIN_DATA}" +echo "Val data: ${VAL_DATA}" +echo "Engine: ${ENGINE} (gen_tp=${GEN_TP}, train_tp=${TRAIN_TP})" +echo "Runner: ${RUNNER}" +echo "Turns: agent_max_turns=${AGENT_MAX_TURNS}" +echo "Batch: n=${N}, mini_bsz=${PPO_MINI_BATCH_SIZE}" +echo "Sequence: prompt=${PROMPT_LENGTH}, response=${RESPONSE_LENGTH}" +echo "Trainer: V1 ${TRAINER_MODE}" +if [[ "${TRAINER_MODE}" == "separate_async" ]]; then + echo "Resources: trainer=${NNODES}x${N_GPUS_PER_NODE}, rollout=${ROLLOUT_NNODES}x${ROLLOUT_NGPUS_PER_NODE}" +else + echo "Resources: colocated=${NNODES}x${N_GPUS_PER_NODE}" +fi +echo "Samples: train_max=${TRAIN_MAX_SAMPLES}, val_max=${VAL_MAX_SAMPLES}" +echo "===================================================" + +# ── Compute derived parameters ─────────────────────────────────────────── +ACTOR_PPO_MAX_TOKEN_LEN=$(( (PROMPT_LENGTH + RESPONSE_LENGTH) / TRAIN_CP )) +INFER_PPO_MAX_TOKEN_LEN=$(( (PROMPT_LENGTH + RESPONSE_LENGTH) / TRAIN_CP )) + +RUNTIME_ENV_ARGS=() +if [ -n "${RUNTIME_ENV}" ]; then + RUNTIME_ENV_ARGS=(--runtime-env "${RUNTIME_ENV}") +else + RUNTIME_ENV_JSON="$( + python3 - <<'PY' +import json +import os + +env_vars = { + key: value + for key in ( + "PYTHONPATH", + "AKERNEL_SERVER_ADDRESS", + "AKERNEL_TOKEN", + "AKERNEL_TUNNEL_SSL_VERIFY", + "AGENT_MAX_TURNS", + "SWE_AGENT_EVAL_TIMEOUT", + "SWE_AGENT_TOOL_IMAGE", + "SWE_AGENT_RUN_TIMEOUT", + "CONDA_ENV", + "GATEWAY_COUNT", + ) + if (value := os.environ.get(key)) is not None +} +env_vars.setdefault("TRANSFER_QUEUE_ENABLE", "") +env_vars.setdefault("NCCL_P2P_DISABLE", "1") +env_vars.setdefault("NCCL_SHM_DISABLE", "1") +print(json.dumps({"env_vars": env_vars})) +PY + )" + RUNTIME_ENV_ARGS=(--runtime-env-json "${RUNTIME_ENV_JSON}") +fi + +# ── Ensure Ray is running ──────────────────────────────────────────────── +if [[ "${TRAINER_MODE}" == "separate_async" ]]; then + TOTAL_GPUS=$(( NNODES * N_GPUS_PER_NODE + ROLLOUT_NNODES * ROLLOUT_NGPUS_PER_NODE )) +else + TOTAL_GPUS=$(( NNODES * N_GPUS_PER_NODE )) +fi +if ! timeout "${RAY_STATUS_TIMEOUT}" ray status &>/dev/null; then + echo "Starting Ray cluster (${TOTAL_GPUS} GPUs)..." + ray start --head --num-gpus="${TOTAL_GPUS}" --disable-usage-stats +else + echo "Ray cluster already running." +fi + +# ── Launch ──────────────────────────────────────────────────────────────── +WORKING_DIR="${WORKING_DIR:-$(pwd)}" + +MAIN_CMD=( + python3 -m verl.trainer.main_ppo + --config-name="${CONFIG_NAME}" \ + --config-path="${REPO_ROOT}/examples/blackbox_recipes/mini_swe_agent/config" \ + hydra.searchpath=[pkg://verl.trainer.config] \ + +ray_kwargs.ray_init.address="${RAY_INIT_ADDRESS}" \ + trainer.use_v1=True \ + trainer.v1.trainer_mode="${TRAINER_MODE}" \ + trainer.v1.colocate_async.num_warmup_batches=${NUM_WARMUP_BATCHES} \ + trainer.v1.separate_async.num_warmup_batches=${SEPARATE_NUM_WARMUP_BATCHES} \ + trainer.v1.separate_async.parameter_sync_step=${PARAMETER_SYNC_STEP} \ + transfer_queue.enable=True \ + actor_rollout_ref.model.path="${MODEL_PATH}" \ + data.train_files="['${TRAIN_DATA}']" \ + data.val_files="['${VAL_DATA}']" \ + data.train_max_samples=${TRAIN_MAX_SAMPLES} \ + data.val_max_samples=${VAL_MAX_SAMPLES} \ + data.train_batch_size=${TRAIN_BATCH_SIZE} \ + data.val_batch_size=${VAL_BATCH_SIZE} \ + data.max_prompt_length=${PROMPT_LENGTH} \ + data.max_response_length=${RESPONSE_LENGTH} \ + actor_rollout_ref.rollout.n=${N} \ + actor_rollout_ref.rollout.name=${ENGINE} \ + actor_rollout_ref.rollout.prompt_length=${PROMPT_LENGTH} \ + actor_rollout_ref.rollout.response_length=${RESPONSE_LENGTH} \ + actor_rollout_ref.rollout.max_model_len=${MAX_MODEL_LEN} \ + actor_rollout_ref.rollout.max_num_batched_tokens=${MAX_MODEL_LEN} \ + actor_rollout_ref.rollout.temperature=${TEMPERATURE} \ + actor_rollout_ref.rollout.top_p=${TOP_P} \ + actor_rollout_ref.rollout.top_k=${TOP_K} \ + actor_rollout_ref.rollout.checkpoint_engine.update_weights_bucket_megabytes=${UPDATE_WEIGHTS_BUCKET_MB} \ + actor_rollout_ref.rollout.nnodes=${ROLLOUT_NNODES} \ + actor_rollout_ref.rollout.n_gpus_per_node=${ROLLOUT_NGPUS_PER_NODE} \ + actor_rollout_ref.rollout.tensor_model_parallel_size=${GEN_TP} \ + actor_rollout_ref.rollout.gpu_memory_utilization=${ROLLOUT_GPU_MEM_UTIL} \ + actor_rollout_ref.rollout.agent.num_workers=${NUM_AGENT_WORKERS} \ + "${RUNNER_ARGS[@]}" \ + actor_rollout_ref.actor.clip_ratio_low=${CLIP_RATIO_LOW} \ + actor_rollout_ref.actor.clip_ratio_high=${CLIP_RATIO_HIGH} \ + actor_rollout_ref.actor.ppo_mini_batch_size=${PPO_MINI_BATCH_SIZE} \ + actor_rollout_ref.actor.ppo_max_token_len_per_gpu=${ACTOR_PPO_MAX_TOKEN_LEN} \ + actor_rollout_ref.actor.optim.lr=${ACTOR_LR} \ + +actor_rollout_ref.actor.optim.override_optimizer_config.optimizer_offload_fraction=${OPTIMIZER_OFFLOAD_FRACTION} \ + +actor_rollout_ref.actor.optim.override_optimizer_config.overlap_cpu_optimizer_d2h_h2d=True \ + +actor_rollout_ref.actor.optim.override_optimizer_config.use_precision_aware_optimizer=True \ + +actor_rollout_ref.actor.optim.override_optimizer_config.optimizer_cpu_offload=True \ + actor_rollout_ref.actor.megatron.param_offload=${OFFLOAD} \ + actor_rollout_ref.actor.megatron.grad_offload=${OFFLOAD} \ + actor_rollout_ref.actor.megatron.optimizer_offload=${OFFLOAD} \ + actor_rollout_ref.actor.megatron.tensor_model_parallel_size=${TRAIN_TP} \ + actor_rollout_ref.actor.megatron.pipeline_model_parallel_size=${TRAIN_PP} \ + actor_rollout_ref.actor.megatron.context_parallel_size=${TRAIN_CP} \ + actor_rollout_ref.actor.megatron.use_mbridge=${USE_MBRIDGE} \ + actor_rollout_ref.ref.megatron.param_offload=${OFFLOAD} \ + actor_rollout_ref.ref.megatron.tensor_model_parallel_size=${TRAIN_TP} \ + actor_rollout_ref.ref.megatron.pipeline_model_parallel_size=${TRAIN_PP} \ + actor_rollout_ref.ref.megatron.context_parallel_size=${TRAIN_CP} \ + actor_rollout_ref.rollout.log_prob_micro_batch_size_per_gpu=1 \ + actor_rollout_ref.rollout.log_prob_max_token_len_per_gpu=${INFER_PPO_MAX_TOKEN_LEN} \ + actor_rollout_ref.ref.log_prob_micro_batch_size_per_gpu=1 \ + actor_rollout_ref.ref.log_prob_max_token_len_per_gpu=${INFER_PPO_MAX_TOKEN_LEN} \ + trainer.project_name="${PROJECT_NAME}" \ + trainer.experiment_name="${EXPERIMENT_NAME}" \ + trainer.total_epochs=${TOTAL_EPOCHS} \ + trainer.val_before_train=${VAL_BEFORE_TRAIN} \ + trainer.save_freq=${SAVE_FREQ} \ + trainer.test_freq=${TEST_FREQ} \ + trainer.default_local_dir="${CKPTS_DIR}" \ + trainer.nnodes=${NNODES} \ + trainer.n_gpus_per_node=${N_GPUS_PER_NODE} \ + "$@" +) + +if [[ -n "${TOTAL_TRAINING_STEPS}" ]]; then + MAIN_CMD+=(trainer.total_training_steps=${TOTAL_TRAINING_STEPS}) +fi + +if [[ "${RAY_SUBMIT_MODE}" == "job" ]]; then + ray job submit --no-wait --working-dir="${WORKING_DIR}" "${RUNTIME_ENV_ARGS[@]}" -- "${MAIN_CMD[@]}" +elif [[ "${RAY_SUBMIT_MODE}" == "local" ]]; then + "${MAIN_CMD[@]}" +else + echo "Unknown RAY_SUBMIT_MODE=${RAY_SUBMIT_MODE}; expected job or local" >&2 + exit 1 +fi diff --git a/examples/blackbox_recipes/mini_swe_agent/subprocess_runner.py b/examples/blackbox_recipes/mini_swe_agent/subprocess_runner.py deleted file mode 100644 index b03dc7d7..00000000 --- a/examples/blackbox_recipes/mini_swe_agent/subprocess_runner.py +++ /dev/null @@ -1,61 +0,0 @@ -"""Ray-based subprocess runner for agent_runner execution. - -Launches agent_runner in a separate Ray worker process to prevent blocking -operations (sleep, sync I/O, etc.) from stalling the framework's event loop. -""" - -from __future__ import annotations - -import asyncio -import logging -from typing import Any - -import ray - -from uni_agent.trainer.framework.types import SessionHandle - -logger = logging.getLogger(__name__) - - -class _StubSessionRuntime: - """Captures reward_info from agent_runner's complete_session call.""" - - def __init__(self): - self.reward_info: dict[str, Any] | None = None - - async def complete_session(self, session_id: str, reward_info: dict[str, Any] | None = None): - self.reward_info = reward_info - - -@ray.remote(num_cpus=0) -def remote_agent_run( - agent_runner_fqn: str, - raw_prompt, - session_id: str, - base_url: str, - sample_index: int, - runner_kwargs: dict, -) -> dict[str, Any] | None: - """Run agent_runner in a dedicated Ray worker process.""" - from verl.utils.import_utils import load_class_from_fqn - - agent_runner = load_class_from_fqn(agent_runner_fqn) - stub_runtime = _StubSessionRuntime() - handle = SessionHandle(session_id=session_id, base_url=base_url) - - async def _run(): - try: - await agent_runner( - raw_prompt=raw_prompt, - session=handle, - sample_index=sample_index, - session_runtime=stub_runtime, - **runner_kwargs, - ) - return stub_runtime.reward_info - except Exception as e: - logger.error("remote_agent_run failed: session_id=%s, sample=%d, error=%s", - session_id, sample_index, e, exc_info=True) - raise - - return asyncio.run(_run()) diff --git a/examples/blackbox_recipes/sandbox/sandbox.py b/examples/blackbox_recipes/sandbox_client.py similarity index 66% rename from examples/blackbox_recipes/sandbox/sandbox.py rename to examples/blackbox_recipes/sandbox_client.py index e6e46a8d..631c8722 100644 --- a/examples/blackbox_recipes/sandbox/sandbox.py +++ b/examples/blackbox_recipes/sandbox_client.py @@ -1,5 +1,7 @@ -"""OpenYuanRong (AKernel) remote sandbox command execution. +"""AKernel remote sandbox command execution. +AKernel is an agent sandbox infra collaboratively developed by the +OpenYuanrong team and the Ant AKernel team. Uses ``akernel_sdk.Sandbox`` with sidecar ``Mount`` to inject the mini-swe-agent tool image. Supports upstream tunnel so the agent inside the sandbox can reach the gateway via ``http://127.0.0.1:``. @@ -10,6 +12,7 @@ import asyncio import logging import os +import uuid from dataclasses import dataclass from typing import Any from urllib.parse import urlparse @@ -23,24 +26,31 @@ class CommandResult: stderr: str exit_code: int + logger = logging.getLogger(__name__) DEFAULT_PROXY_PORT = 38197 def _configure_akernel_env() -> None: - """Map OPENYUANRONG_* env vars to AKERNEL_* before importing akernel_sdk.""" - server = os.getenv("OPENYUANRONG_SERVER_ADDRESS") - token = os.getenv("OPENYUANRONG_TOKEN") - tunnel_ssl_verify = os.getenv("OPENYUANRONG_TUNNEL_SSL_VERIFY", "0") + """Validate AKernel credentials and map the tunnel SSL flag for akernel_sdk. + + ``akernel_sdk`` reads ``AKERNEL_SERVER_ADDRESS`` / ``AKERNEL_TOKEN`` directly, + so only the tunnel SSL flag needs to be translated to ``TUNNEL_SSL_VERIFY``. + """ + server = os.getenv("AKERNEL_SERVER_ADDRESS") + token = os.getenv("AKERNEL_TOKEN") if not server or not token: - raise ValueError( - "OPENYUANRONG_SERVER_ADDRESS and OPENYUANRONG_TOKEN " - "environment variables must be set for YR sandbox" - ) - os.environ["AKERNEL_SERVER_ADDRESS"] = server - os.environ["AKERNEL_TOKEN"] = token - os.environ["TUNNEL_SSL_VERIFY"] = tunnel_ssl_verify + raise ValueError("AKERNEL_SERVER_ADDRESS and AKERNEL_TOKEN environment variables must be set for sandbox") + os.environ["TUNNEL_SSL_VERIFY"] = os.getenv("AKERNEL_TUNNEL_SSL_VERIFY", "0") + + +def _resolve_sandbox_name() -> str | None: + """Return ``{prefix}{random}`` when ``SANDBOX_NAME_PREFIX`` env is set.""" + prefix = os.getenv("SANDBOX_NAME_PREFIX") + if not prefix: + return None + return f"{prefix}{uuid.uuid4().hex[:8]}" def extract_upstream(gateway_url: str) -> str: @@ -71,8 +81,8 @@ def rewrite_gateway_url( return f"http://127.0.0.1:{proxy_port}{path}" -class YRSandbox: - """Command execution via OpenYuanRong (AKernel) remote sandbox.""" +class SandboxClient: + """Command execution via remote sandbox.""" def __init__(self, sandbox: Any) -> None: self._sandbox = sandbox @@ -81,7 +91,6 @@ def __init__(self, sandbox: Any) -> None: def sandbox_id(self) -> str: return getattr(self._sandbox, "sandbox_id", "unknown") - @classmethod async def create( cls, @@ -97,10 +106,10 @@ async def create( mem_limit: int = 8192, idle_timeout: int = 7200, sidecar_target: str = "/opt/mini-swe-agent", - max_retries: int = 5, + max_retries: int = 10, **sandbox_kwargs: Any, - ) -> "YRSandbox": - """Create an OpenYuanRong sandbox with sidecar tool mounted. + ) -> SandboxClient: + """Create an sandbox client with sidecar tool mounted. The sidecar image is mounted at ``sidecar_target`` inside the sandbox via ``akernel_sdk.Mount``. @@ -127,25 +136,35 @@ async def create( sb_kwargs["proxy_port"] = proxy_port if env: sb_kwargs["env"] = env + name = _resolve_sandbox_name() + if name is not None: + sb_kwargs["name"] = name sb_kwargs.update(sandbox_kwargs) logger.info( - "Creating YR sandbox (image=%s, cpu=%d, memory=%d, sidecar=%s:%s, upstream=%s)", - image, cpu, memory, sidecar_image, sidecar_target, upstream or "none", + "Creating sandbox (image=%s, cpu=%d, memory=%d, sidecar=%s:%s, upstream=%s, name=%s)", + image, + cpu, + memory, + sidecar_image, + sidecar_target, + upstream or "none", + name or "auto", ) last_error: Exception | None = None for retry in range(max_retries): sandbox = None try: sandbox = await asyncio.to_thread(lambda: Sandbox(**sb_kwargs)) - logger.info("YR sandbox created: %s", getattr(sandbox, "sandbox_id", "?")) + logger.info("sandbox created: %s", getattr(sandbox, "sandbox_id", "?")) return cls(sandbox=sandbox) except Exception as exc: last_error = exc sandbox_id = getattr(sandbox, "sandbox_id", None) logger.critical( - "Failed to create YR sandbox (sandbox_id=%s): %s", - sandbox_id or "n/a", exc, + "Failed to create sandbox (sandbox_id=%s): %s", + sandbox_id or "n/a", + exc, ) if sandbox is not None: try: @@ -153,17 +172,19 @@ async def create( except Exception: pass if retry < max_retries - 1: - sleep_time = min(30, 2 ** retry) - logger.info("Retrying YR sandbox creation in %d seconds...", sleep_time) + sleep_time = min(30, 2**retry) + logger.info("Retrying sandbox creation in %d seconds...", sleep_time) await asyncio.sleep(sleep_time) - raise RuntimeError(f"Failed to create YR sandbox after {max_retries} retries") from last_error + raise RuntimeError(f"Failed to create sandbox after {max_retries} retries") from last_error async def run(self, cmd: str, *, timeout: int = 600) -> CommandResult: - """Execute *cmd* inside the OpenYuanRong sandbox via ``sandbox.commands.run``.""" + """Execute *cmd* inside the sandbox via ``sandbox.commands.run``.""" try: result = await asyncio.to_thread( - self._sandbox.commands.run, cmd, timeout=timeout, + self._sandbox.commands.run, + cmd, + timeout=timeout, ) return CommandResult( stdout=getattr(result, "stdout", ""), @@ -174,15 +195,15 @@ async def run(self, cmd: str, *, timeout: int = 600) -> CommandResult: return CommandResult(stdout="", stderr=str(e), exit_code=-1) async def cleanup(self) -> None: - """Kill the OpenYuanRong sandbox if still running.""" + """Kill the sandbox if still running.""" if self._sandbox is not None: sandbox_id = getattr(self._sandbox, "sandbox_id", "?") try: if self._sandbox.is_running(): await asyncio.to_thread(self._sandbox.kill) - logger.info("YR sandbox %s killed", sandbox_id) + logger.info("sandbox %s killed", sandbox_id) else: - logger.info("YR sandbox %s already stopped", sandbox_id) + logger.info("sandbox %s already stopped", sandbox_id) except Exception as e: - logger.warning("Failed to kill YR sandbox %s: %s", sandbox_id, e) + logger.warning("Failed to kill sandbox %s: %s", sandbox_id, e) self._sandbox = None diff --git a/examples/blackbox_recipes/scripts/build_tool.sh b/examples/blackbox_recipes/scripts/build_tool.sh deleted file mode 100755 index e5158629..00000000 --- a/examples/blackbox_recipes/scripts/build_tool.sh +++ /dev/null @@ -1,75 +0,0 @@ -#!/usr/bin/env bash -# Build a SWE blackbox sidecar tool image. -# -# Usage: -# bash examples/swe_agent_blackbox/build_tool.sh -# bash examples/swe_agent_blackbox/build_tool.sh --tool claude_code -# bash examples/swe_agent_blackbox/build_tool.sh --pip-index https://pypi.tuna.tsinghua.edu.cn/simple/ -# bash examples/swe_agent_blackbox/build_tool.sh --npm-registry https://registry.npmmirror.com -# bash examples/swe_agent_blackbox/build_tool.sh --tool-version latest -# bash examples/swe_agent_blackbox/build_tool.sh --registry reg.antgroup-inc.cn/myrepo -# -set -euo pipefail - -SCRIPT_DIR="$(cd "$(dirname "$0")" && pwd)" -TOOL_KIND="${TOOL_KIND:-mini_swe}" -IMAGE_TAG="${TOOL_TAG:-latest}" -TOOL_VERSION="${TOOL_VERSION:-latest}" - -# Parse args -REGISTRY="" -PIP_INDEX_URL="${PIP_INDEX_URL:-}" -NPM_REGISTRY="${NPM_REGISTRY:-}" -while [[ $# -gt 0 ]]; do - case "$1" in - --tool) TOOL_KIND="$2"; shift 2 ;; - --registry) REGISTRY="$2"; shift 2 ;; - --pip-index) PIP_INDEX_URL="$2"; shift 2 ;; - --npm-registry) NPM_REGISTRY="$2"; shift 2 ;; - --tool-version) TOOL_VERSION="$2"; shift 2 ;; - *) echo "Unknown arg: $1"; exit 1 ;; - esac -done - -BUILD_ARGS=() -DOCKERFILE="${SCRIPT_DIR}/Dockerfile.mini-swe-agent-tool" -if [[ "${TOOL_KIND}" == "claude" ]]; then - TOOL_KIND="claude_code" -fi -if [[ "${TOOL_KIND}" == "claude_code" ]]; then - IMAGE_NAME="${TOOL_IMAGE:-claude-code-tool}" - DOCKERFILE="${SCRIPT_DIR}/Dockerfile.claude-code-tool" - BUILD_ARGS+=(--build-arg "TOOL_VERSION=${TOOL_VERSION}") - if [[ -n "${NPM_REGISTRY}" ]]; then - BUILD_ARGS+=(--build-arg "NPM_REGISTRY=${NPM_REGISTRY}") - fi -elif [[ "${TOOL_KIND}" == "mini_swe" ]]; then - IMAGE_NAME="${TOOL_IMAGE:-mini-swe-agent-tool}" - if [[ -n "${PIP_INDEX_URL}" ]]; then - BUILD_ARGS+=(--build-arg PIP_INDEX_URL="${PIP_INDEX_URL}") - fi -else - echo "Unknown tool: ${TOOL_KIND}; expected mini_swe or claude_code" - exit 1 -fi - -echo "==> Building ${TOOL_KIND} tool image: ${IMAGE_NAME}:${IMAGE_TAG}" -docker build \ - -f "${DOCKERFILE}" \ - -t "${IMAGE_NAME}:${IMAGE_TAG}" \ - "${BUILD_ARGS[@]}" \ - "${SCRIPT_DIR}/" - -if [[ -n "${REGISTRY}" ]]; then - FULL_TAG="${REGISTRY}/${IMAGE_NAME}:${IMAGE_TAG}" - echo "==> Tagging and pushing: ${FULL_TAG}" - docker tag "${IMAGE_NAME}:${IMAGE_TAG}" "${FULL_TAG}" - docker push "${FULL_TAG}" - echo " Pushed." -fi - -echo "" -echo "Tool image ready: ${IMAGE_NAME}:${IMAGE_TAG}" -if [[ -n "${REGISTRY}" ]]; then - echo " Remote sandbox: ${FULL_TAG}" -fi diff --git a/examples/blackbox_recipes/scripts/run_infer.sh b/examples/blackbox_recipes/scripts/run_infer.sh deleted file mode 100755 index d5703aa6..00000000 --- a/examples/blackbox_recipes/scripts/run_infer.sh +++ /dev/null @@ -1,66 +0,0 @@ -#!/usr/bin/env bash -# Parallel inference for the blackbox SWE-agent recipe. -# -# Usage: -# bash examples/swe_agent_blackbox/scripts/run_infer.sh - -set -euo pipefail - -# ── Model & data ───────────────────────────────────────────────────────── -MODEL_PATH="${MODEL_PATH:-$HOME/models/Qwen3.5-9B}" -DATA_PATH="${DATA_PATH:-$HOME/data/swe_agent/swe_bench_verified.parquet}" - -# ── Inference parameters ───────────────────────────────────────────────── -MAX_SAMPLES="${MAX_SAMPLES:--1}" -PROMPT_LENGTH="${PROMPT_LENGTH:-4096}" -RESPONSE_LENGTH="${RESPONSE_LENGTH:-65536}" -TEMPERATURE="${TEMPERATURE:-1.0}" -TOP_P="${TOP_P:-1.0}" -N="${N:-8}" -ENGINE="${ENGINE:-vllm}" -TP="${TP:-4}" -N_GPUS_PER_NODE="${N_GPUS_PER_NODE:-8}" -GATEWAY_COUNT="${GATEWAY_COUNT:-1}" -MAX_CONCURRENT_SESSIONS="${MAX_CONCURRENT_SESSIONS:-2}" - -# ── Agent parameters ───────────────────────────────────────────────────── -RUNNER="${RUNNER:-uniagent}" -AGENT_CONFIG_PATH="${AGENT_CONFIG_PATH:-examples/swe_agent_blackbox/config/agent_config.yaml}" -export SWE_AGENT_MAX_TURNS="${SWE_AGENT_MAX_TURNS:-100}" -export SWE_AGENT_EVAL_TIMEOUT="${SWE_AGENT_EVAL_TIMEOUT:-600}" -SWE_AGENT_TOOL_IMAGE="${SWE_AGENT_TOOL_IMAGE:-}" -SWE_AGENT_RUN_TIMEOUT="${SWE_AGENT_RUN_TIMEOUT:-7200}" - -# ── Logging ────────────────────────────────────────────────────────────── -export VERL_LOGGING_LEVEL="${VERL_LOGGING_LEVEL:-INFO}" -export ROLLOUT_GPU_MEM_UTIL="${ROLLOUT_GPU_MEM_UTIL:-0.5}" - -echo "=== SWE-Agent Blackbox Inference ===" -echo "Model: ${MODEL_PATH}" -echo "Data: ${DATA_PATH}" -echo "Max samples: ${MAX_SAMPLES}" -echo "Engine: ${ENGINE} (TP=${TP})" -echo "Runner: ${RUNNER}" -echo "Gateway count: ${GATEWAY_COUNT}" -echo "Max concurrent sessions: ${MAX_CONCURRENT_SESSIONS}" -echo "=====================================" - -python examples/swe_agent_blackbox/parallel_infer.py \ - --model-path "${MODEL_PATH}" \ - --data-path "${DATA_PATH}" \ - --max-samples "${MAX_SAMPLES}" \ - --prompt-length "${PROMPT_LENGTH}" \ - --response-length "${RESPONSE_LENGTH}" \ - --temperature "${TEMPERATURE}" \ - --top-p "${TOP_P}" \ - --n "${N}" \ - --engine "${ENGINE}" \ - --tensor-parallel-size "${TP}" \ - --max-turns "${SWE_AGENT_MAX_TURNS}" \ - --runner "${RUNNER}" \ - --agent-config-path "${AGENT_CONFIG_PATH}" \ - --n-gpus-per-node "${N_GPUS_PER_NODE}" \ - --gateway-count "${GATEWAY_COUNT}" \ - --max-concurrent-sessions "${MAX_CONCURRENT_SESSIONS}" \ - --tool-image "${SWE_AGENT_TOOL_IMAGE}" \ - --run-timeout "${SWE_AGENT_RUN_TIMEOUT}" diff --git a/examples/blackbox_recipes/scripts/run_train.sh b/examples/blackbox_recipes/scripts/run_train.sh deleted file mode 100755 index cf08005d..00000000 --- a/examples/blackbox_recipes/scripts/run_train.sh +++ /dev/null @@ -1,122 +0,0 @@ -#!/usr/bin/env bash -# Training launch script for the blackbox SWE-agent recipe. -# -# Uses GRPO + AgentFrameworkRolloutAdapter with reward computed in-process -# by the agent runner, then passed through the reward worker's compute_score. -# -# Usage: -# bash examples/swe_agent_blackbox/scripts/run_train.sh -# -# All configurable via environment variables (see defaults below). - -set -euo pipefail - -# ── Model & data ───────────────────────────────────────────────────────── -MODEL_PATH="${MODEL_PATH:-$HOME/models/Qwen3-Coder-30B-A3B-Instruct}" -TRAIN_DATA="${TRAIN_DATA:-$HOME/data/swe_agent/swe_bench_verified.parquet}" -VAL_DATA="${VAL_DATA:-$HOME/data/swe_agent/swe_bench_verified.parquet}" - -# ── Hardware ───────────────────────────────────────────────────────────── -NNODES="${NNODES:-1}" -NGPUS_PER_NODE="${NGPUS_PER_NODE:-8}" - -# ── Training parameters ───────────────────────────────────────────────── -TRAIN_BATCH_SIZE="${TRAIN_BATCH_SIZE:-128}" -PROMPT_LENGTH="${PROMPT_LENGTH:-4096}" -RESPONSE_LENGTH="${RESPONSE_LENGTH:-131072}" -ACTOR_LR="${ACTOR_LR:-1e-6}" -TOTAL_EPOCHS="${TOTAL_EPOCHS:-10}" -SAVE_FREQ="${SAVE_FREQ:-10}" -TEST_FREQ="${TEST_FREQ:-10}" - -# ── Rollout parameters ────────────────────────────────────────────────── -ENGINE="${ENGINE:-vllm}" -TP="${TP:-4}" -ROLLOUT_GPU_MEM_UTIL="${ROLLOUT_GPU_MEM_UTIL:-0.7}" -N="${N:-8}" -TEMPERATURE="${TEMPERATURE:-1.0}" - -# ── Agent parameters ───────────────────────────────────────────────────── -RUNNER="${RUNNER:-mini_swe}" -MAX_TURNS="${MAX_TURNS:-100}" -AGENT_CONFIG_PATH="${AGENT_CONFIG_PATH:-examples/swe_agent_blackbox/config/agent_config.yaml}" -COMPLETION_TIMEOUT="${COMPLETION_TIMEOUT:-600}" -if [[ "${RUNNER}" == "claude_code" ]]; then - AGENT_RUNNER_FQN="examples.swe_agent_blackbox.claude_code_runner.claude_code_runner" - SWE_AGENT_TOOL_IMAGE="${SWE_AGENT_TOOL_IMAGE:-claude-code-tool:latest}" -elif [[ "${RUNNER}" == "mini_swe" ]]; then - AGENT_RUNNER_FQN="examples.swe_agent_blackbox.mini_swe_agent_runner.mini_swe_agent_runner" - SWE_AGENT_TOOL_IMAGE="${SWE_AGENT_TOOL_IMAGE:-swr.cn-east-3.myhuaweicloud.com/openyuanrong/mini-swe-agent-tool:latest}" -elif [[ "${RUNNER}" == "uniagent" ]]; then - AGENT_RUNNER_FQN="examples.swe_agent_blackbox.agent_runner.swe_agent_runner" - SWE_AGENT_TOOL_IMAGE="" -else - echo "Unknown RUNNER=${RUNNER}; expected mini_swe, claude_code, or uniagent" >&2 - exit 1 -fi -SWE_AGENT_RUN_TIMEOUT="${SWE_AGENT_RUN_TIMEOUT:-7200}" -RUNNER_ARGS=( - "actor_rollout_ref.rollout.custom.agent_framework.agent_runner_fqn=${AGENT_RUNNER_FQN}" -) -if [[ "${RUNNER}" != "uniagent" ]]; then - RUNNER_ARGS+=( - "+actor_rollout_ref.rollout.custom.agent_framework.agent_runner_kwargs.tool_image=${SWE_AGENT_TOOL_IMAGE}" - "+actor_rollout_ref.rollout.custom.agent_framework.agent_runner_kwargs.run_timeout=${SWE_AGENT_RUN_TIMEOUT}" - ) -fi - -# ── Logging ────────────────────────────────────────────────────────────── -PROJECT_NAME="${PROJECT_NAME:-swe_agent_blackbox}" -EXPERIMENT_NAME="${EXPERIMENT_NAME:-swe_agent_$(date +%Y%m%d_%H%M)}" -VERL_LOGGING_LEVEL="${VERL_LOGGING_LEVEL:-INFO}" - -export SWE_AGENT_MAX_TURNS="${MAX_TURNS}" -export SWE_AGENT_EVAL_TIMEOUT="${SWE_AGENT_EVAL_TIMEOUT:-600}" -export VERL_LOGGING_LEVEL - -# ── Environment for NCCL ───────────────────────────────────────────────── -export NCCL_P2P_DISABLE="${NCCL_P2P_DISABLE:-1}" -export NCCL_SHM_DISABLE="${NCCL_SHM_DISABLE:-1}" - -echo "=== SWE-Agent Blackbox Training ===" -echo "Model: ${MODEL_PATH}" -echo "Train data: ${TRAIN_DATA}" -echo "Val data: ${VAL_DATA}" -echo "Engine: ${ENGINE} (TP=${TP})" -echo "Runner: ${RUNNER}" -echo "Batch size: ${TRAIN_BATCH_SIZE}, N=${N}" -echo "Epochs: ${TOTAL_EPOCHS}" -echo "=====================================" - -python3 -m verl.trainer.main_ppo_sync \ - --config-name=swe_agent_blackbox \ - --config-path="$(pwd)/examples/swe_agent_blackbox/config" \ - actor_rollout_ref.model.path="${MODEL_PATH}" \ - data.train_files="['${TRAIN_DATA}']" \ - data.val_files="['${VAL_DATA}']" \ - data.train_batch_size=${TRAIN_BATCH_SIZE} \ - data.max_prompt_length=${PROMPT_LENGTH} \ - data.max_response_length=${RESPONSE_LENGTH} \ - actor_rollout_ref.rollout.name=${ENGINE} \ - actor_rollout_ref.rollout.tensor_model_parallel_size=${TP} \ - actor_rollout_ref.rollout.gpu_memory_utilization=${ROLLOUT_GPU_MEM_UTIL} \ - actor_rollout_ref.rollout.n=${N} \ - actor_rollout_ref.rollout.temperature=${TEMPERATURE} \ - actor_rollout_ref.rollout.prompt_length=${PROMPT_LENGTH} \ - actor_rollout_ref.rollout.response_length=${RESPONSE_LENGTH} \ - actor_rollout_ref.rollout.max_model_len=$((PROMPT_LENGTH + RESPONSE_LENGTH + 1024)) \ - actor_rollout_ref.rollout.multi_turn.max_assistant_turns=${MAX_TURNS} \ - actor_rollout_ref.actor.optim.lr=${ACTOR_LR} \ - actor_rollout_ref.rollout.nnodes=${NNODES} \ - actor_rollout_ref.rollout.n_gpus_per_node=${NGPUS_PER_NODE} \ - trainer.nnodes=${NNODES} \ - trainer.n_gpus_per_node=${NGPUS_PER_NODE} \ - trainer.total_epochs=${TOTAL_EPOCHS} \ - trainer.save_freq=${SAVE_FREQ} \ - trainer.test_freq=${TEST_FREQ} \ - trainer.project_name=${PROJECT_NAME} \ - trainer.experiment_name=${EXPERIMENT_NAME} \ - actor_rollout_ref.rollout.custom.agent_framework.agent_runner_kwargs.agent_config_path="${AGENT_CONFIG_PATH}" \ - actor_rollout_ref.rollout.custom.agent_framework.completion_timeout_seconds=${COMPLETION_TIMEOUT} \ - "${RUNNER_ARGS[@]}" \ - "$@" diff --git a/examples/blackbox_recipes/scripts/run_train_megatron_async.sh b/examples/blackbox_recipes/scripts/run_train_megatron_async.sh deleted file mode 100755 index db3a8264..00000000 --- a/examples/blackbox_recipes/scripts/run_train_megatron_async.sh +++ /dev/null @@ -1,199 +0,0 @@ -#!/usr/bin/env bash -# Megatron + TQ fully-async training for the blackbox SWE-agent recipe. -# -# Uses FullyAsyncAgentFrameworkRolloutAdapter + SWEAgentFramework with Megatron backend. -# Data flows through TransferQueue (zero-copy) with ReplayBuffer flow control. -# -# Usage: -# bash examples/swe_agent_blackbox/scripts/run_train_megatron_async.sh -# -# All configurable via environment variables (see defaults below). - -set -euo pipefail - -# ── Model & data ───────────────────────────────────────────────────────── -MODEL_PATH="${MODEL_PATH:-${HOME}/models/Qwen3.5-9B}" -TRAIN_DATA="${TRAIN_DATA:-${HOME}/data/swe_agent/swe_rebench_filtered.parquet}" -VAL_DATA="${VAL_DATA:-${HOME}/data/swe_agent/swe_bench_verified.parquet}" -RUNTIME_ENV="${RUNTIME_ENV:-}" - -# ── Hardware ───────────────────────────────────────────────────────────── -NNODES_TRAIN="${NNODES_TRAIN:-1}" -NNODES_ROLLOUT="${NNODES_ROLLOUT:-1}" -NGPUS_PER_NODE="${NGPUS_PER_NODE:-8}" - -# ── Algorithm ──────────────────────────────────────────────────────────── -CLIP_RATIO_LOW="${CLIP_RATIO_LOW:-0.2}" -CLIP_RATIO_HIGH="${CLIP_RATIO_HIGH:-0.28}" -ACTOR_LR="${ACTOR_LR:-1e-6}" - -# ── Sequence lengths ───────────────────────────────────────────────────── -PROMPT_LENGTH="${PROMPT_LENGTH:-4096}" -RESPONSE_LENGTH="${RESPONSE_LENGTH:-131072}" -MAX_MODEL_LEN=$((PROMPT_LENGTH + RESPONSE_LENGTH)) - -# ── Rollout parameters ─────────────────────────────────────────────────── -ENGINE="${ENGINE:-vllm}" -GEN_TP="${GEN_TP:-2}" -N="${N:-8}" -TEMPERATURE="${TEMPERATURE:-1.0}" -ROLLOUT_GPU_MEM_UTIL="${ROLLOUT_GPU_MEM_UTIL:-0.7}" - -# ── Megatron training parallelism ──────────────────────────────────────── -TRAIN_TP="${TRAIN_TP:-8}" -TRAIN_PP="${TRAIN_PP:-1}" -TRAIN_CP="${TRAIN_CP:-1}" -OFFLOAD="${OFFLOAD:-True}" -OPTIMIZER_OFFLOAD_FRACTION="${OFFLOAD_FRACTION:-1.0}" -USE_MBRIDGE="${USE_MBRIDGE:-True}" -PPO_MINI_BATCH_SIZE="${PPO_MINI_BATCH_SIZE:-16}" - -# ── Agent parameters ───────────────────────────────────────────────────── -RUNNER="${RUNNER:-mini_swe}" -MAX_TURNS="${MAX_TURNS:-100}" -AGENT_CONFIG_PATH="${AGENT_CONFIG_PATH:-examples/swe_agent_blackbox/config/agent_config.yaml}" -COMPLETION_TIMEOUT="${COMPLETION_TIMEOUT:-600}" -if [[ "${RUNNER}" == "claude_code" ]]; then - AGENT_RUNNER_FQN="examples.swe_agent_blackbox.claude_code_runner.claude_code_runner" - SWE_AGENT_TOOL_IMAGE="${SWE_AGENT_TOOL_IMAGE:-claude-code-tool:latest}" -elif [[ "${RUNNER}" == "mini_swe" ]]; then - AGENT_RUNNER_FQN="examples.swe_agent_blackbox.mini_swe_agent_runner.mini_swe_agent_runner" - SWE_AGENT_TOOL_IMAGE="${SWE_AGENT_TOOL_IMAGE:-swr.cn-east-3.myhuaweicloud.com/openyuanrong/mini-swe-agent-tool:latest}" -elif [[ "${RUNNER}" == "uniagent" ]]; then - AGENT_RUNNER_FQN="examples.swe_agent_blackbox.agent_runner.swe_agent_runner" - SWE_AGENT_TOOL_IMAGE="" -else - echo "Unknown RUNNER=${RUNNER}; expected mini_swe, claude_code, or uniagent" >&2 - exit 1 -fi -SWE_AGENT_RUN_TIMEOUT="${SWE_AGENT_RUN_TIMEOUT:-7200}" -CONDA_ENV="${CONDA_ENV:-testbed}" -RUNNER_ARGS=( - "actor_rollout_ref.rollout.custom.agent_framework.agent_runner_fqn=${AGENT_RUNNER_FQN}" -) -if [[ "${RUNNER}" != "uniagent" ]]; then - RUNNER_ARGS+=( - "+actor_rollout_ref.rollout.custom.agent_framework.agent_runner_kwargs.tool_image=${SWE_AGENT_TOOL_IMAGE}" - "+actor_rollout_ref.rollout.custom.agent_framework.agent_runner_kwargs.run_timeout=${SWE_AGENT_RUN_TIMEOUT}" - "+actor_rollout_ref.rollout.custom.agent_framework.agent_runner_kwargs.conda_env=${CONDA_ENV}" - ) -fi - -# ── OpenYuanRong (YR remote sandbox) ───────────────────────────────────── -OPENYUANRONG_SERVER_ADDRESS="${OPENYUANRONG_SERVER_ADDRESS:-}" -OPENYUANRONG_TOKEN="${OPENYUANRONG_TOKEN:-}" -OPENYUANRONG_TUNNEL_SSL_VERIFY="${OPENYUANRONG_TUNNEL_SSL_VERIFY:-0}" - -# ── Async training ─────────────────────────────────────────────────────── -TOTAL_ROLLOUT_STEPS="${TOTAL_ROLLOUT_STEPS:-100000}" -STALENESS_THRESHOLD="${STALENESS_THRESHOLD:-1.0}" -TRIGGER_SYNC_STEP="${TRIGGER_SYNC_STEP:-4}" -PARTIAL_ROLLOUT="${PARTIAL_ROLLOUT:-True}" - -# ── Logging & checkpointing ────────────────────────────────────────────── -PROJECT_NAME="${PROJECT_NAME:-swe_agent_blackbox}" -EXPERIMENT_NAME="${EXPERIMENT_NAME:-swe_agent_$(date +%Y%m%d_%H%M)}" -SAVE_FREQ="${SAVE_FREQ:-10}" -TEST_FREQ="${TEST_FREQ:-10}" -CKPTS_DIR="${CKPTS_DIR:-checkpoints/${PROJECT_NAME}/${EXPERIMENT_NAME}}" - -export SWE_AGENT_MAX_TURNS="${MAX_TURNS}" -export SWE_AGENT_EVAL_TIMEOUT="${SWE_AGENT_EVAL_TIMEOUT:-600}" -export OPENYUANRONG_SERVER_ADDRESS -export OPENYUANRONG_TOKEN -export OPENYUANRONG_TUNNEL_SSL_VERIFY - -echo "=== SWE-Agent Blackbox Megatron Async Training ===" -echo "Model: ${MODEL_PATH}" -echo "Train data: ${TRAIN_DATA}" -echo "Val data: ${VAL_DATA}" -echo "Engine: ${ENGINE} (gen_tp=${GEN_TP}, train_tp=${TRAIN_TP})" -echo "Runner: ${RUNNER}" -echo "Batch: n=${N}, mini_bsz=${PPO_MINI_BATCH_SIZE}" -echo "Sequence: prompt=${PROMPT_LENGTH}, response=${RESPONSE_LENGTH}" -echo "Nodes: train=${NNODES_TRAIN}, rollout=${NNODES_ROLLOUT}" -echo "===================================================" - -# ── Compute derived parameters ─────────────────────────────────────────── -ACTOR_PPO_MAX_TOKEN_LEN=$(( (PROMPT_LENGTH + RESPONSE_LENGTH) / TRAIN_CP )) -INFER_PPO_MAX_TOKEN_LEN=$(( (PROMPT_LENGTH + RESPONSE_LENGTH) / TRAIN_CP )) - -RUNTIME_ENV_ARGS=() -if [ -n "${RUNTIME_ENV}" ]; then - RUNTIME_ENV_ARGS=(--runtime-env "${RUNTIME_ENV}") -fi - -# ── Ensure Ray is running ──────────────────────────────────────────────── -TOTAL_GPUS=$(( (NNODES_TRAIN + NNODES_ROLLOUT) * NGPUS_PER_NODE )) -if ! ray status &>/dev/null; then - echo "Starting Ray cluster (${TOTAL_GPUS} GPUs)..." - ray start --head --num-gpus="${TOTAL_GPUS}" --disable-usage-stats -else - echo "Ray cluster already running." -fi - -# ── Launch ──────────────────────────────────────────────────────────────── -WORKING_DIR="${WORKING_DIR:-$(pwd)}" - -ray job submit --no-wait --working-dir="${WORKING_DIR}" "${RUNTIME_ENV_ARGS[@]}" \ - -- python3 -m verl.experimental.fully_async_policy.fully_async_main \ - --config-name=swe_agent_blackbox_megatron_async \ - --config-path="$(pwd)/examples/swe_agent_blackbox/config" \ - hydra.searchpath=[pkg://verl.trainer.config] \ - actor_rollout_ref.model.path="${MODEL_PATH}" \ - data.train_files="['${TRAIN_DATA}']" \ - data.val_files="['${VAL_DATA}']" \ - data.max_prompt_length=${PROMPT_LENGTH} \ - data.max_response_length=${RESPONSE_LENGTH} \ - actor_rollout_ref.rollout.n=${N} \ - actor_rollout_ref.rollout.name=${ENGINE} \ - actor_rollout_ref.rollout.prompt_length=${PROMPT_LENGTH} \ - actor_rollout_ref.rollout.response_length=${RESPONSE_LENGTH} \ - actor_rollout_ref.rollout.max_model_len=${MAX_MODEL_LEN} \ - actor_rollout_ref.rollout.max_num_batched_tokens=${MAX_MODEL_LEN} \ - actor_rollout_ref.rollout.temperature=${TEMPERATURE} \ - actor_rollout_ref.rollout.tensor_model_parallel_size=${GEN_TP} \ - actor_rollout_ref.rollout.gpu_memory_utilization=${ROLLOUT_GPU_MEM_UTIL} \ - actor_rollout_ref.rollout.multi_turn.max_assistant_turns=${MAX_TURNS} \ - actor_rollout_ref.rollout.custom.agent_framework.completion_timeout_seconds=${COMPLETION_TIMEOUT} \ - actor_rollout_ref.rollout.custom.agent_framework.agent_runner_kwargs.agent_config_path="${AGENT_CONFIG_PATH}" \ - "${RUNNER_ARGS[@]}" \ - actor_rollout_ref.actor.clip_ratio_low=${CLIP_RATIO_LOW} \ - actor_rollout_ref.actor.clip_ratio_high=${CLIP_RATIO_HIGH} \ - actor_rollout_ref.actor.ppo_mini_batch_size=${PPO_MINI_BATCH_SIZE} \ - actor_rollout_ref.actor.ppo_max_token_len_per_gpu=${ACTOR_PPO_MAX_TOKEN_LEN} \ - actor_rollout_ref.actor.optim.lr=${ACTOR_LR} \ - actor_rollout_ref.actor.optim.lr_decay_steps=${TOTAL_ROLLOUT_STEPS} \ - +actor_rollout_ref.actor.optim.override_optimizer_config.optimizer_offload_fraction=${OPTIMIZER_OFFLOAD_FRACTION} \ - +actor_rollout_ref.actor.optim.override_optimizer_config.overlap_cpu_optimizer_d2h_h2d=True \ - +actor_rollout_ref.actor.optim.override_optimizer_config.use_precision_aware_optimizer=True \ - +actor_rollout_ref.actor.optim.override_optimizer_config.optimizer_cpu_offload=True \ - actor_rollout_ref.actor.megatron.param_offload=${OFFLOAD} \ - actor_rollout_ref.actor.megatron.grad_offload=${OFFLOAD} \ - actor_rollout_ref.actor.megatron.optimizer_offload=${OFFLOAD} \ - actor_rollout_ref.actor.megatron.tensor_model_parallel_size=${TRAIN_TP} \ - actor_rollout_ref.actor.megatron.pipeline_model_parallel_size=${TRAIN_PP} \ - actor_rollout_ref.actor.megatron.context_parallel_size=${TRAIN_CP} \ - actor_rollout_ref.actor.megatron.use_mbridge=${USE_MBRIDGE} \ - actor_rollout_ref.ref.megatron.param_offload=${OFFLOAD} \ - actor_rollout_ref.ref.megatron.tensor_model_parallel_size=${TRAIN_TP} \ - actor_rollout_ref.ref.megatron.pipeline_model_parallel_size=${TRAIN_PP} \ - actor_rollout_ref.ref.megatron.context_parallel_size=${TRAIN_CP} \ - actor_rollout_ref.rollout.log_prob_micro_batch_size_per_gpu=1 \ - actor_rollout_ref.rollout.log_prob_max_token_len_per_gpu=${INFER_PPO_MAX_TOKEN_LEN} \ - actor_rollout_ref.ref.log_prob_micro_batch_size_per_gpu=1 \ - actor_rollout_ref.ref.log_prob_max_token_len_per_gpu=${INFER_PPO_MAX_TOKEN_LEN} \ - trainer.project_name="${PROJECT_NAME}" \ - trainer.experiment_name="${EXPERIMENT_NAME}" \ - trainer.save_freq=${SAVE_FREQ} \ - trainer.test_freq=${TEST_FREQ} \ - trainer.default_local_dir="${CKPTS_DIR}" \ - trainer.nnodes=${NNODES_TRAIN} \ - trainer.n_gpus_per_node=${NGPUS_PER_NODE} \ - rollout.nnodes=${NNODES_ROLLOUT} \ - rollout.n_gpus_per_node=${NGPUS_PER_NODE} \ - rollout.total_rollout_steps=${TOTAL_ROLLOUT_STEPS} \ - async_training.staleness_threshold=${STALENESS_THRESHOLD} \ - async_training.trigger_parameter_sync_step=${TRIGGER_SYNC_STEP} \ - async_training.partial_rollout=${PARTIAL_ROLLOUT} \ - "$@" diff --git a/examples/blackbox_recipes/scripts/run_train_megatron_sync.sh b/examples/blackbox_recipes/scripts/run_train_megatron_sync.sh deleted file mode 100755 index 1a0c19d3..00000000 --- a/examples/blackbox_recipes/scripts/run_train_megatron_sync.sh +++ /dev/null @@ -1,138 +0,0 @@ -#!/usr/bin/env bash -# Megatron sync training for the blackbox SWE-agent recipe. -# -# Uses main_ppo_sync + Megatron backend with the same blackbox agent infrastructure -# (AgentFrameworkRolloutAdapter, subprocess_runner, SWEAgentFramework). -# -# Usage: -# bash examples/swe_agent_blackbox/scripts/run_train_megatron_sync.sh -# -# All configurable via environment variables (see defaults below). - -set -euo pipefail - -# ── Model & data ───────────────────────────────────────────────────────── -MODEL_PATH="${MODEL_PATH:-$HOME/models/Qwen3.5-9B}" -TRAIN_DATA="${TRAIN_DATA:-$HOME/data/swe_agent/swe_rebench_filtered.parquet}" -VAL_DATA="${VAL_DATA:-$HOME/data/swe_agent/swe_bench_verified.parquet}" - -# ── Hardware ───────────────────────────────────────────────────────────── -NNODES="${NNODES:-1}" -NGPUS_PER_NODE="${NGPUS_PER_NODE:-8}" - -# ── Training parameters ───────────────────────────────────────────────── -TRAIN_BATCH_SIZE="${TRAIN_BATCH_SIZE:-128}" -PROMPT_LENGTH="${PROMPT_LENGTH:-4096}" -RESPONSE_LENGTH="${RESPONSE_LENGTH:-131072}" -ACTOR_LR="${ACTOR_LR:-1e-6}" -TOTAL_EPOCHS="${TOTAL_EPOCHS:-10}" -SAVE_FREQ="${SAVE_FREQ:-10}" -TEST_FREQ="${TEST_FREQ:-10}" -PPO_MINI_BATCH_SIZE="${PPO_MINI_BATCH_SIZE:-16}" - -# ── Rollout parameters ────────────────────────────────────────────────── -ENGINE="${ENGINE:-vllm}" -TP="${TP:-4}" -ROLLOUT_GPU_MEM_UTIL="${ROLLOUT_GPU_MEM_UTIL:-0.7}" -N="${N:-8}" -TEMPERATURE="${TEMPERATURE:-1.0}" - -# ── Megatron parallelism ──────────────────────────────────────────────── -TRAIN_TP="${TRAIN_TP:-8}" -TRAIN_PP="${TRAIN_PP:-1}" -TRAIN_CP="${TRAIN_CP:-1}" -OFFLOAD="${OFFLOAD:-true}" -USE_MBRIDGE="${USE_MBRIDGE:-true}" - -# ── Agent parameters ───────────────────────────────────────────────────── -RUNNER="${RUNNER:-mini_swe}" -MAX_TURNS="${MAX_TURNS:-100}" -AGENT_CONFIG_PATH="${AGENT_CONFIG_PATH:-examples/swe_agent_blackbox/config/agent_config.yaml}" -COMPLETION_TIMEOUT="${COMPLETION_TIMEOUT:-600}" -if [[ "${RUNNER}" == "claude_code" ]]; then - AGENT_RUNNER_FQN="examples.swe_agent_blackbox.claude_code_runner.claude_code_runner" - SWE_AGENT_TOOL_IMAGE="${SWE_AGENT_TOOL_IMAGE:-claude-code-tool:latest}" -elif [[ "${RUNNER}" == "mini_swe" ]]; then - AGENT_RUNNER_FQN="examples.swe_agent_blackbox.mini_swe_agent_runner.mini_swe_agent_runner" - SWE_AGENT_TOOL_IMAGE="${SWE_AGENT_TOOL_IMAGE:-swr.cn-east-3.myhuaweicloud.com/openyuanrong/mini-swe-agent-tool:latest}" -elif [[ "${RUNNER}" == "uniagent" ]]; then - AGENT_RUNNER_FQN="examples.swe_agent_blackbox.agent_runner.swe_agent_runner" - SWE_AGENT_TOOL_IMAGE="" -else - echo "Unknown RUNNER=${RUNNER}; expected mini_swe, claude_code, or uniagent" >&2 - exit 1 -fi -SWE_AGENT_RUN_TIMEOUT="${SWE_AGENT_RUN_TIMEOUT:-7200}" -RUNNER_ARGS=( - "actor_rollout_ref.rollout.custom.agent_framework.agent_runner_fqn=${AGENT_RUNNER_FQN}" -) -if [[ "${RUNNER}" != "uniagent" ]]; then - RUNNER_ARGS+=( - "+actor_rollout_ref.rollout.custom.agent_framework.agent_runner_kwargs.tool_image=${SWE_AGENT_TOOL_IMAGE}" - "+actor_rollout_ref.rollout.custom.agent_framework.agent_runner_kwargs.run_timeout=${SWE_AGENT_RUN_TIMEOUT}" - ) -fi - -# ── Logging ────────────────────────────────────────────────────────────── -PROJECT_NAME="${PROJECT_NAME:-swe_agent_blackbox}" -EXPERIMENT_NAME="${EXPERIMENT_NAME:-swe_agent_$(date +%Y%m%d_%H%M)}" -VERL_LOGGING_LEVEL="${VERL_LOGGING_LEVEL:-INFO}" - -export SWE_AGENT_MAX_TURNS="${MAX_TURNS}" -export SWE_AGENT_EVAL_TIMEOUT="${SWE_AGENT_EVAL_TIMEOUT:-600}" -export VERL_LOGGING_LEVEL - -# ── Environment for NCCL ──────────────────────────────────────────────── -export NCCL_P2P_DISABLE="${NCCL_P2P_DISABLE:-1}" -export NCCL_SHM_DISABLE="${NCCL_SHM_DISABLE:-1}" - -echo "=== SWE-Agent Blackbox Megatron Sync Training ===" -echo "Model: ${MODEL_PATH}" -echo "Train data: ${TRAIN_DATA}" -echo "Val data: ${VAL_DATA}" -echo "Engine: ${ENGINE} (gen_tp=${TP}, train_tp=${TRAIN_TP})" -echo "Runner: ${RUNNER}" -echo "Batch size: ${TRAIN_BATCH_SIZE}, N=${N}" -echo "Sequence: prompt=${PROMPT_LENGTH}, response=${RESPONSE_LENGTH}" -echo "===============================================" - -python3 -m verl.trainer.main_ppo_sync \ - --config-name=swe_agent_blackbox_megatron_sync \ - --config-path="$(pwd)/examples/swe_agent_blackbox/config" \ - hydra.searchpath=[pkg://verl.trainer.config] \ - actor_rollout_ref.model.path="${MODEL_PATH}" \ - data.train_files="['${TRAIN_DATA}']" \ - data.val_files="['${VAL_DATA}']" \ - data.train_batch_size=${TRAIN_BATCH_SIZE} \ - data.max_prompt_length=${PROMPT_LENGTH} \ - data.max_response_length=${RESPONSE_LENGTH} \ - actor_rollout_ref.rollout.name=${ENGINE} \ - actor_rollout_ref.rollout.tensor_model_parallel_size=${TP} \ - actor_rollout_ref.rollout.gpu_memory_utilization=${ROLLOUT_GPU_MEM_UTIL} \ - actor_rollout_ref.rollout.n=${N} \ - actor_rollout_ref.rollout.temperature=${TEMPERATURE} \ - actor_rollout_ref.rollout.prompt_length=${PROMPT_LENGTH} \ - actor_rollout_ref.rollout.response_length=${RESPONSE_LENGTH} \ - actor_rollout_ref.rollout.max_model_len=$((PROMPT_LENGTH + RESPONSE_LENGTH)) \ - actor_rollout_ref.rollout.multi_turn.max_assistant_turns=${MAX_TURNS} \ - actor_rollout_ref.actor.optim.lr=${ACTOR_LR} \ - actor_rollout_ref.actor.ppo_mini_batch_size=${PPO_MINI_BATCH_SIZE} \ - actor_rollout_ref.actor.megatron.tensor_model_parallel_size=${TRAIN_TP} \ - actor_rollout_ref.actor.megatron.pipeline_model_parallel_size=${TRAIN_PP} \ - actor_rollout_ref.actor.megatron.context_parallel_size=${TRAIN_CP} \ - actor_rollout_ref.actor.megatron.param_offload=${OFFLOAD} \ - actor_rollout_ref.actor.megatron.grad_offload=${OFFLOAD} \ - actor_rollout_ref.actor.megatron.use_mbridge=${USE_MBRIDGE} \ - actor_rollout_ref.rollout.nnodes=${NNODES} \ - actor_rollout_ref.rollout.n_gpus_per_node=${NGPUS_PER_NODE} \ - trainer.nnodes=${NNODES} \ - trainer.n_gpus_per_node=${NGPUS_PER_NODE} \ - trainer.total_epochs=${TOTAL_EPOCHS} \ - trainer.save_freq=${SAVE_FREQ} \ - trainer.test_freq=${TEST_FREQ} \ - trainer.project_name=${PROJECT_NAME} \ - trainer.experiment_name=${EXPERIMENT_NAME} \ - actor_rollout_ref.rollout.custom.agent_framework.agent_runner_kwargs.agent_config_path="${AGENT_CONFIG_PATH}" \ - actor_rollout_ref.rollout.custom.agent_framework.completion_timeout_seconds=${COMPLETION_TIMEOUT} \ - "${RUNNER_ARGS[@]}" \ - "$@" diff --git a/examples/data_preprocess/r2e_gym_subset_filtered.py b/examples/data_preprocess/r2e_gym_subset_filtered.py index c97afbdc..eeafbc0a 100644 --- a/examples/data_preprocess/r2e_gym_subset_filtered.py +++ b/examples/data_preprocess/r2e_gym_subset_filtered.py @@ -17,6 +17,13 @@ def get_image_name(dataset_id: str, instance_id: str) -> str: assert len(parts) == 2 instance_number = parts[1].lower() return PUB_VOLCES_IMG_URL_R2E.format(instance_number=instance_number) +elif impl == "openyuanrong": + + def get_image_name(dataset_id: str, instance_id: str) -> str: + parts = instance_id.split("__") + assert len(parts) == 2 + instance_number = parts[1].lower() + return f"swr.cn-east-3.myhuaweicloud.com/openyuanrong/r2e-gym-subset/{instance_number}:latest" else: raise ValueError(f"Invalid deployment implementation: {impl}") diff --git a/examples/data_preprocess/swe_bench_verified.py b/examples/data_preprocess/swe_bench_verified.py index 8f26ad16..3a56695e 100644 --- a/examples/data_preprocess/swe_bench_verified.py +++ b/examples/data_preprocess/swe_bench_verified.py @@ -18,6 +18,15 @@ def get_image_name(dataset_id: str, instance_id: str) -> str: project_name = parts[0].lower() instance_number = parts[1].lower() return f"swebench/sweb.eval.x86_64.{project_name}_1776_{instance_number}" +elif impl == "openyuanrong": + + def get_image_name(dataset_id: str, instance_id: str) -> str: + assert dataset_id == "swe-bench-verified" + parts = instance_id.split("__") + assert len(parts) == 2 + project_name = parts[0].lower() + instance_number = parts[1].lower() + return f"swr.cn-east-3.myhuaweicloud.com/openyuanrong/swe-bench-verified/sweb.eval.x86_64.{project_name}_1776_{instance_number}:v2" else: raise ValueError(f"Invalid deployment implementation: {impl}") diff --git a/examples/data_preprocess/swe_rebench.py b/examples/data_preprocess/swe_rebench.py index 3add8b28..1cb907df 100644 --- a/examples/data_preprocess/swe_rebench.py +++ b/examples/data_preprocess/swe_rebench.py @@ -17,6 +17,14 @@ def get_image_name(dataset_id, instance_id): project_name = parts[0].lower() instance_number = parts[1].lower() return f"swerebench/sweb.eval.x86_64.{project_name}_1776_{instance_number}" +elif impl == "openyuanrong": + + def get_image_name(dataset_id: str, instance_id: str) -> str: + parts = instance_id.split("__") + assert len(parts) == 2 + project_name = parts[0].lower() + instance_number = parts[1].lower() + return f"swr.cn-east-3.myhuaweicloud.com/openyuanrong/swe-rebench/{project_name}_1776_{instance_number}:latest" else: raise ValueError(f"Invalid deployment implementation: {impl}") diff --git a/uni_agent/gateway/session/codec.py b/uni_agent/gateway/session/codec.py index 747dbde8..b808183d 100644 --- a/uni_agent/gateway/session/codec.py +++ b/uni_agent/gateway/session/codec.py @@ -7,8 +7,8 @@ from uuid import uuid4 from verl.experimental.agent_loop.tool_parser import ToolParser -from verl.utils.chat_template import apply_chat_template as _apply_chat_template -from verl.utils.chat_template import initialize_system_prompt +from verl.utils.tokenizer.chat_template import apply_chat_template as _apply_chat_template +from verl.utils.tokenizer.chat_template import initialize_system_prompt from verl.utils.tokenizer import normalize_token_ids diff --git a/uni_agent/interaction/model.py b/uni_agent/interaction/model.py index bfa0651f..1184688f 100644 --- a/uni_agent/interaction/model.py +++ b/uni_agent/interaction/model.py @@ -138,7 +138,7 @@ async def query( return response_str, [], rollout_cache, generation_info async def _get_new_message_ids(self, new_messages: list[dict[str, Any]]) -> list[int]: - from verl.utils.chat_template import apply_chat_template + from verl.utils.tokenizer.chat_template import apply_chat_template from verl.utils.tokenizer import normalize_token_ids tokenized_prompt = await self.loop.run_in_executor( @@ -154,7 +154,7 @@ async def _get_new_message_ids(self, new_messages: list[dict[str, Any]]) -> list @cached_property def message_boundary_tokens(self) -> list[int]: - from verl.utils.chat_template import apply_chat_template + from verl.utils.tokenizer.chat_template import apply_chat_template from verl.utils.tokenizer import normalize_token_ids dummy_history = [ diff --git a/verl b/verl index 7aed6b23..6fef6a7a 160000 --- a/verl +++ b/verl @@ -1 +1 @@ -Subproject commit 7aed6b230776f963fa09509c10d9c3a767d1102c +Subproject commit 6fef6a7a699435cad84e8907e9121457e41eed04