diff --git a/.gitignore b/.gitignore index 2b914ad..7d3938d 100644 --- a/.gitignore +++ b/.gitignore @@ -181,3 +181,4 @@ pyrightconfig.json # End of https://www.toptal.com/developers/gitignore/api/python swebench_repos/ +setup.txt \ No newline at end of file diff --git a/.gitmodules b/.gitmodules deleted file mode 100644 index ad55c20..0000000 --- a/.gitmodules +++ /dev/null @@ -1,6 +0,0 @@ -[submodule "software-agent-sdk"] - path = software-agent-sdk - url = https://github.com/OpenHands/software-agent-sdk.git -[submodule "prime-rl"] - path = prime-rl - url = https://github.com/PrimeIntellect-ai/prime-rl diff --git a/README_verifiers.md b/README_verifiers.md deleted file mode 100644 index b7c9771..0000000 --- a/README_verifiers.md +++ /dev/null @@ -1,30 +0,0 @@ -# Instructions for using the verifiers environment - -1. Install dependencies - -```bash -curl -LsSf https://astral.sh/uv/install.sh | sh -uv sync -``` - -2. Clone some repos from the SWE-bench dataset - -```bash -uv run scripts/clone_repos.py --output-dir ./swebench_repos --dataset princeton-nlp/SWE-bench_Lite --max-workers 10 -``` - -3. Run `vllm` and serve `Qwen3-8B` -```bash -vllm serve Qwen/Qwen3-8B --enable-auto-tool-choice --tool-call-parser hermes --reasoning-parser deepseek_r1 -``` - -4. Install [ripgrep](https://github.com/BurntSushi/ripgrep?tab=readme-ov-file#installation) -```bash -sudo apt-get install ripgrep -y -``` - -5. Run the verifiers eval with your model of choice - -```bash -uv run vf-eval swe-grep-oss-env --api-base-url http://localhost:8000/v1 --model "Qwen/Qwen3-8B" --num-examples 1 --rollouts-per-example 1 -``` diff --git a/configs/reward_config_1.7b.yaml b/configs/reward_config_1.7b.yaml new file mode 100644 index 0000000..feced9a --- /dev/null +++ b/configs/reward_config_1.7b.yaml @@ -0,0 +1,3 @@ +reward: + - fn: multilevel_localization_f1_reward + weight: 1.0 \ No newline at end of file diff --git a/configs/reward_config_14b.yaml b/configs/reward_config_14b.yaml new file mode 100644 index 0000000..288be5c --- /dev/null +++ b/configs/reward_config_14b.yaml @@ -0,0 +1,8 @@ +reward: + - fn: multilevel_localization_f1_reward + weight: 1.0 + - fn: multiturn_reward + args: + maximal_turns: 4 + minimal_turns: 4 + weight: 1.0 \ No newline at end of file diff --git a/configs/reward_config_4b.yaml b/configs/reward_config_4b.yaml new file mode 100644 index 0000000..feced9a --- /dev/null +++ b/configs/reward_config_4b.yaml @@ -0,0 +1,3 @@ +reward: + - fn: multilevel_localization_f1_reward + weight: 1.0 \ No newline at end of file diff --git a/configs/rewards/cosine.yaml b/configs/rewards/cosine.yaml deleted file mode 100644 index a8fbae7..0000000 --- a/configs/rewards/cosine.yaml +++ /dev/null @@ -1,2 +0,0 @@ -reward: - - fn: cosine_reward \ No newline at end of file diff --git a/configs/rewards/cosine_file_only.yaml b/configs/rewards/cosine_file_only.yaml deleted file mode 100644 index 658062a..0000000 --- a/configs/rewards/cosine_file_only.yaml +++ /dev/null @@ -1,7 +0,0 @@ -reward: - - fn: cosine_reward - args: - loc_threshold: 0.5 - file_level_weight: 1.0 - module_level_weight: 0.0 - entity_level_weight: 0.0 \ No newline at end of file diff --git a/configs/rewards/file_loc.yaml b/configs/rewards/file_loc.yaml deleted file mode 100644 index dfc07a9..0000000 --- a/configs/rewards/file_loc.yaml +++ /dev/null @@ -1,3 +0,0 @@ -reward: - - fn: file_localization_f1_reward - - fn: tool_use_reward \ No newline at end of file diff --git a/configs/rewards/scaled_f1.yaml b/configs/rewards/scaled_f1.yaml deleted file mode 100644 index 1a8dd59..0000000 --- a/configs/rewards/scaled_f1.yaml +++ /dev/null @@ -1,2 +0,0 @@ -reward: - - fn: scaled_f1_reward diff --git a/configs/rewards/tool_use.yaml b/configs/rewards/tool_use.yaml deleted file mode 100644 index 0c97d9d..0000000 --- a/configs/rewards/tool_use.yaml +++ /dev/null @@ -1,4 +0,0 @@ -reward: - - fn: tool_use_reward - - fn: turn_efficiency - - fn: multilevel_localization_f1_reward \ No newline at end of file diff --git a/configs/rewards/tool_use_and_turn_cosine_file_only.yaml b/configs/rewards/tool_use_and_turn_cosine_file_only.yaml deleted file mode 100644 index e528ad8..0000000 --- a/configs/rewards/tool_use_and_turn_cosine_file_only.yaml +++ /dev/null @@ -1,8 +0,0 @@ -reward: - - fn: turn_cosine_reward - args: - loc_threshold: 0.5 - file_level_weight: 1.0 - module_level_weight: 0.0 - entity_level_weight: 0.0 - - fn: tool_use_reward \ No newline at end of file diff --git a/configs/rewards/weighted_f1.yaml b/configs/rewards/weighted_f1.yaml deleted file mode 100644 index eac2352..0000000 --- a/configs/rewards/weighted_f1.yaml +++ /dev/null @@ -1,6 +0,0 @@ -reward: - - fn: multilevel_localization_f1_reward - args: - file_level_weight: 4.0 - module_level_weight: 2.0 - entity_level_weight: 1.0 \ No newline at end of file diff --git a/configs/skyrl-experiments/README.md b/configs/skyrl-experiments/README.md deleted file mode 100644 index df63aec..0000000 --- a/configs/skyrl-experiments/README.md +++ /dev/null @@ -1,288 +0,0 @@ -# SkyRL Experiment Configuration Guide - -This directory contains experiment configuration files for training agents with SkyRL. Each YAML file defines an experiment setup with specific tools, rewards, and prompts. - -## Usage - -```bash -DATA_PATH= - -bash scripts/run_async_training.sh \ - -m Qwen/Qwen3-4B \ - -o "+generator.exp_config=configs/skyrl-experiments/read-only.yaml" \ - -d $DATA_PATH \ - 2>&1 | tee training.log -``` - -## Configuration File Structure - -Each experiment config file follows this structure: - -```yaml -name: "experiment_name" -description: "Brief description of the experiment" - -reward: - - fn: reward_function_1 - - fn: reward_function_2 - -tools: - - tool_name_1 - - tool_name_2 - -prompts: - system_prompt: "templates/system_prompt.j2" - user_prompt: "templates/user_prompt.j2" -``` - -### Fields - -#### `name` (optional) -- **Type**: String -- **Description**: A unique identifier for the experiment -- **Example**: `"read_only_tools"` - -#### `description` (optional) -- **Type**: String -- **Description**: A human-readable description of what the experiment tests -- **Example**: `"The agent only has access to read only tools"` - -#### `reward` (required) -- **Type**: List of reward function specifications -- **Description**: Defines the reward functions used to evaluate agent performance during training -- **Format**: Each item should have a `fn` key with the reward function name -- **Example**: - ```yaml - reward: - - fn: tool_use_reward - - fn: turn_efficiency - ``` - -#### `tools` (required) -- **Type**: List of tool names -- **Description**: Specifies which tools the agent has access to during the experiment -- **Options**: Can be default OpenHands tools, custom tools, or toolsets -- **Example**: - ```yaml - tools: - - terminal - - grep - - glob - ``` - -#### `prompts` (required) -- **Type**: Object with `system_prompt` and `user_prompt` keys -- **Description**: Specifies the Jinja2 template files for system and user prompts -- **Location**: Templates should be placed in `src/prompts/templates/` -- **Format**: Paths are relative to `src/prompts/` -- **Example**: - ```yaml - prompts: - system_prompt: "templates/system_prompt.j2" - user_prompt: "templates/file_localization.j2" - ``` - -## Default OpenHands Tools - -The following tools are built into OpenHands and can be used directly in your config: - -- `apply_patch` - Apply code patches to files -- `browser_use` - Interact with web browsers -- `delegate` - Delegate tasks to sub-agents -- `file_editor` - Edit files with various operations -- `glob` - Search for files by name patterns -- `grep` - Search file contents using regex -- `planning_file_editor` - File editor with planning capabilities -- `preset` - Use predefined tool presets -- `task_tracker` - Track and manage tasks -- `terminal` - Execute shell commands -- `tom_consult` - Consult theory of mind models - -## Registering Custom Tools - -To create and register a custom tool: - -### 1. Create a Tool File - -Create a new Python file in `src/tools/` (e.g., `src/tools/my_custom_tool.py`): - -```python -from src.tools import tool -from pydantic import Field -from collections.abc import Sequence -from openhands.sdk import ( - Action, - Observation, - TextContent, - ToolDefinition, -) -from openhands.sdk.tool import ToolExecutor - -# Define your Action class -class MyCustomAction(Action): - param1: str = Field(description="Description of parameter") - param2: int = Field(default=10, description="Optional parameter") - -# Define your Observation class -class MyCustomObservation(Observation): - result: str = "" - - @property - def to_llm_content(self) -> Sequence[TextContent]: - return [TextContent(text=self.result)] - -# Define your Executor -class MyCustomExecutor(ToolExecutor[MyCustomAction, MyCustomObservation]): - def __call__(self, action: MyCustomAction, conversation=None) -> MyCustomObservation: - # Implement your tool logic here - result = f"Processed {action.param1} with {action.param2}" - return MyCustomObservation(result=result) - -# Define your Tool -class MyCustomTool(ToolDefinition[MyCustomAction, MyCustomObservation]): - @classmethod - def create(cls, conv_state) -> Sequence[ToolDefinition]: - executor = MyCustomExecutor() - return [ - cls( - description="Description of what your tool does", - action_type=MyCustomAction, - observation_type=MyCustomObservation, - executor=executor, - ) - ] - -# Register the tool -@tool(name="my_custom_tool") -def _make_my_custom_tool(conv_state) -> list[ToolDefinition]: - return MyCustomTool.create(conv_state) -``` - -### 2. Use the Tool in Your Config - -Once registered, simply add the tool name to your experiment config: - -```yaml -tools: - - my_custom_tool - - terminal -``` - -### Creating Toolsets - -You can also create toolsets that bundle multiple tools together (see `bash_and_grep_toolset` in `src/tools/example_custom_tool.py`): - -```python -@tool(name="my_toolset") -def _make_my_toolset(conv_state) -> list[ToolDefinition]: - """Create multiple tools that share resources.""" - terminal_executor = TerminalExecutor(working_dir=conv_state.workspace.working_dir) - - tool1 = Tool1.create(conv_state, executor=terminal_executor)[0] - tool2 = Tool2.create(conv_state, executor=terminal_executor)[0] - - return [tool1, tool2] -``` - -## System and User Prompts - -Prompts are defined using Jinja2 templates and should be placed in `src/prompts/templates/`. - -### Available Template Files - -- `system_prompt.j2` - Default system prompt -- `file_localization.j2` - User prompt for file localization tasks -- `file_module.j2` - User prompt for file/module tasks -- `file_module_parallel_tools.j2` - User prompt with parallel tool usage -- `system_message_search.j2` - System prompt for search tasks -- `default.j2` - Default user prompt - -### Creating Custom Prompts - -1. Create a new Jinja2 template file in `src/prompts/templates/`: - -```jinja2 -{# templates/my_custom_prompt.j2 #} -You are an AI assistant specialized in {{ task_type }}. - -Your goal is to: {{ goal }} - -Available tools: -{% for tool in tools %} -- {{ tool }} -{% endfor %} - -Please proceed with the task. -``` - -2. Reference it in your experiment config: - -```yaml -prompts: - system_prompt: "templates/system_prompt.j2" - user_prompt: "templates/my_custom_prompt.j2" -``` - -### Template Variables - -Templates have access to various context variables provided by the training system, including: -- `task_type` - The type of task being performed -- `goal` - The specific goal for the episode -- `tools` - List of available tools -- `workspace` - Workspace information -- And other context-specific variables - -## Example Configurations - -### Example 1: Read-Only Tools -```yaml -name: "read_only_tools" -description: "The agent only has access to read only tools" - -reward: - - fn: tool_use_reward - - fn: turn_efficiency - -tools: - - glob - - grep - - terminal - -prompts: - system_prompt: "templates/system_prompt.j2" - user_prompt: "templates/file_localization.j2" -``` - -### Example 2: Terminal Only -```yaml -name: "terminal_tool_only" -description: "The agent only has access to the terminal tool" - -reward: - - fn: tool_use_reward - - fn: turn_efficiency - -tools: - - terminal - -prompts: - system_prompt: "templates/system_prompt.j2" - user_prompt: "templates/file_localization.j2" -``` - -### Example 3: Custom Toolset -```yaml -name: "bash_and_grep" -description: "Agent uses bash and grep toolset with shared executor" - -reward: - - fn: tool_use_reward - - fn: turn_efficiency - -tools: - - bash_and_grep_toolset - -prompts: - system_prompt: "templates/system_prompt.j2" - user_prompt: "templates/file_localization.j2" -``` diff --git a/configs/skyrl-experiments/multilevel_f05_minimum.yaml b/configs/skyrl-experiments/multilevel_f05_minimum.yaml deleted file mode 100644 index 393b64b..0000000 --- a/configs/skyrl-experiments/multilevel_f05_minimum.yaml +++ /dev/null @@ -1,21 +0,0 @@ -name: "Multilevel F1 with No Thinking" -description: "Don't use thinking" - -reward: - - fn: multilevel_localization_f1_reward - weight: 1.0 - args: - beta: 0.5 - - fn: format_reward - weight: 1.0 - args: - penalize: false - -tools: - - glob - - grep - - terminal - -prompts: - system_prompt: "templates/system_prompt.j2" - user_prompt: "templates/file_module.j2" \ No newline at end of file diff --git a/configs/skyrl-experiments/no_think_multilevel_f1.yaml b/configs/skyrl-experiments/no_think_multilevel_f1.yaml deleted file mode 100644 index 21a6d05..0000000 --- a/configs/skyrl-experiments/no_think_multilevel_f1.yaml +++ /dev/null @@ -1,18 +0,0 @@ -name: "Multilevel F1 with No Thinking" -description: "Don't use thinking" - -reward: - - fn: multilevel_localization_f1_reward - - fn: format_reward - - fn: tool_use_reward - args: - penalize: true - -tools: - - glob - - grep - - terminal - -prompts: - system_prompt: "templates/system_prompt_short.j2" - user_prompt: "templates/file_module_no_think.j2" \ No newline at end of file diff --git a/configs/skyrl-experiments/no_think_weighted_f1.yaml b/configs/skyrl-experiments/no_think_weighted_f1.yaml deleted file mode 100644 index 60a54a5..0000000 --- a/configs/skyrl-experiments/no_think_weighted_f1.yaml +++ /dev/null @@ -1,22 +0,0 @@ -name: "Weighted F1" -description: "Don't use thinking" - -reward: - - fn: multilevel_localization_f1_reward - args: - file_level_weight: 1.0 - module_level_weight: 0.5 - entity_level_weight: 0.25 - - fn: format_reward - - fn: tool_use_reward - args: - penalize: true - -tools: - - glob - - grep - - terminal - -prompts: - system_prompt: "templates/system_prompt_short.j2" - user_prompt: "templates/file_module_no_think.j2" \ No newline at end of file diff --git a/configs/skyrl-experiments/read-only.yaml b/configs/skyrl-experiments/read-only.yaml deleted file mode 100644 index 4d5525a..0000000 --- a/configs/skyrl-experiments/read-only.yaml +++ /dev/null @@ -1,15 +0,0 @@ -name: "read_only_tools" -description: "The agent only has access to read only tools" - -reward: - - fn: tool_use_reward - - fn: turn_efficiency - -tools: - - glob - - grep - - terminal - -prompts: - system_prompt: "templates/system_prompt.j2" - user_prompt: "templates/file_localization.j2" \ No newline at end of file diff --git a/configs/skyrl-experiments/terminal.yaml b/configs/skyrl-experiments/terminal.yaml deleted file mode 100644 index 6ad62c6..0000000 --- a/configs/skyrl-experiments/terminal.yaml +++ /dev/null @@ -1,13 +0,0 @@ -name: "terminal_tool_only" -description: "The agent only has access to the terminal tool" - -reward: - - fn: tool_use_reward - - fn: turn_efficiency - -tools: - - terminal - -prompts: - system_prompt: "templates/system_prompt.j2" - user_prompt: "templates/file_localization.j2" \ No newline at end of file diff --git a/configs/skyrl-experiments/test.yaml b/configs/skyrl-experiments/test.yaml deleted file mode 100644 index ce042b5..0000000 --- a/configs/skyrl-experiments/test.yaml +++ /dev/null @@ -1,13 +0,0 @@ -name: "read_only_tools" -description: "The agent only has access to read only tools" - -reward: - - fn: tool_use_reward - - fn: turn_efficiency - -tools: - - bash_and_grep_toolset - -prompts: - system_prompt: "templates/system_prompt.j2" - user_prompt: "templates/file_localization.j2" \ No newline at end of file diff --git a/configs/skyrl-experiments/think_multilevel_f1.yaml b/configs/skyrl-experiments/think_multilevel_f1.yaml deleted file mode 100644 index b558d80..0000000 --- a/configs/skyrl-experiments/think_multilevel_f1.yaml +++ /dev/null @@ -1,16 +0,0 @@ -name: "Multilevel F1 with No Thinking" -description: "Don't use thinking" - -reward: - - fn: multilevel_localization_f1_reward - - fn: format_reward - - fn: tool_use_reward - -tools: - - glob - - grep - - terminal - -prompts: - system_prompt: "templates/system_prompt_short.j2" - user_prompt: "templates/file_module_think.j2" \ No newline at end of file diff --git a/configs/skyrl-experiments/weighted_multilevel_f05.yaml b/configs/skyrl-experiments/weighted_multilevel_f05.yaml deleted file mode 100644 index a0d7752..0000000 --- a/configs/skyrl-experiments/weighted_multilevel_f05.yaml +++ /dev/null @@ -1,28 +0,0 @@ -name: "Multilevel F1 with No Thinking" -description: "Don't use thinking" - -reward: - - fn: multilevel_localization_f1_reward - weight: 1.0 - args: - beta: 0.5 - - fn: tool_use_reward - weight: 1.0 - args: - clamp: false - penalize: false - max_tool_use: 5 - - fn: multiturn_reward - weight: 4.0 - args: - minimal_turns: 4 - maximal_turns: 4 - -tools: - - glob - - grep - - terminal - -prompts: - system_prompt: "templates/system_prompt.j2" - user_prompt: "templates/file_module.j2" \ No newline at end of file diff --git a/configs/swe-grep-oss/rl/infer.toml b/configs/swe-grep-oss/rl/infer.toml deleted file mode 100644 index cebaf69..0000000 --- a/configs/swe-grep-oss/rl/infer.toml +++ /dev/null @@ -1,8 +0,0 @@ -gpu_memory_utilization = 0.7 - -[model] -name = "willcb/Qwen3-8B" -enforce_eager = true -enable_auto_tool_choice = true -tool_call_parser = "hermes" - diff --git a/configs/swe-grep-oss/rl/orch.toml b/configs/swe-grep-oss/rl/orch.toml deleted file mode 100644 index 45c2042..0000000 --- a/configs/swe-grep-oss/rl/orch.toml +++ /dev/null @@ -1,17 +0,0 @@ -batch_size = 64 -seq_len = 8000 -rollouts_per_example = 4 -max_steps = 150 -mask_truncated_completions = false - -[wandb] -project = "swe-grep-oss" - -[model] -name = "willcb/Qwen3-8B" - -[[env]] -id = "swe-grep-oss-env" - -[ckpt] -interval = 10 diff --git a/configs/swe-grep-oss/rl/train.toml b/configs/swe-grep-oss/rl/train.toml deleted file mode 100644 index ffc5977..0000000 --- a/configs/swe-grep-oss/rl/train.toml +++ /dev/null @@ -1,28 +0,0 @@ -max_steps = 150 - -[model] -name = "willcb/Qwen3-8B" - -[model.ac] -freq = 1 - -[model.experimental.lora] -rank = 64 -alpha = 512 -dropout = 0.0 -target_modules = [ - "q_proj", # Attention: Query projection - "k_proj", # Attention: Key projection - "v_proj", # Attention: Value projection - "o_proj", # Attention: Output projection - "gate_proj", # MLP: Gating projection - "up_proj", # MLP: Up projection - "down_proj" # MLP: Down projection -] -modules_to_save = [] - -[optim] -lr = 1e-5 - -[ckpt] -interval = 10 \ No newline at end of file diff --git a/data/adityasoni17__SWE-smith-py-code-search_train/train.parquet b/data/adityasoni17__SWE-smith-py-code-search_train/train.parquet new file mode 100644 index 0000000..49daa50 Binary files /dev/null and b/data/adityasoni17__SWE-smith-py-code-search_train/train.parquet differ diff --git a/data/adityasoni17__SWE-smith-py-code-search_train/validation.parquet b/data/adityasoni17__SWE-smith-py-code-search_train/validation.parquet new file mode 100644 index 0000000..c3d2394 Binary files /dev/null and b/data/adityasoni17__SWE-smith-py-code-search_train/validation.parquet differ diff --git a/data/swe_gym/train.parquet b/data/qwen3_1.7b_data/train.parquet similarity index 56% rename from data/swe_gym/train.parquet rename to data/qwen3_1.7b_data/train.parquet index 09576bd..d7e8ce7 100644 Binary files a/data/swe_gym/train.parquet and b/data/qwen3_1.7b_data/train.parquet differ diff --git a/data/qwen3_1.7b_data/validation.parquet b/data/qwen3_1.7b_data/validation.parquet new file mode 100644 index 0000000..f82fcbe Binary files /dev/null and b/data/qwen3_1.7b_data/validation.parquet differ diff --git a/data/swe_gym/validation.parquet b/data/swe_gym/validation.parquet deleted file mode 100644 index d085f63..0000000 Binary files a/data/swe_gym/validation.parquet and /dev/null differ diff --git a/prime-rl b/prime-rl deleted file mode 160000 index 6b01ba5..0000000 --- a/prime-rl +++ /dev/null @@ -1 +0,0 @@ -Subproject commit 6b01ba5ae7f215aa1f869dfac30f5df5d587ee94 diff --git a/pyproject.toml b/pyproject.toml index cce1f28..c2f64b3 100644 --- a/pyproject.toml +++ b/pyproject.toml @@ -78,21 +78,13 @@ flash-attn = ["torch"] [tool.uv.sources] skyrl-train = { git = "https://github.com/adityasoni9998/SkyRL.git", rev = "81e5a97c7430503c0c4e6508497cc5aa01a0c624", subdirectory = "skyrl-train" } flash-attn = {url = "https://github.com/Dao-AILab/flash-attention/releases/download/v2.8.3/flash_attn-2.8.3+cu12torch2.8cxx11abiTRUE-cp313-cp313-linux_x86_64.whl"} -openhands-sdk = { workspace = true } -openhands-tools = { workspace = true } -openhands-workspace = { workspace = true } -openhands-agent-server = { workspace = true } +openhands-sdk = { git = "https://github.com/OpenHands/software-agent-sdk.git", rev = "85ecfd9333d2d2cc4404dd460fd38868d9b978e2", subdirectory = "openhands-sdk" } +openhands-tools = { git = "https://github.com/OpenHands/software-agent-sdk.git", rev = "85ecfd9333d2d2cc4404dd460fd38868d9b978e2", subdirectory = "openhands-tools" } +openhands-workspace = { git = "https://github.com/OpenHands/software-agent-sdk.git", rev = "85ecfd9333d2d2cc4404dd460fd38868d9b978e2", subdirectory = "openhands-workspace" } +openhands-agent-server = { git = "https://github.com/OpenHands/software-agent-sdk.git", rev = "85ecfd9333d2d2cc4404dd460fd38868d9b978e2", subdirectory = "openhands-agent-server" } torch = { index = "pytorch-cu128" } torchvision = { index = "pytorch-cu128" } flashinfer-jit-cache = { index = "flashinfer-cu128" } # flashinfer-python = [ # { url = "https://download.pytorch.org/whl/cu128/flashinfer/flashinfer_python-0.2.6.post1%2Bcu128torch2.7-cp39-abi3-linux_x86_64.whl" } # ] - -[tool.uv.workspace] -members = [ - "software-agent-sdk/openhands-sdk", - "software-agent-sdk/openhands-tools", - "software-agent-sdk/openhands-workspace", - "software-agent-sdk/openhands-agent-server", -] diff --git a/scripts/clone_repos.py b/scripts/clone_repos.py deleted file mode 100644 index bc70173..0000000 --- a/scripts/clone_repos.py +++ /dev/null @@ -1,199 +0,0 @@ -import argparse -import subprocess -from concurrent.futures import ThreadPoolExecutor, as_completed -from pathlib import Path - -from datasets import load_dataset -from tqdm import tqdm - - -def clone_instance( - repo_name: str, commit_id: str, instance_id: str, output_dir: Path -) -> bool: - """ - Clone a repository at a specific commit into a separate directory. - - Args: - repo_name: Repository name in format 'owner/repo' - commit_id: Commit hash to checkout - instance_id: Instance ID for directory naming - output_dir: Base output directory - - Returns: - True if successful, False otherwise - """ - # Create instance directory name: repo_instance-id - # E.g., astropy_astropy-12907 - instance_dir_name = f"{repo_name.replace('/', '_')}_{instance_id}" - instance_path = output_dir / instance_dir_name - - # Skip if already exists - if instance_path.exists(): - return True - - try: - # Clone the repository - subprocess.run( - [ - "git", - "clone", - f"https://github.com/{repo_name}.git", - str(instance_path), - ], - check=True, - capture_output=True, - text=True, - ) - - # Checkout the specific commit - subprocess.run( - ["git", "-C", str(instance_path), "checkout", commit_id], - check=True, - capture_output=True, - text=True, - ) - - return True - except subprocess.CalledProcessError as e: - return False - - -def main(): - parser = argparse.ArgumentParser( - description="Clone repositories from SWE-bench dataset" - ) - parser.add_argument( - "--output-dir", - type=str, - default="./swebench_repos", - help="Directory to clone repositories into (default: ./swebench_repos)", - ) - parser.add_argument( - "--dataset", - type=str, - default="princeton-nlp/SWE-bench_Lite", - help="SWE-bench dataset to use (default: princeton-nlp/SWE-bench_Lite)", - ) - parser.add_argument( - "--max-instances", - type=int, - default=None, - help="Maximum number of instances to process (for testing)", - ) - parser.add_argument( - "--max-repos", - type=int, - default=None, - help="Maximum number of repositories to clone (for testing)", - ) - parser.add_argument( - "--show-fields", - action="store_true", - help="Show available fields in the dataset and exit", - ) - parser.add_argument( - "--max-workers", - type=int, - default=4, - help="Maximum number of concurrent clone operations (default: 4)", - ) - - args = parser.parse_args() - - output_dir = Path(args.output_dir) - output_dir.mkdir(parents=True, exist_ok=True) - - print(f"Loading SWE-bench dataset: {args.dataset}") - dataset = load_dataset(args.dataset, split="test") - print(f"✓ Loaded {len(dataset)} instances") - - # Show available fields if requested - if args.show_fields: - print("\n" + "=" * 80) - print("Available fields in dataset:") - print("=" * 80) - if len(dataset) > 0: - first_instance = dataset[0] - for key in sorted(first_instance.keys()): - value = first_instance[key] - # Truncate long values - value_str = str(value) - if len(value_str) > 100: - value_str = value_str[:100] + "..." - print(f"{key:25s}: {value_str}") - print("=" * 80) - return - - # Collect all instances to process - instances_to_process = [] - for instance in dataset: - instances_to_process.append( - { - "repo": instance["repo"], - "instance_id": instance["instance_id"], - "base_commit": instance["base_commit"], - } - ) - - # Apply max-repos filter - if args.max_repos: - # Group by repo and take first N repos - repos_seen = set() - filtered_instances = [] - for instance in instances_to_process: - if instance["repo"] not in repos_seen: - if len(repos_seen) >= args.max_repos: - continue - repos_seen.add(instance["repo"]) - if instance["repo"] in repos_seen: - filtered_instances.append(instance) - instances_to_process = filtered_instances - print(f"\n(Limited to {args.max_repos} repositories)") - - # Apply max-instances filter - if args.max_instances: - instances_to_process = instances_to_process[: args.max_instances] - print(f"(Limited to {args.max_instances} instances)") - - print(f"\nProcessing {len(instances_to_process)} instances") - print(f"Using {args.max_workers} concurrent workers") - print("=" * 80) - - # Clone each instance concurrently - successful = 0 - with ThreadPoolExecutor(max_workers=args.max_workers) as executor: - # Submit all tasks - future_to_instance = { - executor.submit( - clone_instance, - instance["repo"], - instance["base_commit"], - instance["instance_id"], - output_dir, - ): instance - for instance in instances_to_process - } - - # Process completed tasks with progress bar - for future in tqdm( - as_completed(future_to_instance), - total=len(instances_to_process), - desc="Cloning instances", - ): - if future.result(): - successful += 1 - - print("\n" + "=" * 80) - print("Summary:") - print("=" * 80) - print(f"Output directory: {output_dir.absolute()}") - total = len(instances_to_process) - print(f"Successfully cloned: {successful}/{total} instances") - print( - "Note: Each instance is in its own directory named _" - ) - print("\nDone! 🎉") - - -if __name__ == "__main__": - main() diff --git a/scripts/run_async_training_1.7b.sh b/scripts/run_async_training_1.7b.sh new file mode 100644 index 0000000..ee30bac --- /dev/null +++ b/scripts/run_async_training_1.7b.sh @@ -0,0 +1,119 @@ +#!/bin/bash + +while getopts ":m:n:d:s:l:o:i:t:b:c:r:w:" opt; do + case ${opt} in + m ) MODEL=$OPTARG;; + n ) N_ROLLOUTS=$OPTARG;; + d ) DATA_PATH=$OPTARG;; + s ) CKPT_PATH=$OPTARG;; + l ) LCAL_PATH=$OPTARG;; + o ) OTHER_OPTION=$OPTARG;; + i ) NUM_INFERENCE_ENGINES=$OPTARG;; + t ) NUM_TRAINING_ENGINES=$OPTARG;; + b ) BATCH_SIZE=$OPTARG;; + c ) MICRO_BATCH_SIZE=$OPTARG;; + r ) RUN_NAME=$OPTARG;; + w ) STEP_WISE=$OPTARG;; + # \? ) echo "Usage: cmd [-u] [-p]";; + esac +done + +MODEL_ALIAS=$(echo $MODEL | sed 's/\//-/g') +# Get number of GPUs available +NUM_GPUS=$(nvidia-smi -L | wc -l) +N_ROLLOUTS="${N_ROLLOUTS:-8}" +BATCH_SIZE="${BATCH_SIZE:-8}" +MICRO_BATCH_SIZE="${MICRO_BATCH_SIZE:-1}" +MAX_LENGTH=8192 +RUN_NAME="${RUN_NAME:-${MODEL_ALIAS}-${BATCH_SIZE}x${N_ROLLOUTS}}" +set -x + +DATA_PATH="${DATA_PATH:-data/swe_smith}" +CKPT_PATH="${CKPT_PATH:-$(pwd)/ckpts/${MODEL_ALIAS}}" +# If LCAL_PATH is not set, use CKPT_PATH +LCAL_PATH="${LCAL_PATH:-$CKPT_PATH}" +mkdir -p $CKPT_PATH + +HALF_NUM_GPUS=$((NUM_GPUS / 2)) +NUM_INFERENCE_ENGINES="${NUM_INFERENCE_ENGINES:-$HALF_NUM_GPUS}" +NUM_TRAINING_ENGINES="${NUM_TRAINING_ENGINES:-$HALF_NUM_GPUS}" +STEP_WISE="${STEP_WISE:-false}" + +export VLLM_FLASH_ATTN_VERSION=2 +export CUDA_LAUNCH_BLOCKING=1 +export TORCH_USE_CUDA_DSA=1 +export RAY_worker_register_timeout_seconds=600 +uv run --isolated -m src.train \ + +run_async_trainer=true \ + data.train_data="['$DATA_PATH/train.parquet']" \ + data.val_data="['$DATA_PATH/validation.parquet']" \ + trainer.algorithm.advantage_estimator="grpo" \ + trainer.algorithm.grpo_norm_by_std=false \ + trainer.policy.model.path=${MODEL} \ + trainer.placement.colocate_all=false \ + trainer.placement.colocate_policy_ref=true \ + trainer.strategy=fsdp2 \ + trainer.policy.fsdp_config.cpu_offload=true \ + trainer.policy.fsdp_config.reshard_after_forward=true \ + trainer.policy.fsdp_config.fsdp_size=-1 \ + trainer.fully_async.num_parallel_generation_workers=${BATCH_SIZE} \ + trainer.placement.policy_num_gpus_per_node=${NUM_TRAINING_ENGINES} \ + trainer.placement.ref_num_gpus_per_node=${NUM_TRAINING_ENGINES} \ + trainer.placement.policy_num_nodes=1 \ + trainer.placement.ref_num_nodes=1 \ + trainer.policy.sequence_parallel_size=1 \ + generator.num_inference_engines=${NUM_INFERENCE_ENGINES} \ + generator.inference_engine_tensor_parallel_size=1 \ + +generator.traj_dir=${CKPT_PATH}trajectories/ \ + +generator.engine_init_kwargs.enable_auto_tool_choice=true \ + +generator.engine_init_kwargs.tool_call_parser="hermes" \ + +generator.engine_init_kwargs.max_model_len=32768 \ + +generator.prompts.system_prompt="templates/system_prompt_custom_finish.j2" \ + +generator.prompts.user_prompt="templates/file_module_custom_finish.j2" \ + +generator.engine_init_kwargs.disable_cascade_attn=true \ + trainer.epochs=1 \ + trainer.eval_batch_size=100 \ + trainer.eval_before_train=false \ + trainer.eval_interval=-1 \ + trainer.update_epochs_per_batch=1 \ + trainer.train_batch_size=${BATCH_SIZE} \ + trainer.policy_mini_batch_size=${BATCH_SIZE} \ + trainer.micro_forward_batch_size_per_gpu=1 \ + trainer.micro_train_batch_size_per_gpu=${MICRO_BATCH_SIZE} \ + trainer.dump_data_batch=true \ + trainer.export_path="${CKPT_PATH}exported_model/" \ + trainer.hf_save_interval=50 \ + trainer.ckpt_interval=10 \ + trainer.use_sample_packing=false \ + trainer.max_prompt_length=32768 \ + trainer.algorithm.policy_loss_type="gspo" \ + trainer.algorithm.eps_clip_low=0.0003 \ + trainer.algorithm.eps_clip_high=0.0004 \ + trainer.algorithm.loss_reduction="sequence_mean" \ + generator.sampling_params.max_generate_length=${MAX_LENGTH} \ + generator.sampling_params.temperature=1.0 \ + generator.max_input_length=32768 \ + generator.max_num_batched_tokens=131072 \ + generator.max_turns=6 \ + trainer.policy.optimizer_config.lr=1.0e-6 \ + trainer.algorithm.use_kl_loss=False \ + trainer.algorithm.use_kl_in_reward=False \ + generator.backend=vllm \ + generator.run_engines_locally=True \ + generator.enable_http_endpoint=True \ + generator.http_endpoint_host='0.0.0.0' \ + generator.http_endpoint_port=8080 \ + generator.weight_sync_backend=nccl \ + generator.async_engine=true \ + generator.batched=false \ + generator.n_samples_per_prompt=${N_ROLLOUTS} \ + generator.gpu_memory_utilization=0.8 \ + generator.enforce_eager=false \ + trainer.step_wise_training=${STEP_WISE} \ + trainer.logger="wandb" \ + trainer.project_name="code_search" \ + trainer.run_name=${RUN_NAME} \ + trainer.resume_mode=latest \ + trainer.ckpt_path="$LCAL_PATH" \ + trainer.max_ckpts_to_keep=5 \ + $OTHER_OPTION \ No newline at end of file diff --git a/scripts/run_async_training.sh b/scripts/run_async_training_14B.sh similarity index 98% rename from scripts/run_async_training.sh rename to scripts/run_async_training_14B.sh index 3c5d625..2c11225 100644 --- a/scripts/run_async_training.sh +++ b/scripts/run_async_training_14B.sh @@ -64,6 +64,7 @@ STEP_WISE="${STEP_WISE:-false}" export VLLM_FLASH_ATTN_VERSION=2 export CUDA_LAUNCH_BLOCKING=1 export TORCH_USE_CUDA_DSA=1 +export RAY_worker_register_timeout_seconds=600 uv run --isolated -m src.train \ +run_async_trainer=true \ @@ -97,7 +98,7 @@ uv run --isolated -m src.train \ generator.eval_n_samples_per_prompt=1 \ trainer.epochs=10 \ trainer.eval_batch_size=32 \ - trainer.eval_before_train=true \ + trainer.eval_before_train=false \ trainer.eval_interval=-1 \ trainer.update_epochs_per_batch=1 \ trainer.train_batch_size=${BATCH_SIZE} \ diff --git a/scripts/run_async_training_4b.sh b/scripts/run_async_training_4b.sh new file mode 100644 index 0000000..452faeb --- /dev/null +++ b/scripts/run_async_training_4b.sh @@ -0,0 +1,129 @@ +#!/bin/bash + + +# export REWARD=file_loc +# sbatch scripts/run_async_training.sh \ +# -m Qwen/Qwen3-8B -n 8 -b 1 -i 4 -t 4 \ +# -d data/swe_gym \ +# -s /project/flame/lsutawik/cso/ckpts/qwen3-8b-8x8-${REWARD}/ \ +# -o "+generator.reward=configs/rewards/${REWARD}.yaml" + +# . .env + +while getopts ":m:n:d:s:l:o:i:t:b:c:r:w:" opt; do + case ${opt} in + m ) MODEL=$OPTARG;; + n ) N_ROLLOUTS=$OPTARG;; + d ) DATA_PATH=$OPTARG;; + s ) CKPT_PATH=$OPTARG;; + l ) LCAL_PATH=$OPTARG;; + o ) OTHER_OPTION=$OPTARG;; + i ) NUM_INFERENCE_ENGINES=$OPTARG;; + t ) NUM_TRAINING_ENGINES=$OPTARG;; + b ) BATCH_SIZE=$OPTARG;; + c ) MICRO_BATCH_SIZE=$OPTARG;; + r ) RUN_NAME=$OPTARG;; + w ) STEP_WISE=$OPTARG;; + # \? ) echo "Usage: cmd [-u] [-p]";; + esac +done + +MODEL_ALIAS=$(echo $MODEL | sed 's/\//-/g') +# Get number of GPUs available +NUM_GPUS=$(nvidia-smi -L | wc -l) +N_ROLLOUTS="${N_ROLLOUTS:-8}" +BATCH_SIZE="${BATCH_SIZE:-8}" +MICRO_BATCH_SIZE="${MICRO_BATCH_SIZE:-1}" +MAX_LENGTH=8192 +RUN_NAME="${RUN_NAME:-${MODEL_ALIAS}-${BATCH_SIZE}x${N_ROLLOUTS}}" +set -x + +DATA_PATH="${DATA_PATH:-data/swe_smith}" +CKPT_PATH="${CKPT_PATH:-$(pwd)/ckpts/${MODEL_ALIAS}}" +# If LCAL_PATH is not set, use CKPT_PATH +LCAL_PATH="${LCAL_PATH:-$CKPT_PATH}" +mkdir -p $CKPT_PATH + +HALF_NUM_GPUS=$((NUM_GPUS / 2)) +NUM_INFERENCE_ENGINES="${NUM_INFERENCE_ENGINES:-$HALF_NUM_GPUS}" +NUM_TRAINING_ENGINES="${NUM_TRAINING_ENGINES:-$HALF_NUM_GPUS}" +STEP_WISE="${STEP_WISE:-false}" + +export VLLM_FLASH_ATTN_VERSION=2 +export CUDA_LAUNCH_BLOCKING=1 +export TORCH_USE_CUDA_DSA=1 +export RAY_worker_register_timeout_seconds=600 +uv run --isolated --active -m src.train \ + +run_async_trainer=true \ + data.train_data="['$DATA_PATH/train.parquet']" \ + data.val_data="['$DATA_PATH/validation.parquet']" \ + trainer.algorithm.advantage_estimator="grpo" \ + trainer.algorithm.grpo_norm_by_std=false \ + trainer.policy.model.path=${MODEL} \ + trainer.placement.colocate_all=false \ + trainer.placement.colocate_policy_ref=true \ + trainer.strategy=fsdp2 \ + trainer.policy.fsdp_config.cpu_offload=true \ + trainer.policy.fsdp_config.reshard_after_forward=true \ + trainer.policy.fsdp_config.fsdp_size=-1 \ + trainer.fully_async.num_parallel_generation_workers=${BATCH_SIZE} \ + trainer.placement.policy_num_gpus_per_node=${NUM_TRAINING_ENGINES} \ + trainer.placement.ref_num_gpus_per_node=${NUM_TRAINING_ENGINES} \ + trainer.placement.policy_num_nodes=1 \ + trainer.placement.ref_num_nodes=1 \ + trainer.policy.sequence_parallel_size=1 \ + generator.num_inference_engines=${NUM_INFERENCE_ENGINES} \ + generator.inference_engine_tensor_parallel_size=1 \ + +generator.traj_dir=${CKPT_PATH}trajectories/ \ + +generator.engine_init_kwargs.enable_auto_tool_choice=true \ + +generator.engine_init_kwargs.tool_call_parser="hermes" \ + +generator.engine_init_kwargs.max_model_len=40960 \ + +generator.prompts.system_prompt="templates/system_prompt_custom_finish.j2" \ + +generator.prompts.user_prompt="templates/file_module_custom_finish.j2" \ + +generator.engine_init_kwargs.disable_cascade_attn=true \ + trainer.epochs=1 \ + trainer.eval_batch_size=100 \ + trainer.eval_before_train=false \ + trainer.eval_interval=-1 \ + trainer.update_epochs_per_batch=1 \ + trainer.train_batch_size=${BATCH_SIZE} \ + trainer.policy_mini_batch_size=${BATCH_SIZE} \ + trainer.micro_forward_batch_size_per_gpu=1 \ + trainer.micro_train_batch_size_per_gpu=${MICRO_BATCH_SIZE} \ + trainer.dump_data_batch=true \ + trainer.export_path="${CKPT_PATH}exported_model/" \ + trainer.hf_save_interval=50 \ + trainer.ckpt_interval=10 \ + trainer.use_sample_packing=false \ + trainer.max_prompt_length=40960 \ + trainer.algorithm.policy_loss_type="gspo" \ + trainer.algorithm.eps_clip_low=0.0003 \ + trainer.algorithm.eps_clip_high=0.0004 \ + trainer.algorithm.loss_reduction="sequence_mean" \ + generator.sampling_params.max_generate_length=${MAX_LENGTH} \ + generator.sampling_params.temperature=1.0 \ + generator.max_input_length=40960 \ + generator.max_num_batched_tokens=131072 \ + generator.max_turns=10 \ + trainer.policy.optimizer_config.lr=1.0e-6 \ + trainer.algorithm.use_kl_loss=False \ + trainer.algorithm.use_kl_in_reward=False \ + generator.backend=vllm \ + generator.run_engines_locally=True \ + generator.enable_http_endpoint=True \ + generator.http_endpoint_host='0.0.0.0' \ + generator.http_endpoint_port=8080 \ + generator.weight_sync_backend=nccl \ + generator.async_engine=true \ + generator.batched=false \ + generator.n_samples_per_prompt=${N_ROLLOUTS} \ + generator.gpu_memory_utilization=0.8 \ + generator.enforce_eager=false \ + trainer.step_wise_training=${STEP_WISE} \ + trainer.logger="wandb" \ + trainer.project_name="code_search" \ + trainer.run_name=${RUN_NAME} \ + trainer.resume_mode=latest \ + trainer.ckpt_path="$LCAL_PATH" \ + trainer.max_ckpts_to_keep=5 \ + $OTHER_OPTION \ No newline at end of file diff --git a/scripts/run_prime_rl.sh b/scripts/run_prime_rl.sh deleted file mode 100755 index 8c394e7..0000000 --- a/scripts/run_prime_rl.sh +++ /dev/null @@ -1,20 +0,0 @@ -#!/bin/bash - -# Install ripgrep -sudo apt-get install ripgrep -y - -# Set PyTorch CUDA allocator config to reduce fragmentation -export PYTORCH_CUDA_ALLOC_CONF=expandable_segments:True - -# Navigate to prime-rl directory -cd $HOME/agentic-code-search-oss/prime-rl - -# Install the verifiers environment -uv pip install -e .. - -# Run RL training -uv run rl \ - --trainer @ ../configs/swe-grep-oss/rl/train.toml \ - --orchestrator @ ../configs/swe-grep-oss/rl/orch.toml \ - --inference @ ../configs/swe-grep-oss/rl/infer.toml - diff --git a/scripts/run_training.sh b/scripts/run_training.sh deleted file mode 100644 index cbc2561..0000000 --- a/scripts/run_training.sh +++ /dev/null @@ -1,93 +0,0 @@ -#!/bin/bash -#SBATCH --job-name=cso -#SBATCH --output=../logs/%j.out -#SBATCH --error=../logs/%j.out -#SBATCH --partition=general -#SBATCH --gres=gpu:A100:2 -#SBATCH --nodes=1 -#SBATCH --time=2-00:00:00 -#SBATCH --mem=512G -#SBATCH --cpus-per-task=32 -#SBATCH --ntasks-per-node=1 -#SBATCH --exclude=babel-q5-28,babel-o5-20 - -. .env - -while getopts ":m:n:d:s:" opt; do - case ${opt} in - m ) MODEL=$OPTARG;; - n ) N_ROLLOUTS=$OPTARG;; - d ) DATA_PATH=$OPTARG;; - s ) CKPT_PATH=$OPTARG;; - # \? ) echo "Usage: cmd [-u] [-p]";; - esac -done - -MODEL_ALIAS=$(echo $MODEL | sed 's/\//-/g') -# Get number of GPUs available -NUM_GPUS=$(nvidia-smi -L | wc -l) -N_ROLLOUTS="${N_ROLLOUTS:-4}" -MAX_LENGTH=2048 -RUN_NAME="code_search_${MODEL_ALIAS}" -set -x - -DATA_PATH="${DATA_PATH:-data/swe_smith}" -CKPT_PATH="${CKPT_PATH:-ckpts/${MODEL_ALIAS}}" -mkdir -p $CKPT_PATH - -NNODES=1 -NUM_INFERENCE_ENGINES=2 -TP_SIZE=1 -LOGGER=wandb - -# We use a small batch size here for demonstration -# NOTE (sumanthrh): The `generator.max_turns` here is actually unused, and we use the `step_limit` from the `swebench.yaml` file. -CUDA_LAUNCH_BLOCKING=1 uv run --isolated -m src.train \ - data.train_data="['$DATA_PATH/train.parquet']" \ - data.val_data="['$DATA_PATH/validation.parquet']" \ - trainer.algorithm.advantage_estimator="grpo" \ - trainer.policy.model.path=${MODEL} \ - trainer.placement.colocate_all=true \ - trainer.strategy=fsdp2 \ - trainer.placement.policy_num_gpus_per_node=$NUM_GPUS \ - trainer.placement.ref_num_gpus_per_node=$NUM_GPUS \ - trainer.placement.policy_num_nodes=$NNODES \ - trainer.placement.ref_num_nodes=$NNODES \ - trainer.policy.sequence_parallel_size=$NUM_GPUS \ - generator.num_inference_engines=$NUM_INFERENCE_ENGINES \ - generator.inference_engine_tensor_parallel_size=$TP_SIZE \ - +generator.traj_dir=$CKPT_PATH/trajectories/ \ - +generator.engine_init_kwargs="{enable_auto_tool_choice:true,tool_call_parser:hermes}" \ - trainer.epochs=20 \ - trainer.eval_batch_size=100 \ - trainer.eval_before_train=false \ - trainer.eval_interval=100 \ - trainer.update_epochs_per_batch=1 \ - trainer.train_batch_size=4 \ - trainer.policy_mini_batch_size=4 \ - trainer.micro_forward_batch_size_per_gpu=2 \ - trainer.micro_train_batch_size_per_gpu=2 \ - trainer.dump_data_batch=true \ - trainer.ckpt_interval=10 \ - trainer.max_prompt_length=4096 \ - generator.sampling_params.max_generate_length=${MAX_LENGTH} \ - generator.max_input_length=24000 \ - generator.max_num_batched_tokens=48000 \ - generator.max_turns=20 \ - trainer.policy.optimizer_config.lr=1.0e-6 \ - trainer.algorithm.use_kl_loss=False \ - generator.backend=vllm \ - generator.run_engines_locally=True \ - generator.enable_http_endpoint=True \ - generator.http_endpoint_host='0.0.0.0' \ - generator.http_endpoint_port=8080 \ - generator.weight_sync_backend=nccl \ - generator.async_engine=true \ - generator.batched=true \ - generator.n_samples_per_prompt=${N_ROLLOUTS} \ - generator.gpu_memory_utilization=0.6 \ - trainer.logger="$LOGGER" \ - trainer.project_name="code_search" \ - trainer.run_name=${RUN_NAME} \ - trainer.resume_mode=null \ - trainer.ckpt_path="$CKPT_PATH" diff --git a/scripts/train_async.sh b/scripts/train_async.sh deleted file mode 100644 index faf6192..0000000 --- a/scripts/train_async.sh +++ /dev/null @@ -1,10 +0,0 @@ -#!/bin/bash - -# Loop over 10 -for i in $(seq 1 10) -do - echo "Run number: $i" - # Kill any process using port 8080 after 4 hours - ( sleep 14400 && fuser -k 8080/tcp ) & \ - bash scripts/run_async_training.sh "$@" -done diff --git a/software-agent-sdk b/software-agent-sdk deleted file mode 160000 index 85ecfd9..0000000 --- a/software-agent-sdk +++ /dev/null @@ -1 +0,0 @@ -Subproject commit 85ecfd9333d2d2cc4404dd460fd38868d9b978e2 diff --git a/src/async_trainer.py b/src/async_trainer.py index 7815122..840c80e 100644 --- a/src/async_trainer.py +++ b/src/async_trainer.py @@ -59,7 +59,6 @@ def patched_concatenate_generator_outputs(generator_outputs: List[GeneratorOutpu logger.info(f"Attempting to concatenate values for additional keys {additional_keys}") for key in additional_keys: try: - # result[key] = sum([generator_output[key] for generator_output in generator_outputs], []) additional_result[key] = np.mean([generator_output[key] for generator_output in generator_outputs]).item() except Exception as e: logger.error(f"Error in aggregating key {key}: {e}", exc_info=True) @@ -72,10 +71,6 @@ def patched_concatenate_generator_outputs(generator_outputs: List[GeneratorOutpu # Import here to avoid circular dependency. from skyrl_train.utils.trainer_utils import validate_generator_output - # print("trajectory_ids", result["trajectory_ids"]) - # print("rewards", result["rewards"]) - # print("is_last_step", result["is_last_step"]) - num_prompts = len(result["prompt_token_ids"]) validate_generator_output(num_prompts, result) @@ -131,19 +126,6 @@ def convert_generation_group_mini_batch_to_training_input( self.cfg.trainer.step_wise_training = False generator_output = self.postprocess_generator_output(generator_output, uids) - # # Truncate prompt_token_ids to avoid OOM - # max_prompt_len = self.cfg.trainer.max_prompt_length - # if max_prompt_len == -1: - # pass - # else: - # truncated_prompt_token_ids = [] - # for prompt_ids in generator_output["prompt_token_ids"]: - # if len(prompt_ids) > max_prompt_len: - # truncated_prompt_token_ids.append(prompt_ids[-max_prompt_len:]) - # else: - # truncated_prompt_token_ids.append(prompt_ids) - # generator_output["prompt_token_ids"] = truncated_prompt_token_ids - # print example just for debugging vis = self.tokenizer.decode(generator_output["response_ids"][0]) logger.info(f"Example generated: {vis}") diff --git a/src/build_dataset.py b/src/build_dataset.py index b28ae50..365f44f 100644 --- a/src/build_dataset.py +++ b/src/build_dataset.py @@ -3,15 +3,12 @@ from datasets import load_dataset -# from src.utils.dataset import extract_functions_from_patch - - def main(): parser = argparse.ArgumentParser(description="Build dataset from patches") parser.add_argument("--dataset", default="adityasoni17/SWE-smith-py-code-search", help="Input dataset path") parser.add_argument("--split", default="train", help="Dataset split to use") parser.add_argument("--output", required=True, help="Output file path for processed dataset") - parser.add_argument("--use_patch", action="store_true", help="Whether to use patches to extract target functions") + parser.add_argument("--use_patch", action="store_true", help="Whether to apply patch after pulling the repo (only set to true for SWE-Smith whose patch actually introduces the bug)") args = parser.parse_args() # Load and process dataset @@ -43,7 +40,6 @@ def main(): # shuffle dataset dataset = dataset.sample(frac=1, random_state=42).reset_index(drop=True) - # train_size = int(0.975 * len(dataset)) train_dataset = dataset.iloc[:-100] validation_dataset = dataset.iloc[-100:] @@ -56,6 +52,7 @@ def main(): output_path = os.path.join(output_dir, "validation.parquet") validation_dataset.to_parquet(output_path) + print(len(train_dataset), len(validation_dataset)) if __name__ == "__main__": diff --git a/src/constants.py b/src/constants.py deleted file mode 100644 index c71f050..0000000 --- a/src/constants.py +++ /dev/null @@ -1,10 +0,0 @@ -""" -Constants used across the SWE-Grep OSS project. -""" - -# Maximum number of tool calls allowed per task -DEFAULT_MAX_TOOL_CALLS = 5 - -# Maximum number of tokens allowed per task -DEFAULT_MAX_TOKENS = 40960 - diff --git a/src/generator/code_search_generator.py b/src/generator/code_search_generator.py index 9431c9d..8e104be 100644 --- a/src/generator/code_search_generator.py +++ b/src/generator/code_search_generator.py @@ -64,7 +64,6 @@ from src.agent.agent import CustomAgent from src.rewards import get_reward_function -# from src.tools import TOOL_REGISTRY from src.metrics.efficiency_metrics import compute_all_efficiency_metrics from src.metrics.trajectory_metrics import compute_trajectory_metrics @@ -73,7 +72,6 @@ import signal logger = get_logger(__name__) -# logger.setLevel(logging.WARNING) logger.setLevel(logging.ERROR) file_path = os.path.dirname(__file__) @@ -142,20 +140,8 @@ def init_and_run( structured_locations = None messages = [] - # for tool_name in generator_cfg.tools: - # if tool_name in TOOL_REGISTRY: - # register_tool(tool_name, TOOL_REGISTRY[tool_name]) - # else: - # raise ValueError(f"Tool {tool_name} does not exist in the registry") - - # tools = [ - # Tool(name=tool_name) for tool_name in generator_cfg.tools - # ] - register_tool(LocalizationFinishTool.name, LocalizationFinishTool) tools = [ - # Tool(name=GlobTool.name), - # Tool(name=GrepTool.name), Tool(name=TerminalTool.name), Tool(name="localization_finish"), ] @@ -185,7 +171,6 @@ def init_and_run( } ), tools=tools, - # security_analyzer=None, system_prompt_filename=system_prompt_path ) @@ -268,17 +253,11 @@ def __init__( self.generator_cfg = generator_cfg self.tokenizer = tokenizer self.model_name = model_name - # self.litellm_model_name = "openai/" + self.model_name self.litellm_model_name = "openai/" + self.model_name - # if self.generator_cfg.chat_template.name_or_path is not None: - # raise NotImplementedError( - # "OpenhandsGenerator doesn't support custom chat template" - # ) - self.step_wise = step_wise self.max_train_length = generator_cfg.get( - "max_train_length", 32768 + "max_train_length", 100000 ) def sanity_check_last_step(self, token_messages): @@ -312,7 +291,6 @@ async def code_search_loop( trajectory_id: TrajectoryID, batch_metadata: BatchMetadata, ) -> Tuple[List[int], float, str, List[int], List[int], Optional[List[int]], Optional[Dict[str, Any]]]: - # sweagent_config = yaml.safe_load(get_config_path(self.generator_cfg.miniswe_config_path).read_text()) # NOTE (sumanthrh): Input `prompt` is not used here because mini-swe-agent uses a similar entry from the `instance` obj instance = env_extras error = None @@ -341,12 +319,6 @@ async def code_search_loop( "end_timestamp": None } - # print("=" * 100) - # print("Conversation finished. Got the following LLM messages:") - # for i, message in enumerate(messages): - # print(f"Message {i}: {str(message)[:100]}") - # print("Final message:", final_message) - # Run sanity check before computing the reward so that the logged metrics reflect the actual reward received in training token_messages = [msg for msg in messages if msg["kind"] == "TokenEvent"] trajectory_exhausted_steps = structured_locations is None and len(token_messages) >= self.generator_cfg.max_turns @@ -415,7 +387,6 @@ async def code_search_loop( token_messages = [msg for msg in messages if msg["kind"] == "TokenEvent"] rollout_list = [] - num_steps = len(token_messages) if len(token_messages) > 0: if self.step_wise: for idx, message in enumerate(token_messages): @@ -447,6 +418,8 @@ async def code_search_loop( max_response_len = max_train_len - len(current_prompt_ids) buffer_succeed = 5 # buffer tokens after assistant tag + if "Qwen3-4B-Instruct-2507" in self.model_name: + buffer_succeed = 1 #NOTE: 4B-Instruct doesn't have tokens so only the subsequent \n needs masking. buffer_precede = 1 # buffer tokens before im_start tag # make mask of 0 for everything inside <|im_start|> # and assistant and 1 elsewhere @@ -455,6 +428,7 @@ async def code_search_loop( mask = [] inside = False buffer = 0 + found_role_switch = False for token_id in current_response_ids: if token_id == start_token_id: inside = True @@ -462,7 +436,7 @@ async def code_search_loop( mask.pop() mask.extend([0] * buffer_precede) mask.append(0) - elif token_id == end_token_id: + elif token_id == end_token_id and found_role_switch: inside = False mask.append(0) buffer = buffer_succeed @@ -474,12 +448,18 @@ async def code_search_loop( buffer -= 1 else: mask.append(1) + + # mark role switch is <|im_start|> is found + if token_id == start_token_id: + found_role_switch = True + else: + found_role_switch = False # mask zero out everything beyond max_response_len # Don't truncate the response, just mask out the loss - if len(current_response_ids) > max_response_len: - for i in range(max_response_len, len(current_response_ids)): - mask[i] = 0 + # if len(current_response_ids) > max_response_len: + # for i in range(max_response_len, len(current_response_ids)): + # mask[i] = 0 # mask loss completely from trajectories that exhausted all steps without calling the custom finish tool if trajectory_exhausted_steps: diff --git a/src/metrics/efficiency_metrics.py b/src/metrics/efficiency_metrics.py index c0eda37..3074eb2 100644 --- a/src/metrics/efficiency_metrics.py +++ b/src/metrics/efficiency_metrics.py @@ -28,8 +28,11 @@ def compute_token_metrics(messages: List[Dict[str, Any]]) -> Dict[str, float]: "avg_prompt_tokens_per_step": 0.0, "avg_response_tokens_per_step": 0.0, } - - total_prompt_tokens = sum(len(msg.get("prompt_token_ids", [])) for msg in token_messages) + if len(token_messages) > 0: + total_prompt_tokens = len(token_messages[-1].get("prompt_token_ids", [])) + else: + total_prompt_tokens = 0 + # total_prompt_tokens = sum(len(msg.get("prompt_token_ids", [])) for msg in token_messages) total_response_tokens = sum(len(msg.get("response_token_ids", [])) for msg in token_messages) num_steps = len(token_messages) diff --git a/src/prompts/system_prompt.py b/src/prompts/system_prompt.py deleted file mode 100644 index 1aae5ad..0000000 --- a/src/prompts/system_prompt.py +++ /dev/null @@ -1,87 +0,0 @@ -SYSTEM_PROMPT = """ -You are a specialized code localization agent. Your sole objective is to identify and return the files in the codebase that are relevant to the user's query. -You are given access to the codebase in a linux file system. - -## PRIMARY DIRECTIVE -- Find relevant files, do NOT answer the user's query directly -- Return ONLY file paths in XML tags -- Prioritize precision: every file you return should be relevant -- You have up to 8 turns to explore and return your answer - -## TOOL USAGE REQUIREMENTS - -### bash tool (REQUIRED for search) -- You MUST use the bash tool to search and explore the codebase -- Execute bash commands like: rg, grep, find, ls, cat, head, tail, sed -- Use parallel tool calls: invoke bash tool up to 5 times concurrently in a single turn -- NEVER exceed 5 parallel tool calls per turn -- Common patterns: - * `rg "pattern" -t py` - search for code patterns - * `rg --files | grep "keyword"` - find files by name - * `cat path/to/file.py` - read file contents - * `find . -name "*.py" -type f` - locate files by extension - * `wc -l path/to/file.py` - count lines in a file - * `sed -n '1,100p' path/to/file.py` - read lines 1-100 of a file - * `head -n 100 path/to/file.py` - read first 100 lines - * `tail -n 100 path/to/file.py` - read last 100 lines - -### Reading Files (CRITICAL for context management) -- NEVER read entire large files with `cat` - this will blow up your context window -- ALWAYS check file size first: `wc -l path/to/file.py` -- For files > 100 lines, read in chunks: - * Use `sed -n '1,100p' file.py` to read lines 1-100 - * Use `sed -n '101,200p' file.py` to read lines 101-200 - * Continue with subsequent ranges as needed (201-300, 301-400, etc.) -- Strategic reading approach: - * Read the first 50-100 lines to see imports and initial structure - * Use `rg` to find specific patterns and their line numbers - * Read targeted line ranges around matches using `sed -n 'START,ENDp'` - * Only read additional chunks if the initial sections are relevant - -### Final Answer Format (REQUIRED) -- You MUST return your final answer in XML tags -- Format: path/to/file1.py\npath/to/file2.py\npath/to/file3.py -- List one file path per line inside the tags -- Use relative paths as they appear in the repository -- DO NOT include any other text inside the tags - -## SEARCH STRATEGY - -1. **Initial Exploration**: Cast a wide net - - Search for keywords, function names, class names - - Check file names and directory structure - - Use up to 3 parallel bash calls to explore multiple angles - - Check file sizes with `wc -l` before reading - - Read promising files in chunks (lines 1-100) to verify relevance - -2. **Deep Dive**: Follow the most promising leads - - Use up to 3 parallel bash calls to investigate further - - Read files in chunks to confirm they address the query - - Use `rg` with line numbers to locate specific code, then read those ranges - - Start eliminating false positives - -3. **Final Verification**: Confirm your file list - - Verify each candidate file is truly relevant - - Ensure you haven't missed related files - - Return your answer in tags - -## CRITICAL RULES -- NEVER exceed 5 parallel bash tool calls in a single turn -- NEVER respond without wrapping your file list in tags -- ALWAYS use bash tool to search (do not guess file locations) -- NEVER read entire large files - always read in chunks (100-line ranges) -- Check file size with `wc -l` before reading -- Read file contents in chunks to verify relevance before including them -- Return file paths as they appear in the repository. Do not begin the path with "./" -- Aim for high precision (all files relevant) and high recall (no relevant files missed) - -## EXAMPLE OUTPUT - -After exploring the codebase, return your answer like this: - - -src/main.py -src/utils/helper.py -tests/test_main.py - -""" diff --git a/src/rewards/cosine_rewards.py b/src/rewards/cosine_rewards.py deleted file mode 100644 index 17b2771..0000000 --- a/src/rewards/cosine_rewards.py +++ /dev/null @@ -1,111 +0,0 @@ -import math -from src.rewards import reward - -from src.rewards.file_localization.file_localization import ( - multilevel_localization_f1_reward, - file_localization_f1_reward - ) - -@reward("cosine_reward") -def cosine_reward( - final_message, - instance, - messages, - loc_threshold=1.5, - use_tool_reward=True, - use_turn_reward=True, - use_length_reward=False, - max_turns=8, - max_avg_tool_calls=10, - ideal_avg_tool_calls=5, - max_length=16384, - multilevel=True, - max_reward=5.0, - min_reward=-5.0, - **kwargs - ): - - try: - if multilevel: - loc_reward, reward_dict = multilevel_localization_f1_reward(final_message, instance, **kwargs) - else: - loc_reward, reward_dict = file_localization_f1_reward(final_message, instance, **kwargs) - - except Exception as e: - print(f"Error computing localization reward: {e}") - loc_reward = 0.0 - reward_dict = { - "multilevel_localization_f1_reward": 0.0, - "file_reward": 0.0, - "module_reward": 0.0, - "entity_reward": 0.0, - } - - def _cos_fn(t, T, mu_min, mu_max): - cos_inner = (math.pi * t) / T - cos_out = math.cos(cos_inner) + 1 - return mu_min + 0.5 * (mu_max - mu_min) * cos_out - - token_messages = [msg for msg in messages if msg["kind"] == "TokenEvent"] - tool_messages = [msg for msg in messages if msg["kind"] == "ActionEvent"] - - # Don't count the last turn which is the - # final answer generation which can involve 1 tool or none - num_turns = max(1, len(token_messages) - 1) - num_tool_calls = len(tool_messages) - avg_tool_calls_per_turn = num_tool_calls / num_turns if num_turns > 0 else 0 - - reward = 0.0 - - # Number of turns - if use_turn_reward: - if num_turns > max_turns: - cosine_turn_reward = 0 - elif loc_reward >= loc_threshold: - cosine_turn_reward = _cos_fn(num_turns, max_turns, 0.0, max_reward) - else: - cosine_turn_reward = _cos_fn(num_turns, max_turns, 0.0, min_reward) - reward_dict["turn_cosine_reward"] = cosine_turn_reward - - reward += cosine_turn_reward - - # Length of response - if use_length_reward: - current_prompt_ids = token_messages[0]["prompt_token_ids"] - ending_prompt_ids = token_messages[-1]["prompt_token_ids"] - ending_response_ids = token_messages[-1]["response_token_ids"] - current_response_ids = ending_prompt_ids + ending_response_ids - current_response_ids = current_response_ids[len(current_prompt_ids):] - - current_length = len(current_prompt_ids) + len(current_response_ids) - - if current_length > max_length: - cosine_length_reward = 0 - elif loc_reward >= loc_threshold: - cosine_length_reward = _cos_fn(current_length, max_length, 0.0, max_reward) - else: - cosine_length_reward = _cos_fn(current_length, max_length, 0.0, min_reward) - reward_dict["length_cosine_reward"] = cosine_length_reward - - reward += cosine_length_reward - - # Number of tool calls - if use_tool_reward: - if avg_tool_calls_per_turn > max_avg_tool_calls: - cosine_tool_reward = 0 - elif loc_reward >= loc_threshold: - # Using 5 as the ideal average number of tool calls per turn - # Anything more or less than the max score - if avg_tool_calls_per_turn >= ideal_avg_tool_calls: - avg_tool_calls_per_turn -= ideal_avg_tool_calls - cosine_tool_reward = _cos_fn(avg_tool_calls_per_turn, ideal_avg_tool_calls, 1.0, max_reward) - else: - cosine_tool_reward = _cos_fn(avg_tool_calls_per_turn, ideal_avg_tool_calls, max_reward, 1.0) - else: - # If wrong, encourage to do more calls - cosine_tool_reward = _cos_fn(avg_tool_calls_per_turn, max_avg_tool_calls, 0.0, min_reward) - reward_dict["tool_cosine_reward"] = cosine_tool_reward - - reward += cosine_tool_reward - - return reward, reward_dict diff --git a/src/rewards/format_reward.py b/src/rewards/format_reward.py deleted file mode 100644 index 07db1c2..0000000 --- a/src/rewards/format_reward.py +++ /dev/null @@ -1,20 +0,0 @@ -import re -from src.rewards import reward - -@reward("format_reward") -def format_reward( - final_message: str, - START_STRING: str = "```", - END_STRING: str = "```", - penalize: bool = True, - **kwargs - ): - - final_message = final_message.strip() - if final_message.startswith(START_STRING) and END_STRING in final_message: - return 1.0, {"format_reward": 1.0} - else: - if penalize: - return -5.0, {"format_reward": -5.0} - else: - return 0.0, {"format_reward": 0.0} diff --git a/src/rewards/result_tool_check.py b/src/rewards/result_tool_check.py deleted file mode 100644 index e050414..0000000 --- a/src/rewards/result_tool_check.py +++ /dev/null @@ -1,14 +0,0 @@ -import verifiers as vf - -from src.utils.get_result_tool_call import get_result_tool_call - - -def result_tool_check( - prompt, completion: vf.types.Messages, answer, state, task, info -) -> float: - """ - Check if the result tool call is successful. - """ - - _, success = get_result_tool_call(completion) - return 1.0 if success else 0.0 diff --git a/src/rewards/result_tool_f1.py b/src/rewards/result_tool_f1.py deleted file mode 100644 index db88261..0000000 --- a/src/rewards/result_tool_f1.py +++ /dev/null @@ -1,32 +0,0 @@ -import verifiers as vf - -from src.utils.result_tool_metrics import ( - calculate_f1, - calculate_precision, - calculate_recall, - get_file_sets, -) - - -def result_tool_f1( - prompt, completion: vf.types.Messages, answer, state, task, info -) -> float: - """ - Calculate file-level F1 score. - - F1 = 2 * (precision * recall) / (precision + recall) - - Measures: Harmonic mean of precision and recall. - - Args: - answer: Should contain the patch string - """ - result_files, patch_files = get_file_sets(completion, answer) - - if result_files is None or patch_files is None: - return 0.0 - - precision = calculate_precision(result_files, patch_files) - recall = calculate_recall(result_files, patch_files) - - return calculate_f1(precision, recall) diff --git a/src/rewards/result_tool_precision.py b/src/rewards/result_tool_precision.py deleted file mode 100644 index 66f420d..0000000 --- a/src/rewards/result_tool_precision.py +++ /dev/null @@ -1,24 +0,0 @@ -import verifiers as vf - -from src.utils.result_tool_metrics import calculate_precision, get_file_sets - - -def result_tool_precision( - prompt, completion: vf.types.Messages, answer, state, task, info -) -> float: - """ - Calculate file-level precision. - - Precision = |result_files ∩ patch_files| / |result_files| - - Measures: Of the files the agent identified, what percentage are correct? - - Args: - answer: Should contain the patch string - """ - result_files, patch_files = get_file_sets(completion, answer) - - if result_files is None or patch_files is None: - return 0.0 - - return calculate_precision(result_files, patch_files) diff --git a/src/rewards/result_tool_recall.py b/src/rewards/result_tool_recall.py deleted file mode 100644 index 8723b3a..0000000 --- a/src/rewards/result_tool_recall.py +++ /dev/null @@ -1,25 +0,0 @@ -import verifiers as vf - -from src.utils.result_tool_metrics import calculate_recall, get_file_sets - - -def result_tool_recall( - prompt, completion: vf.types.Messages, answer, state, task, info -) -> float: - """ - Calculate file-level recall. - - Recall = |result_files ∩ patch_files| / |patch_files| - - Measures: Of all the files in the patch, what percentage did the - agent identify? - - Args: - answer: Should contain the patch string - """ - result_files, patch_files = get_file_sets(completion, answer) - - if result_files is None or patch_files is None: - return 0.0 - - return calculate_recall(result_files, patch_files) diff --git a/src/rewards/scaled_f1.py b/src/rewards/scaled_f1.py deleted file mode 100644 index 80cfb45..0000000 --- a/src/rewards/scaled_f1.py +++ /dev/null @@ -1,57 +0,0 @@ -from src.rewards import reward - -from src.rewards.file_localization.file_localization import ( - multilevel_localization_f1_reward, - file_localization_f1_reward - ) - -@reward("scaled_f1_reward") -def scaled_f1_reward( - final_message, - messages, - instance, - multilevel=False, - **kwargs - ): - - try: - if multilevel: - loc_reward, reward_dict = multilevel_localization_f1_reward(final_message, instance, **kwargs) - else: - loc_reward, reward_dict = file_localization_f1_reward(final_message, instance, **kwargs) - - except Exception as e: - print(f"Error computing localization reward: {e}") - loc_reward = 0.0 - reward_dict = { - "multilevel_localization_f1_reward": 0.0, - "file_reward": 0.0, - "module_reward": 0.0, - "entity_reward": 0.0, - } - - token_messages = [msg for msg in messages if msg["kind"] == "TokenEvent"] - tool_messages = [msg for msg in messages if msg["kind"] == "ActionEvent"] - - num_turns = len(token_messages) - 1 - if num_turns == 0: - num_turns = 1 # to avoid division by zero - - num_tool_calls = len(tool_messages) - - avg_tool_calls_per_turn = num_tool_calls / num_turns if num_turns > 0 else 0 - if avg_tool_calls_per_turn > 5: - avg_tool_calls_per_turn = 5 # cap at ideal avg tool calls - - avg_tool_calls_per_turn = avg_tool_calls_per_turn / 5 # normalize by ideal avg tool calls - - reward_dict["tool_use_reward"] = avg_tool_calls_per_turn - - # Penalize if no tool calls were made - if avg_tool_calls_per_turn <= 0: - reward = -5 - return reward, reward_dict - - reward = loc_reward * avg_tool_calls_per_turn - - return reward, reward_dict \ No newline at end of file diff --git a/src/rewards/tool_use.py b/src/rewards/tool_use.py deleted file mode 100644 index 588c282..0000000 --- a/src/rewards/tool_use.py +++ /dev/null @@ -1,49 +0,0 @@ -from src.rewards import reward - -@reward("tool_use_reward") -def tool_use_reward(messages, max_tool_use=5, penalize=False, clamp=False, reduction="mean", **kwargs) -> float: - token_messages = [msg for msg in messages if msg["kind"] == "TokenEvent"] - tool_messages = [msg for msg in messages if msg["kind"] == "ActionEvent"] - - num_turns = max(1, len(token_messages) - 1) - num_tool_calls = len(tool_messages) - - if num_tool_calls == 0: - if penalize: - return -5.0 - else: - return 0.0 - - if reduction == "mean": - average_tool_use = num_tool_calls / num_turns - reward = min(average_tool_use, max_tool_use) / max_tool_use - else: - reward = min(num_tool_calls, max_tool_use) / max_tool_use - - if clamp: - if reward > 0: - return 1.0 - else: - return 0.0 - else: - return reward - -@reward("turn_efficiency") -def turn_efficiency(messages, max_turns=5, **kwargs) -> float: - token_messages = [msg for msg in messages if msg["kind"] == "TokenEvent"] - tool_messages = [msg for msg in messages if msg["kind"] == "ActionEvent"] - - num_turns = len(token_messages) - num_tool_calls = len(tool_messages) - - if num_turns <= 1: - return 0.0 - - if (num_tool_calls > 1): - # Decay the reward if more than max_turns are used - if num_turns <= max_turns: - return 1.0 - else: - return max(0.0, 1.0 - (num_turns - max_turns) * 0.1) - - return 0.0 \ No newline at end of file diff --git a/src/tools/__init__.py b/src/tools/__init__.py index d4580b1..e69de29 100644 --- a/src/tools/__init__.py +++ b/src/tools/__init__.py @@ -1,64 +0,0 @@ -import importlib -import pkgutil -from pathlib import Path - -TOOL_REGISTRY = {} - -DEFAULT_OPENHANDS_TOOLS = [ - "apply_patch", - "browser_use", - "delegate", - "file_editor", - "glob", - "grep", - "planning_file_editor", - "preset", - "task_tracker", - "terminal", - "tom_consult" -] - -def tool_exists(tool_name: str): - """Check if a tool exists in the registry.""" - return tool_name in DEFAULT_OPENHANDS_TOOLS or tool_name in TOOL_REGISTRY - -def tool(name: str): - """Decorator to register a new tool function.""" - def decorator(func): - if name in DEFAULT_OPENHANDS_TOOLS: - raise ValueError(f"Tool name '{name}' is an in-built openhands tool and cannot be overridden.") - - # Track the tool in local registry for run-time validation - TOOL_REGISTRY[name] = func - return func - return decorator - -def _auto_load_tools(): - """Automatically discover and import all tool modules to register functions.""" - current_dir = Path(__file__).parent - - # Recursively import all Python modules - def _import_submodules(path, package_name): - # Import all Python modules in this directory - for importer, modname, ispkg in pkgutil.iter_modules([str(path)]): - if modname != '__init__': - try: - importlib.import_module(f'.{modname}', package=package_name) - except ImportError: - pass - - # Recursively process subdirectories - for item in path.iterdir(): - if item.is_dir() and not item.name.startswith('_'): - try: - # Import the package (runs __init__.py if it exists) - importlib.import_module(f'.{item.name}', package=package_name) - except ImportError: - pass - # Recursively import modules from subdirectories - _import_submodules(item, f'{package_name}.{item.name}') - - _import_submodules(current_dir, __name__) - -# Auto-load all tool functions on import -_auto_load_tools() diff --git a/src/tools/localization_finish.py b/src/tools/localization_finish.py index b914a0a..9514143 100644 --- a/src/tools/localization_finish.py +++ b/src/tools/localization_finish.py @@ -20,7 +20,6 @@ ) from openhands.sdk.tool import ToolExecutor, ToolAnnotations from openhands.sdk.conversation.state import ConversationExecutionStatus -from src.tools import tool if TYPE_CHECKING: from openhands.sdk.conversation.base import BaseConversation @@ -43,8 +42,6 @@ class LocalizationFinishAction(Action): """ ) - # message: str = Field(description="Code localization submission sent to the user.") - @property def visualize(self) -> Text: """Return Rich Text representation of this action.""" @@ -163,13 +160,4 @@ def create( ), ) ] - -@tool(name="localization_finish") -def _make_localization_finish_tool() -> list[ToolDefinition]: - """Create localization finish tool. - - This is a localization-specific finish tool that accepts structured locations - and validates the output format. - """ - return LocalizationFinishTool.create() \ No newline at end of file diff --git a/src/train.py b/src/train.py index d60dc4d..3cd9210 100644 --- a/src/train.py +++ b/src/train.py @@ -6,7 +6,6 @@ import asyncio -# from src.tools import tool_exists from src.generator.code_search_generator import CodeSearchGenerator from src.async_trainer import CustomFullyAsyncRayPPOTrainer as FullyAsyncRayPPOTrainer # from skyrl_train.fully_async_trainer import FullyAsyncRayPPOTrainer @@ -70,10 +69,6 @@ def main(cfg: DictConfig) -> None: # validate the arguments validate_cfg(cfg) - # cfg.trainer.policy.deepspeed_config.zero_optimization.offload_param.device = "cpu" - # cfg.trainer.policy.deepspeed_config.zero_optimization.offload_optimizer.device = "cpu" - # cfg.trainer.policy.deepspeed_config.zero_optimization.zero_hpz_partition_size = 8 - print("cfg.trainer.policy.deepspeed_config") print(cfg.trainer.policy.deepspeed_config) @@ -97,11 +92,6 @@ def main(cfg: DictConfig) -> None: cfg.generator.tools = [ "terminal", ] - - # # Check if the tool exists in the registry - # for tool in cfg.generator.tools: - # if not tool_exists(tool): - # raise ValueError(f"Tool {tool} does not exist in the registry") # Set default prompts if not specified if not hasattr(cfg.generator, "prompts"): diff --git a/src/utils/dataset.py b/src/utils/dataset.py deleted file mode 100644 index 47a21ab..0000000 --- a/src/utils/dataset.py +++ /dev/null @@ -1,67 +0,0 @@ -import re - - -def extract_functions_from_patch(input_diff: str): - """ - Parse a unified diff and extract, per file, the starting line of each hunk and the old line count. - - Returns: List[Tuple[str, List[int, int]]] - Example: [("path/to/file.py", [start_line, old_count]), ...] - """ - - results: dict[str, list[list[int]]] = {} - current_file: str | None = None - in_hunk = False - hunk_old_start = None - hunk_old_count = None - - # Regex for hunk header: @@ -old_start,old_count +new_start,new_count @@ ... - hunk_re = re.compile(r"^@@ -(?P\d+)(?:,(?P\d+))? \+(?P\d+)(?:,(?P\d+))? @@") - - def flush_hunk(): - nonlocal hunk_old_start, hunk_old_count, in_hunk - if current_file is None or hunk_old_start is None: - return - count = hunk_old_count if hunk_old_count is not None else 1 - results.setdefault(current_file, []).append([hunk_old_start, count]) - # Reset hunk state - in_hunk = False - hunk_old_start = None - hunk_old_count = None - - for raw_line in input_diff.strip().splitlines(): - line = raw_line.rstrip("\n") - - # Track current file being processed via the new file path header - if line.startswith("+++ b/"): - # Starting a new file ends any current hunk - if in_hunk: - flush_hunk() - current_file = line[6:] - continue - - # A new hunk header starts - m = hunk_re.match(line) - if m and current_file: - # Flush any previous hunk before starting a new one - if in_hunk: - flush_hunk() - in_hunk = True - hunk_old_start = int(m.group("old_start")) - old_count_str = m.group("old_count") - hunk_old_count = int(old_count_str) if old_count_str is not None else 1 - continue - - # Flush any unterminated hunk at EOF - if in_hunk: - flush_hunk() - - targets = [] - for file, hunks in results.items(): - for hunk in hunks: - targets.append( - (file, hunk) - ) - return targets - # return results - diff --git a/src/utils/get_instance.py b/src/utils/get_instance.py deleted file mode 100644 index bb6d52a..0000000 --- a/src/utils/get_instance.py +++ /dev/null @@ -1,160 +0,0 @@ -import argparse -import os -from pathlib import Path - -from datasets import load_dataset - - -def _default_repos_dir() -> Path: - """Resolve the default path for the cloned SWE-bench repositories. - - Priority order: - 1) Environment variable SWEBENCH_REPOS_DIR if set - 2) Project root joined with "swebench_repos" (project root - inferred from this file) - 3) Current working directory joined with "swebench_repos" as a final - fallback - """ - env_override = os.getenv("SWEBENCH_REPOS_DIR") - if env_override: - return Path(env_override).expanduser().resolve() - - # This file lives at /src/utils/get_instance.py - project_root = Path(__file__).resolve().parents[2] - candidate = project_root / "swebench_repos" - if candidate.exists() or candidate.parent.exists(): - return candidate - - # Fallback to CWD - return Path.cwd() / "swebench_repos" - - -def get_instance_path(instance: dict, output_dir: Path | None = None) -> Path: - """ - Get the filesystem path for a SWE-bench instance. - - Args: - instance: Dictionary with 'repo' and 'instance_id' keys - output_dir: Base directory where instances are cloned - - Returns: - Path to the instance directory - """ - if output_dir is None: - output_dir = _default_repos_dir() - - repo_name = instance["repo"] - instance_id = instance["instance_id"] - dir_name = f"{repo_name.replace('/', '_')}_{instance_id}" - return output_dir / dir_name - - -def main(): - parser = argparse.ArgumentParser( - description="Get filesystem path for SWE-bench instance" - ) - parser.add_argument( - "--instance-id", - type=str, - help="Instance ID to look up (e.g., astropy__astropy-12907)", - ) - parser.add_argument( - "--index", - type=int, - help="Dataset index to look up (e.g., 0 for first instance)", - ) - parser.add_argument( - "--output-dir", - type=str, - default=str(_default_repos_dir()), - help=( - "Base directory where instances are cloned. Defaults to " - "SWEBENCH_REPOS_DIR if set, else /swebench_repos" - ), - ) - parser.add_argument( - "--dataset", - type=str, - default="princeton-nlp/SWE-bench_Lite", - help="SWE-bench dataset to use", - ) - parser.add_argument( - "--check", - action="store_true", - help="Check if the path exists and show info", - ) - - args = parser.parse_args() - - if not args.instance_id and args.index is None: - parser.error("Either --instance-id or --index must be provided") - - # Load dataset - print(f"Loading dataset: {args.dataset}") - dataset = load_dataset(args.dataset, split="test") - print(f"✓ Loaded {len(dataset)} instances\n") - - # Find the instance - if args.instance_id: - # Find by instance_id - instance = None - for inst in dataset: - if inst["instance_id"] == args.instance_id: - instance = inst - break - if not instance: - print(f"✗ Instance ID '{args.instance_id}' not found in dataset") - return - else: - # Get by index - if args.index < 0 or args.index >= len(dataset): - print(f"✗ Index {args.index} out of range [0, {len(dataset)-1}]") - return - instance = dataset[args.index] - - # Get the path - output_dir = Path(args.output_dir) - instance_path = get_instance_path(instance, output_dir) - - # Display info - print("=" * 80) - print("Instance Information:") - print("=" * 80) - print(f"Instance ID: {instance['instance_id']}") - print(f"Repository: {instance['repo']}") - print(f"Base Commit: {instance['base_commit']}") - print("\nFilesystem Path:") - print(f" {instance_path.absolute()}") - - # Check if exists - if args.check: - print("\n" + "=" * 80) - print("Path Check:") - print("=" * 80) - if instance_path.exists(): - print("✓ Path exists") - print("\nDirectory contents (first 10 items):") - items = sorted(instance_path.iterdir())[:10] - for item in items: - item_type = "📁" if item.is_dir() else "📄" - print(f" {item_type} {item.name}") - all_items = list(instance_path.iterdir()) - if len(all_items) > 10: - extra_count = len(all_items) - 10 - print(f" ... and {extra_count} more") - - # Count Python files - py_files = list(instance_path.rglob("*.py")) - print(f"\nTotal Python files: {len(py_files)}") - else: - print("✗ Path does not exist") - print("\nTo clone this instance, run:") - cmd = ( - f" python scripts/clone_repos.py --max-instances 1 " - f'--dataset "{args.dataset}"' - ) - print(cmd) - - -if __name__ == "__main__": - main() diff --git a/src/utils/instance_old.py b/src/utils/instance_old.py deleted file mode 100644 index d76d41c..0000000 --- a/src/utils/instance_old.py +++ /dev/null @@ -1,60 +0,0 @@ -import argparse -import subprocess -from pathlib import Path - -from datasets import load_dataset -from tqdm import tqdm - - -def clone_instance( - repo_name: str, commit_id: str, instance_id: str, output_dir: Path -) -> bool: - """ - Clone a repository at a specific commit into a separate directory. - - Args: - repo_name: Repository name in format 'owner/repo' - commit_id: Commit hash to checkout - instance_id: Instance ID for directory naming - output_dir: Base output directory - - Returns: - True if successful, False otherwise - """ - # Create instance directory name: repo_instance-id - # E.g., astropy_astropy-12907 - instance_dir_name = f"{repo_name.replace('/', '_')}_{instance_id}" - instance_path = output_dir / instance_dir_name - - # Skip if already exists - if instance_path.exists(): - print(f" ✓ Instance {instance_id} already exists") - return True, instance_path - - try: - # Clone the repository - subprocess.run( - [ - "git", - "clone", - f"https://github.com/{repo_name}.git", - str(instance_path), - ], - check=True, - capture_output=True, - text=True, - ) - - # Checkout the specific commit - subprocess.run( - ["git", "-C", str(instance_path), "checkout", commit_id], - check=True, - capture_output=True, - text=True, - ) - - print(f" ✓ Cloned {instance_id} at commit {commit_id[:8]}") - return True, instance_path - except subprocess.CalledProcessError as e: - print(f" ✗ Error cloning {instance_id}: {e.stderr}") - return False, None diff --git a/src/utils/parse_patch.py b/src/utils/parse_patch.py deleted file mode 100644 index 241b2a4..0000000 --- a/src/utils/parse_patch.py +++ /dev/null @@ -1,130 +0,0 @@ -import re - - -def parse_patch(patch: str) -> dict: - """ - Parse a git diff patch and extract file paths with their line ranges. - - Args: - patch: Git diff patch string - - Returns: - Dictionary mapping file paths to their modified line ranges: - { - "file_path": { - "old_start": int, - "old_count": int, - "new_start": int, - "new_count": int, - "hunks": [ - { - "old_start": int, - "old_count": int, - "new_start": int, - "new_count": int - }, - ... - ] - }, - ... - } - """ - result = {} - - # Split patch into individual file diffs - file_diffs = re.split(r"^diff --git ", patch, flags=re.MULTILINE) - - for file_diff in file_diffs: - if not file_diff.strip(): - continue - - # Extract file path from the diff header - # Format: a/path/to/file b/path/to/file - file_match = re.search( - r"a/(.*?) b/(.*?)$", file_diff, flags=re.MULTILINE - ) - if not file_match: - continue - - file_path = file_match.group(2) # Use the 'b/' path (new file) - - # Find all hunks in this file diff - # Hunk header format: @@ -old_start,old_count +new_start,new_count @@ - hunk_pattern = r"@@ -(\d+)(?:,(\d+))? \+(\d+)(?:,(\d+))? @@" - hunks = [] - - for match in re.finditer(hunk_pattern, file_diff): - old_start = int(match.group(1)) - old_count = int(match.group(2)) if match.group(2) else 1 - new_start = int(match.group(3)) - new_count = int(match.group(4)) if match.group(4) else 1 - - hunks.append( - { - "old_start": old_start, - "old_count": old_count, - "new_start": new_start, - "new_count": new_count, - } - ) - - if hunks: - # Calculate overall ranges - old_start = min(h["old_start"] for h in hunks) - old_end = max(h["old_start"] + h["old_count"] - 1 for h in hunks) - new_start = min(h["new_start"] for h in hunks) - new_end = max(h["new_start"] + h["new_count"] - 1 for h in hunks) - - result[file_path] = { - "old_start": old_start, - "old_count": old_end - old_start + 1, - "new_start": new_start, - "new_count": new_end - new_start + 1, - "hunks": hunks, - } - - return result - - -def add_patch_info(example): - """ - Dataset transformation function to add parsed patch info. - - Args: - example: Dataset example with 'patch' field - - Returns: - Example with added 'patch_info' field - """ - example["patch_info"] = parse_patch(example["patch"]) - return example - - -if __name__ == "__main__": - from datasets import load_dataset - - # Load dataset - print("Loading dataset...") - dataset = load_dataset("princeton-nlp/SWE-bench_Lite", split="test") - - # Get first instance - instance = dataset[0] - - print("\n" + "=" * 80) - print("Instance ID:", instance["instance_id"]) - print("=" * 80) - - # Show original patch - print("\nOriginal patch:") - print("-" * 80) - print(instance["patch"]) - - # Parse the patch - patch_info = parse_patch(instance["patch"]) - - print("\n" + "=" * 80) - print("Full patch_info dict:") - print("=" * 80) - import json - - print(json.dumps(patch_info, indent=2)) diff --git a/swe_grep_oss_env.py b/swe_grep_oss_env.py deleted file mode 100644 index 197d866..0000000 --- a/swe_grep_oss_env.py +++ /dev/null @@ -1,332 +0,0 @@ -import json -import logging -import traceback -from typing import Literal - -import verifiers as vf -from datasets import load_dataset -from openai import AsyncOpenAI - -import src.tools as tools -from src.constants import DEFAULT_MAX_TOKENS, DEFAULT_MAX_TOOL_CALLS -from src.prompts.system_prompt import SYSTEM_PROMPT -from src.utils.get_instance import get_instance_path -from src.utils.parse_patch import parse_patch - -logger = logging.getLogger("swe-grep-oss") - - -class SWEGrepEnv(vf.StatefulToolEnv): - def __init__(self, **kwargs): - super().__init__(**kwargs) - - # Only add bash tool - no result tool needed with XML output - self.add_tool(tools.bash, args_to_skip=["cwd"]) - - async def is_completed( - self, messages: vf.types.Messages, state: vf.types.State, **kwargs - ) -> bool: - max_turns_reached = await self.max_turns_reached(state) - prompt_too_long = await self.prompt_too_long(state) - - # Check if the last message contains XML tags - has_files_tag = False - if messages and len(messages) > 0: - last_message = messages[-1] - if last_message.get("role") == "assistant": - content = last_message.get("content", "") - if isinstance(content, str) and "" in content and "" in content: - has_files_tag = True - - if has_files_tag or max_turns_reached or prompt_too_long: - return True - - return False - - async def env_response( - self, messages: vf.types.Messages, state: vf.types.State, **kwargs - ) -> tuple[vf.types.Messages, vf.types.State]: - assert isinstance(messages, list) - - tool_messages = [] - tool_calls = messages[-1].get("tool_calls", []) - for tool_call in tool_calls: - tool_name: str = tool_call.get("function", {}).get("name", "") - tool_call_id: str = tool_call.get("id", "") - - arguments_str = tool_call.get("function", {}).get("arguments", "") - - try: - tool_args = json.loads(arguments_str) - - # Handle double-encoded JSON (when json.loads returns a string instead of dict) - if isinstance(tool_args, str): - self.logger.warning( - f"Double-encoded JSON detected, attempting to parse again: {tool_args[:100]}" - ) - tool_args = json.loads(tool_args) - - # Final check: must be a dict - if not isinstance(tool_args, dict): - raise TypeError(f"Expected dict, got {type(tool_args).__name__}") - - except (json.JSONDecodeError, TypeError) as e: - self.logger.error(f"Failed to parse tool arguments: {e}") - self.logger.error(f"Raw arguments: {repr(arguments_str)}") - tool_messages.append( - { - "role": "tool", - "content": f"Error: Invalid tool arguments - {str(e)}", - "tool_call_id": tool_call_id, - } - ) - continue - - tool_args = self.update_tool_args(tool_name, tool_args, messages, state, **kwargs) - tool_message: vf.types.Message = await self.call_tool( - tool_name, tool_args, tool_call_id - ) - tool_messages.append(tool_message) - return tool_messages, state - - async def rollout( - self, - client: AsyncOpenAI, - model: str, - prompt: vf.types.Messages, - completion: vf.types.Messages | None = None, - answer: str = "", - state: vf.types.State = {}, - task: str = "default", - info: vf.types.Info | None = None, - example_id: int = 0, - sampling_args: vf.types.SamplingArgs | None = None, - **kwargs, - ) -> tuple[vf.types.Messages, vf.types.State]: - try: - return await super().rollout( - client, - model, - prompt, - completion, - answer, - state, - task, - info, - example_id, - sampling_args, - **kwargs, - ) - except Exception as e: - import traceback - - self.logger.error(f"Error in rollout: {e}") - self.logger.error(f"Traceback: {traceback.format_exc()}") - raise # Re-raise to see the actual error - - def update_tool_args( - self, - tool_name: str, - tool_args: dict, - messages: vf.types.Messages, - state: vf.types.State, - **kwargs, - ) -> dict: - try: - if tool_name == "bash": - repo_path = get_instance_path( - { - "repo": state["info"]["repo"], - "instance_id": state["info"]["instance_id"], - } - ) - updated_tool_args = dict(tool_args) - updated_tool_args["cwd"] = repo_path - return updated_tool_args - except Exception: - # Add detailed logging - self.logger.error(f"update_tool_args called with tool_name={tool_name}") - self.logger.error(f"tool_args type: {type(tool_args)}") - self.logger.error(f"tool_args value: {repr(tool_args)}") - self.logger.error(f"messages: {messages}") - self.logger.error(f"Traceback: {traceback.format_exc()}") - raise # Re-raise to see the actual error - - return tool_args - - -def load_environment( - max_tokens: int = DEFAULT_MAX_TOKENS, - max_tool_calls: int = DEFAULT_MAX_TOOL_CALLS, - mode: Literal["train", "test", "full", "rl"] = "rl", - **kwargs, -): - """ - Load and configure the SWE-Grep environment. - - Args: - max_tokens: Maximum tokens for model responses - max_tool_calls: Maximum number of tool calls allowed - mode: Dataset mode - "train" (80%), "test" (20%), "full" (100%), or "rl" (train+eval split) - **kwargs: Additional arguments passed to SWEGrepEnv - - Returns: - SWEGrepEnv instance configured with the specified dataset - """ - - # Load and prepare dataset - full_dataset = load_dataset("princeton-nlp/SWE-bench_Lite", split="test") - full_dataset = full_dataset.shuffle(seed=42) - - # Transform dataset with metadata and prompts - def transform_row(row): - return { - "info": { - "repo": row["repo"], - "instance_id": row["instance_id"], - "max_tokens": max_tokens, - "max_tool_calls": max_tool_calls, - }, - "prompt": [ - {"role": "system", "content": SYSTEM_PROMPT}, - {"role": "user", "content": row["problem_statement"]}, - ], - "answer": json.dumps(parse_patch(row["patch"])), - } - - full_dataset = full_dataset.map(transform_row) - - # Split dataset for train/eval modes - split_dataset = full_dataset.train_test_split(test_size=0.2, seed=42) - train_dataset = split_dataset["train"] - eval_dataset = split_dataset["test"] - - # XML parser to extract files from tags - parser = vf.XMLParser(["files"], answer_field="files") - - # Reward function: F1 score between predicted and actual files - def file_localization_reward(completion: str, answer: str, **kwargs) -> float: - """ - Calculate F1 score between predicted files and actual files from patch. - """ - - # Helper function to normalize file paths - def normalize_path(path: str) -> str: - """Remove leading './' from file paths for consistent comparison.""" - path = path.strip() - if path.startswith("./"): - path = path[2:] - return path - - # Parse the model's response - predicted_files_str = parser.parse_answer(completion) - if predicted_files_str is None: - return 0.0 - - try: - # Try to parse as JSON array - if predicted_files_str.strip().startswith("["): - predicted_files = json.loads(predicted_files_str) - else: - # Split by newlines and filter empty - predicted_files = [f.strip() for f in predicted_files_str.split("\n") if f.strip()] - except: - predicted_files = [] - - # Parse the ground truth answer - try: - actual_files = json.loads(answer) if isinstance(answer, str) else answer - except: - actual_files = [] - - # Normalize paths and convert to sets for comparison - predicted_set = set(normalize_path(f) for f in predicted_files) - actual_set = set(normalize_path(f) for f in actual_files) - - # Calculate F1 score - if len(predicted_set) == 0 and len(actual_set) == 0: - return 1.0 - if len(predicted_set) == 0 or len(actual_set) == 0: - return 0.0 - - true_positives = len(predicted_set & actual_set) - precision = true_positives / len(predicted_set) if len(predicted_set) > 0 else 0.0 - recall = true_positives / len(actual_set) if len(actual_set) > 0 else 0.0 - - if precision + recall == 0: - return 0.0 - - f1 = 2 * (precision * recall) / (precision + recall) - return f1 - - # Reward that countr total no of turns with atleast 1 tool call - def turns_with_tool_calls(completion: vf.types.Messages) -> float: - """ - Count the total number of turns with atleast 1 tool call. - """ - if not isinstance(completion, list): - return 0.0 - - count = 0 - for message in completion: - if isinstance(message, dict) and "tool_calls" in message: - tool_calls = message.get("tool_calls", []) - if len(tool_calls) > 0: - count += 1 - - if count == 0: - return 0 - - return 1 - - # Reward to increase number of tool calls per turn - def tool_call_count_per_turn(completion: vf.types.Messages) -> float: - """ - Count the number of tool calls per turn. - """ - if not isinstance(completion, list): - return 0.0 - - counts = [] - for message in completion: - if isinstance(message, dict) and "tool_calls" in message: - tool_calls = message.get("tool_calls", []) - if tool_calls: - counts.append(len(tool_calls)) - - if len(counts) == 0: - return 0.0 - - avg_count = sum(counts) / len(counts) - - # clip it at 5 tool calls per turn - avg_count = min(5.0, avg_count) / 5.0 - - return avg_count - - # Define rubric with single F1 reward - rubric = vf.Rubric( - funcs=[file_localization_reward, turns_with_tool_calls, tool_call_count_per_turn], - weights=[1.0, 1.0, 1.0], - ) - - # Common environment configuration - env_config = { - "parser": parser, - "rubric": rubric, - "max_turns": 8, - **kwargs, - } - - # Select dataset(s) based on mode - if mode == "full": - env_config["dataset"] = full_dataset - elif mode == "train": - env_config["dataset"] = train_dataset - elif mode == "test": - env_config["dataset"] = eval_dataset - else: # mode == "rl" (default) - env_config["dataset"] = train_dataset - env_config["eval_dataset"] = eval_dataset - - return SWEGrepEnv(**env_config) diff --git a/tests/test_data_extract_functions_from_patch.py b/tests/test_data_extract_functions_from_patch.py deleted file mode 100644 index e48ebe6..0000000 --- a/tests/test_data_extract_functions_from_patch.py +++ /dev/null @@ -1,55 +0,0 @@ -import pytest - -from src.utils.dataset import extract_functions_from_patch - - -def test_extract_functions_from_patch(): - diff = """diff --git a/moto/rds/exceptions.py b/moto/rds/exceptions.py ---- a/moto/rds/exceptions.py -+++ b/moto/rds/exceptions.py -@@ -82,6 +82,14 @@ def __init__(self, database_identifier: str): -) - - -+class DBClusterToBeDeletedHasActiveMembers(RDSClientError): -+ def __init__(self) -> None: -+ super().__init__( -+ "InvalidDBClusterStateFault", -+ "Cluster cannot be deleted, it still contains DB instances in non-deleting state.", -+ ) -+ -+ -class InvalidDBInstanceStateError(RDSClientError): -def __init__(self, database_identifier: str, istate: str): -estate = ( -diff --git a/moto/rds/models.py b/moto/rds/models.py ---- a/moto/rds/models.py -+++ b/moto/rds/models.py -@@ -19,6 +19,7 @@ -DBClusterNotFoundError, -DBClusterSnapshotAlreadyExistsError, -DBClusterSnapshotNotFoundError, -+ DBClusterToBeDeletedHasActiveMembers, -DBInstanceNotFoundError, -DBSnapshotNotFoundError, -DBSecurityGroupNotFoundError, -@@ -2339,7 +2340,8 @@ def delete_db_cluster( -raise InvalidParameterValue( -"Can't delete Cluster with protection enabled" -) -- -+ if cluster.cluster_members: -+ raise DBClusterToBeDeletedHasActiveMembers() -global_id = cluster.global_cluster_identifier or "" -if global_id in self.global_clusters: -self.remove_from_global_cluster(global_id, cluster_identifier)""" - - result = extract_functions_from_patch(diff) - - # Note: In unified diff headers, paths are prefixed with a/ and b/. - # After stripping the leading 'b/', the actual path here is 'b/b.py'. - assert result == [ - ("moto/rds/exceptions.py", [82, 6]), - ("moto/rds/models.py", [19, 6]), - ("moto/rds/models.py", [2339, 7]) - ] \ No newline at end of file