diff --git a/.gitignore b/.gitignore
index 2b914ad..7d3938d 100644
--- a/.gitignore
+++ b/.gitignore
@@ -181,3 +181,4 @@ pyrightconfig.json
 
 # End of https://www.toptal.com/developers/gitignore/api/python
 swebench_repos/
+setup.txt
\ No newline at end of file
diff --git a/.gitmodules b/.gitmodules
deleted file mode 100644
index ad55c20..0000000
--- a/.gitmodules
+++ /dev/null
@@ -1,6 +0,0 @@
-[submodule "software-agent-sdk"]
-	path = software-agent-sdk
-	url = https://github.com/OpenHands/software-agent-sdk.git
-[submodule "prime-rl"]
-	path = prime-rl
-	url = https://github.com/PrimeIntellect-ai/prime-rl
diff --git a/README_verifiers.md b/README_verifiers.md
deleted file mode 100644
index b7c9771..0000000
--- a/README_verifiers.md
+++ /dev/null
@@ -1,30 +0,0 @@
-# Instructions for using the verifiers environment
-
-1. Install dependencies
-
-```bash
-curl -LsSf https://astral.sh/uv/install.sh | sh
-uv sync
-```
-
-2. Clone some repos from the SWE-bench dataset
-
-```bash
-uv run scripts/clone_repos.py --output-dir ./swebench_repos --dataset princeton-nlp/SWE-bench_Lite --max-workers 10
-```
-
-3. Run `vllm` and serve `Qwen3-8B`
-```bash
-vllm serve Qwen/Qwen3-8B --enable-auto-tool-choice --tool-call-parser hermes --reasoning-parser deepseek_r1
-```
-
-4. Install [ripgrep](https://github.com/BurntSushi/ripgrep?tab=readme-ov-file#installation)
-```bash
-sudo apt-get install ripgrep -y
-```
-
-5. Run the verifiers eval with your model of choice
-
-```bash
-uv run vf-eval swe-grep-oss-env --api-base-url http://localhost:8000/v1 --model "Qwen/Qwen3-8B" --num-examples 1 --rollouts-per-example 1
-```
diff --git a/configs/reward_config_1.7b.yaml b/configs/reward_config_1.7b.yaml
new file mode 100644
index 0000000..feced9a
--- /dev/null
+++ b/configs/reward_config_1.7b.yaml
@@ -0,0 +1,3 @@
+reward:
+  - fn: multilevel_localization_f1_reward
+    weight: 1.0
\ No newline at end of file
diff --git a/configs/reward_config_14b.yaml b/configs/reward_config_14b.yaml
new file mode 100644
index 0000000..288be5c
--- /dev/null
+++ b/configs/reward_config_14b.yaml
@@ -0,0 +1,8 @@
+reward:
+  - fn: multilevel_localization_f1_reward
+    weight: 1.0
+  - fn: multiturn_reward
+    args:
+      maximal_turns: 4
+      minimal_turns: 4
+    weight: 1.0
\ No newline at end of file
diff --git a/configs/reward_config_4b.yaml b/configs/reward_config_4b.yaml
new file mode 100644
index 0000000..feced9a
--- /dev/null
+++ b/configs/reward_config_4b.yaml
@@ -0,0 +1,3 @@
+reward:
+  - fn: multilevel_localization_f1_reward
+    weight: 1.0
\ No newline at end of file
diff --git a/configs/rewards/cosine.yaml b/configs/rewards/cosine.yaml
deleted file mode 100644
index a8fbae7..0000000
--- a/configs/rewards/cosine.yaml
+++ /dev/null
@@ -1,2 +0,0 @@
-reward:
-  - fn: cosine_reward
\ No newline at end of file
diff --git a/configs/rewards/cosine_file_only.yaml b/configs/rewards/cosine_file_only.yaml
deleted file mode 100644
index 658062a..0000000
--- a/configs/rewards/cosine_file_only.yaml
+++ /dev/null
@@ -1,7 +0,0 @@
-reward:
-  - fn: cosine_reward
-    args:
-      loc_threshold: 0.5
-      file_level_weight: 1.0
-      module_level_weight: 0.0
-      entity_level_weight: 0.0
\ No newline at end of file
diff --git a/configs/rewards/file_loc.yaml b/configs/rewards/file_loc.yaml
deleted file mode 100644
index dfc07a9..0000000
--- a/configs/rewards/file_loc.yaml
+++ /dev/null
@@ -1,3 +0,0 @@
-reward:
-  - fn: file_localization_f1_reward
-  - fn: tool_use_reward
\ No newline at end of file
diff --git a/configs/rewards/scaled_f1.yaml b/configs/rewards/scaled_f1.yaml
deleted file mode 100644
index 1a8dd59..0000000
--- a/configs/rewards/scaled_f1.yaml
+++ /dev/null
@@ -1,2 +0,0 @@
-reward:
-  - fn: scaled_f1_reward
diff --git a/configs/rewards/tool_use.yaml b/configs/rewards/tool_use.yaml
deleted file mode 100644
index 0c97d9d..0000000
--- a/configs/rewards/tool_use.yaml
+++ /dev/null
@@ -1,4 +0,0 @@
-reward:
-  - fn: tool_use_reward
-  - fn: turn_efficiency
-  - fn: multilevel_localization_f1_reward
\ No newline at end of file
diff --git a/configs/rewards/tool_use_and_turn_cosine_file_only.yaml b/configs/rewards/tool_use_and_turn_cosine_file_only.yaml
deleted file mode 100644
index e528ad8..0000000
--- a/configs/rewards/tool_use_and_turn_cosine_file_only.yaml
+++ /dev/null
@@ -1,8 +0,0 @@
-reward:
-  - fn: turn_cosine_reward
-    args:
-      loc_threshold: 0.5
-      file_level_weight: 1.0
-      module_level_weight: 0.0
-      entity_level_weight: 0.0
-  - fn: tool_use_reward
\ No newline at end of file
diff --git a/configs/rewards/weighted_f1.yaml b/configs/rewards/weighted_f1.yaml
deleted file mode 100644
index eac2352..0000000
--- a/configs/rewards/weighted_f1.yaml
+++ /dev/null
@@ -1,6 +0,0 @@
-reward:
-  - fn: multilevel_localization_f1_reward
-    args:
-      file_level_weight: 4.0
-      module_level_weight: 2.0
-      entity_level_weight: 1.0
\ No newline at end of file
diff --git a/configs/skyrl-experiments/README.md b/configs/skyrl-experiments/README.md
deleted file mode 100644
index df63aec..0000000
--- a/configs/skyrl-experiments/README.md
+++ /dev/null
@@ -1,288 +0,0 @@
-# SkyRL Experiment Configuration Guide
-
-This directory contains experiment configuration files for training agents with SkyRL. Each YAML file defines an experiment setup with specific tools, rewards, and prompts.
-
-## Usage
-
-```bash
-DATA_PATH=<Absolute Path to Data>
-
-bash scripts/run_async_training.sh \
-    -m Qwen/Qwen3-4B \
-    -o "+generator.exp_config=configs/skyrl-experiments/read-only.yaml" \
-    -d $DATA_PATH \
-    2>&1 | tee training.log
-```
-
-## Configuration File Structure
-
-Each experiment config file follows this structure:
-
-```yaml
-name: "experiment_name"
-description: "Brief description of the experiment"
-
-reward:
-  - fn: reward_function_1
-  - fn: reward_function_2
-
-tools:
-  - tool_name_1
-  - tool_name_2
-
-prompts:
-  system_prompt: "templates/system_prompt.j2"
-  user_prompt: "templates/user_prompt.j2"
-```
-
-### Fields
-
-#### `name` (optional)
-- **Type**: String
-- **Description**: A unique identifier for the experiment
-- **Example**: `"read_only_tools"`
-
-#### `description` (optional)
-- **Type**: String
-- **Description**: A human-readable description of what the experiment tests
-- **Example**: `"The agent only has access to read only tools"`
-
-#### `reward` (required)
-- **Type**: List of reward function specifications
-- **Description**: Defines the reward functions used to evaluate agent performance during training
-- **Format**: Each item should have a `fn` key with the reward function name
-- **Example**:
-  ```yaml
-  reward:
-    - fn: tool_use_reward
-    - fn: turn_efficiency
-  ```
-
-#### `tools` (required)
-- **Type**: List of tool names
-- **Description**: Specifies which tools the agent has access to during the experiment
-- **Options**: Can be default OpenHands tools, custom tools, or toolsets
-- **Example**:
-  ```yaml
-  tools:
-    - terminal
-    - grep
-    - glob
-  ```
-
-#### `prompts` (required)
-- **Type**: Object with `system_prompt` and `user_prompt` keys
-- **Description**: Specifies the Jinja2 template files for system and user prompts
-- **Location**: Templates should be placed in `src/prompts/templates/`
-- **Format**: Paths are relative to `src/prompts/`
-- **Example**:
-  ```yaml
-  prompts:
-    system_prompt: "templates/system_prompt.j2"
-    user_prompt: "templates/file_localization.j2"
-  ```
-
-## Default OpenHands Tools
-
-The following tools are built into OpenHands and can be used directly in your config:
-
-- `apply_patch` - Apply code patches to files
-- `browser_use` - Interact with web browsers
-- `delegate` - Delegate tasks to sub-agents
-- `file_editor` - Edit files with various operations
-- `glob` - Search for files by name patterns
-- `grep` - Search file contents using regex
-- `planning_file_editor` - File editor with planning capabilities
-- `preset` - Use predefined tool presets
-- `task_tracker` - Track and manage tasks
-- `terminal` - Execute shell commands
-- `tom_consult` - Consult theory of mind models
-
-## Registering Custom Tools
-
-To create and register a custom tool:
-
-### 1. Create a Tool File
-
-Create a new Python file in `src/tools/` (e.g., `src/tools/my_custom_tool.py`):
-
-```python
-from src.tools import tool
-from pydantic import Field
-from collections.abc import Sequence
-from openhands.sdk import (
-    Action,
-    Observation,
-    TextContent,
-    ToolDefinition,
-)
-from openhands.sdk.tool import ToolExecutor
-
-# Define your Action class
-class MyCustomAction(Action):
-    param1: str = Field(description="Description of parameter")
-    param2: int = Field(default=10, description="Optional parameter")
-
-# Define your Observation class
-class MyCustomObservation(Observation):
-    result: str = ""
-    
-    @property
-    def to_llm_content(self) -> Sequence[TextContent]:
-        return [TextContent(text=self.result)]
-
-# Define your Executor
-class MyCustomExecutor(ToolExecutor[MyCustomAction, MyCustomObservation]):
-    def __call__(self, action: MyCustomAction, conversation=None) -> MyCustomObservation:
-        # Implement your tool logic here
-        result = f"Processed {action.param1} with {action.param2}"
-        return MyCustomObservation(result=result)
-
-# Define your Tool
-class MyCustomTool(ToolDefinition[MyCustomAction, MyCustomObservation]):
-    @classmethod
-    def create(cls, conv_state) -> Sequence[ToolDefinition]:
-        executor = MyCustomExecutor()
-        return [
-            cls(
-                description="Description of what your tool does",
-                action_type=MyCustomAction,
-                observation_type=MyCustomObservation,
-                executor=executor,
-            )
-        ]
-
-# Register the tool
-@tool(name="my_custom_tool")
-def _make_my_custom_tool(conv_state) -> list[ToolDefinition]:
-    return MyCustomTool.create(conv_state)
-```
-
-### 2. Use the Tool in Your Config
-
-Once registered, simply add the tool name to your experiment config:
-
-```yaml
-tools:
-  - my_custom_tool
-  - terminal
-```
-
-### Creating Toolsets
-
-You can also create toolsets that bundle multiple tools together (see `bash_and_grep_toolset` in `src/tools/example_custom_tool.py`):
-
-```python
-@tool(name="my_toolset")
-def _make_my_toolset(conv_state) -> list[ToolDefinition]:
-    """Create multiple tools that share resources."""
-    terminal_executor = TerminalExecutor(working_dir=conv_state.workspace.working_dir)
-    
-    tool1 = Tool1.create(conv_state, executor=terminal_executor)[0]
-    tool2 = Tool2.create(conv_state, executor=terminal_executor)[0]
-    
-    return [tool1, tool2]
-```
-
-## System and User Prompts
-
-Prompts are defined using Jinja2 templates and should be placed in `src/prompts/templates/`.
-
-### Available Template Files
-
-- `system_prompt.j2` - Default system prompt
-- `file_localization.j2` - User prompt for file localization tasks
-- `file_module.j2` - User prompt for file/module tasks
-- `file_module_parallel_tools.j2` - User prompt with parallel tool usage
-- `system_message_search.j2` - System prompt for search tasks
-- `default.j2` - Default user prompt
-
-### Creating Custom Prompts
-
-1. Create a new Jinja2 template file in `src/prompts/templates/`:
-
-```jinja2
-{# templates/my_custom_prompt.j2 #}
-You are an AI assistant specialized in {{ task_type }}.
-
-Your goal is to: {{ goal }}
-
-Available tools:
-{% for tool in tools %}
-- {{ tool }}
-{% endfor %}
-
-Please proceed with the task.
-```
-
-2. Reference it in your experiment config:
-
-```yaml
-prompts:
-  system_prompt: "templates/system_prompt.j2"
-  user_prompt: "templates/my_custom_prompt.j2"
-```
-
-### Template Variables
-
-Templates have access to various context variables provided by the training system, including:
-- `task_type` - The type of task being performed
-- `goal` - The specific goal for the episode
-- `tools` - List of available tools
-- `workspace` - Workspace information
-- And other context-specific variables
-
-## Example Configurations
-
-### Example 1: Read-Only Tools
-```yaml
-name: "read_only_tools"
-description: "The agent only has access to read only tools"
-
-reward:
-  - fn: tool_use_reward
-  - fn: turn_efficiency
-
-tools:
-  - glob
-  - grep
-  - terminal
-
-prompts:
-  system_prompt: "templates/system_prompt.j2"
-  user_prompt: "templates/file_localization.j2"
-```
-
-### Example 2: Terminal Only
-```yaml
-name: "terminal_tool_only"
-description: "The agent only has access to the terminal tool"
-
-reward:
-  - fn: tool_use_reward
-  - fn: turn_efficiency
-
-tools:
-  - terminal
-
-prompts:
-  system_prompt: "templates/system_prompt.j2"
-  user_prompt: "templates/file_localization.j2"
-```
-
-### Example 3: Custom Toolset
-```yaml
-name: "bash_and_grep"
-description: "Agent uses bash and grep toolset with shared executor"
-
-reward:
-  - fn: tool_use_reward
-  - fn: turn_efficiency
-
-tools:
-  - bash_and_grep_toolset
-
-prompts:
-  system_prompt: "templates/system_prompt.j2"
-  user_prompt: "templates/file_localization.j2"
-```
diff --git a/configs/skyrl-experiments/multilevel_f05_minimum.yaml b/configs/skyrl-experiments/multilevel_f05_minimum.yaml
deleted file mode 100644
index 393b64b..0000000
--- a/configs/skyrl-experiments/multilevel_f05_minimum.yaml
+++ /dev/null
@@ -1,21 +0,0 @@
-name: "Multilevel F1 with No Thinking"
-description: "Don't use thinking"
-
-reward:
-  - fn: multilevel_localization_f1_reward
-    weight: 1.0
-    args:
-      beta: 0.5
-  - fn: format_reward
-    weight: 1.0
-    args:
-      penalize: false
-
-tools:
-  - glob
-  - grep
-  - terminal
-
-prompts:
-  system_prompt: "templates/system_prompt.j2"
-  user_prompt: "templates/file_module.j2"
\ No newline at end of file
diff --git a/configs/skyrl-experiments/no_think_multilevel_f1.yaml b/configs/skyrl-experiments/no_think_multilevel_f1.yaml
deleted file mode 100644
index 21a6d05..0000000
--- a/configs/skyrl-experiments/no_think_multilevel_f1.yaml
+++ /dev/null
@@ -1,18 +0,0 @@
-name: "Multilevel F1 with No Thinking"
-description: "Don't use thinking"
-
-reward:
-  - fn: multilevel_localization_f1_reward
-  - fn: format_reward
-  - fn: tool_use_reward
-    args:
-      penalize: true
-
-tools:
-  - glob
-  - grep
-  - terminal
-
-prompts:
-  system_prompt: "templates/system_prompt_short.j2"
-  user_prompt: "templates/file_module_no_think.j2"
\ No newline at end of file
diff --git a/configs/skyrl-experiments/no_think_weighted_f1.yaml b/configs/skyrl-experiments/no_think_weighted_f1.yaml
deleted file mode 100644
index 60a54a5..0000000
--- a/configs/skyrl-experiments/no_think_weighted_f1.yaml
+++ /dev/null
@@ -1,22 +0,0 @@
-name: "Weighted F1"
-description: "Don't use thinking"
-
-reward:
-  - fn: multilevel_localization_f1_reward
-    args:
-      file_level_weight: 1.0
-      module_level_weight: 0.5
-      entity_level_weight: 0.25
-  - fn: format_reward
-  - fn: tool_use_reward
-    args:
-      penalize: true
-
-tools:
-  - glob
-  - grep
-  - terminal
-
-prompts:
-  system_prompt: "templates/system_prompt_short.j2"
-  user_prompt: "templates/file_module_no_think.j2"
\ No newline at end of file
diff --git a/configs/skyrl-experiments/read-only.yaml b/configs/skyrl-experiments/read-only.yaml
deleted file mode 100644
index 4d5525a..0000000
--- a/configs/skyrl-experiments/read-only.yaml
+++ /dev/null
@@ -1,15 +0,0 @@
-name: "read_only_tools"
-description: "The agent only has access to read only tools"
-
-reward:
-  - fn: tool_use_reward
-  - fn: turn_efficiency
-
-tools:
-  - glob
-  - grep
-  - terminal
-
-prompts:
-  system_prompt: "templates/system_prompt.j2"
-  user_prompt: "templates/file_localization.j2"
\ No newline at end of file
diff --git a/configs/skyrl-experiments/terminal.yaml b/configs/skyrl-experiments/terminal.yaml
deleted file mode 100644
index 6ad62c6..0000000
--- a/configs/skyrl-experiments/terminal.yaml
+++ /dev/null
@@ -1,13 +0,0 @@
-name: "terminal_tool_only"
-description: "The agent only has access to the terminal tool"
-
-reward:
-  - fn: tool_use_reward
-  - fn: turn_efficiency
-
-tools:
-  - terminal
-
-prompts:
-  system_prompt: "templates/system_prompt.j2"
-  user_prompt: "templates/file_localization.j2"
\ No newline at end of file
diff --git a/configs/skyrl-experiments/test.yaml b/configs/skyrl-experiments/test.yaml
deleted file mode 100644
index ce042b5..0000000
--- a/configs/skyrl-experiments/test.yaml
+++ /dev/null
@@ -1,13 +0,0 @@
-name: "read_only_tools"
-description: "The agent only has access to read only tools"
-
-reward:
-  - fn: tool_use_reward
-  - fn: turn_efficiency
-
-tools:
-  - bash_and_grep_toolset
-
-prompts:
-  system_prompt: "templates/system_prompt.j2"
-  user_prompt: "templates/file_localization.j2"
\ No newline at end of file
diff --git a/configs/skyrl-experiments/think_multilevel_f1.yaml b/configs/skyrl-experiments/think_multilevel_f1.yaml
deleted file mode 100644
index b558d80..0000000
--- a/configs/skyrl-experiments/think_multilevel_f1.yaml
+++ /dev/null
@@ -1,16 +0,0 @@
-name: "Multilevel F1 with No Thinking"
-description: "Don't use thinking"
-
-reward:
-  - fn: multilevel_localization_f1_reward
-  - fn: format_reward
-  - fn: tool_use_reward
-
-tools:
-  - glob
-  - grep
-  - terminal
-
-prompts:
-  system_prompt: "templates/system_prompt_short.j2"
-  user_prompt: "templates/file_module_think.j2"
\ No newline at end of file
diff --git a/configs/skyrl-experiments/weighted_multilevel_f05.yaml b/configs/skyrl-experiments/weighted_multilevel_f05.yaml
deleted file mode 100644
index a0d7752..0000000
--- a/configs/skyrl-experiments/weighted_multilevel_f05.yaml
+++ /dev/null
@@ -1,28 +0,0 @@
-name: "Multilevel F1 with No Thinking"
-description: "Don't use thinking"
-
-reward:
-  - fn: multilevel_localization_f1_reward
-    weight: 1.0
-    args:
-      beta: 0.5
-  - fn: tool_use_reward
-    weight: 1.0
-    args:
-      clamp: false
-      penalize: false
-      max_tool_use: 5
-  - fn: multiturn_reward
-    weight: 4.0
-    args:
-      minimal_turns: 4
-      maximal_turns: 4
-
-tools:
-  - glob
-  - grep
-  - terminal
-
-prompts:
-  system_prompt: "templates/system_prompt.j2"
-  user_prompt: "templates/file_module.j2"
\ No newline at end of file
diff --git a/configs/swe-grep-oss/rl/infer.toml b/configs/swe-grep-oss/rl/infer.toml
deleted file mode 100644
index cebaf69..0000000
--- a/configs/swe-grep-oss/rl/infer.toml
+++ /dev/null
@@ -1,8 +0,0 @@
-gpu_memory_utilization = 0.7
-
-[model]
-name = "willcb/Qwen3-8B"
-enforce_eager = true
-enable_auto_tool_choice = true
-tool_call_parser = "hermes"
-
diff --git a/configs/swe-grep-oss/rl/orch.toml b/configs/swe-grep-oss/rl/orch.toml
deleted file mode 100644
index 45c2042..0000000
--- a/configs/swe-grep-oss/rl/orch.toml
+++ /dev/null
@@ -1,17 +0,0 @@
-batch_size = 64
-seq_len = 8000
-rollouts_per_example = 4
-max_steps = 150
-mask_truncated_completions = false
-
-[wandb]
-project = "swe-grep-oss"
-
-[model]
-name = "willcb/Qwen3-8B"
-
-[[env]]
-id = "swe-grep-oss-env"
-
-[ckpt]
-interval = 10
diff --git a/configs/swe-grep-oss/rl/train.toml b/configs/swe-grep-oss/rl/train.toml
deleted file mode 100644
index ffc5977..0000000
--- a/configs/swe-grep-oss/rl/train.toml
+++ /dev/null
@@ -1,28 +0,0 @@
-max_steps = 150
-
-[model]
-name = "willcb/Qwen3-8B"
-
-[model.ac]
-freq = 1
-
-[model.experimental.lora]
-rank = 64
-alpha = 512
-dropout = 0.0
-target_modules = [
-    "q_proj",      # Attention: Query projection
-    "k_proj",      # Attention: Key projection
-    "v_proj",      # Attention: Value projection
-    "o_proj",      # Attention: Output projection
-    "gate_proj",   # MLP: Gating projection
-    "up_proj",     # MLP: Up projection
-    "down_proj"    # MLP: Down projection
-]
-modules_to_save = []
-
-[optim]
-lr = 1e-5
-
-[ckpt]
-interval = 10
\ No newline at end of file
diff --git a/data/adityasoni17__SWE-smith-py-code-search_train/train.parquet b/data/adityasoni17__SWE-smith-py-code-search_train/train.parquet
new file mode 100644
index 0000000..49daa50
Binary files /dev/null and b/data/adityasoni17__SWE-smith-py-code-search_train/train.parquet differ
diff --git a/data/adityasoni17__SWE-smith-py-code-search_train/validation.parquet b/data/adityasoni17__SWE-smith-py-code-search_train/validation.parquet
new file mode 100644
index 0000000..c3d2394
Binary files /dev/null and b/data/adityasoni17__SWE-smith-py-code-search_train/validation.parquet differ
diff --git a/data/swe_gym/train.parquet b/data/qwen3_1.7b_data/train.parquet
similarity index 56%
rename from data/swe_gym/train.parquet
rename to data/qwen3_1.7b_data/train.parquet
index 09576bd..d7e8ce7 100644
Binary files a/data/swe_gym/train.parquet and b/data/qwen3_1.7b_data/train.parquet differ
diff --git a/data/qwen3_1.7b_data/validation.parquet b/data/qwen3_1.7b_data/validation.parquet
new file mode 100644
index 0000000..f82fcbe
Binary files /dev/null and b/data/qwen3_1.7b_data/validation.parquet differ
diff --git a/data/swe_gym/validation.parquet b/data/swe_gym/validation.parquet
deleted file mode 100644
index d085f63..0000000
Binary files a/data/swe_gym/validation.parquet and /dev/null differ
diff --git a/prime-rl b/prime-rl
deleted file mode 160000
index 6b01ba5..0000000
--- a/prime-rl
+++ /dev/null
@@ -1 +0,0 @@
-Subproject commit 6b01ba5ae7f215aa1f869dfac30f5df5d587ee94
diff --git a/pyproject.toml b/pyproject.toml
index cce1f28..c2f64b3 100644
--- a/pyproject.toml
+++ b/pyproject.toml
@@ -78,21 +78,13 @@ flash-attn = ["torch"]
 [tool.uv.sources]
 skyrl-train = { git = "https://github.com/adityasoni9998/SkyRL.git", rev = "81e5a97c7430503c0c4e6508497cc5aa01a0c624", subdirectory = "skyrl-train" }
 flash-attn = {url = "https://github.com/Dao-AILab/flash-attention/releases/download/v2.8.3/flash_attn-2.8.3+cu12torch2.8cxx11abiTRUE-cp313-cp313-linux_x86_64.whl"}
-openhands-sdk = { workspace = true }
-openhands-tools = { workspace = true }
-openhands-workspace = { workspace = true }
-openhands-agent-server = { workspace = true }
+openhands-sdk = { git = "https://github.com/OpenHands/software-agent-sdk.git", rev = "85ecfd9333d2d2cc4404dd460fd38868d9b978e2", subdirectory = "openhands-sdk" }
+openhands-tools = { git = "https://github.com/OpenHands/software-agent-sdk.git", rev = "85ecfd9333d2d2cc4404dd460fd38868d9b978e2", subdirectory = "openhands-tools" }
+openhands-workspace = { git = "https://github.com/OpenHands/software-agent-sdk.git", rev = "85ecfd9333d2d2cc4404dd460fd38868d9b978e2", subdirectory = "openhands-workspace" }
+openhands-agent-server = { git = "https://github.com/OpenHands/software-agent-sdk.git", rev = "85ecfd9333d2d2cc4404dd460fd38868d9b978e2", subdirectory = "openhands-agent-server" }
 torch = { index = "pytorch-cu128" }
 torchvision = { index = "pytorch-cu128" }
 flashinfer-jit-cache = { index = "flashinfer-cu128" }
 # flashinfer-python = [
 #     { url = "https://download.pytorch.org/whl/cu128/flashinfer/flashinfer_python-0.2.6.post1%2Bcu128torch2.7-cp39-abi3-linux_x86_64.whl" }
 # ]
-
-[tool.uv.workspace]
-members = [
-  "software-agent-sdk/openhands-sdk",
-  "software-agent-sdk/openhands-tools",
-  "software-agent-sdk/openhands-workspace",
-  "software-agent-sdk/openhands-agent-server",
-]
diff --git a/scripts/clone_repos.py b/scripts/clone_repos.py
deleted file mode 100644
index bc70173..0000000
--- a/scripts/clone_repos.py
+++ /dev/null
@@ -1,199 +0,0 @@
-import argparse
-import subprocess
-from concurrent.futures import ThreadPoolExecutor, as_completed
-from pathlib import Path
-
-from datasets import load_dataset
-from tqdm import tqdm
-
-
-def clone_instance(
-    repo_name: str, commit_id: str, instance_id: str, output_dir: Path
-) -> bool:
-    """
-    Clone a repository at a specific commit into a separate directory.
-
-    Args:
-        repo_name: Repository name in format 'owner/repo'
-        commit_id: Commit hash to checkout
-        instance_id: Instance ID for directory naming
-        output_dir: Base output directory
-
-    Returns:
-        True if successful, False otherwise
-    """
-    # Create instance directory name: repo_instance-id
-    # E.g., astropy_astropy-12907
-    instance_dir_name = f"{repo_name.replace('/', '_')}_{instance_id}"
-    instance_path = output_dir / instance_dir_name
-
-    # Skip if already exists
-    if instance_path.exists():
-        return True
-
-    try:
-        # Clone the repository
-        subprocess.run(
-            [
-                "git",
-                "clone",
-                f"https://github.com/{repo_name}.git",
-                str(instance_path),
-            ],
-            check=True,
-            capture_output=True,
-            text=True,
-        )
-
-        # Checkout the specific commit
-        subprocess.run(
-            ["git", "-C", str(instance_path), "checkout", commit_id],
-            check=True,
-            capture_output=True,
-            text=True,
-        )
-
-        return True
-    except subprocess.CalledProcessError as e:
-        return False
-
-
-def main():
-    parser = argparse.ArgumentParser(
-        description="Clone repositories from SWE-bench dataset"
-    )
-    parser.add_argument(
-        "--output-dir",
-        type=str,
-        default="./swebench_repos",
-        help="Directory to clone repositories into (default: ./swebench_repos)",
-    )
-    parser.add_argument(
-        "--dataset",
-        type=str,
-        default="princeton-nlp/SWE-bench_Lite",
-        help="SWE-bench dataset to use (default: princeton-nlp/SWE-bench_Lite)",
-    )
-    parser.add_argument(
-        "--max-instances",
-        type=int,
-        default=None,
-        help="Maximum number of instances to process (for testing)",
-    )
-    parser.add_argument(
-        "--max-repos",
-        type=int,
-        default=None,
-        help="Maximum number of repositories to clone (for testing)",
-    )
-    parser.add_argument(
-        "--show-fields",
-        action="store_true",
-        help="Show available fields in the dataset and exit",
-    )
-    parser.add_argument(
-        "--max-workers",
-        type=int,
-        default=4,
-        help="Maximum number of concurrent clone operations (default: 4)",
-    )
-
-    args = parser.parse_args()
-
-    output_dir = Path(args.output_dir)
-    output_dir.mkdir(parents=True, exist_ok=True)
-
-    print(f"Loading SWE-bench dataset: {args.dataset}")
-    dataset = load_dataset(args.dataset, split="test")
-    print(f"✓ Loaded {len(dataset)} instances")
-
-    # Show available fields if requested
-    if args.show_fields:
-        print("\n" + "=" * 80)
-        print("Available fields in dataset:")
-        print("=" * 80)
-        if len(dataset) > 0:
-            first_instance = dataset[0]
-            for key in sorted(first_instance.keys()):
-                value = first_instance[key]
-                # Truncate long values
-                value_str = str(value)
-                if len(value_str) > 100:
-                    value_str = value_str[:100] + "..."
-                print(f"{key:25s}: {value_str}")
-        print("=" * 80)
-        return
-
-    # Collect all instances to process
-    instances_to_process = []
-    for instance in dataset:
-        instances_to_process.append(
-            {
-                "repo": instance["repo"],
-                "instance_id": instance["instance_id"],
-                "base_commit": instance["base_commit"],
-            }
-        )
-
-    # Apply max-repos filter
-    if args.max_repos:
-        # Group by repo and take first N repos
-        repos_seen = set()
-        filtered_instances = []
-        for instance in instances_to_process:
-            if instance["repo"] not in repos_seen:
-                if len(repos_seen) >= args.max_repos:
-                    continue
-                repos_seen.add(instance["repo"])
-            if instance["repo"] in repos_seen:
-                filtered_instances.append(instance)
-        instances_to_process = filtered_instances
-        print(f"\n(Limited to {args.max_repos} repositories)")
-
-    # Apply max-instances filter
-    if args.max_instances:
-        instances_to_process = instances_to_process[: args.max_instances]
-        print(f"(Limited to {args.max_instances} instances)")
-
-    print(f"\nProcessing {len(instances_to_process)} instances")
-    print(f"Using {args.max_workers} concurrent workers")
-    print("=" * 80)
-
-    # Clone each instance concurrently
-    successful = 0
-    with ThreadPoolExecutor(max_workers=args.max_workers) as executor:
-        # Submit all tasks
-        future_to_instance = {
-            executor.submit(
-                clone_instance,
-                instance["repo"],
-                instance["base_commit"],
-                instance["instance_id"],
-                output_dir,
-            ): instance
-            for instance in instances_to_process
-        }
-
-        # Process completed tasks with progress bar
-        for future in tqdm(
-            as_completed(future_to_instance),
-            total=len(instances_to_process),
-            desc="Cloning instances",
-        ):
-            if future.result():
-                successful += 1
-
-    print("\n" + "=" * 80)
-    print("Summary:")
-    print("=" * 80)
-    print(f"Output directory: {output_dir.absolute()}")
-    total = len(instances_to_process)
-    print(f"Successfully cloned: {successful}/{total} instances")
-    print(
-        "Note: Each instance is in its own directory named <repo>_<instance_id>"
-    )
-    print("\nDone! 🎉")
-
-
-if __name__ == "__main__":
-    main()
diff --git a/scripts/run_async_training_1.7b.sh b/scripts/run_async_training_1.7b.sh
new file mode 100644
index 0000000..ee30bac
--- /dev/null
+++ b/scripts/run_async_training_1.7b.sh
@@ -0,0 +1,119 @@
+#!/bin/bash
+
+while getopts ":m:n:d:s:l:o:i:t:b:c:r:w:" opt; do
+  case ${opt} in
+    m ) MODEL=$OPTARG;;
+    n ) N_ROLLOUTS=$OPTARG;;
+    d ) DATA_PATH=$OPTARG;;
+    s ) CKPT_PATH=$OPTARG;;
+    l ) LCAL_PATH=$OPTARG;;
+    o ) OTHER_OPTION=$OPTARG;;
+    i ) NUM_INFERENCE_ENGINES=$OPTARG;;
+    t ) NUM_TRAINING_ENGINES=$OPTARG;;
+    b ) BATCH_SIZE=$OPTARG;;
+    c ) MICRO_BATCH_SIZE=$OPTARG;;
+    r ) RUN_NAME=$OPTARG;;
+    w ) STEP_WISE=$OPTARG;;
+    # \? ) echo "Usage: cmd [-u] [-p]";;
+  esac
+done
+
+MODEL_ALIAS=$(echo $MODEL | sed 's/\//-/g')
+# Get number of GPUs available
+NUM_GPUS=$(nvidia-smi -L | wc -l)
+N_ROLLOUTS="${N_ROLLOUTS:-8}"
+BATCH_SIZE="${BATCH_SIZE:-8}"
+MICRO_BATCH_SIZE="${MICRO_BATCH_SIZE:-1}"
+MAX_LENGTH=8192
+RUN_NAME="${RUN_NAME:-${MODEL_ALIAS}-${BATCH_SIZE}x${N_ROLLOUTS}}"
+set -x
+
+DATA_PATH="${DATA_PATH:-data/swe_smith}"
+CKPT_PATH="${CKPT_PATH:-$(pwd)/ckpts/${MODEL_ALIAS}}"
+# If LCAL_PATH is not set, use CKPT_PATH
+LCAL_PATH="${LCAL_PATH:-$CKPT_PATH}"
+mkdir -p $CKPT_PATH
+
+HALF_NUM_GPUS=$((NUM_GPUS / 2))
+NUM_INFERENCE_ENGINES="${NUM_INFERENCE_ENGINES:-$HALF_NUM_GPUS}"
+NUM_TRAINING_ENGINES="${NUM_TRAINING_ENGINES:-$HALF_NUM_GPUS}"
+STEP_WISE="${STEP_WISE:-false}"
+
+export VLLM_FLASH_ATTN_VERSION=2
+export CUDA_LAUNCH_BLOCKING=1
+export TORCH_USE_CUDA_DSA=1
+export RAY_worker_register_timeout_seconds=600
+uv run --isolated -m src.train \
+  +run_async_trainer=true \
+  data.train_data="['$DATA_PATH/train.parquet']" \
+  data.val_data="['$DATA_PATH/validation.parquet']" \
+  trainer.algorithm.advantage_estimator="grpo" \
+  trainer.algorithm.grpo_norm_by_std=false \
+  trainer.policy.model.path=${MODEL} \
+  trainer.placement.colocate_all=false \
+  trainer.placement.colocate_policy_ref=true \
+  trainer.strategy=fsdp2 \
+  trainer.policy.fsdp_config.cpu_offload=true \
+  trainer.policy.fsdp_config.reshard_after_forward=true \
+  trainer.policy.fsdp_config.fsdp_size=-1 \
+  trainer.fully_async.num_parallel_generation_workers=${BATCH_SIZE} \
+  trainer.placement.policy_num_gpus_per_node=${NUM_TRAINING_ENGINES} \
+  trainer.placement.ref_num_gpus_per_node=${NUM_TRAINING_ENGINES} \
+  trainer.placement.policy_num_nodes=1 \
+  trainer.placement.ref_num_nodes=1 \
+  trainer.policy.sequence_parallel_size=1 \
+  generator.num_inference_engines=${NUM_INFERENCE_ENGINES} \
+  generator.inference_engine_tensor_parallel_size=1 \
+  +generator.traj_dir=${CKPT_PATH}trajectories/ \
+  +generator.engine_init_kwargs.enable_auto_tool_choice=true \
+  +generator.engine_init_kwargs.tool_call_parser="hermes" \
+  +generator.engine_init_kwargs.max_model_len=32768 \
+  +generator.prompts.system_prompt="templates/system_prompt_custom_finish.j2" \
+  +generator.prompts.user_prompt="templates/file_module_custom_finish.j2" \
+  +generator.engine_init_kwargs.disable_cascade_attn=true \
+  trainer.epochs=1 \
+  trainer.eval_batch_size=100 \
+  trainer.eval_before_train=false \
+  trainer.eval_interval=-1 \
+  trainer.update_epochs_per_batch=1 \
+  trainer.train_batch_size=${BATCH_SIZE} \
+  trainer.policy_mini_batch_size=${BATCH_SIZE} \
+  trainer.micro_forward_batch_size_per_gpu=1 \
+  trainer.micro_train_batch_size_per_gpu=${MICRO_BATCH_SIZE} \
+  trainer.dump_data_batch=true \
+  trainer.export_path="${CKPT_PATH}exported_model/" \
+  trainer.hf_save_interval=50 \
+  trainer.ckpt_interval=10 \
+  trainer.use_sample_packing=false \
+  trainer.max_prompt_length=32768 \
+  trainer.algorithm.policy_loss_type="gspo" \
+  trainer.algorithm.eps_clip_low=0.0003 \
+  trainer.algorithm.eps_clip_high=0.0004 \
+  trainer.algorithm.loss_reduction="sequence_mean" \
+  generator.sampling_params.max_generate_length=${MAX_LENGTH} \
+  generator.sampling_params.temperature=1.0 \
+  generator.max_input_length=32768 \
+  generator.max_num_batched_tokens=131072 \
+  generator.max_turns=6 \
+  trainer.policy.optimizer_config.lr=1.0e-6 \
+  trainer.algorithm.use_kl_loss=False \
+  trainer.algorithm.use_kl_in_reward=False \
+  generator.backend=vllm \
+  generator.run_engines_locally=True \
+  generator.enable_http_endpoint=True \
+  generator.http_endpoint_host='0.0.0.0' \
+  generator.http_endpoint_port=8080 \
+  generator.weight_sync_backend=nccl \
+  generator.async_engine=true \
+  generator.batched=false \
+  generator.n_samples_per_prompt=${N_ROLLOUTS} \
+  generator.gpu_memory_utilization=0.8 \
+  generator.enforce_eager=false \
+  trainer.step_wise_training=${STEP_WISE} \
+  trainer.logger="wandb" \
+  trainer.project_name="code_search" \
+  trainer.run_name=${RUN_NAME} \
+  trainer.resume_mode=latest \
+  trainer.ckpt_path="$LCAL_PATH" \
+  trainer.max_ckpts_to_keep=5 \
+  $OTHER_OPTION
\ No newline at end of file
diff --git a/scripts/run_async_training.sh b/scripts/run_async_training_14B.sh
similarity index 98%
rename from scripts/run_async_training.sh
rename to scripts/run_async_training_14B.sh
index 3c5d625..2c11225 100644
--- a/scripts/run_async_training.sh
+++ b/scripts/run_async_training_14B.sh
@@ -64,6 +64,7 @@ STEP_WISE="${STEP_WISE:-false}"
 export VLLM_FLASH_ATTN_VERSION=2
 export CUDA_LAUNCH_BLOCKING=1
 export TORCH_USE_CUDA_DSA=1
+export RAY_worker_register_timeout_seconds=600
 
 uv run --isolated -m src.train \
   +run_async_trainer=true \
@@ -97,7 +98,7 @@ uv run --isolated -m src.train \
   generator.eval_n_samples_per_prompt=1 \
   trainer.epochs=10 \
   trainer.eval_batch_size=32 \
-  trainer.eval_before_train=true \
+  trainer.eval_before_train=false \
   trainer.eval_interval=-1 \
   trainer.update_epochs_per_batch=1 \
   trainer.train_batch_size=${BATCH_SIZE} \
diff --git a/scripts/run_async_training_4b.sh b/scripts/run_async_training_4b.sh
new file mode 100644
index 0000000..452faeb
--- /dev/null
+++ b/scripts/run_async_training_4b.sh
@@ -0,0 +1,129 @@
+#!/bin/bash
+
+
+# export REWARD=file_loc
+# sbatch scripts/run_async_training.sh \
+#   -m Qwen/Qwen3-8B -n 8 -b 1 -i 4 -t 4 \
+#   -d data/swe_gym \
+#   -s /project/flame/lsutawik/cso/ckpts/qwen3-8b-8x8-${REWARD}/ \
+#   -o "+generator.reward=configs/rewards/${REWARD}.yaml"
+
+# . .env
+
+while getopts ":m:n:d:s:l:o:i:t:b:c:r:w:" opt; do
+  case ${opt} in
+    m ) MODEL=$OPTARG;;
+    n ) N_ROLLOUTS=$OPTARG;;
+    d ) DATA_PATH=$OPTARG;;
+    s ) CKPT_PATH=$OPTARG;;
+    l ) LCAL_PATH=$OPTARG;;
+    o ) OTHER_OPTION=$OPTARG;;
+    i ) NUM_INFERENCE_ENGINES=$OPTARG;;
+    t ) NUM_TRAINING_ENGINES=$OPTARG;;
+    b ) BATCH_SIZE=$OPTARG;;
+    c ) MICRO_BATCH_SIZE=$OPTARG;;
+    r ) RUN_NAME=$OPTARG;;
+    w ) STEP_WISE=$OPTARG;;
+    # \? ) echo "Usage: cmd [-u] [-p]";;
+  esac
+done
+
+MODEL_ALIAS=$(echo $MODEL | sed 's/\//-/g')
+# Get number of GPUs available
+NUM_GPUS=$(nvidia-smi -L | wc -l)
+N_ROLLOUTS="${N_ROLLOUTS:-8}"
+BATCH_SIZE="${BATCH_SIZE:-8}"
+MICRO_BATCH_SIZE="${MICRO_BATCH_SIZE:-1}"
+MAX_LENGTH=8192
+RUN_NAME="${RUN_NAME:-${MODEL_ALIAS}-${BATCH_SIZE}x${N_ROLLOUTS}}"
+set -x
+
+DATA_PATH="${DATA_PATH:-data/swe_smith}"
+CKPT_PATH="${CKPT_PATH:-$(pwd)/ckpts/${MODEL_ALIAS}}"
+# If LCAL_PATH is not set, use CKPT_PATH
+LCAL_PATH="${LCAL_PATH:-$CKPT_PATH}"
+mkdir -p $CKPT_PATH
+
+HALF_NUM_GPUS=$((NUM_GPUS / 2))
+NUM_INFERENCE_ENGINES="${NUM_INFERENCE_ENGINES:-$HALF_NUM_GPUS}"
+NUM_TRAINING_ENGINES="${NUM_TRAINING_ENGINES:-$HALF_NUM_GPUS}"
+STEP_WISE="${STEP_WISE:-false}"
+
+export VLLM_FLASH_ATTN_VERSION=2
+export CUDA_LAUNCH_BLOCKING=1
+export TORCH_USE_CUDA_DSA=1
+export RAY_worker_register_timeout_seconds=600
+uv run --isolated --active -m src.train \
+  +run_async_trainer=true \
+  data.train_data="['$DATA_PATH/train.parquet']" \
+  data.val_data="['$DATA_PATH/validation.parquet']" \
+  trainer.algorithm.advantage_estimator="grpo" \
+  trainer.algorithm.grpo_norm_by_std=false \
+  trainer.policy.model.path=${MODEL} \
+  trainer.placement.colocate_all=false \
+  trainer.placement.colocate_policy_ref=true \
+  trainer.strategy=fsdp2 \
+  trainer.policy.fsdp_config.cpu_offload=true \
+  trainer.policy.fsdp_config.reshard_after_forward=true \
+  trainer.policy.fsdp_config.fsdp_size=-1 \
+  trainer.fully_async.num_parallel_generation_workers=${BATCH_SIZE} \
+  trainer.placement.policy_num_gpus_per_node=${NUM_TRAINING_ENGINES} \
+  trainer.placement.ref_num_gpus_per_node=${NUM_TRAINING_ENGINES} \
+  trainer.placement.policy_num_nodes=1 \
+  trainer.placement.ref_num_nodes=1 \
+  trainer.policy.sequence_parallel_size=1 \
+  generator.num_inference_engines=${NUM_INFERENCE_ENGINES} \
+  generator.inference_engine_tensor_parallel_size=1 \
+  +generator.traj_dir=${CKPT_PATH}trajectories/ \
+  +generator.engine_init_kwargs.enable_auto_tool_choice=true \
+  +generator.engine_init_kwargs.tool_call_parser="hermes" \
+  +generator.engine_init_kwargs.max_model_len=40960 \
+  +generator.prompts.system_prompt="templates/system_prompt_custom_finish.j2" \
+  +generator.prompts.user_prompt="templates/file_module_custom_finish.j2" \
+  +generator.engine_init_kwargs.disable_cascade_attn=true \
+  trainer.epochs=1 \
+  trainer.eval_batch_size=100 \
+  trainer.eval_before_train=false \
+  trainer.eval_interval=-1 \
+  trainer.update_epochs_per_batch=1 \
+  trainer.train_batch_size=${BATCH_SIZE} \
+  trainer.policy_mini_batch_size=${BATCH_SIZE} \
+  trainer.micro_forward_batch_size_per_gpu=1 \
+  trainer.micro_train_batch_size_per_gpu=${MICRO_BATCH_SIZE} \
+  trainer.dump_data_batch=true \
+  trainer.export_path="${CKPT_PATH}exported_model/" \
+  trainer.hf_save_interval=50 \
+  trainer.ckpt_interval=10 \
+  trainer.use_sample_packing=false \
+  trainer.max_prompt_length=40960 \
+  trainer.algorithm.policy_loss_type="gspo" \
+  trainer.algorithm.eps_clip_low=0.0003 \
+  trainer.algorithm.eps_clip_high=0.0004 \
+  trainer.algorithm.loss_reduction="sequence_mean" \
+  generator.sampling_params.max_generate_length=${MAX_LENGTH} \
+  generator.sampling_params.temperature=1.0 \
+  generator.max_input_length=40960 \
+  generator.max_num_batched_tokens=131072 \
+  generator.max_turns=10 \
+  trainer.policy.optimizer_config.lr=1.0e-6 \
+  trainer.algorithm.use_kl_loss=False \
+  trainer.algorithm.use_kl_in_reward=False \
+  generator.backend=vllm \
+  generator.run_engines_locally=True \
+  generator.enable_http_endpoint=True \
+  generator.http_endpoint_host='0.0.0.0' \
+  generator.http_endpoint_port=8080 \
+  generator.weight_sync_backend=nccl \
+  generator.async_engine=true \
+  generator.batched=false \
+  generator.n_samples_per_prompt=${N_ROLLOUTS} \
+  generator.gpu_memory_utilization=0.8 \
+  generator.enforce_eager=false \
+  trainer.step_wise_training=${STEP_WISE} \
+  trainer.logger="wandb" \
+  trainer.project_name="code_search" \
+  trainer.run_name=${RUN_NAME} \
+  trainer.resume_mode=latest \
+  trainer.ckpt_path="$LCAL_PATH" \
+  trainer.max_ckpts_to_keep=5 \
+  $OTHER_OPTION
\ No newline at end of file
diff --git a/scripts/run_prime_rl.sh b/scripts/run_prime_rl.sh
deleted file mode 100755
index 8c394e7..0000000
--- a/scripts/run_prime_rl.sh
+++ /dev/null
@@ -1,20 +0,0 @@
-#!/bin/bash
-
-# Install ripgrep
-sudo apt-get install ripgrep -y
-
-# Set PyTorch CUDA allocator config to reduce fragmentation
-export PYTORCH_CUDA_ALLOC_CONF=expandable_segments:True
-
-# Navigate to prime-rl directory
-cd $HOME/agentic-code-search-oss/prime-rl
-
-# Install the verifiers environment
-uv pip install -e ..
-
-# Run RL training
-uv run rl \
-  --trainer @ ../configs/swe-grep-oss/rl/train.toml \
-  --orchestrator @ ../configs/swe-grep-oss/rl/orch.toml \
-  --inference @ ../configs/swe-grep-oss/rl/infer.toml
-
diff --git a/scripts/run_training.sh b/scripts/run_training.sh
deleted file mode 100644
index cbc2561..0000000
--- a/scripts/run_training.sh
+++ /dev/null
@@ -1,93 +0,0 @@
-#!/bin/bash
-#SBATCH --job-name=cso
-#SBATCH --output=../logs/%j.out
-#SBATCH --error=../logs/%j.out
-#SBATCH --partition=general
-#SBATCH --gres=gpu:A100:2
-#SBATCH --nodes=1
-#SBATCH --time=2-00:00:00
-#SBATCH --mem=512G
-#SBATCH --cpus-per-task=32
-#SBATCH --ntasks-per-node=1
-#SBATCH --exclude=babel-q5-28,babel-o5-20
-
-. .env
-
-while getopts ":m:n:d:s:" opt; do
-  case ${opt} in
-    m ) MODEL=$OPTARG;;
-    n ) N_ROLLOUTS=$OPTARG;;
-    d ) DATA_PATH=$OPTARG;;
-    s ) CKPT_PATH=$OPTARG;;
-    # \? ) echo "Usage: cmd [-u] [-p]";;
-  esac
-done
-
-MODEL_ALIAS=$(echo $MODEL | sed 's/\//-/g')
-# Get number of GPUs available
-NUM_GPUS=$(nvidia-smi -L | wc -l)
-N_ROLLOUTS="${N_ROLLOUTS:-4}"
-MAX_LENGTH=2048
-RUN_NAME="code_search_${MODEL_ALIAS}"
-set -x
-
-DATA_PATH="${DATA_PATH:-data/swe_smith}"
-CKPT_PATH="${CKPT_PATH:-ckpts/${MODEL_ALIAS}}"
-mkdir -p $CKPT_PATH
-
-NNODES=1
-NUM_INFERENCE_ENGINES=2
-TP_SIZE=1
-LOGGER=wandb
-
-# We use a small batch size here for demonstration
-# NOTE (sumanthrh): The `generator.max_turns` here is actually unused, and we use the `step_limit` from the `swebench.yaml` file. 
-CUDA_LAUNCH_BLOCKING=1 uv run --isolated -m src.train \
-  data.train_data="['$DATA_PATH/train.parquet']" \
-  data.val_data="['$DATA_PATH/validation.parquet']" \
-  trainer.algorithm.advantage_estimator="grpo" \
-  trainer.policy.model.path=${MODEL} \
-  trainer.placement.colocate_all=true \
-  trainer.strategy=fsdp2 \
-  trainer.placement.policy_num_gpus_per_node=$NUM_GPUS \
-  trainer.placement.ref_num_gpus_per_node=$NUM_GPUS \
-  trainer.placement.policy_num_nodes=$NNODES \
-  trainer.placement.ref_num_nodes=$NNODES \
-  trainer.policy.sequence_parallel_size=$NUM_GPUS \
-  generator.num_inference_engines=$NUM_INFERENCE_ENGINES \
-  generator.inference_engine_tensor_parallel_size=$TP_SIZE \
-  +generator.traj_dir=$CKPT_PATH/trajectories/ \
-  +generator.engine_init_kwargs="{enable_auto_tool_choice:true,tool_call_parser:hermes}" \
-  trainer.epochs=20 \
-  trainer.eval_batch_size=100 \
-  trainer.eval_before_train=false \
-  trainer.eval_interval=100 \
-  trainer.update_epochs_per_batch=1 \
-  trainer.train_batch_size=4 \
-  trainer.policy_mini_batch_size=4 \
-  trainer.micro_forward_batch_size_per_gpu=2 \
-  trainer.micro_train_batch_size_per_gpu=2 \
-  trainer.dump_data_batch=true \
-  trainer.ckpt_interval=10 \
-  trainer.max_prompt_length=4096 \
-  generator.sampling_params.max_generate_length=${MAX_LENGTH} \
-  generator.max_input_length=24000 \
-  generator.max_num_batched_tokens=48000 \
-  generator.max_turns=20 \
-  trainer.policy.optimizer_config.lr=1.0e-6 \
-  trainer.algorithm.use_kl_loss=False \
-  generator.backend=vllm \
-  generator.run_engines_locally=True \
-  generator.enable_http_endpoint=True \
-  generator.http_endpoint_host='0.0.0.0' \
-  generator.http_endpoint_port=8080 \
-  generator.weight_sync_backend=nccl \
-  generator.async_engine=true \
-  generator.batched=true \
-  generator.n_samples_per_prompt=${N_ROLLOUTS} \
-  generator.gpu_memory_utilization=0.6 \
-  trainer.logger="$LOGGER" \
-  trainer.project_name="code_search" \
-  trainer.run_name=${RUN_NAME} \
-  trainer.resume_mode=null \
-  trainer.ckpt_path="$CKPT_PATH"
diff --git a/scripts/train_async.sh b/scripts/train_async.sh
deleted file mode 100644
index faf6192..0000000
--- a/scripts/train_async.sh
+++ /dev/null
@@ -1,10 +0,0 @@
-#!/bin/bash
-
-# Loop over 10
-for i in $(seq 1 10)
-do
-  echo "Run number: $i"
-  # Kill any process using port 8080 after 4 hours
-  ( sleep 14400 && fuser -k 8080/tcp ) & \
-  bash scripts/run_async_training.sh "$@"
-done
diff --git a/software-agent-sdk b/software-agent-sdk
deleted file mode 160000
index 85ecfd9..0000000
--- a/software-agent-sdk
+++ /dev/null
@@ -1 +0,0 @@
-Subproject commit 85ecfd9333d2d2cc4404dd460fd38868d9b978e2
diff --git a/src/async_trainer.py b/src/async_trainer.py
index 7815122..840c80e 100644
--- a/src/async_trainer.py
+++ b/src/async_trainer.py
@@ -59,7 +59,6 @@ def patched_concatenate_generator_outputs(generator_outputs: List[GeneratorOutpu
         logger.info(f"Attempting to concatenate values for additional keys {additional_keys}")
     for key in additional_keys:
         try:
-            # result[key] = sum([generator_output[key] for generator_output in generator_outputs], [])
             additional_result[key] = np.mean([generator_output[key] for generator_output in generator_outputs]).item()
         except Exception as e:
             logger.error(f"Error in aggregating key {key}: {e}", exc_info=True)
@@ -72,10 +71,6 @@ def patched_concatenate_generator_outputs(generator_outputs: List[GeneratorOutpu
     # Import here to avoid circular dependency.
     from skyrl_train.utils.trainer_utils import validate_generator_output
 
-    # print("trajectory_ids", result["trajectory_ids"])
-    # print("rewards", result["rewards"])
-    # print("is_last_step", result["is_last_step"])
-
     num_prompts = len(result["prompt_token_ids"])
     validate_generator_output(num_prompts, result)
 
@@ -131,19 +126,6 @@ def convert_generation_group_mini_batch_to_training_input(
         self.cfg.trainer.step_wise_training = False
         generator_output = self.postprocess_generator_output(generator_output, uids)
 
-        # # Truncate prompt_token_ids to avoid OOM
-        # max_prompt_len = self.cfg.trainer.max_prompt_length
-        # if max_prompt_len == -1:
-        #     pass
-        # else:
-        #     truncated_prompt_token_ids = []
-        #     for prompt_ids in generator_output["prompt_token_ids"]:
-        #         if len(prompt_ids) > max_prompt_len:
-        #             truncated_prompt_token_ids.append(prompt_ids[-max_prompt_len:])
-        #         else:
-        #             truncated_prompt_token_ids.append(prompt_ids)
-        #     generator_output["prompt_token_ids"] = truncated_prompt_token_ids
-
         # print example just for debugging
         vis = self.tokenizer.decode(generator_output["response_ids"][0])
         logger.info(f"Example generated: {vis}")
diff --git a/src/build_dataset.py b/src/build_dataset.py
index b28ae50..365f44f 100644
--- a/src/build_dataset.py
+++ b/src/build_dataset.py
@@ -3,15 +3,12 @@
 
 from datasets import load_dataset
 
-# from src.utils.dataset import extract_functions_from_patch
-
-
 def main():
     parser = argparse.ArgumentParser(description="Build dataset from patches")
     parser.add_argument("--dataset", default="adityasoni17/SWE-smith-py-code-search", help="Input dataset path")
     parser.add_argument("--split", default="train", help="Dataset split to use")
     parser.add_argument("--output", required=True, help="Output file path for processed dataset")
-    parser.add_argument("--use_patch", action="store_true", help="Whether to use patches to extract target functions")
+    parser.add_argument("--use_patch", action="store_true", help="Whether to apply patch after pulling the repo (only set to true for SWE-Smith whose patch actually introduces the bug)")
     args = parser.parse_args()
 
     # Load and process dataset
@@ -43,7 +40,6 @@ def main():
     # shuffle dataset
     dataset = dataset.sample(frac=1, random_state=42).reset_index(drop=True)
 
-    # train_size = int(0.975 * len(dataset))
     train_dataset = dataset.iloc[:-100]
     validation_dataset = dataset.iloc[-100:]
 
@@ -56,6 +52,7 @@ def main():
 
     output_path = os.path.join(output_dir, "validation.parquet")
     validation_dataset.to_parquet(output_path)
+    print(len(train_dataset), len(validation_dataset))
 
 
 if __name__ == "__main__":
diff --git a/src/constants.py b/src/constants.py
deleted file mode 100644
index c71f050..0000000
--- a/src/constants.py
+++ /dev/null
@@ -1,10 +0,0 @@
-"""
-Constants used across the SWE-Grep OSS project.
-"""
-
-# Maximum number of tool calls allowed per task
-DEFAULT_MAX_TOOL_CALLS = 5
-
-# Maximum number of tokens allowed per task
-DEFAULT_MAX_TOKENS = 40960
-
diff --git a/src/generator/code_search_generator.py b/src/generator/code_search_generator.py
index 9431c9d..8e104be 100644
--- a/src/generator/code_search_generator.py
+++ b/src/generator/code_search_generator.py
@@ -64,7 +64,6 @@
 from src.agent.agent import CustomAgent
 
 from src.rewards import get_reward_function
-# from src.tools import TOOL_REGISTRY
 
 from src.metrics.efficiency_metrics import compute_all_efficiency_metrics
 from src.metrics.trajectory_metrics import compute_trajectory_metrics
@@ -73,7 +72,6 @@
 import signal
 
 logger = get_logger(__name__)
-# logger.setLevel(logging.WARNING)
 logger.setLevel(logging.ERROR)
 
 file_path = os.path.dirname(__file__)
@@ -142,20 +140,8 @@ def init_and_run(
     structured_locations = None
     messages = []
 
-    # for tool_name in generator_cfg.tools:
-    #     if tool_name in TOOL_REGISTRY:
-    #         register_tool(tool_name, TOOL_REGISTRY[tool_name])
-    #     else:
-    #         raise ValueError(f"Tool {tool_name} does not exist in the registry")
-
-    # tools = [
-    #     Tool(name=tool_name) for tool_name in generator_cfg.tools
-    # ]
-
     register_tool(LocalizationFinishTool.name, LocalizationFinishTool)
     tools = [
-        # Tool(name=GlobTool.name),
-        # Tool(name=GrepTool.name),
         Tool(name=TerminalTool.name),
         Tool(name="localization_finish"),
     ]
@@ -185,7 +171,6 @@ def init_and_run(
             }
         ),
         tools=tools,
-        # security_analyzer=None,
         system_prompt_filename=system_prompt_path
     )
 
@@ -268,17 +253,11 @@ def __init__(
         self.generator_cfg = generator_cfg
         self.tokenizer = tokenizer
         self.model_name = model_name
-        # self.litellm_model_name = "openai/" + self.model_name
         self.litellm_model_name = "openai/" + self.model_name
 
-        # if self.generator_cfg.chat_template.name_or_path is not None:
-        #     raise NotImplementedError(
-        #         "OpenhandsGenerator doesn't support custom chat template"
-        #     )
-
         self.step_wise = step_wise
         self.max_train_length = generator_cfg.get(
-            "max_train_length", 32768
+            "max_train_length", 100000
         )
 
     def sanity_check_last_step(self, token_messages):
@@ -312,7 +291,6 @@ async def code_search_loop(
         trajectory_id: TrajectoryID,
         batch_metadata: BatchMetadata,
     ) -> Tuple[List[int], float, str, List[int], List[int], Optional[List[int]], Optional[Dict[str, Any]]]:
-        # sweagent_config = yaml.safe_load(get_config_path(self.generator_cfg.miniswe_config_path).read_text())
         # NOTE (sumanthrh): Input `prompt` is not used here because mini-swe-agent uses a similar entry from the `instance` obj
         instance = env_extras
         error = None
@@ -341,12 +319,6 @@ async def code_search_loop(
                 "end_timestamp": None
             }
 
-        # print("=" * 100)
-        # print("Conversation finished. Got the following LLM messages:")
-        # for i, message in enumerate(messages):
-        #     print(f"Message {i}: {str(message)[:100]}")
-        # print("Final message:", final_message)
-
         # Run sanity check before computing the reward so that the logged metrics reflect the actual reward received in training
         token_messages = [msg for msg in messages if msg["kind"] == "TokenEvent"]
         trajectory_exhausted_steps = structured_locations is None and len(token_messages) >= self.generator_cfg.max_turns
@@ -415,7 +387,6 @@ async def code_search_loop(
 
         token_messages = [msg for msg in messages if msg["kind"] == "TokenEvent"]
         rollout_list = []
-        num_steps = len(token_messages)
         if len(token_messages) > 0:
             if self.step_wise:
                 for idx, message in enumerate(token_messages):
@@ -447,6 +418,8 @@ async def code_search_loop(
                 max_response_len = max_train_len - len(current_prompt_ids)
 
                 buffer_succeed = 5  # buffer tokens after assistant tag
+                if "Qwen3-4B-Instruct-2507" in self.model_name:
+                    buffer_succeed = 1 #NOTE: 4B-Instruct doesn't have <think> tokens so only the subsequent \n needs masking.
                 buffer_precede = 1  # buffer tokens before im_start tag
                 # make mask of 0 for everything inside <|im_start|> 
                 # and assistant and 1 elsewhere 
@@ -455,6 +428,7 @@ async def code_search_loop(
                 mask = []
                 inside = False
                 buffer = 0
+                found_role_switch = False
                 for token_id in current_response_ids:
                     if token_id == start_token_id:
                         inside = True
@@ -462,7 +436,7 @@ async def code_search_loop(
                             mask.pop()
                         mask.extend([0] * buffer_precede)
                         mask.append(0)
-                    elif token_id == end_token_id:
+                    elif token_id == end_token_id and found_role_switch:
                         inside = False
                         mask.append(0)
                         buffer = buffer_succeed
@@ -474,12 +448,18 @@ async def code_search_loop(
                             buffer -= 1
                         else:
                             mask.append(1)
+                    
+                    # mark role switch is <|im_start|> is found
+                    if token_id == start_token_id:
+                        found_role_switch = True
+                    else:
+                        found_role_switch = False
 
                 # mask zero out everything beyond max_response_len
                 # Don't truncate the response, just mask out the loss
-                if len(current_response_ids) > max_response_len:
-                    for i in range(max_response_len, len(current_response_ids)):
-                        mask[i] = 0
+                # if len(current_response_ids) > max_response_len:
+                #     for i in range(max_response_len, len(current_response_ids)):
+                #         mask[i] = 0
                 
                 # mask loss completely from trajectories that exhausted all steps without calling the custom finish tool
                 if trajectory_exhausted_steps:
diff --git a/src/metrics/efficiency_metrics.py b/src/metrics/efficiency_metrics.py
index c0eda37..3074eb2 100644
--- a/src/metrics/efficiency_metrics.py
+++ b/src/metrics/efficiency_metrics.py
@@ -28,8 +28,11 @@ def compute_token_metrics(messages: List[Dict[str, Any]]) -> Dict[str, float]:
             "avg_prompt_tokens_per_step": 0.0,
             "avg_response_tokens_per_step": 0.0,
         }
-
-    total_prompt_tokens = sum(len(msg.get("prompt_token_ids", [])) for msg in token_messages)
+    if len(token_messages) > 0:
+        total_prompt_tokens = len(token_messages[-1].get("prompt_token_ids", []))
+    else:
+        total_prompt_tokens = 0
+    # total_prompt_tokens = sum(len(msg.get("prompt_token_ids", [])) for msg in token_messages)
     total_response_tokens = sum(len(msg.get("response_token_ids", [])) for msg in token_messages)
 
     num_steps = len(token_messages)
diff --git a/src/prompts/system_prompt.py b/src/prompts/system_prompt.py
deleted file mode 100644
index 1aae5ad..0000000
--- a/src/prompts/system_prompt.py
+++ /dev/null
@@ -1,87 +0,0 @@
-SYSTEM_PROMPT = """
-You are a specialized code localization agent. Your sole objective is to identify and return the files in the codebase that are relevant to the user's query.
-You are given access to the codebase in a linux file system.
-
-## PRIMARY DIRECTIVE
-- Find relevant files, do NOT answer the user's query directly
-- Return ONLY file paths in <files> XML tags
-- Prioritize precision: every file you return should be relevant
-- You have up to 8 turns to explore and return your answer
-
-## TOOL USAGE REQUIREMENTS
-
-### bash tool (REQUIRED for search)
-- You MUST use the bash tool to search and explore the codebase
-- Execute bash commands like: rg, grep, find, ls, cat, head, tail, sed
-- Use parallel tool calls: invoke bash tool up to 5 times concurrently in a single turn
-- NEVER exceed 5 parallel tool calls per turn
-- Common patterns:
-  * `rg "pattern" -t py` - search for code patterns
-  * `rg --files | grep "keyword"` - find files by name
-  * `cat path/to/file.py` - read file contents
-  * `find . -name "*.py" -type f` - locate files by extension
-  * `wc -l path/to/file.py` - count lines in a file
-  * `sed -n '1,100p' path/to/file.py` - read lines 1-100 of a file
-  * `head -n 100 path/to/file.py` - read first 100 lines
-  * `tail -n 100 path/to/file.py` - read last 100 lines
-
-### Reading Files (CRITICAL for context management)
-- NEVER read entire large files with `cat` - this will blow up your context window
-- ALWAYS check file size first: `wc -l path/to/file.py`
-- For files > 100 lines, read in chunks:
-  * Use `sed -n '1,100p' file.py` to read lines 1-100
-  * Use `sed -n '101,200p' file.py` to read lines 101-200
-  * Continue with subsequent ranges as needed (201-300, 301-400, etc.)
-- Strategic reading approach:
-  * Read the first 50-100 lines to see imports and initial structure
-  * Use `rg` to find specific patterns and their line numbers
-  * Read targeted line ranges around matches using `sed -n 'START,ENDp'`
-  * Only read additional chunks if the initial sections are relevant
-
-### Final Answer Format (REQUIRED)
-- You MUST return your final answer in <files> XML tags
-- Format: <files>path/to/file1.py\npath/to/file2.py\npath/to/file3.py</files>
-- List one file path per line inside the tags
-- Use relative paths as they appear in the repository
-- DO NOT include any other text inside the <files> tags
-
-## SEARCH STRATEGY
-
-1. **Initial Exploration**: Cast a wide net
-   - Search for keywords, function names, class names
-   - Check file names and directory structure
-   - Use up to 3 parallel bash calls to explore multiple angles
-   - Check file sizes with `wc -l` before reading
-   - Read promising files in chunks (lines 1-100) to verify relevance
-
-2. **Deep Dive**: Follow the most promising leads
-   - Use up to 3 parallel bash calls to investigate further
-   - Read files in chunks to confirm they address the query
-   - Use `rg` with line numbers to locate specific code, then read those ranges
-   - Start eliminating false positives
-
-3. **Final Verification**: Confirm your file list
-   - Verify each candidate file is truly relevant
-   - Ensure you haven't missed related files
-   - Return your answer in <files> tags
-
-## CRITICAL RULES
-- NEVER exceed 5 parallel bash tool calls in a single turn
-- NEVER respond without wrapping your file list in <files> tags
-- ALWAYS use bash tool to search (do not guess file locations)
-- NEVER read entire large files - always read in chunks (100-line ranges)
-- Check file size with `wc -l` before reading
-- Read file contents in chunks to verify relevance before including them
-- Return file paths as they appear in the repository. Do not begin the path with "./"
-- Aim for high precision (all files relevant) and high recall (no relevant files missed)
-
-## EXAMPLE OUTPUT
-
-After exploring the codebase, return your answer like this:
-
-<files>
-src/main.py
-src/utils/helper.py
-tests/test_main.py
-</files>
-"""
diff --git a/src/rewards/cosine_rewards.py b/src/rewards/cosine_rewards.py
deleted file mode 100644
index 17b2771..0000000
--- a/src/rewards/cosine_rewards.py
+++ /dev/null
@@ -1,111 +0,0 @@
-import math
-from src.rewards import reward
-
-from src.rewards.file_localization.file_localization import (
-    multilevel_localization_f1_reward,
-    file_localization_f1_reward
-    )
-
-@reward("cosine_reward")
-def cosine_reward(
-    final_message,
-    instance,
-    messages,
-    loc_threshold=1.5,
-    use_tool_reward=True,
-    use_turn_reward=True,
-    use_length_reward=False,
-    max_turns=8,
-    max_avg_tool_calls=10,
-    ideal_avg_tool_calls=5,
-    max_length=16384,
-    multilevel=True,
-    max_reward=5.0,
-    min_reward=-5.0,
-    **kwargs
-    ):
-
-    try:
-        if multilevel:
-            loc_reward, reward_dict = multilevel_localization_f1_reward(final_message, instance, **kwargs)
-        else:
-            loc_reward, reward_dict = file_localization_f1_reward(final_message, instance, **kwargs)
-
-    except Exception as e:
-        print(f"Error computing localization reward: {e}")
-        loc_reward = 0.0
-        reward_dict = {
-            "multilevel_localization_f1_reward": 0.0,
-            "file_reward": 0.0,
-            "module_reward": 0.0,
-            "entity_reward": 0.0,
-        }
-
-    def _cos_fn(t, T, mu_min, mu_max):
-        cos_inner = (math.pi * t) / T
-        cos_out = math.cos(cos_inner) + 1
-        return mu_min + 0.5 * (mu_max - mu_min) * cos_out
-
-    token_messages = [msg for msg in messages if msg["kind"] == "TokenEvent"]
-    tool_messages = [msg for msg in messages if msg["kind"] == "ActionEvent"]
-    
-    # Don't count the last turn which is the 
-    # final answer generation which can involve 1 tool or none
-    num_turns = max(1, len(token_messages) - 1)
-    num_tool_calls = len(tool_messages)
-    avg_tool_calls_per_turn = num_tool_calls / num_turns if num_turns > 0 else 0
-
-    reward = 0.0
-
-    # Number of turns
-    if use_turn_reward:
-        if num_turns > max_turns:
-            cosine_turn_reward = 0
-        elif loc_reward >= loc_threshold:
-            cosine_turn_reward = _cos_fn(num_turns, max_turns, 0.0, max_reward)
-        else:
-            cosine_turn_reward = _cos_fn(num_turns, max_turns, 0.0, min_reward)
-        reward_dict["turn_cosine_reward"] = cosine_turn_reward
-
-        reward += cosine_turn_reward
-
-    # Length of response
-    if use_length_reward:
-        current_prompt_ids = token_messages[0]["prompt_token_ids"]
-        ending_prompt_ids = token_messages[-1]["prompt_token_ids"]
-        ending_response_ids = token_messages[-1]["response_token_ids"]
-        current_response_ids = ending_prompt_ids + ending_response_ids
-        current_response_ids = current_response_ids[len(current_prompt_ids):]
-
-        current_length = len(current_prompt_ids) + len(current_response_ids)
-
-        if current_length > max_length:
-            cosine_length_reward = 0
-        elif loc_reward >= loc_threshold:
-            cosine_length_reward = _cos_fn(current_length, max_length, 0.0, max_reward)
-        else:
-            cosine_length_reward = _cos_fn(current_length, max_length, 0.0, min_reward)
-        reward_dict["length_cosine_reward"] = cosine_length_reward
-
-        reward += cosine_length_reward
-    
-    # Number of tool calls
-    if use_tool_reward:
-        if avg_tool_calls_per_turn > max_avg_tool_calls:
-            cosine_tool_reward = 0
-        elif loc_reward >= loc_threshold:
-            # Using 5 as the ideal average number of tool calls per turn
-            # Anything more or less than the max score
-            if avg_tool_calls_per_turn >= ideal_avg_tool_calls:
-                avg_tool_calls_per_turn -= ideal_avg_tool_calls
-                cosine_tool_reward = _cos_fn(avg_tool_calls_per_turn, ideal_avg_tool_calls, 1.0, max_reward)
-            else:
-                cosine_tool_reward = _cos_fn(avg_tool_calls_per_turn, ideal_avg_tool_calls, max_reward, 1.0)
-        else:
-            # If wrong, encourage to do more calls
-            cosine_tool_reward = _cos_fn(avg_tool_calls_per_turn, max_avg_tool_calls, 0.0, min_reward)
-        reward_dict["tool_cosine_reward"] = cosine_tool_reward
-
-        reward += cosine_tool_reward
-
-    return reward, reward_dict
diff --git a/src/rewards/format_reward.py b/src/rewards/format_reward.py
deleted file mode 100644
index 07db1c2..0000000
--- a/src/rewards/format_reward.py
+++ /dev/null
@@ -1,20 +0,0 @@
-import re
-from src.rewards import reward
-
-@reward("format_reward")
-def format_reward(
-    final_message: str,
-    START_STRING: str = "```",
-    END_STRING: str = "```",
-    penalize: bool = True,
-    **kwargs
-    ):
-    
-    final_message = final_message.strip()
-    if final_message.startswith(START_STRING) and END_STRING in final_message:
-        return 1.0, {"format_reward": 1.0}
-    else:
-        if penalize:
-            return -5.0, {"format_reward": -5.0}
-        else:
-            return 0.0, {"format_reward": 0.0}
diff --git a/src/rewards/result_tool_check.py b/src/rewards/result_tool_check.py
deleted file mode 100644
index e050414..0000000
--- a/src/rewards/result_tool_check.py
+++ /dev/null
@@ -1,14 +0,0 @@
-import verifiers as vf
-
-from src.utils.get_result_tool_call import get_result_tool_call
-
-
-def result_tool_check(
-    prompt, completion: vf.types.Messages, answer, state, task, info
-) -> float:
-    """
-    Check if the result tool call is successful.
-    """
-
-    _, success = get_result_tool_call(completion)
-    return 1.0 if success else 0.0
diff --git a/src/rewards/result_tool_f1.py b/src/rewards/result_tool_f1.py
deleted file mode 100644
index db88261..0000000
--- a/src/rewards/result_tool_f1.py
+++ /dev/null
@@ -1,32 +0,0 @@
-import verifiers as vf
-
-from src.utils.result_tool_metrics import (
-    calculate_f1,
-    calculate_precision,
-    calculate_recall,
-    get_file_sets,
-)
-
-
-def result_tool_f1(
-    prompt, completion: vf.types.Messages, answer, state, task, info
-) -> float:
-    """
-    Calculate file-level F1 score.
-
-    F1 = 2 * (precision * recall) / (precision + recall)
-
-    Measures: Harmonic mean of precision and recall.
-
-    Args:
-        answer: Should contain the patch string
-    """
-    result_files, patch_files = get_file_sets(completion, answer)
-
-    if result_files is None or patch_files is None:
-        return 0.0
-
-    precision = calculate_precision(result_files, patch_files)
-    recall = calculate_recall(result_files, patch_files)
-
-    return calculate_f1(precision, recall)
diff --git a/src/rewards/result_tool_precision.py b/src/rewards/result_tool_precision.py
deleted file mode 100644
index 66f420d..0000000
--- a/src/rewards/result_tool_precision.py
+++ /dev/null
@@ -1,24 +0,0 @@
-import verifiers as vf
-
-from src.utils.result_tool_metrics import calculate_precision, get_file_sets
-
-
-def result_tool_precision(
-    prompt, completion: vf.types.Messages, answer, state, task, info
-) -> float:
-    """
-    Calculate file-level precision.
-
-    Precision = |result_files ∩ patch_files| / |result_files|
-
-    Measures: Of the files the agent identified, what percentage are correct?
-
-    Args:
-        answer: Should contain the patch string
-    """
-    result_files, patch_files = get_file_sets(completion, answer)
-
-    if result_files is None or patch_files is None:
-        return 0.0
-
-    return calculate_precision(result_files, patch_files)
diff --git a/src/rewards/result_tool_recall.py b/src/rewards/result_tool_recall.py
deleted file mode 100644
index 8723b3a..0000000
--- a/src/rewards/result_tool_recall.py
+++ /dev/null
@@ -1,25 +0,0 @@
-import verifiers as vf
-
-from src.utils.result_tool_metrics import calculate_recall, get_file_sets
-
-
-def result_tool_recall(
-    prompt, completion: vf.types.Messages, answer, state, task, info
-) -> float:
-    """
-    Calculate file-level recall.
-
-    Recall = |result_files ∩ patch_files| / |patch_files|
-
-    Measures: Of all the files in the patch, what percentage did the
-    agent identify?
-
-    Args:
-        answer: Should contain the patch string
-    """
-    result_files, patch_files = get_file_sets(completion, answer)
-
-    if result_files is None or patch_files is None:
-        return 0.0
-
-    return calculate_recall(result_files, patch_files)
diff --git a/src/rewards/scaled_f1.py b/src/rewards/scaled_f1.py
deleted file mode 100644
index 80cfb45..0000000
--- a/src/rewards/scaled_f1.py
+++ /dev/null
@@ -1,57 +0,0 @@
-from src.rewards import reward
-
-from src.rewards.file_localization.file_localization import (
-    multilevel_localization_f1_reward,
-    file_localization_f1_reward
-    )
-
-@reward("scaled_f1_reward")
-def scaled_f1_reward(
-    final_message,
-    messages,
-    instance,
-    multilevel=False,
-    **kwargs
-    ):
-
-    try:
-        if multilevel:
-            loc_reward, reward_dict = multilevel_localization_f1_reward(final_message, instance, **kwargs)
-        else:
-            loc_reward, reward_dict = file_localization_f1_reward(final_message, instance, **kwargs)
-
-    except Exception as e:
-        print(f"Error computing localization reward: {e}")
-        loc_reward = 0.0
-        reward_dict = {
-            "multilevel_localization_f1_reward": 0.0,
-            "file_reward": 0.0,
-            "module_reward": 0.0,
-            "entity_reward": 0.0,
-        }
-
-    token_messages = [msg for msg in messages if msg["kind"] == "TokenEvent"]
-    tool_messages = [msg for msg in messages if msg["kind"] == "ActionEvent"]
-    
-    num_turns = len(token_messages) - 1
-    if num_turns == 0:
-        num_turns = 1  # to avoid division by zero
-
-    num_tool_calls = len(tool_messages)
-
-    avg_tool_calls_per_turn = num_tool_calls / num_turns if num_turns > 0 else 0
-    if avg_tool_calls_per_turn > 5:
-        avg_tool_calls_per_turn = 5  # cap at ideal avg tool calls
-
-    avg_tool_calls_per_turn = avg_tool_calls_per_turn / 5  # normalize by ideal avg tool calls
-
-    reward_dict["tool_use_reward"] = avg_tool_calls_per_turn
-
-    # Penalize if no tool calls were made
-    if avg_tool_calls_per_turn <= 0:
-        reward = -5
-        return reward, reward_dict
-
-    reward = loc_reward * avg_tool_calls_per_turn
-    
-    return reward, reward_dict
\ No newline at end of file
diff --git a/src/rewards/tool_use.py b/src/rewards/tool_use.py
deleted file mode 100644
index 588c282..0000000
--- a/src/rewards/tool_use.py
+++ /dev/null
@@ -1,49 +0,0 @@
-from src.rewards import reward
-
-@reward("tool_use_reward")
-def tool_use_reward(messages, max_tool_use=5, penalize=False, clamp=False, reduction="mean", **kwargs) -> float:
-    token_messages = [msg for msg in messages if msg["kind"] == "TokenEvent"]
-    tool_messages = [msg for msg in messages if msg["kind"] == "ActionEvent"]
-    
-    num_turns = max(1, len(token_messages) - 1)
-    num_tool_calls = len(tool_messages)
-
-    if num_tool_calls == 0:
-        if penalize:
-            return -5.0
-        else:
-            return 0.0
-    
-    if reduction == "mean":
-        average_tool_use = num_tool_calls / num_turns
-        reward = min(average_tool_use, max_tool_use) / max_tool_use
-    else:
-        reward = min(num_tool_calls, max_tool_use) / max_tool_use
-
-    if clamp:
-        if reward > 0:
-            return 1.0
-        else:
-            return 0.0
-    else:
-        return reward
-
-@reward("turn_efficiency")
-def turn_efficiency(messages, max_turns=5, **kwargs) -> float:
-    token_messages = [msg for msg in messages if msg["kind"] == "TokenEvent"]
-    tool_messages = [msg for msg in messages if msg["kind"] == "ActionEvent"]
-    
-    num_turns = len(token_messages)
-    num_tool_calls = len(tool_messages)
-    
-    if num_turns <= 1:
-        return 0.0
-    
-    if (num_tool_calls > 1):
-        # Decay the reward if more than max_turns are used
-        if num_turns <= max_turns:
-            return 1.0
-        else:
-            return max(0.0, 1.0 - (num_turns - max_turns) * 0.1)
-
-    return 0.0
\ No newline at end of file
diff --git a/src/tools/__init__.py b/src/tools/__init__.py
index d4580b1..e69de29 100644
--- a/src/tools/__init__.py
+++ b/src/tools/__init__.py
@@ -1,64 +0,0 @@
-import importlib
-import pkgutil
-from pathlib import Path
-
-TOOL_REGISTRY = {}
-
-DEFAULT_OPENHANDS_TOOLS = [
-    "apply_patch",
-    "browser_use",
-    "delegate",
-    "file_editor",
-    "glob",
-    "grep",
-    "planning_file_editor",
-    "preset",
-    "task_tracker",
-    "terminal",
-    "tom_consult"
-]
-
-def tool_exists(tool_name: str):
-    """Check if a tool exists in the registry."""
-    return tool_name in DEFAULT_OPENHANDS_TOOLS or tool_name in TOOL_REGISTRY
-
-def tool(name: str):
-    """Decorator to register a new tool function."""
-    def decorator(func):
-        if name in DEFAULT_OPENHANDS_TOOLS:
-            raise ValueError(f"Tool name '{name}' is an in-built openhands tool and cannot be overridden.")
-
-        # Track the tool in local registry for run-time validation
-        TOOL_REGISTRY[name] = func
-        return func
-    return decorator
-
-def _auto_load_tools():
-    """Automatically discover and import all tool modules to register functions."""
-    current_dir = Path(__file__).parent
-    
-    # Recursively import all Python modules
-    def _import_submodules(path, package_name):
-        # Import all Python modules in this directory
-        for importer, modname, ispkg in pkgutil.iter_modules([str(path)]):
-            if modname != '__init__':
-                try:
-                    importlib.import_module(f'.{modname}', package=package_name)
-                except ImportError:
-                    pass
-        
-        # Recursively process subdirectories
-        for item in path.iterdir():
-            if item.is_dir() and not item.name.startswith('_'):
-                try:
-                    # Import the package (runs __init__.py if it exists)
-                    importlib.import_module(f'.{item.name}', package=package_name)
-                except ImportError:
-                    pass
-                # Recursively import modules from subdirectories
-                _import_submodules(item, f'{package_name}.{item.name}')
-    
-    _import_submodules(current_dir, __name__)
-
-# Auto-load all tool functions on import
-_auto_load_tools()
diff --git a/src/tools/localization_finish.py b/src/tools/localization_finish.py
index b914a0a..9514143 100644
--- a/src/tools/localization_finish.py
+++ b/src/tools/localization_finish.py
@@ -20,7 +20,6 @@
 )
 from openhands.sdk.tool import ToolExecutor, ToolAnnotations
 from openhands.sdk.conversation.state import ConversationExecutionStatus
-from src.tools import tool
 
 if TYPE_CHECKING:
     from openhands.sdk.conversation.base import BaseConversation
@@ -43,8 +42,6 @@ class LocalizationFinishAction(Action):
 """
     )
 
-    # message: str = Field(description="Code localization submission sent to the user.")
-
     @property
     def visualize(self) -> Text:
         """Return Rich Text representation of this action."""
@@ -163,13 +160,4 @@ def create(
                 ),
             )
         ]
-
-@tool(name="localization_finish")
-def _make_localization_finish_tool() -> list[ToolDefinition]:
-    """Create localization finish tool.
-
-    This is a localization-specific finish tool that accepts structured locations
-    and validates the output format.
-    """
-    return LocalizationFinishTool.create()
     
\ No newline at end of file
diff --git a/src/train.py b/src/train.py
index d60dc4d..3cd9210 100644
--- a/src/train.py
+++ b/src/train.py
@@ -6,7 +6,6 @@
 
 import asyncio
 
-# from src.tools import tool_exists
 from src.generator.code_search_generator import CodeSearchGenerator
 from src.async_trainer import CustomFullyAsyncRayPPOTrainer as FullyAsyncRayPPOTrainer
 # from skyrl_train.fully_async_trainer import FullyAsyncRayPPOTrainer
@@ -70,10 +69,6 @@ def main(cfg: DictConfig) -> None:
     # validate the arguments
     validate_cfg(cfg)
 
-    # cfg.trainer.policy.deepspeed_config.zero_optimization.offload_param.device = "cpu"
-    # cfg.trainer.policy.deepspeed_config.zero_optimization.offload_optimizer.device = "cpu"
-    # cfg.trainer.policy.deepspeed_config.zero_optimization.zero_hpz_partition_size = 8
-
     print("cfg.trainer.policy.deepspeed_config")
     print(cfg.trainer.policy.deepspeed_config)
 
@@ -97,11 +92,6 @@ def main(cfg: DictConfig) -> None:
             cfg.generator.tools = [
                 "terminal",
             ]
-
-    # # Check if the tool exists in the registry
-    # for tool in cfg.generator.tools:
-    #     if not tool_exists(tool):
-    #         raise ValueError(f"Tool {tool} does not exist in the registry")
     
     # Set default prompts if not specified
     if not hasattr(cfg.generator, "prompts"):
diff --git a/src/utils/dataset.py b/src/utils/dataset.py
deleted file mode 100644
index 47a21ab..0000000
--- a/src/utils/dataset.py
+++ /dev/null
@@ -1,67 +0,0 @@
-import re
-
-
-def extract_functions_from_patch(input_diff: str):
-    """
-    Parse a unified diff and extract, per file, the starting line of each hunk and the old line count.
-
-    Returns: List[Tuple[str, List[int, int]]]
-      Example: [("path/to/file.py", [start_line, old_count]), ...]
-    """
-
-    results: dict[str, list[list[int]]] = {}
-    current_file: str | None = None
-    in_hunk = False
-    hunk_old_start = None
-    hunk_old_count = None
-
-    # Regex for hunk header: @@ -old_start,old_count +new_start,new_count @@ ...
-    hunk_re = re.compile(r"^@@ -(?P<old_start>\d+)(?:,(?P<old_count>\d+))? \+(?P<new_start>\d+)(?:,(?P<new_count>\d+))? @@")
-
-    def flush_hunk():
-        nonlocal hunk_old_start, hunk_old_count, in_hunk
-        if current_file is None or hunk_old_start is None:
-            return
-        count = hunk_old_count if hunk_old_count is not None else 1
-        results.setdefault(current_file, []).append([hunk_old_start, count])
-        # Reset hunk state
-        in_hunk = False
-        hunk_old_start = None
-        hunk_old_count = None
-
-    for raw_line in input_diff.strip().splitlines():
-        line = raw_line.rstrip("\n")
-
-        # Track current file being processed via the new file path header
-        if line.startswith("+++ b/"):
-            # Starting a new file ends any current hunk
-            if in_hunk:
-                flush_hunk()
-            current_file = line[6:]
-            continue
-
-        # A new hunk header starts
-        m = hunk_re.match(line)
-        if m and current_file:
-            # Flush any previous hunk before starting a new one
-            if in_hunk:
-                flush_hunk()
-            in_hunk = True
-            hunk_old_start = int(m.group("old_start"))
-            old_count_str = m.group("old_count")
-            hunk_old_count = int(old_count_str) if old_count_str is not None else 1
-            continue
-
-    # Flush any unterminated hunk at EOF
-    if in_hunk:
-        flush_hunk()
-
-    targets = []
-    for file, hunks in results.items():
-        for hunk in hunks:
-            targets.append(
-                (file, hunk)
-            )
-    return targets
-    # return results
-
diff --git a/src/utils/get_instance.py b/src/utils/get_instance.py
deleted file mode 100644
index bb6d52a..0000000
--- a/src/utils/get_instance.py
+++ /dev/null
@@ -1,160 +0,0 @@
-import argparse
-import os
-from pathlib import Path
-
-from datasets import load_dataset
-
-
-def _default_repos_dir() -> Path:
-    """Resolve the default path for the cloned SWE-bench repositories.
-
-    Priority order:
-    1) Environment variable SWEBENCH_REPOS_DIR if set
-    2) Project root joined with "swebench_repos" (project root
-       inferred from this file)
-    3) Current working directory joined with "swebench_repos" as a final
-       fallback
-    """
-    env_override = os.getenv("SWEBENCH_REPOS_DIR")
-    if env_override:
-        return Path(env_override).expanduser().resolve()
-
-    # This file lives at <project_root>/src/utils/get_instance.py
-    project_root = Path(__file__).resolve().parents[2]
-    candidate = project_root / "swebench_repos"
-    if candidate.exists() or candidate.parent.exists():
-        return candidate
-
-    # Fallback to CWD
-    return Path.cwd() / "swebench_repos"
-
-
-def get_instance_path(instance: dict, output_dir: Path | None = None) -> Path:
-    """
-    Get the filesystem path for a SWE-bench instance.
-
-    Args:
-        instance: Dictionary with 'repo' and 'instance_id' keys
-        output_dir: Base directory where instances are cloned
-
-    Returns:
-        Path to the instance directory
-    """
-    if output_dir is None:
-        output_dir = _default_repos_dir()
-
-    repo_name = instance["repo"]
-    instance_id = instance["instance_id"]
-    dir_name = f"{repo_name.replace('/', '_')}_{instance_id}"
-    return output_dir / dir_name
-
-
-def main():
-    parser = argparse.ArgumentParser(
-        description="Get filesystem path for SWE-bench instance"
-    )
-    parser.add_argument(
-        "--instance-id",
-        type=str,
-        help="Instance ID to look up (e.g., astropy__astropy-12907)",
-    )
-    parser.add_argument(
-        "--index",
-        type=int,
-        help="Dataset index to look up (e.g., 0 for first instance)",
-    )
-    parser.add_argument(
-        "--output-dir",
-        type=str,
-        default=str(_default_repos_dir()),
-        help=(
-            "Base directory where instances are cloned. Defaults to "
-            "SWEBENCH_REPOS_DIR if set, else <project_root>/swebench_repos"
-        ),
-    )
-    parser.add_argument(
-        "--dataset",
-        type=str,
-        default="princeton-nlp/SWE-bench_Lite",
-        help="SWE-bench dataset to use",
-    )
-    parser.add_argument(
-        "--check",
-        action="store_true",
-        help="Check if the path exists and show info",
-    )
-
-    args = parser.parse_args()
-
-    if not args.instance_id and args.index is None:
-        parser.error("Either --instance-id or --index must be provided")
-
-    # Load dataset
-    print(f"Loading dataset: {args.dataset}")
-    dataset = load_dataset(args.dataset, split="test")
-    print(f"✓ Loaded {len(dataset)} instances\n")
-
-    # Find the instance
-    if args.instance_id:
-        # Find by instance_id
-        instance = None
-        for inst in dataset:
-            if inst["instance_id"] == args.instance_id:
-                instance = inst
-                break
-        if not instance:
-            print(f"✗ Instance ID '{args.instance_id}' not found in dataset")
-            return
-    else:
-        # Get by index
-        if args.index < 0 or args.index >= len(dataset):
-            print(f"✗ Index {args.index} out of range [0, {len(dataset)-1}]")
-            return
-        instance = dataset[args.index]
-
-    # Get the path
-    output_dir = Path(args.output_dir)
-    instance_path = get_instance_path(instance, output_dir)
-
-    # Display info
-    print("=" * 80)
-    print("Instance Information:")
-    print("=" * 80)
-    print(f"Instance ID:  {instance['instance_id']}")
-    print(f"Repository:   {instance['repo']}")
-    print(f"Base Commit:  {instance['base_commit']}")
-    print("\nFilesystem Path:")
-    print(f"  {instance_path.absolute()}")
-
-    # Check if exists
-    if args.check:
-        print("\n" + "=" * 80)
-        print("Path Check:")
-        print("=" * 80)
-        if instance_path.exists():
-            print("✓ Path exists")
-            print("\nDirectory contents (first 10 items):")
-            items = sorted(instance_path.iterdir())[:10]
-            for item in items:
-                item_type = "📁" if item.is_dir() else "📄"
-                print(f"  {item_type} {item.name}")
-            all_items = list(instance_path.iterdir())
-            if len(all_items) > 10:
-                extra_count = len(all_items) - 10
-                print(f"  ... and {extra_count} more")
-
-            # Count Python files
-            py_files = list(instance_path.rglob("*.py"))
-            print(f"\nTotal Python files: {len(py_files)}")
-        else:
-            print("✗ Path does not exist")
-            print("\nTo clone this instance, run:")
-            cmd = (
-                f"  python scripts/clone_repos.py --max-instances 1 "
-                f'--dataset "{args.dataset}"'
-            )
-            print(cmd)
-
-
-if __name__ == "__main__":
-    main()
diff --git a/src/utils/instance_old.py b/src/utils/instance_old.py
deleted file mode 100644
index d76d41c..0000000
--- a/src/utils/instance_old.py
+++ /dev/null
@@ -1,60 +0,0 @@
-import argparse
-import subprocess
-from pathlib import Path
-
-from datasets import load_dataset
-from tqdm import tqdm
-
-
-def clone_instance(
-    repo_name: str, commit_id: str, instance_id: str, output_dir: Path
-) -> bool:
-    """
-    Clone a repository at a specific commit into a separate directory.
-
-    Args:
-        repo_name: Repository name in format 'owner/repo'
-        commit_id: Commit hash to checkout
-        instance_id: Instance ID for directory naming
-        output_dir: Base output directory
-
-    Returns:
-        True if successful, False otherwise
-    """
-    # Create instance directory name: repo_instance-id
-    # E.g., astropy_astropy-12907
-    instance_dir_name = f"{repo_name.replace('/', '_')}_{instance_id}"
-    instance_path = output_dir / instance_dir_name
-
-    # Skip if already exists
-    if instance_path.exists():
-        print(f"  ✓ Instance {instance_id} already exists")
-        return True, instance_path
-
-    try:
-        # Clone the repository
-        subprocess.run(
-            [
-                "git",
-                "clone",
-                f"https://github.com/{repo_name}.git",
-                str(instance_path),
-            ],
-            check=True,
-            capture_output=True,
-            text=True,
-        )
-
-        # Checkout the specific commit
-        subprocess.run(
-            ["git", "-C", str(instance_path), "checkout", commit_id],
-            check=True,
-            capture_output=True,
-            text=True,
-        )
-
-        print(f"  ✓ Cloned {instance_id} at commit {commit_id[:8]}")
-        return True, instance_path
-    except subprocess.CalledProcessError as e:
-        print(f"  ✗ Error cloning {instance_id}: {e.stderr}")
-        return False, None
diff --git a/src/utils/parse_patch.py b/src/utils/parse_patch.py
deleted file mode 100644
index 241b2a4..0000000
--- a/src/utils/parse_patch.py
+++ /dev/null
@@ -1,130 +0,0 @@
-import re
-
-
-def parse_patch(patch: str) -> dict:
-    """
-    Parse a git diff patch and extract file paths with their line ranges.
-
-    Args:
-        patch: Git diff patch string
-
-    Returns:
-        Dictionary mapping file paths to their modified line ranges:
-        {
-            "file_path": {
-                "old_start": int,
-                "old_count": int,
-                "new_start": int,
-                "new_count": int,
-                "hunks": [
-                    {
-                        "old_start": int,
-                        "old_count": int,
-                        "new_start": int,
-                        "new_count": int
-                    },
-                    ...
-                ]
-            },
-            ...
-        }
-    """
-    result = {}
-
-    # Split patch into individual file diffs
-    file_diffs = re.split(r"^diff --git ", patch, flags=re.MULTILINE)
-
-    for file_diff in file_diffs:
-        if not file_diff.strip():
-            continue
-
-        # Extract file path from the diff header
-        # Format: a/path/to/file b/path/to/file
-        file_match = re.search(
-            r"a/(.*?) b/(.*?)$", file_diff, flags=re.MULTILINE
-        )
-        if not file_match:
-            continue
-
-        file_path = file_match.group(2)  # Use the 'b/' path (new file)
-
-        # Find all hunks in this file diff
-        # Hunk header format: @@ -old_start,old_count +new_start,new_count @@
-        hunk_pattern = r"@@ -(\d+)(?:,(\d+))? \+(\d+)(?:,(\d+))? @@"
-        hunks = []
-
-        for match in re.finditer(hunk_pattern, file_diff):
-            old_start = int(match.group(1))
-            old_count = int(match.group(2)) if match.group(2) else 1
-            new_start = int(match.group(3))
-            new_count = int(match.group(4)) if match.group(4) else 1
-
-            hunks.append(
-                {
-                    "old_start": old_start,
-                    "old_count": old_count,
-                    "new_start": new_start,
-                    "new_count": new_count,
-                }
-            )
-
-        if hunks:
-            # Calculate overall ranges
-            old_start = min(h["old_start"] for h in hunks)
-            old_end = max(h["old_start"] + h["old_count"] - 1 for h in hunks)
-            new_start = min(h["new_start"] for h in hunks)
-            new_end = max(h["new_start"] + h["new_count"] - 1 for h in hunks)
-
-            result[file_path] = {
-                "old_start": old_start,
-                "old_count": old_end - old_start + 1,
-                "new_start": new_start,
-                "new_count": new_end - new_start + 1,
-                "hunks": hunks,
-            }
-
-    return result
-
-
-def add_patch_info(example):
-    """
-    Dataset transformation function to add parsed patch info.
-
-    Args:
-        example: Dataset example with 'patch' field
-
-    Returns:
-        Example with added 'patch_info' field
-    """
-    example["patch_info"] = parse_patch(example["patch"])
-    return example
-
-
-if __name__ == "__main__":
-    from datasets import load_dataset
-
-    # Load dataset
-    print("Loading dataset...")
-    dataset = load_dataset("princeton-nlp/SWE-bench_Lite", split="test")
-
-    # Get first instance
-    instance = dataset[0]
-
-    print("\n" + "=" * 80)
-    print("Instance ID:", instance["instance_id"])
-    print("=" * 80)
-
-    # Show original patch
-    print("\nOriginal patch:")
-    print("-" * 80)
-    print(instance["patch"])
-
-    # Parse the patch
-    patch_info = parse_patch(instance["patch"])
-
-    print("\n" + "=" * 80)
-    print("Full patch_info dict:")
-    print("=" * 80)
-    import json
-
-    print(json.dumps(patch_info, indent=2))
diff --git a/swe_grep_oss_env.py b/swe_grep_oss_env.py
deleted file mode 100644
index 197d866..0000000
--- a/swe_grep_oss_env.py
+++ /dev/null
@@ -1,332 +0,0 @@
-import json
-import logging
-import traceback
-from typing import Literal
-
-import verifiers as vf
-from datasets import load_dataset
-from openai import AsyncOpenAI
-
-import src.tools as tools
-from src.constants import DEFAULT_MAX_TOKENS, DEFAULT_MAX_TOOL_CALLS
-from src.prompts.system_prompt import SYSTEM_PROMPT
-from src.utils.get_instance import get_instance_path
-from src.utils.parse_patch import parse_patch
-
-logger = logging.getLogger("swe-grep-oss")
-
-
-class SWEGrepEnv(vf.StatefulToolEnv):
-    def __init__(self, **kwargs):
-        super().__init__(**kwargs)
-
-        # Only add bash tool - no result tool needed with XML output
-        self.add_tool(tools.bash, args_to_skip=["cwd"])
-
-    async def is_completed(
-        self, messages: vf.types.Messages, state: vf.types.State, **kwargs
-    ) -> bool:
-        max_turns_reached = await self.max_turns_reached(state)
-        prompt_too_long = await self.prompt_too_long(state)
-
-        # Check if the last message contains <files> XML tags
-        has_files_tag = False
-        if messages and len(messages) > 0:
-            last_message = messages[-1]
-            if last_message.get("role") == "assistant":
-                content = last_message.get("content", "")
-                if isinstance(content, str) and "<files>" in content and "</files>" in content:
-                    has_files_tag = True
-
-        if has_files_tag or max_turns_reached or prompt_too_long:
-            return True
-
-        return False
-
-    async def env_response(
-        self, messages: vf.types.Messages, state: vf.types.State, **kwargs
-    ) -> tuple[vf.types.Messages, vf.types.State]:
-        assert isinstance(messages, list)
-
-        tool_messages = []
-        tool_calls = messages[-1].get("tool_calls", [])
-        for tool_call in tool_calls:
-            tool_name: str = tool_call.get("function", {}).get("name", "")
-            tool_call_id: str = tool_call.get("id", "")
-
-            arguments_str = tool_call.get("function", {}).get("arguments", "")
-
-            try:
-                tool_args = json.loads(arguments_str)
-
-                # Handle double-encoded JSON (when json.loads returns a string instead of dict)
-                if isinstance(tool_args, str):
-                    self.logger.warning(
-                        f"Double-encoded JSON detected, attempting to parse again: {tool_args[:100]}"
-                    )
-                    tool_args = json.loads(tool_args)
-
-                # Final check: must be a dict
-                if not isinstance(tool_args, dict):
-                    raise TypeError(f"Expected dict, got {type(tool_args).__name__}")
-
-            except (json.JSONDecodeError, TypeError) as e:
-                self.logger.error(f"Failed to parse tool arguments: {e}")
-                self.logger.error(f"Raw arguments: {repr(arguments_str)}")
-                tool_messages.append(
-                    {
-                        "role": "tool",
-                        "content": f"Error: Invalid tool arguments - {str(e)}",
-                        "tool_call_id": tool_call_id,
-                    }
-                )
-                continue
-
-            tool_args = self.update_tool_args(tool_name, tool_args, messages, state, **kwargs)
-            tool_message: vf.types.Message = await self.call_tool(
-                tool_name, tool_args, tool_call_id
-            )
-            tool_messages.append(tool_message)
-        return tool_messages, state
-
-    async def rollout(
-        self,
-        client: AsyncOpenAI,
-        model: str,
-        prompt: vf.types.Messages,
-        completion: vf.types.Messages | None = None,
-        answer: str = "",
-        state: vf.types.State = {},
-        task: str = "default",
-        info: vf.types.Info | None = None,
-        example_id: int = 0,
-        sampling_args: vf.types.SamplingArgs | None = None,
-        **kwargs,
-    ) -> tuple[vf.types.Messages, vf.types.State]:
-        try:
-            return await super().rollout(
-                client,
-                model,
-                prompt,
-                completion,
-                answer,
-                state,
-                task,
-                info,
-                example_id,
-                sampling_args,
-                **kwargs,
-            )
-        except Exception as e:
-            import traceback
-
-            self.logger.error(f"Error in rollout: {e}")
-            self.logger.error(f"Traceback: {traceback.format_exc()}")
-            raise  # Re-raise to see the actual error
-
-    def update_tool_args(
-        self,
-        tool_name: str,
-        tool_args: dict,
-        messages: vf.types.Messages,
-        state: vf.types.State,
-        **kwargs,
-    ) -> dict:
-        try:
-            if tool_name == "bash":
-                repo_path = get_instance_path(
-                    {
-                        "repo": state["info"]["repo"],
-                        "instance_id": state["info"]["instance_id"],
-                    }
-                )
-                updated_tool_args = dict(tool_args)
-                updated_tool_args["cwd"] = repo_path
-                return updated_tool_args
-        except Exception:
-            # Add detailed logging
-            self.logger.error(f"update_tool_args called with tool_name={tool_name}")
-            self.logger.error(f"tool_args type: {type(tool_args)}")
-            self.logger.error(f"tool_args value: {repr(tool_args)}")
-            self.logger.error(f"messages: {messages}")
-            self.logger.error(f"Traceback: {traceback.format_exc()}")
-            raise  # Re-raise to see the actual error
-
-        return tool_args
-
-
-def load_environment(
-    max_tokens: int = DEFAULT_MAX_TOKENS,
-    max_tool_calls: int = DEFAULT_MAX_TOOL_CALLS,
-    mode: Literal["train", "test", "full", "rl"] = "rl",
-    **kwargs,
-):
-    """
-    Load and configure the SWE-Grep environment.
-
-    Args:
-        max_tokens: Maximum tokens for model responses
-        max_tool_calls: Maximum number of tool calls allowed
-        mode: Dataset mode - "train" (80%), "test" (20%), "full" (100%), or "rl" (train+eval split)
-        **kwargs: Additional arguments passed to SWEGrepEnv
-
-    Returns:
-        SWEGrepEnv instance configured with the specified dataset
-    """
-
-    # Load and prepare dataset
-    full_dataset = load_dataset("princeton-nlp/SWE-bench_Lite", split="test")
-    full_dataset = full_dataset.shuffle(seed=42)
-
-    # Transform dataset with metadata and prompts
-    def transform_row(row):
-        return {
-            "info": {
-                "repo": row["repo"],
-                "instance_id": row["instance_id"],
-                "max_tokens": max_tokens,
-                "max_tool_calls": max_tool_calls,
-            },
-            "prompt": [
-                {"role": "system", "content": SYSTEM_PROMPT},
-                {"role": "user", "content": row["problem_statement"]},
-            ],
-            "answer": json.dumps(parse_patch(row["patch"])),
-        }
-
-    full_dataset = full_dataset.map(transform_row)
-
-    # Split dataset for train/eval modes
-    split_dataset = full_dataset.train_test_split(test_size=0.2, seed=42)
-    train_dataset = split_dataset["train"]
-    eval_dataset = split_dataset["test"]
-
-    # XML parser to extract files from <files> tags
-    parser = vf.XMLParser(["files"], answer_field="files")
-
-    # Reward function: F1 score between predicted and actual files
-    def file_localization_reward(completion: str, answer: str, **kwargs) -> float:
-        """
-        Calculate F1 score between predicted files and actual files from patch.
-        """
-
-        # Helper function to normalize file paths
-        def normalize_path(path: str) -> str:
-            """Remove leading './' from file paths for consistent comparison."""
-            path = path.strip()
-            if path.startswith("./"):
-                path = path[2:]
-            return path
-
-        # Parse the model's response
-        predicted_files_str = parser.parse_answer(completion)
-        if predicted_files_str is None:
-            return 0.0
-
-        try:
-            # Try to parse as JSON array
-            if predicted_files_str.strip().startswith("["):
-                predicted_files = json.loads(predicted_files_str)
-            else:
-                # Split by newlines and filter empty
-                predicted_files = [f.strip() for f in predicted_files_str.split("\n") if f.strip()]
-        except:
-            predicted_files = []
-
-        # Parse the ground truth answer
-        try:
-            actual_files = json.loads(answer) if isinstance(answer, str) else answer
-        except:
-            actual_files = []
-
-        # Normalize paths and convert to sets for comparison
-        predicted_set = set(normalize_path(f) for f in predicted_files)
-        actual_set = set(normalize_path(f) for f in actual_files)
-
-        # Calculate F1 score
-        if len(predicted_set) == 0 and len(actual_set) == 0:
-            return 1.0
-        if len(predicted_set) == 0 or len(actual_set) == 0:
-            return 0.0
-
-        true_positives = len(predicted_set & actual_set)
-        precision = true_positives / len(predicted_set) if len(predicted_set) > 0 else 0.0
-        recall = true_positives / len(actual_set) if len(actual_set) > 0 else 0.0
-
-        if precision + recall == 0:
-            return 0.0
-
-        f1 = 2 * (precision * recall) / (precision + recall)
-        return f1
-
-    # Reward that countr total no of turns with atleast 1 tool call
-    def turns_with_tool_calls(completion: vf.types.Messages) -> float:
-        """
-        Count the total number of turns with atleast 1 tool call.
-        """
-        if not isinstance(completion, list):
-            return 0.0
-
-        count = 0
-        for message in completion:
-            if isinstance(message, dict) and "tool_calls" in message:
-                tool_calls = message.get("tool_calls", [])
-                if len(tool_calls) > 0:
-                    count += 1
-
-        if count == 0:
-            return 0
-
-        return 1
-
-    # Reward to increase number of tool calls per turn
-    def tool_call_count_per_turn(completion: vf.types.Messages) -> float:
-        """
-        Count the number of tool calls per turn.
-        """
-        if not isinstance(completion, list):
-            return 0.0
-
-        counts = []
-        for message in completion:
-            if isinstance(message, dict) and "tool_calls" in message:
-                tool_calls = message.get("tool_calls", [])
-                if tool_calls:
-                    counts.append(len(tool_calls))
-
-        if len(counts) == 0:
-            return 0.0
-
-        avg_count = sum(counts) / len(counts)
-
-        # clip it at 5 tool calls per turn
-        avg_count = min(5.0, avg_count) / 5.0
-
-        return avg_count
-
-    # Define rubric with single F1 reward
-    rubric = vf.Rubric(
-        funcs=[file_localization_reward, turns_with_tool_calls, tool_call_count_per_turn],
-        weights=[1.0, 1.0, 1.0],
-    )
-
-    # Common environment configuration
-    env_config = {
-        "parser": parser,
-        "rubric": rubric,
-        "max_turns": 8,
-        **kwargs,
-    }
-
-    # Select dataset(s) based on mode
-    if mode == "full":
-        env_config["dataset"] = full_dataset
-    elif mode == "train":
-        env_config["dataset"] = train_dataset
-    elif mode == "test":
-        env_config["dataset"] = eval_dataset
-    else:  # mode == "rl" (default)
-        env_config["dataset"] = train_dataset
-        env_config["eval_dataset"] = eval_dataset
-
-    return SWEGrepEnv(**env_config)
diff --git a/tests/test_data_extract_functions_from_patch.py b/tests/test_data_extract_functions_from_patch.py
deleted file mode 100644
index e48ebe6..0000000
--- a/tests/test_data_extract_functions_from_patch.py
+++ /dev/null
@@ -1,55 +0,0 @@
-import pytest
-
-from src.utils.dataset import extract_functions_from_patch
-
-
-def test_extract_functions_from_patch():
-    diff = """diff --git a/moto/rds/exceptions.py b/moto/rds/exceptions.py
---- a/moto/rds/exceptions.py
-+++ b/moto/rds/exceptions.py
-@@ -82,6 +82,14 @@ def __init__(self, database_identifier: str):
-)
-
-
-+class DBClusterToBeDeletedHasActiveMembers(RDSClientError):
-+ def __init__(self) -> None:
-+ super().__init__(
-+ "InvalidDBClusterStateFault",
-+ "Cluster cannot be deleted, it still contains DB instances in non-deleting state.",
-+ )
-+
-+
-class InvalidDBInstanceStateError(RDSClientError):
-def __init__(self, database_identifier: str, istate: str):
-estate = (
-diff --git a/moto/rds/models.py b/moto/rds/models.py
---- a/moto/rds/models.py
-+++ b/moto/rds/models.py
-@@ -19,6 +19,7 @@
-DBClusterNotFoundError,
-DBClusterSnapshotAlreadyExistsError,
-DBClusterSnapshotNotFoundError,
-+ DBClusterToBeDeletedHasActiveMembers,
-DBInstanceNotFoundError,
-DBSnapshotNotFoundError,
-DBSecurityGroupNotFoundError,
-@@ -2339,7 +2340,8 @@ def delete_db_cluster(
-raise InvalidParameterValue(
-"Can't delete Cluster with protection enabled"
-)
--
-+ if cluster.cluster_members:
-+ raise DBClusterToBeDeletedHasActiveMembers()
-global_id = cluster.global_cluster_identifier or ""
-if global_id in self.global_clusters:
-self.remove_from_global_cluster(global_id, cluster_identifier)"""
-
-    result = extract_functions_from_patch(diff)
-
-    # Note: In unified diff headers, paths are prefixed with a/ and b/.
-    # After stripping the leading 'b/', the actual path here is 'b/b.py'.
-    assert result == [
-        ("moto/rds/exceptions.py", [82, 6]),
-        ("moto/rds/models.py", [19, 6]),
-        ("moto/rds/models.py", [2339, 7])
-    ]
\ No newline at end of file