From 207d839cd4170879f385c18f1ba8b55670ae6b4d Mon Sep 17 00:00:00 2001 From: Leander Maben Date: Thu, 1 Jan 2026 18:46:03 -0500 Subject: [PATCH 1/3] Add Custom Finish Tool --- configs/skyrl-experiments/read-only.yaml | 1 + configs/skyrl-experiments/terminal.yaml | 1 + src/prompts/templates/system_prompt.j2 | 75 +++-- src/tools/localization_finish.py | 361 +++++++++++++++++++++++ tests/tools/test_localization_finish.py | 263 +++++++++++++++++ 5 files changed, 681 insertions(+), 20 deletions(-) create mode 100644 src/tools/localization_finish.py create mode 100644 tests/tools/test_localization_finish.py diff --git a/configs/skyrl-experiments/read-only.yaml b/configs/skyrl-experiments/read-only.yaml index 4d5525a..4f48460 100644 --- a/configs/skyrl-experiments/read-only.yaml +++ b/configs/skyrl-experiments/read-only.yaml @@ -9,6 +9,7 @@ tools: - glob - grep - terminal + - localization_finish prompts: system_prompt: "templates/system_prompt.j2" diff --git a/configs/skyrl-experiments/terminal.yaml b/configs/skyrl-experiments/terminal.yaml index 6ad62c6..613ad3f 100644 --- a/configs/skyrl-experiments/terminal.yaml +++ b/configs/skyrl-experiments/terminal.yaml @@ -7,6 +7,7 @@ reward: tools: - terminal + - localization_finish prompts: system_prompt: "templates/system_prompt.j2" diff --git a/src/prompts/templates/system_prompt.j2 b/src/prompts/templates/system_prompt.j2 index f5a67aa..462f7ba 100644 --- a/src/prompts/templates/system_prompt.j2 +++ b/src/prompts/templates/system_prompt.j2 @@ -36,12 +36,40 @@ You are given access to the codebase in a linux file system. * Read targeted line ranges around matches using `sed -n 'START,ENDp'` * Only read additional chunks if the initial sections are relevant -### Final Answer Format (REQUIRED) -- You MUST return your final answer in backticks ``` ... ``` -- Format: ```\nfull_path1/file1.py\nclass: MyClass1\nfunction: my_function1\n\nfull_path2/file2.py\nfunction: MyClass2.my_function2\n\nfull_path3/file3.py\nfunction: my_function3\n``` -- List one file path per line -- Use relative paths as they appear in the repository -- DO NOT include any other text inside the backticks +### Submitting Your Answer (REQUIRED) + +When you have identified all relevant locations, use the `finish` tool to submit your results. + +**Format Requirements:** +- Wrap your answer in triple backticks ``` ... ``` +- Each location must start with a file path (ending in .py) +- Class name is OPTIONAL - include only if the change is within a specific class +- Function name is OPTIONAL - include only if the change is at function/method level + +**When to include what:** +- File-level changes (imports, globals, new classes): List just the file +- Class-level changes (new methods, attributes): List file + class (no function) +- Standalone function: List file + function (no class) +- Method in a class: List file + class + function + +**Example formats:** +``` +# File-only (imports, globals, new class added) +path/to/file1.py + +# File + Class (class-level changes) +path/to/file2.py +class: MyClass + +# File + Function (standalone function) +path/to/file3.py +function: my_function + +# File + Class + Function (method change) +path/to/file4.py +class: MyClass +function: my_method +``` ## SEARCH STRATEGY @@ -61,32 +89,39 @@ You are given access to the codebase in a linux file system. 3. **Final Verification**: Confirm your file list - Verify each candidate file is truly relevant - Ensure you haven't missed related files - - Return your answer in backticks ``` ... ``` + - Use the `finish` tool to submit your answer ## CRITICAL RULES - NEVER exceed 5 parallel bash tool calls in a single turn -- NEVER respond without wrapping your file list in backticks ``` +- ALWAYS use the `finish` tool when you're done (wrap your answer in backticks) - ALWAYS use bash tool to search (do not guess file locations) - NEVER read entire large files - always read in chunks (100-line ranges) - Check file size with `wc -l` before reading - Read file contents in chunks to verify relevance before including them - Return file paths as they appear in the repository. Do not begin the path with "./" - Aim for high precision (all files relevant) and high recall (no relevant files missed) +- Class and function names are OPTIONAL - only include when changes are at that level -## EXAMPLE OUTPUT +## EXAMPLE SUBMISSION -After exploring the codebase, return your answer like this: +When ready, call the `finish` tool with your findings: -Your final output should list the locations requiring modification, wrapped with triple backticks ``` -Each location should include the file path, class name (if applicable), and function name. Here is an example Output: ``` -full_path1/file1.py -class: MyClass1 -function: my_function1 +src/utils/parser.py +class: DataParser +function: parse_json -full_path2/file2.py -function: MyClass2.my_function2 +src/models/user.py +class: User + +src/config.py + +src/api/endpoints.py +function: handle_request +``` -full_path3/file3.py -function: my_function3 -``` \ No newline at end of file +**Note:** In this example: +- `parser.py` has a specific method change +- `user.py` has a class-level change (no specific function) +- `config.py` has file-level changes (imports/globals) +- `endpoints.py` has a standalone function change \ No newline at end of file diff --git a/src/tools/localization_finish.py b/src/tools/localization_finish.py new file mode 100644 index 0000000..a762904 --- /dev/null +++ b/src/tools/localization_finish.py @@ -0,0 +1,361 @@ +"""Custom finish tool for code localization tasks. + +This tool allows the agent to submit localization results in a flexible format where: +- File path is required +- Class name is optional +- Function name is optional +""" + +import os +from typing import TYPE_CHECKING +from collections.abc import Sequence + +from pydantic import Field +from rich.text import Text + +from openhands.sdk import ( + Action, + Observation, + ToolDefinition, +) +from openhands.sdk.tool import ToolExecutor + +from src.tools import tool + +if TYPE_CHECKING: + from openhands.sdk.conversation.base import BaseConversation + + +class LocalizationFinishAction(Action): + """Action for submitting final localization results.""" + + message: str = Field( + description="""Your final localization results in the specified format. + +Format: +``` +path/to/file1.py +class: ClassName +function: method_name + +path/to/file2.py +function: standalone_function + +path/to/file3.py +``` + +Requirements: +- Each location must have a file path +- Class name is optional (omit if change is file-level or function is standalone) +- Function name is optional (omit if change is file-level or class-level only) +- Wrap your answer in triple backticks + +Example for different scenarios: +- File-level change (imports, globals): Just list the file +- Class-level change (new method, attributes): List file + class +- Method change: List file + class + function +- Standalone function: List file + function +""" + ) + + @property + def visualize(self) -> Text: + """Return Rich Text representation of this action.""" + content = Text() + content.append("Submitting localization results:\n", style="bold blue") + content.append(self.message) + return content + + +class LocalizationFinishObservation(Observation): + """Observation returned after submitting localization results.""" + + success: bool = Field(default=True, description="Whether submission was successful") + num_locations: int = Field(default=0, description="Number of locations submitted") + validation_message: str = Field(default="", description="Validation feedback") + details: dict = Field(default_factory=dict, description="Additional details") + + @property + def visualize(self) -> Text: + """Return Rich Text representation of this observation.""" + content = Text() + if self.success: + content.append(f"✓ Successfully submitted {self.num_locations} location(s)\n", style="bold green") + else: + content.append(f"✗ Submission failed\n", style="bold red") + content.append(f"{self.validation_message}\n", style="yellow") + return content + + +def parse_localization_output(raw_output: str) -> list[dict]: + """Parse localization output with optional class and function. + + This is an enhanced version of parse_simple_output that handles: + - File-only entries (no class or function) + - File + class entries (no function) + - File + function entries (no class) + - File + class + function entries + + Args: + raw_output: Raw text output to parse + + Returns: + List of dictionaries with 'file', 'class' (optional), 'function' (optional) + """ + # Remove triple backticks and whitespace + raw_output = raw_output.strip("` \n") + + locations = [] + current_file = None + current_class = None + current_function = None + + lines = raw_output.strip().split("\n") + + for line in lines: + line = line.strip() + + if not line: + # Empty line - save current location if we have a file + if current_file: + locations.append({ + "file": current_file, + "class": current_class, + "function": current_function, + }) + current_file = None + current_class = None + current_function = None + continue + + # Check if this is a file path (ends with .py) + if line.endswith(".py"): + # Save previous location if exists + if current_file: + locations.append({ + "file": current_file, + "class": current_class, + "function": current_function, + }) + # Start new location + current_file = line + current_class = None + current_function = None + continue + + # Parse class declaration + if line.startswith("class:"): + class_name = line[len("class:"):].strip() + current_class = class_name + continue + + # Parse function/method declaration + if line.startswith("function:") or line.startswith("method:"): + func_text = line.split(":", 1)[1].strip() + func_name = func_text.split()[0].strip("() ") + + # Check if function includes class prefix (e.g., "MyClass.my_method") + if "." in func_name: + parts = func_name.split(".", 1) + current_class = parts[0] + current_function = parts[1] + else: + current_function = func_name + continue + + # Don't forget the last location + if current_file: + locations.append({ + "file": current_file, + "class": current_class, + "function": current_function, + }) + + return locations + + +class LocalizationFinishExecutor(ToolExecutor): + """Executor for localization finish tool with validation.""" + + def __init__(self, workspace_dir: str | None = None): + """Initialize the executor. + + Args: + workspace_dir: Optional workspace directory to validate file existence. + """ + self.workspace_dir = workspace_dir + + def __call__( + self, + action: LocalizationFinishAction, + conversation: "BaseConversation | None" = None, + ) -> LocalizationFinishObservation: + """Execute the finish action with validation. + + Args: + action: The localization finish action to execute + conversation: Optional conversation context + + Returns: + LocalizationFinishObservation with validation results + """ + + try: + # Parse the output to validate format + locations = parse_localization_output(action.message) + num_locs = len(locations) + + # Validation 1: Check if any locations were found + if num_locs == 0: + return LocalizationFinishObservation( + success=False, + num_locations=0, + validation_message=( + "No valid locations found. Please provide at least one file path " + "in the correct format wrapped in triple backticks." + ), + details={"error": "empty_output"} + ) + + # Validation 2: Check each location has a file path + errors = [] + for i, loc in enumerate(locations): + if not loc.get('file'): + errors.append(f"Location {i+1} is missing a file path") + + if errors: + return LocalizationFinishObservation( + success=False, + num_locations=0, + validation_message="\n".join(errors), + details={"error": "missing_file_paths", "locations": locations} + ) + + # Validation 3: Check file existence (if workspace provided) + if self.workspace_dir: + missing_files = [] + for loc in locations: + file_path = loc['file'] + full_path = os.path.join(self.workspace_dir, file_path) + if not os.path.exists(full_path): + missing_files.append(file_path) + + if missing_files: + return LocalizationFinishObservation( + success=False, + num_locations=num_locs, + validation_message=( + f"Warning: {len(missing_files)} file(s) not found in workspace:\n" + + "\n".join(f" - {f}" for f in missing_files[:5]) + + (f"\n ... and {len(missing_files) - 5} more" if len(missing_files) > 5 else "") + ), + details={ + "warning": "files_not_found", + "missing_files": missing_files, + "locations": locations + } + ) + + # Success! + return LocalizationFinishObservation( + success=True, + num_locations=num_locs, + validation_message=f"Successfully submitted {num_locs} location(s).", + details={"locations": locations} + ) + + except Exception as e: + # Parsing failed + return LocalizationFinishObservation( + success=False, + num_locations=0, + validation_message=( + f"Error parsing output: {str(e)}\n\n" + "Please ensure your output follows the correct format:\n" + "```\n" + "path/to/file.py\n" + "class: ClassName\n" + "function: method_name\n" + "```" + ), + details={"error": "parse_error", "exception": str(e)} + ) + + +TOOL_DESCRIPTION = """Submit your final code localization results. + +Use this tool when you have identified all relevant files, classes, and functions +that need to be modified to address the issue described in the problem statement. + +Format your results as follows: +``` +path/to/file1.py +class: ClassName +function: method_name + +path/to/file2.py +function: standalone_function + +path/to/file3.py +``` + +Requirements: +- Wrap your output in triple backticks (```) +- Each location must start with a file path +- Class name is OPTIONAL - include only if the change is within a specific class +- Function name is OPTIONAL - include only if the change is at function/method level + +When to omit class/function: +- File-level only (imports, globals, new classes): List just the file +- Class-level only (new methods, attributes): List file + class (no function) +- Standalone function: List file + function (no class) +- Method in class: List file + class + function + +The tool will validate your submission and provide feedback if the format is incorrect. +""" + + +class LocalizationFinishTool(ToolDefinition[LocalizationFinishAction, LocalizationFinishObservation]): + """Tool for submitting final code localization results.""" + + @classmethod + def create( + cls, + conv_state, + workspace_dir: str | None = None, + **params + ) -> Sequence["LocalizationFinishTool"]: + """Create LocalizationFinishTool instance. + + Args: + conv_state: Conversation state (provides workspace info) + workspace_dir: Optional workspace directory override + **params: Additional parameters + + Returns: + A sequence containing a single LocalizationFinishTool instance. + """ + # Get workspace from conv_state if not provided + if workspace_dir is None and hasattr(conv_state, 'workspace'): + workspace_dir = str(conv_state.workspace.working_dir) + + executor = LocalizationFinishExecutor(workspace_dir=workspace_dir) + + return [ + cls( + action_type=LocalizationFinishAction, + observation_type=LocalizationFinishObservation, + description=TOOL_DESCRIPTION, + executor=executor, + ) + ] + + +@tool(name="finish") +def _make_localization_finish_tool(conv_state) -> list[ToolDefinition]: + """Create localization finish tool. + + This replaces the default finish tool with a localization-specific version + that validates the output format. + """ + return LocalizationFinishTool.create(conv_state) diff --git a/tests/tools/test_localization_finish.py b/tests/tools/test_localization_finish.py new file mode 100644 index 0000000..2e348e9 --- /dev/null +++ b/tests/tools/test_localization_finish.py @@ -0,0 +1,263 @@ +"""Tests for the localization finish tool.""" + +import os +import tempfile +import pytest +from pathlib import Path + +from src.tools.localization_finish import ( + LocalizationFinishAction, + LocalizationFinishExecutor, + LocalizationFinishObservation, +) + + +class TestLocalizationFinishExecutor: + """Tests for LocalizationFinishExecutor.""" + + def setup_method(self): + """Create executor and temp workspace for each test.""" + self.temp_dir = tempfile.mkdtemp() + self.executor = LocalizationFinishExecutor(workspace_dir=self.temp_dir) + + def teardown_method(self): + """Clean up temp workspace.""" + import shutil + if os.path.exists(self.temp_dir): + shutil.rmtree(self.temp_dir) + + def test_empty_message(self): + """Test with empty message.""" + action = LocalizationFinishAction(message="") + result = self.executor(action) + + assert result.success is False + assert result.num_locations == 0 + assert "No valid locations" in result.validation_message + + def test_malformed_input(self): + """Test with malformed input.""" + action = LocalizationFinishAction(message="random text without proper format") + result = self.executor(action) + + assert result.success is False + # Malformed input results in no locations found + assert "No valid locations" in result.validation_message + + def test_file_only_valid(self): + """Test valid file-level localization (no class or function).""" + # Create the file in workspace + file_path = "test_file.py" + Path(self.temp_dir, file_path).touch() + + action = LocalizationFinishAction(message=f"""``` +{file_path} +```""") + result = self.executor(action) + + assert result.success is True + assert result.num_locations == 1 + assert "Successfully submitted" in result.validation_message + + def test_file_and_class_valid(self): + """Test valid class-level localization (no function).""" + file_path = "test_file.py" + Path(self.temp_dir, file_path).touch() + + action = LocalizationFinishAction(message=f"""``` +{file_path} +class: MyClass +```""") + result = self.executor(action) + + assert result.success is True + assert result.num_locations == 1 + + def test_file_and_function_valid(self): + """Test valid function-level localization (no class).""" + file_path = "test_file.py" + Path(self.temp_dir, file_path).touch() + + action = LocalizationFinishAction(message=f"""``` +{file_path} +function: my_function +```""") + result = self.executor(action) + + assert result.success is True + assert result.num_locations == 1 + + def test_complete_localization_valid(self): + """Test complete localization with file, class, and function.""" + file_path = "test_file.py" + Path(self.temp_dir, file_path).touch() + + action = LocalizationFinishAction(message=f"""``` +{file_path} +class: MyClass +function: my_method +```""") + result = self.executor(action) + + assert result.success is True + assert result.num_locations == 1 + + def test_multiple_locations_mixed_formats(self): + """Test multiple locations with different formats.""" + # Create files + for i in range(1, 5): + Path(self.temp_dir, f"file{i}.py").touch() + + action = LocalizationFinishAction(message="""``` +file1.py + +file2.py +class: ClassA + +file3.py +function: func_b + +file4.py +class: ClassC +function: method_d +```""") + result = self.executor(action) + + assert result.success is True + assert result.num_locations == 4 + assert "Successfully submitted 4 location(s)" in result.validation_message + + def test_missing_file_warning(self): + """Test that missing files trigger a warning.""" + action = LocalizationFinishAction(message="""``` +nonexistent_file.py +```""") + result = self.executor(action) + + # Should still parse but with warning + assert result.success is False + assert result.num_locations == 1 + assert "not found in workspace" in result.validation_message + assert "nonexistent_file.py" in result.validation_message + + def test_some_files_missing(self): + """Test when some files exist and some don't.""" + # Create only one file + Path(self.temp_dir, "exists.py").touch() + + action = LocalizationFinishAction(message="""``` +exists.py + +missing.py +```""") + result = self.executor(action) + + assert result.success is False + assert result.num_locations == 2 + assert "missing.py" in result.validation_message + + def test_no_backticks(self): + """Test that output without backticks fails parsing.""" + action = LocalizationFinishAction(message="""file.py +class: MyClass +""") + result = self.executor(action) + + # Should fail because parse_simple_output expects backticks + assert result.success is False + + def test_executor_without_workspace(self): + """Test executor without workspace validation.""" + executor = LocalizationFinishExecutor(workspace_dir=None) + + action = LocalizationFinishAction(message="""``` +any_file.py +```""") + result = executor(action) + + # Should succeed without file existence check + assert result.success is True + assert result.num_locations == 1 + + def test_nested_file_path(self): + """Test with nested file paths.""" + # Create nested structure + nested_dir = Path(self.temp_dir, "src", "utils") + nested_dir.mkdir(parents=True) + Path(nested_dir, "helper.py").touch() + + action = LocalizationFinishAction(message="""``` +src/utils/helper.py +function: process +```""") + result = self.executor(action) + + assert result.success is True + assert result.num_locations == 1 + + def test_dotted_function_name(self): + """Test with ClassName.method_name format.""" + file_path = "test.py" + Path(self.temp_dir, file_path).touch() + + action = LocalizationFinishAction(message=f"""``` +{file_path} +function: MyClass.my_method +```""") + result = self.executor(action) + + assert result.success is True + assert result.num_locations == 1 + # Should parse as class=MyClass, function=my_method + assert result.details["locations"][0]["class"] == "MyClass" + assert result.details["locations"][0]["function"] == "my_method" + + def test_multiple_missing_files_truncated(self): + """Test that many missing files are truncated in message.""" + action = LocalizationFinishAction(message="""``` +file1.py +file2.py +file3.py +file4.py +file5.py +file6.py +file7.py +```""") + result = self.executor(action) + + assert result.success is False + assert result.num_locations == 7 + # Should show truncation message + assert "and 2 more" in result.validation_message + + +class TestLocalizationFinishObservation: + """Tests for LocalizationFinishObservation visualization.""" + + def test_success_visualization(self): + """Test visualization of successful observation.""" + obs = LocalizationFinishObservation( + success=True, + num_locations=3, + validation_message="Successfully submitted 3 location(s)." + ) + text = obs.visualize + + assert "✓" in str(text) + assert "3 location(s)" in str(text) + + def test_failure_visualization(self): + """Test visualization of failed observation.""" + obs = LocalizationFinishObservation( + success=False, + num_locations=0, + validation_message="Missing file paths" + ) + text = obs.visualize + + assert "✗" in str(text) + assert "Missing file paths" in str(text) + + +if __name__ == "__main__": + pytest.main([__file__, "-v"]) From 8b0d97a876b2438ece942f7c0610a94578c289cd Mon Sep 17 00:00:00 2001 From: Leander Maben Date: Fri, 2 Jan 2026 01:59:39 -0500 Subject: [PATCH 2/3] Take File Object arguments for the Finish Tool instead of String --- src/prompts/templates/system_prompt.j2 | 101 +++++--- .../file_localization/module_rewards.py | 30 ++- src/tools/localization_finish.py | 243 +++++++----------- tests/tools/test_localization_finish.py | 149 +++++------ 4 files changed, 246 insertions(+), 277 deletions(-) diff --git a/src/prompts/templates/system_prompt.j2 b/src/prompts/templates/system_prompt.j2 index 462f7ba..a79f2d1 100644 --- a/src/prompts/templates/system_prompt.j2 +++ b/src/prompts/templates/system_prompt.j2 @@ -38,37 +38,49 @@ You are given access to the codebase in a linux file system. ### Submitting Your Answer (REQUIRED) -When you have identified all relevant locations, use the `finish` tool to submit your results. +When you have identified all relevant locations, use the `localization_finish` tool to submit your results. **Format Requirements:** -- Wrap your answer in triple backticks ``` ... ``` -- Each location must start with a file path (ending in .py) -- Class name is OPTIONAL - include only if the change is within a specific class -- Function name is OPTIONAL - include only if the change is at function/method level +Submit a structured list of locations. Each location is a JSON object with: +- `file`: Path to the file (REQUIRED) +- `class_name`: Class name (OPTIONAL - omit for file-level or standalone functions) +- `function_name`: Function/method name (OPTIONAL - omit for file-level or class-level only) **When to include what:** -- File-level changes (imports, globals, new classes): List just the file -- Class-level changes (new methods, attributes): List file + class (no function) -- Standalone function: List file + function (no class) -- Method in a class: List file + class + function +- File-level changes (imports, globals, new top-level classes): Just `file` +- Class-level changes (new methods, attributes, entire class): `file` + `class_name` +- Standalone function (top-level function): `file` + `function_name` +- Method in a class: `file` + `class_name` + `function_name` **Example formats:** + +1. File-only (imports, globals, new class): +```json +{"file": "path/to/file1.py"} ``` -# File-only (imports, globals, new class added) -path/to/file1.py -# File + Class (class-level changes) -path/to/file2.py -class: MyClass +2. File + Class (class-level changes): +```json +{"file": "path/to/file2.py", "class_name": "MyClass"} +``` -# File + Function (standalone function) -path/to/file3.py -function: my_function +3. File + Function (standalone function): +```json +{"file": "path/to/file3.py", "function_name": "my_function"} +``` + +4. File + Class + Function (method): +```json +{"file": "path/to/file4.py", "class_name": "MyClass", "function_name": "my_method"} +``` -# File + Class + Function (method change) -path/to/file4.py -class: MyClass -function: my_method +5. Multiple locations: +```json +[ + {"file": "src/parser.py", "class_name": "DataParser", "function_name": "parse_json"}, + {"file": "src/models/user.py", "class_name": "User"}, + {"file": "src/config.py"} +] ``` ## SEARCH STRATEGY @@ -89,11 +101,11 @@ function: my_method 3. **Final Verification**: Confirm your file list - Verify each candidate file is truly relevant - Ensure you haven't missed related files - - Use the `finish` tool to submit your answer + - Use the `localization_finish` tool to submit your answer ## CRITICAL RULES - NEVER exceed 5 parallel bash tool calls in a single turn -- ALWAYS use the `finish` tool when you're done (wrap your answer in backticks) +- ALWAYS use the `localization_finish` tool when you're done - ALWAYS use bash tool to search (do not guess file locations) - NEVER read entire large files - always read in chunks (100-line ranges) - Check file size with `wc -l` before reading @@ -104,24 +116,31 @@ function: my_method ## EXAMPLE SUBMISSION -When ready, call the `finish` tool with your findings: - -``` -src/utils/parser.py -class: DataParser -function: parse_json - -src/models/user.py -class: User - -src/config.py - -src/api/endpoints.py -function: handle_request +When ready, call the `localization_finish` tool with your findings: + +```json +[ + { + "file": "src/utils/parser.py", + "class_name": "DataParser", + "function_name": "parse_json" + }, + { + "file": "src/models/user.py", + "class_name": "User" + }, + { + "file": "src/config.py" + }, + { + "file": "src/api/endpoints.py", + "function_name": "handle_request" + } +] ``` **Note:** In this example: -- `parser.py` has a specific method change -- `user.py` has a class-level change (no specific function) -- `config.py` has file-level changes (imports/globals) -- `endpoints.py` has a standalone function change \ No newline at end of file +- `parser.py` has a specific method change (file + class + function) +- `user.py` has a class-level change (file + class only) +- `config.py` has file-level changes (file only) +- `endpoints.py` has a standalone function change (file + function only) \ No newline at end of file diff --git a/src/rewards/file_localization/module_rewards.py b/src/rewards/file_localization/module_rewards.py index 8be9d18..c0fd68c 100644 --- a/src/rewards/file_localization/module_rewards.py +++ b/src/rewards/file_localization/module_rewards.py @@ -1,17 +1,17 @@ import logging -from typing import Dict, List, Tuple +from typing import Dict, List, Tuple, Union -def parse_simple_output(raw_output: str) -> List[Dict[str, str]]: +def parse_simple_output(raw_output: Union[str, List[Dict[str, str]]]) -> List[Dict[str, str]]: """ Parse simplified agent output containing filename, optional class, and function. Args: - raw_output: Raw text output from the agent + raw_output: Either a raw text string OR a list of location dicts (for structured input) Returns: List of dictionaries with keys: 'file', 'class' (optional), 'function' - Example input format: + Example string input format: ``` path/to/file1.py class: MyClass @@ -21,12 +21,32 @@ def parse_simple_output(raw_output: str) -> List[Dict[str, str]]: function: standalone_function ``` - Example output: + Example structured input format: + [ + {'file': 'path/to/file1.py', 'class': 'MyClass', 'function': 'my_method'}, + {'file': 'path/to/file2.py', 'class': None, 'function': 'standalone_function'} + ] + + Example output (same for both): [ {'file': 'path/to/file1.py', 'class': 'MyClass', 'function': 'my_method'}, {'file': 'path/to/file2.py', 'class': None, 'function': 'standalone_function'} ] """ + # Handle structured input (list of dicts) + if isinstance(raw_output, list): + # Already in the correct format (or close to it) + # Normalize field names: class_name -> class, function_name -> function + normalized = [] + for loc in raw_output: + normalized.append({ + 'file': loc.get('file', ''), + 'class': loc.get('class') or loc.get('class_name'), + 'function': loc.get('function') or loc.get('function_name'), + }) + return normalized + + # Handle string input (legacy format) # Remove triple backticks and whitespace raw_output = raw_output.strip("` \n") diff --git a/src/tools/localization_finish.py b/src/tools/localization_finish.py index a762904..513d371 100644 --- a/src/tools/localization_finish.py +++ b/src/tools/localization_finish.py @@ -1,6 +1,6 @@ """Custom finish tool for code localization tasks. -This tool allows the agent to submit localization results in a flexible format where: +This tool allows the agent to submit localization results in a structured format where: - File path is required - Class name is optional - Function name is optional @@ -10,7 +10,7 @@ from typing import TYPE_CHECKING from collections.abc import Sequence -from pydantic import Field +from pydantic import BaseModel, Field, computed_field from rich.text import Text from openhands.sdk import ( @@ -26,44 +26,62 @@ from openhands.sdk.conversation.base import BaseConversation -class LocalizationFinishAction(Action): - """Action for submitting final localization results.""" - - message: str = Field( - description="""Your final localization results in the specified format. +class CodeLocation(BaseModel): + """A single code location with optional class and function.""" -Format: -``` -path/to/file1.py -class: ClassName -function: method_name + file: str = Field(description="Path to the file (required)") + class_name: str | None = Field(default=None, description="Class name (optional)") + function_name: str | None = Field(default=None, description="Function/method name (optional)") -path/to/file2.py -function: standalone_function -path/to/file3.py -``` - -Requirements: -- Each location must have a file path -- Class name is optional (omit if change is file-level or function is standalone) -- Function name is optional (omit if change is file-level or class-level only) -- Wrap your answer in triple backticks +class LocalizationFinishAction(Action): + """Action for submitting final localization results.""" -Example for different scenarios: -- File-level change (imports, globals): Just list the file -- Class-level change (new method, attributes): List file + class -- Method change: List file + class + function -- Standalone function: List file + function + locations: list[CodeLocation] = Field( + description="""List of code locations to modify. Each location must have: +- file: Path to the file (required) +- class_name: Class name (optional, omit for file-level or standalone functions) +- function_name: Function/method name (optional, omit for file-level or class-level only) + +Examples: +- File-level change: {"file": "src/config.py"} +- Class-level change: {"file": "src/user.py", "class_name": "User"} +- Standalone function: {"file": "src/utils.py", "function_name": "helper"} +- Method in class: {"file": "src/parser.py", "class_name": "Parser", "function_name": "parse"} """ ) + @computed_field + @property + def message(self) -> str: + """Auto-generate message from locations for backward compatibility.""" + if not self.locations: + return "" + + lines = [] + for loc in self.locations: + lines.append(loc.file) + if loc.class_name: + lines.append(f"class: {loc.class_name}") + if loc.function_name: + lines.append(f"function: {loc.function_name}") + lines.append("") # Empty line between locations + + return "```\n" + "\n".join(lines).rstrip() + "\n```" + @property def visualize(self) -> Text: """Return Rich Text representation of this action.""" content = Text() content.append("Submitting localization results:\n", style="bold blue") - content.append(self.message) + content.append(f"Found {len(self.locations)} location(s):\n", style="green") + for i, loc in enumerate(self.locations, 1): + content.append(f" {i}. {loc.file}", style="cyan") + if loc.class_name: + content.append(f" → {loc.class_name}", style="yellow") + if loc.function_name: + content.append(f".{loc.function_name}", style="magenta") + content.append("\n") return content @@ -87,91 +105,23 @@ def visualize(self) -> Text: return content -def parse_localization_output(raw_output: str) -> list[dict]: - """Parse localization output with optional class and function. - - This is an enhanced version of parse_simple_output that handles: - - File-only entries (no class or function) - - File + class entries (no function) - - File + function entries (no class) - - File + class + function entries +def locations_to_dict_list(locations: list[CodeLocation]) -> list[dict]: + """Convert CodeLocation objects to dictionary format. Args: - raw_output: Raw text output to parse + locations: List of CodeLocation objects Returns: - List of dictionaries with 'file', 'class' (optional), 'function' (optional) + List of dictionaries with 'file', 'class', 'function' keys """ - # Remove triple backticks and whitespace - raw_output = raw_output.strip("` \n") - - locations = [] - current_file = None - current_class = None - current_function = None - - lines = raw_output.strip().split("\n") - - for line in lines: - line = line.strip() - - if not line: - # Empty line - save current location if we have a file - if current_file: - locations.append({ - "file": current_file, - "class": current_class, - "function": current_function, - }) - current_file = None - current_class = None - current_function = None - continue - - # Check if this is a file path (ends with .py) - if line.endswith(".py"): - # Save previous location if exists - if current_file: - locations.append({ - "file": current_file, - "class": current_class, - "function": current_function, - }) - # Start new location - current_file = line - current_class = None - current_function = None - continue - - # Parse class declaration - if line.startswith("class:"): - class_name = line[len("class:"):].strip() - current_class = class_name - continue - - # Parse function/method declaration - if line.startswith("function:") or line.startswith("method:"): - func_text = line.split(":", 1)[1].strip() - func_name = func_text.split()[0].strip("() ") - - # Check if function includes class prefix (e.g., "MyClass.my_method") - if "." in func_name: - parts = func_name.split(".", 1) - current_class = parts[0] - current_function = parts[1] - else: - current_function = func_name - continue - - # Don't forget the last location - if current_file: - locations.append({ - "file": current_file, - "class": current_class, - "function": current_function, - }) - - return locations + return [ + { + "file": loc.file, + "class": loc.class_name, + "function": loc.function_name, + } + for loc in locations + ] class LocalizationFinishExecutor(ToolExecutor): @@ -201,41 +151,40 @@ def __call__( """ try: - # Parse the output to validate format - locations = parse_localization_output(action.message) + # Get locations from action (already structured) + locations = action.locations num_locs = len(locations) - # Validation 1: Check if any locations were found + # Validation 1: Check if any locations were provided if num_locs == 0: return LocalizationFinishObservation( success=False, num_locations=0, validation_message=( - "No valid locations found. Please provide at least one file path " - "in the correct format wrapped in triple backticks." + "No locations provided. Please provide at least one location." ), details={"error": "empty_output"} ) # Validation 2: Check each location has a file path errors = [] - for i, loc in enumerate(locations): - if not loc.get('file'): - errors.append(f"Location {i+1} is missing a file path") + for i, loc in enumerate(locations, 1): + if not loc.file: + errors.append(f"Location {i} is missing a file path") if errors: return LocalizationFinishObservation( success=False, num_locations=0, validation_message="\n".join(errors), - details={"error": "missing_file_paths", "locations": locations} + details={"error": "missing_file_paths", "locations": locations_to_dict_list(locations)} ) # Validation 3: Check file existence (if workspace provided) if self.workspace_dir: missing_files = [] for loc in locations: - file_path = loc['file'] + file_path = loc.file full_path = os.path.join(self.workspace_dir, file_path) if not os.path.exists(full_path): missing_files.append(file_path) @@ -252,7 +201,7 @@ def __call__( details={ "warning": "files_not_found", "missing_files": missing_files, - "locations": locations + "locations": locations_to_dict_list(locations) } ) @@ -261,24 +210,19 @@ def __call__( success=True, num_locations=num_locs, validation_message=f"Successfully submitted {num_locs} location(s).", - details={"locations": locations} + details={"locations": locations_to_dict_list(locations)} ) except Exception as e: - # Parsing failed + # Validation failed return LocalizationFinishObservation( success=False, num_locations=0, validation_message=( - f"Error parsing output: {str(e)}\n\n" - "Please ensure your output follows the correct format:\n" - "```\n" - "path/to/file.py\n" - "class: ClassName\n" - "function: method_name\n" - "```" + f"Error validating locations: {str(e)}\n\n" + "Please ensure each location has a valid file path." ), - details={"error": "parse_error", "exception": str(e)} + details={"error": "validation_error", "exception": str(e)} ) @@ -287,31 +231,26 @@ def __call__( Use this tool when you have identified all relevant files, classes, and functions that need to be modified to address the issue described in the problem statement. -Format your results as follows: -``` -path/to/file1.py -class: ClassName -function: method_name +Provide a structured list of locations. Each location must have: +- file: Path to the file (required) +- class_name: Class name (optional) +- function_name: Function/method name (optional) + +Examples of different scenarios: -path/to/file2.py -function: standalone_function +1. File-level change (imports, globals, new top-level classes): + {"file": "src/config.py"} -path/to/file3.py -``` +2. Class-level change (new methods, class attributes, entire class modified): + {"file": "src/models/user.py", "class_name": "User"} -Requirements: -- Wrap your output in triple backticks (```) -- Each location must start with a file path -- Class name is OPTIONAL - include only if the change is within a specific class -- Function name is OPTIONAL - include only if the change is at function/method level +3. Standalone function (top-level function, not in a class): + {"file": "src/utils/helpers.py", "function_name": "format_date"} -When to omit class/function: -- File-level only (imports, globals, new classes): List just the file -- Class-level only (new methods, attributes): List file + class (no function) -- Standalone function: List file + function (no class) -- Method in class: List file + class + function +4. Method in a class (specific method modification): + {"file": "src/parser.py", "class_name": "DataParser", "function_name": "parse_json"} -The tool will validate your submission and provide feedback if the format is incorrect. +The tool will validate file existence and provide feedback if issues are found. """ @@ -351,11 +290,11 @@ def create( ] -@tool(name="finish") +@tool(name="localization_finish") def _make_localization_finish_tool(conv_state) -> list[ToolDefinition]: """Create localization finish tool. - This replaces the default finish tool with a localization-specific version - that validates the output format. + This is a localization-specific finish tool that accepts structured locations + and validates the output format. """ return LocalizationFinishTool.create(conv_state) diff --git a/tests/tools/test_localization_finish.py b/tests/tools/test_localization_finish.py index 2e348e9..d03c7aa 100644 --- a/tests/tools/test_localization_finish.py +++ b/tests/tools/test_localization_finish.py @@ -9,6 +9,7 @@ LocalizationFinishAction, LocalizationFinishExecutor, LocalizationFinishObservation, + CodeLocation, ) @@ -26,23 +27,25 @@ def teardown_method(self): if os.path.exists(self.temp_dir): shutil.rmtree(self.temp_dir) - def test_empty_message(self): - """Test with empty message.""" - action = LocalizationFinishAction(message="") + def test_empty_locations(self): + """Test with empty locations list.""" + action = LocalizationFinishAction(locations=[]) result = self.executor(action) assert result.success is False assert result.num_locations == 0 - assert "No valid locations" in result.validation_message + assert "No locations provided" in result.validation_message - def test_malformed_input(self): - """Test with malformed input.""" - action = LocalizationFinishAction(message="random text without proper format") + def test_missing_file_field(self): + """Test with location missing file field.""" + # This would fail Pydantic validation, so we test with empty file string + action = LocalizationFinishAction( + locations=[CodeLocation(file="", class_name="MyClass")] + ) result = self.executor(action) assert result.success is False - # Malformed input results in no locations found - assert "No valid locations" in result.validation_message + assert "missing a file path" in result.validation_message def test_file_only_valid(self): """Test valid file-level localization (no class or function).""" @@ -50,9 +53,9 @@ def test_file_only_valid(self): file_path = "test_file.py" Path(self.temp_dir, file_path).touch() - action = LocalizationFinishAction(message=f"""``` -{file_path} -```""") + action = LocalizationFinishAction( + locations=[CodeLocation(file=file_path)] + ) result = self.executor(action) assert result.success is True @@ -64,10 +67,9 @@ def test_file_and_class_valid(self): file_path = "test_file.py" Path(self.temp_dir, file_path).touch() - action = LocalizationFinishAction(message=f"""``` -{file_path} -class: MyClass -```""") + action = LocalizationFinishAction( + locations=[CodeLocation(file=file_path, class_name="MyClass")] + ) result = self.executor(action) assert result.success is True @@ -78,10 +80,9 @@ def test_file_and_function_valid(self): file_path = "test_file.py" Path(self.temp_dir, file_path).touch() - action = LocalizationFinishAction(message=f"""``` -{file_path} -function: my_function -```""") + action = LocalizationFinishAction( + locations=[CodeLocation(file=file_path, function_name="my_function")] + ) result = self.executor(action) assert result.success is True @@ -92,11 +93,13 @@ def test_complete_localization_valid(self): file_path = "test_file.py" Path(self.temp_dir, file_path).touch() - action = LocalizationFinishAction(message=f"""``` -{file_path} -class: MyClass -function: my_method -```""") + action = LocalizationFinishAction( + locations=[CodeLocation( + file=file_path, + class_name="MyClass", + function_name="my_method" + )] + ) result = self.executor(action) assert result.success is True @@ -108,19 +111,14 @@ def test_multiple_locations_mixed_formats(self): for i in range(1, 5): Path(self.temp_dir, f"file{i}.py").touch() - action = LocalizationFinishAction(message="""``` -file1.py - -file2.py -class: ClassA - -file3.py -function: func_b - -file4.py -class: ClassC -function: method_d -```""") + action = LocalizationFinishAction( + locations=[ + CodeLocation(file="file1.py"), + CodeLocation(file="file2.py", class_name="ClassA"), + CodeLocation(file="file3.py", function_name="func_b"), + CodeLocation(file="file4.py", class_name="ClassC", function_name="method_d"), + ] + ) result = self.executor(action) assert result.success is True @@ -129,9 +127,9 @@ def test_multiple_locations_mixed_formats(self): def test_missing_file_warning(self): """Test that missing files trigger a warning.""" - action = LocalizationFinishAction(message="""``` -nonexistent_file.py -```""") + action = LocalizationFinishAction( + locations=[CodeLocation(file="nonexistent_file.py")] + ) result = self.executor(action) # Should still parse but with warning @@ -145,34 +143,25 @@ def test_some_files_missing(self): # Create only one file Path(self.temp_dir, "exists.py").touch() - action = LocalizationFinishAction(message="""``` -exists.py - -missing.py -```""") + action = LocalizationFinishAction( + locations=[ + CodeLocation(file="exists.py"), + CodeLocation(file="missing.py"), + ] + ) result = self.executor(action) assert result.success is False assert result.num_locations == 2 assert "missing.py" in result.validation_message - def test_no_backticks(self): - """Test that output without backticks fails parsing.""" - action = LocalizationFinishAction(message="""file.py -class: MyClass -""") - result = self.executor(action) - - # Should fail because parse_simple_output expects backticks - assert result.success is False - def test_executor_without_workspace(self): """Test executor without workspace validation.""" executor = LocalizationFinishExecutor(workspace_dir=None) - action = LocalizationFinishAction(message="""``` -any_file.py -```""") + action = LocalizationFinishAction( + locations=[CodeLocation(file="any_file.py")] + ) result = executor(action) # Should succeed without file existence check @@ -186,43 +175,45 @@ def test_nested_file_path(self): nested_dir.mkdir(parents=True) Path(nested_dir, "helper.py").touch() - action = LocalizationFinishAction(message="""``` -src/utils/helper.py -function: process -```""") + action = LocalizationFinishAction( + locations=[ + CodeLocation(file="src/utils/helper.py", function_name="process") + ] + ) result = self.executor(action) assert result.success is True assert result.num_locations == 1 - def test_dotted_function_name(self): - """Test with ClassName.method_name format.""" + def test_structured_class_and_function(self): + """Test with class and function specified separately.""" file_path = "test.py" Path(self.temp_dir, file_path).touch() - action = LocalizationFinishAction(message=f"""``` -{file_path} -function: MyClass.my_method -```""") + action = LocalizationFinishAction( + locations=[ + CodeLocation( + file=file_path, + class_name="MyClass", + function_name="my_method" + ) + ] + ) result = self.executor(action) assert result.success is True assert result.num_locations == 1 - # Should parse as class=MyClass, function=my_method + # Verify structured data is preserved assert result.details["locations"][0]["class"] == "MyClass" assert result.details["locations"][0]["function"] == "my_method" def test_multiple_missing_files_truncated(self): """Test that many missing files are truncated in message.""" - action = LocalizationFinishAction(message="""``` -file1.py -file2.py -file3.py -file4.py -file5.py -file6.py -file7.py -```""") + action = LocalizationFinishAction( + locations=[ + CodeLocation(file=f"file{i}.py") for i in range(1, 8) + ] + ) result = self.executor(action) assert result.success is False From 84a6ee5d419e60312309d1f57cdbc708484600b0 Mon Sep 17 00:00:00 2001 From: Leander Maben Date: Sun, 4 Jan 2026 17:41:03 -0500 Subject: [PATCH 3/3] Update generator to look for the finish tool --- src/generator/code_search_generator.py | 42 +++++++++++++++++-- .../file_localization/file_localization.py | 15 ++++++- 2 files changed, 52 insertions(+), 5 deletions(-) diff --git a/src/generator/code_search_generator.py b/src/generator/code_search_generator.py index b3ec6bf..34cb3b8 100644 --- a/src/generator/code_search_generator.py +++ b/src/generator/code_search_generator.py @@ -54,6 +54,8 @@ LLMConvertibleEvent, get_logger, ) +from openhands.sdk.event import ActionEvent +from src.tools.localization_finish import LocalizationFinishAction from src.prompts.prompt_builder import get_instruction from src.utils.instance import clone_instance @@ -74,6 +76,35 @@ file_path = os.path.dirname(__file__) + +def get_structured_locations(events: List[Event]) -> Optional[List[Dict[str, Any]]]: + """Extract structured locations from LocalizationFinishAction in events. + + Args: + events: List of conversation events to search through. + + Returns: + List of location dicts with 'file', 'class', 'function' keys, or None if not found. + """ + # Find the last LocalizationFinishAction + for event in reversed(events): + if ( + isinstance(event, ActionEvent) + and event.source == "agent" + and isinstance(event.action, LocalizationFinishAction) + ): + # Extract structured locations from the action + locations = [] + for loc in event.action.locations: + locations.append({ + "file": loc.file, + "class": loc.class_name, + "function": loc.function_name, + }) + return locations + return None + + @ray.remote(num_cpus=0.01) def init_and_run( instance: dict, @@ -156,6 +187,9 @@ def init_and_run( messages = list(map(lambda event: event.model_dump(), conversation.state.events)) final_message = get_agent_final_response(conversation.state.events) + # Extract structured locations if available + structured_locations = get_structured_locations(conversation.state.events) + # remove the workspace dir try: if workspace.exists(): @@ -179,7 +213,7 @@ def init_and_run( "end_timestamp": end_timestamp } - return messages, final_message, additional_attr + return messages, final_message, structured_locations, additional_attr class CodeSearchGenerator(SkyRLGymGenerator): @@ -230,7 +264,7 @@ async def code_search_loop( instance = env_extras error = None try: - messages, final_message, additional_attr = await init_and_run.remote( + messages, final_message, structured_locations, additional_attr = await init_and_run.remote( instance, self.litellm_model_name, # sweagent_config, @@ -249,6 +283,7 @@ async def code_search_loop( error = str(e) + "\n" + traceback.format_exc() messages = [] final_message = "" + structured_locations = None additional_attr = { "wall_clock_duration": 0.0, "start_timestamp": None, @@ -269,6 +304,7 @@ async def code_search_loop( try: input_args = { "final_message": final_message, + "structured_locations": structured_locations, "messages": messages, "instance": instance, } @@ -276,7 +312,7 @@ async def code_search_loop( reward_fn = get_reward_function(reward_fn_args["fn"]) input_args = { - **input_args, + **input_args, **reward_fn_args.get("args", {}) } diff --git a/src/rewards/file_localization/file_localization.py b/src/rewards/file_localization/file_localization.py index 64e155a..2c64221 100644 --- a/src/rewards/file_localization/file_localization.py +++ b/src/rewards/file_localization/file_localization.py @@ -26,9 +26,15 @@ def file_localization_f1_reward( final_message: str, instance: dict, file_level_weight: float=1.0, + structured_locations=None, **kwargs ): - all_found_files, all_found_modules, all_found_entities = get_simple_results_from_raw_outputs(final_message) + # Use structured locations if available, otherwise parse final_message + if structured_locations is not None: + all_found_files, all_found_modules, all_found_entities = get_simple_results_from_raw_outputs(structured_locations) + else: + all_found_files, all_found_modules, all_found_entities = get_simple_results_from_raw_outputs(final_message) + true_files = set(x[0] for x in ast.literal_eval(instance["target"])) file_level_score = compute_file_f1_score(all_found_files, true_files) weighted_file_score = file_level_weight * file_level_score @@ -42,6 +48,7 @@ def multilevel_localization_f1_reward( file_level_weight: float=1.0, module_level_weight: float=1.0, entity_level_weight: float=1.0, + structured_locations=None, **kwargs ): @@ -67,7 +74,11 @@ def multilevel_localization_f1_reward( gt_modules = set(gt_modules) gt_entities = set(gt_entities) - predicted_files, predicted_modules, predicted_entities = get_simple_results_from_raw_outputs(final_message) + # Use structured locations if available, otherwise parse final_message + if structured_locations is not None: + predicted_files, predicted_modules, predicted_entities = get_simple_results_from_raw_outputs(structured_locations) + else: + predicted_files, predicted_modules, predicted_entities = get_simple_results_from_raw_outputs(final_message) file_f1_score = compute_file_f1_score(predicted_files, gt_files) module_f1_score = compute_file_f1_score(predicted_modules, gt_modules)