From 27adcc991b1f8eaa0611efe91e442ecf927f5330 Mon Sep 17 00:00:00 2001 From: Taha YASSINE Date: Tue, 10 Feb 2026 11:25:03 +0100 Subject: [PATCH 1/2] Add downstream dataset build script --- scripts/build_downstream_dataset.py | 56 +++++++++++++++++++++++++++++ 1 file changed, 56 insertions(+) create mode 100644 scripts/build_downstream_dataset.py diff --git a/scripts/build_downstream_dataset.py b/scripts/build_downstream_dataset.py new file mode 100644 index 0000000..072ae70 --- /dev/null +++ b/scripts/build_downstream_dataset.py @@ -0,0 +1,56 @@ +import json +from pathlib import Path + +from datasets import load_dataset + + +def main(): + # Configuration + dataset_name = "princeton-nlp/SWE-bench_Verified" + split = "test" + search_results_path = Path( + "/path/to/search_results.jsonl" + ) + output_path = ( + Path(__file__).parent.parent + / "data" + / f"{dataset_name.split('/')[1]}_with_search_results.jsonl" + ) + + # Load the dataset + print(f"Loading dataset: {dataset_name} (split: {split})") + dataset = load_dataset(dataset_name, split=split) + print(f"✓ Loaded {len(dataset)} instances\n") + + # Load JSONL file as a dataset + print(f"Loading JSONL file: {search_results_path}") + if not search_results_path.exists(): + raise FileNotFoundError(f"JSONL file not found: {search_results_path}") + + by_instance_id: dict[str, object] = {} + with search_results_path.open("r") as f: + for line in f: + if not line.strip(): + continue + obj = json.loads(line) + prediction = obj["test_result"]["reward"]["prediction"] + by_instance_id[obj["instance_id"]] = {k: list(v) for k, v in prediction.items()} + + # Merge datasets on instance_id + print("Merging datasets on instance_id...") + search_results = [by_instance_id.get(i) for i in dataset["instance_id"]] + merged_dataset = dataset.add_column(name="search_results", column=search_results) + print("✓ Merged datasets\n") + + # Save as JSONL + print(f"Saving to: {output_path}") + output_path.parent.mkdir(parents=True, exist_ok=True) + + with output_path.open("w") as f: + for ex in merged_dataset: + f.write(json.dumps(ex) + "\n") + print(f"✓ Saved dataset with {len(merged_dataset)} instances to {output_path}") + + +if __name__ == "__main__": + main() From 34af62bb658152794b843fcbe560c58d292e76c6 Mon Sep 17 00:00:00 2001 From: Taha YASSINE Date: Tue, 10 Feb 2026 11:25:40 +0100 Subject: [PATCH 2/2] Add custom prompt template for benchmark --- prompts/default_with_search.j2 | 76 ++++++++++++++++++++++++++++++++++ 1 file changed, 76 insertions(+) create mode 100644 prompts/default_with_search.j2 diff --git a/prompts/default_with_search.j2 b/prompts/default_with_search.j2 new file mode 100644 index 0000000..03bb133 --- /dev/null +++ b/prompts/default_with_search.j2 @@ -0,0 +1,76 @@ +I have access to a python code repository in the directory {{ instance.repo_path }} . You can explore and modify files using the available tools. Consider the following issue description: + + +{{ instance.problem_statement }} + + +A separate search subagent has already explored the repository to identify files, modules, and entities that are likely relevant to the issue. + + +{{ instance.search_results }} + + +Important notes about this context: +- The relevant context is intended to accelerate navigation, not replace your own judgment. +- The listed files/entities are likely but not guaranteed to be sufficient. +- You may inspect files outside this list if needed, but you should prioritize the provided context. +- If you determine that some listed items are irrelevant or that additional files are required, state this explicitly in your analysis. + +Can you help me implement the necessary changes to the repository so that the requirements specified in the are met? +I've already taken care of all changes to any of the test files described in the . This means you DON'T have to modify the testing logic or any of the tests in any way! +Also the development Python environment is already set up for you (i.e., all dependencies already installed), so you don't need to install other packages. +Your task is to make the minimal changes to non-test files in the {{ instance.repo_path }} directory to ensure the is satisfied. + +Follow these phases to resolve the issue: + +Phase 1. READING: read the problem and reword it in clearer terms + 1.1 If there are code or config snippets. Express in words any best practices or conventions in them. + 1.2 Hightlight message errors, method names, variables, file names, stack traces, and technical details. + 1.3 Explain the problem in clear terms. + 1.4 Enumerate the steps to reproduce the problem. + 1.5 Hightlight any best practices to take into account when testing and fixing the issue + +Phase 2. RUNNING: install and run the tests on the repository + 2.1 Activate the environment by running + ./opt/miniconda3/etc/profile.d/conda.sh ; conda activate testbed + 2.2 Follow the readme + 2.3 Install the environment and anything needed + 2.4 Iterate and figure out how to run the tests + +Phase 3. CONTEXT REVIEW & CODE EXPLORATION: use the relevant context to orient yourself in the codebase. + 3.1 Review the and understand why each listed file or entity may be relevant. + 3.2 Prioritize and inspect the files and entities mentioned in the . + 3.3 Read the relevant source code to understand its behavior. + 3.4 Identify discrepancies between current behavior and the expectations in the . + 3.5 Document any irrelevant items or missing files and adjust exploration as needed. + +Phase 4. TEST CREATION: before implementing any fix, create a script to reproduce and verify the issue. + 4.1 Look at existing test files in the repository to understand the test format/structure. + 4.2 Create a minimal reproduction script that reproduces the located issue. + 4.3 Run the reproduction script to confirm you are reproducing the issue. + 4.4 Adjust the reproduction script as necessary. + +Phase 5. FIX ANALYSIS: state clearly the problem and how to fix it + 5.1 State clearly what the problem is. + 5.2 State clearly where the problem is located. + 5.3 State clearly how the test reproduces the issue. + 5.4 State clearly the best practices to take into account in the fix. + 5.5 State clearly how to fix the problem. + +Phase 6. FIX IMPLEMENTATION: Edit the source code to implement your chosen solution. + 6.1 Make minimal, focused changes to fix the issue. + +Phase 7. VERIFICATION: Test your implementation thoroughly. + 7.1 Run your reproduction script to verify the fix works. + 7.2 Add edge cases to your test script to ensure comprehensive coverage. + 7.3 Run existing tests related to the modified code to ensure you haven't broken anything. + +Phase 8. FINAL REVIEW: Carefully re-read the problem description and compare your changes with the base commit {{ instance.base_commit }}. + 8.1 Ensure you've fully addressed all requirements. + 8.2 Run any tests in the repository related to: + 8.2.1 The issue you are fixing + 8.2.2 The files you modified + 8.2.3 The functions you changed + 8.3 If any tests fail, revise your implementation until all tests pass + +Be thorough in your exploration, testing, and reasoning. It's fine if your thinking process is lengthy - quality and completeness are more important than brevity.