From 2b8d9114e31e3e682903adf83fbc4809f7acb70c Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Kov=C3=A1cs=20=C3=81d=C3=A1m?= Date: Thu, 5 Mar 2026 17:37:22 +0100 Subject: [PATCH 1/9] Continue phase 5 --- scripts/code_hallucination/format_builder.py | 107 +++++++++++-------- 1 file changed, 65 insertions(+), 42 deletions(-) diff --git a/scripts/code_hallucination/format_builder.py b/scripts/code_hallucination/format_builder.py index 9d290f7..f371efe 100644 --- a/scripts/code_hallucination/format_builder.py +++ b/scripts/code_hallucination/format_builder.py @@ -180,61 +180,84 @@ def run( if queries is None: queries = {} + # Load existing for resumability + existing = {} + if FORMATS_PATH.exists(): + with open(FORMATS_PATH) as f: + for line in f: + try: + entry = json.loads(line) + existing[entry["instance_id"]] = entry + except (json.JSONDecodeError, KeyError): + continue + print(f"Already processed: {len(existing)} formats") + + to_process = [inst for inst in instances if inst["instance_id"] not in existing] + print(f"Remaining: {len(to_process)} instances to process") + # Only init LLM client if we'll need it (lazy) client = None - results = [] + results = list(existing.values()) format_counts = {fmt: 0 for fmt in FORMAT_TYPES} + for entry in results: + fmt = entry.get("format_type") + if fmt in format_counts: + format_counts[fmt] += 1 skipped = 0 explanation_failures = 0 + processed = 0 - for inst in instances: - instance_id = inst["instance_id"] - - # Load source data from cache - cache_path = source_cache_dir / f"{instance_id}.json" - if not cache_path.exists(): - skipped += 1 - continue - - with open(cache_path) as f: - source_data = json.load(f) - - fmt, answer = assign_format(source_data) - if fmt is None: - skipped += 1 - continue - - # Generate explanation wrapper for code_with_explanation format - if fmt == "code_with_explanation": - if client is None: - client = OpenAI(api_key=api_key, base_url=base_url) - print(f" LLM client initialized for code_with_explanation ({base_url})") - - query = queries.get(instance_id, inst.get("problem_statement", "")[:500]) - context = source_data.get("patch_code", "") - explained = _generate_explanation(client, model, answer, query, context) - - if explained is None: - # Fallback: use raw code as fragment - fmt = "fragment" - explanation_failures += 1 - else: - answer = explained + with open(FORMATS_PATH, "a") as f: + for inst in to_process: + instance_id = inst["instance_id"] + + # Load source data from cache + cache_path = source_cache_dir / f"{instance_id}.json" + if not cache_path.exists(): + skipped += 1 + continue + + with open(cache_path) as fp: + source_data = json.load(fp) - results.append( - { + fmt, answer = assign_format(source_data) + if fmt is None: + skipped += 1 + continue + + # Generate explanation wrapper for code_with_explanation format + if fmt == "code_with_explanation": + if client is None: + client = OpenAI(api_key=api_key, base_url=base_url) + print(f" LLM client initialized for code_with_explanation ({base_url})") + + query = queries.get(instance_id, inst.get("problem_statement", "")[:500]) + context = source_data.get("patch_code", "") + explained = _generate_explanation(client, model, answer, query, context) + + if explained is None: + # Fallback: use raw code as fragment + fmt = "fragment" + explanation_failures += 1 + else: + answer = explained + + entry = { "instance_id": instance_id, "format_type": fmt, "answer": answer, } - ) - format_counts[fmt] += 1 - - # Save - with open(FORMATS_PATH, "w") as f: - for entry in results: f.write(json.dumps(entry) + "\n") + f.flush() + results.append(entry) + format_counts[fmt] += 1 + processed += 1 + + if processed % 100 == 0: + print( + f" Progress: {processed}/{len(to_process)} (failures: {explanation_failures})" + ) print(f"\nAssigned formats for {len(results)} instances (skipped {skipped})") if explanation_failures: From c1cd5eb7a2a335d15c4f87031349edf92b59931e Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Kov=C3=A1cs=20=C3=81d=C3=A1m?= Date: Thu, 5 Mar 2026 17:41:22 +0100 Subject: [PATCH 2/9] Better progressbar --- scripts/code_hallucination/context7_docs.py | 2 +- scripts/code_hallucination/format_builder.py | 2 +- scripts/code_hallucination/hallucination_injector.py | 10 ++++------ scripts/code_hallucination/query_rewriter.py | 4 ++-- scripts/code_hallucination/source_fetcher.py | 4 ++-- 5 files changed, 10 insertions(+), 12 deletions(-) diff --git a/scripts/code_hallucination/context7_docs.py b/scripts/code_hallucination/context7_docs.py index b287015..ef60850 100644 --- a/scripts/code_hallucination/context7_docs.py +++ b/scripts/code_hallucination/context7_docs.py @@ -228,7 +228,7 @@ def run(instances: list[dict]): if processed % 100 == 0: print( - f" Progress: {processed}/{len(to_process)} ({with_docs} with docs, {skipped_by_ratio} skipped)" + f" Phase 4: {processed}/{len(to_process)} ({with_docs} with docs, {skipped_by_ratio} skipped)" ) print( diff --git a/scripts/code_hallucination/format_builder.py b/scripts/code_hallucination/format_builder.py index f371efe..9ad9dfe 100644 --- a/scripts/code_hallucination/format_builder.py +++ b/scripts/code_hallucination/format_builder.py @@ -256,7 +256,7 @@ def run( if processed % 100 == 0: print( - f" Progress: {processed}/{len(to_process)} (failures: {explanation_failures})" + f" Phase 5: {processed}/{len(to_process)} (failures: {explanation_failures})" ) print(f"\nAssigned formats for {len(results)} instances (skipped {skipped})") diff --git a/scripts/code_hallucination/hallucination_injector.py b/scripts/code_hallucination/hallucination_injector.py index 941315c..f7bd6d5 100644 --- a/scripts/code_hallucination/hallucination_injector.py +++ b/scripts/code_hallucination/hallucination_injector.py @@ -421,8 +421,8 @@ def _run_sequential(to_process, formats, queries, docs, api_key, base_url, model results.append(entry) processed += 1 - if processed % 50 == 0: - print(f" Progress: {processed}/{len(to_process)} (failed: {failed})") + if processed % 100 == 0: + print(f" Phase 6: {processed}/{len(to_process)} (failed: {failed})") print(f"\nDone: {processed} injected, {failed} failed ({no_spans} had no matchable spans)") return results @@ -497,11 +497,9 @@ async def process_batches(): results.append(entry) processed += 1 - if processed % 50 == 0 or batch_start + BATCH_SIZE >= len(to_process): + if processed % 100 == 0 or batch_start + BATCH_SIZE >= len(to_process): total = processed + failed - print( - f" Progress: {total}/{len(to_process)} ({processed} ok, {failed} failed)" - ) + print(f" Phase 6: {total}/{len(to_process)} ({processed} ok, {failed} failed)") asyncio.run(process_batches()) print(f"\nDone: {processed} injected, {failed} failed ({no_spans} had no matchable spans)") diff --git a/scripts/code_hallucination/query_rewriter.py b/scripts/code_hallucination/query_rewriter.py index 966c70f..65f3a16 100644 --- a/scripts/code_hallucination/query_rewriter.py +++ b/scripts/code_hallucination/query_rewriter.py @@ -123,8 +123,8 @@ def run( f.flush() processed += 1 - if processed % 50 == 0: - print(f" Progress: {processed}/{len(to_process)} (failed: {failed})") + if processed % 100 == 0: + print(f" Phase 3: {processed}/{len(to_process)} (failed: {failed})") except Exception as e: print(f" ERROR {instance_id}: {e}") failed += 1 diff --git a/scripts/code_hallucination/source_fetcher.py b/scripts/code_hallucination/source_fetcher.py index 0fc25da..7eadaee 100644 --- a/scripts/code_hallucination/source_fetcher.py +++ b/scripts/code_hallucination/source_fetcher.py @@ -509,8 +509,8 @@ def run(instances: list[dict], use_github_api: bool = False): failed = 0 for i, instance in enumerate(instances): - if (i + 1) % 100 == 0: - print(f" Progress: {i + 1}/{len(instances)} ({len(results)} success, {failed} failed)") + if (i + 1) % 100 == 0 or (i + 1) == len(instances): + print(f" Phase 2: {i + 1}/{len(instances)} ({len(results)} success, {failed} failed)") # Skip if already cached cache_path = SOURCE_CACHE_DIR / f"{instance['instance_id']}.json" From 83d5421db9d2f373c67ee3124142b7d3b4d2f379 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Kov=C3=A1cs=20=C3=81d=C3=A1m?= Date: Thu, 5 Mar 2026 18:24:57 +0100 Subject: [PATCH 3/9] Dont fetch everything --- scripts/code_hallucination/source_fetcher.py | 30 +++++++++++++------- 1 file changed, 19 insertions(+), 11 deletions(-) diff --git a/scripts/code_hallucination/source_fetcher.py b/scripts/code_hallucination/source_fetcher.py index 7eadaee..dacebf1 100644 --- a/scripts/code_hallucination/source_fetcher.py +++ b/scripts/code_hallucination/source_fetcher.py @@ -43,7 +43,7 @@ def clone_repo(repo: str, repos_dir: Path = REPOS_DIR) -> Path | None: ["git", "clone", "--bare", f"https://github.com/{repo}.git", str(repo_dir)], capture_output=True, text=True, - timeout=1800, # 30 min for large repos + timeout=60, # 1 min timeout, fall back to GitHub API ) if result.returncode != 0: print(f" ERROR cloning {repo}: {result.stderr[:200]}") @@ -493,17 +493,14 @@ def run(instances: list[dict], use_github_api: bool = False): print("=" * 60) SOURCE_CACHE_DIR.mkdir(parents=True, exist_ok=True) + REPOS_DIR.mkdir(parents=True, exist_ok=True) - if not use_github_api: - REPOS_DIR.mkdir(parents=True, exist_ok=True) - # Group by repo for efficient cloning - repos = set(inst["repo"] for inst in instances) - print(f"Need to clone {len(repos)} repos") - for repo in sorted(repos): - clone_repo(repo) - else: + if use_github_api: print("Using GitHub raw API (no cloning)") + # Track repos that failed to clone so we don't retry + clone_failed_repos: set[str] = set() + # Fetch sources per instance results = [] failed = 0 @@ -519,10 +516,21 @@ def run(instances: list[dict], use_github_api: bool = False): results.append(json.load(f)) continue - result = fetch_source_for_instance(instance, use_github_api=use_github_api) + # Try clone first, fall back to GitHub API + repo = instance["repo"] + use_api_for_this = use_github_api + if not use_api_for_this and repo not in clone_failed_repos: + repo_dir = clone_repo(repo) + if repo_dir is None: + clone_failed_repos.add(repo) + use_api_for_this = True + print(f" Falling back to GitHub API for {repo}") + + result = fetch_source_for_instance( + instance, use_github_api=use_api_for_this or repo in clone_failed_repos + ) if result: results.append(result) - # Cache result cache_path = SOURCE_CACHE_DIR / f"{instance['instance_id']}.json" with open(cache_path, "w") as f: json.dump(result, f) From d19bf79d3f82a31a11ebc36d9171eccc3a4c857a Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Kov=C3=A1cs=20=C3=81d=C3=A1m?= Date: Thu, 5 Mar 2026 19:47:29 +0100 Subject: [PATCH 4/9] Fetch faster --- scripts/code_hallucination/source_fetcher.py | 13 ++++++++----- 1 file changed, 8 insertions(+), 5 deletions(-) diff --git a/scripts/code_hallucination/source_fetcher.py b/scripts/code_hallucination/source_fetcher.py index dacebf1..fa24c4e 100644 --- a/scripts/code_hallucination/source_fetcher.py +++ b/scripts/code_hallucination/source_fetcher.py @@ -5,6 +5,7 @@ import re import subprocess import tempfile +import warnings from pathlib import Path import requests @@ -143,7 +144,9 @@ def extract_modified_functions(original_source: str, patched_source: str) -> lis def get_functions(source: str) -> dict[str, str]: """Parse source and extract function name -> source mapping.""" try: - tree = ast.parse(source) + with warnings.catch_warnings(): + warnings.simplefilter("ignore", SyntaxWarning) + tree = ast.parse(source) except SyntaxError: return {} @@ -460,10 +463,7 @@ def fetch_source_for_instance( for filepath in changed_files: if filepath not in source_files: continue - if repo_dir is not None: - patched_source = apply_patch_and_get_file(repo_dir, commit, patch, filepath) - else: - patched_source = apply_patch_in_memory(source_files[filepath], patch, filepath) + patched_source = apply_patch_in_memory(source_files[filepath], patch, filepath) if patched_source: funcs = extract_modified_functions(source_files[filepath], patched_source) for func in funcs: @@ -492,6 +492,9 @@ def run(instances: list[dict], use_github_api: bool = False): print("Phase 2: Source File Fetching") print("=" * 60) + # Suppress SyntaxWarning from ast.parse on third-party source files + warnings.filterwarnings("ignore", category=SyntaxWarning) + SOURCE_CACHE_DIR.mkdir(parents=True, exist_ok=True) REPOS_DIR.mkdir(parents=True, exist_ok=True) From 5096af2525d91da6e30feb66dc05bac505cfc049 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Kov=C3=A1cs=20=C3=81d=C3=A1m?= Date: Thu, 5 Mar 2026 21:20:59 +0100 Subject: [PATCH 5/9] Better evaluation for code --- scripts/evaluate_code_hallucination.py | 29 +++++++++++++++----------- 1 file changed, 17 insertions(+), 12 deletions(-) diff --git a/scripts/evaluate_code_hallucination.py b/scripts/evaluate_code_hallucination.py index 87a68fd..2df98c0 100644 --- a/scripts/evaluate_code_hallucination.py +++ b/scripts/evaluate_code_hallucination.py @@ -5,12 +5,13 @@ Supports Groq API with any OpenAI-compatible model. Usage: - # With Groq + Kimi + # With Groq OPENAI_API_KEY=gsk_... OPENAI_API_BASE=https://api.groq.com/openai/v1 \ python scripts/evaluate_code_hallucination.py \ --model moonshotai/kimi-k2-instruct-0905 \ --data_path data/code_hallucination_lettucedetect_v2.json \ - --evaluation_type example_level + --evaluation_type example_level \ + --split test """ import argparse @@ -210,9 +211,12 @@ def main(): help="Limit number of test samples (for quick testing)", ) parser.add_argument( - "--test_ratio", type=float, default=0.3, help="Fraction of data to use as test set" + "--split", + type=str, + default="test", + choices=["train", "dev", "test"], + help="Which split to evaluate on (uses the split field from the dataset)", ) - parser.add_argument("--seed", type=int, default=42) args = parser.parse_args() @@ -243,14 +247,13 @@ def main(): ) ) - # Split into test set - import random - - random.seed(args.seed) - random.shuffle(samples) + # Filter to the requested split + test_samples = [s for s in samples if s.split == args.split] - test_size = int(len(samples) * args.test_ratio) - test_samples = samples[:test_size] + if not test_samples: + available_splits = set(s.split for s in samples) + print(f"No samples found for split '{args.split}'. Available splits: {available_splits}") + return if args.max_samples: test_samples = test_samples[: args.max_samples] @@ -260,7 +263,9 @@ def main(): print(f"Dataset: {data_path}") print(f"Total samples: {len(samples)}") - print(f"Test samples: {len(test_samples)} (positive: {n_positive}, negative: {n_negative})") + print( + f"Evaluating on '{args.split}' split: {len(test_samples)} samples (positive: {n_positive}, negative: {n_negative})" + ) print(f"Model: {args.model}") print(f"API base: {os.getenv('OPENAI_API_BASE', 'https://api.openai.com/v1')}") From 9be1d2ed2f13cd7dedc68133e2cb71f2ad7eda90 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Kov=C3=A1cs=20=C3=81d=C3=A1m?= Date: Fri, 6 Mar 2026 09:40:56 +0100 Subject: [PATCH 6/9] Smaller prompts --- scripts/code_hallucination/config.py | 1 + scripts/code_hallucination/format_builder.py | 245 ++++++++++++++---- .../hallucination_injector.py | 44 +++- scripts/code_hallucination/pipeline.py | 24 +- .../code_hallucination/sample_assembler.py | 7 +- 5 files changed, 259 insertions(+), 62 deletions(-) diff --git a/scripts/code_hallucination/config.py b/scripts/code_hallucination/config.py index 7f71f99..85bfed4 100644 --- a/scripts/code_hallucination/config.py +++ b/scripts/code_hallucination/config.py @@ -36,6 +36,7 @@ HALLUCINATION_RATIO = 0.4 # 40% hallucinated, 60% clean MAX_FILE_CHARS = 12000 # Cap individual source file size MAX_CONTEXT7_CHARS = 4000 # Documentation fetch limit +MAX_PROMPT_CHARS = 24000 # ~6K tokens, leaves room for answer within 8K model context # === LLM Config === RETRY_DELAY = 2 diff --git a/scripts/code_hallucination/format_builder.py b/scripts/code_hallucination/format_builder.py index 9ad9dfe..0f0a660 100644 --- a/scripts/code_hallucination/format_builder.py +++ b/scripts/code_hallucination/format_builder.py @@ -1,15 +1,21 @@ -"""Phase 5: Assign answer format to each instance.""" +"""Phase 5: Assign answer format to each instance. +Supports both sequential (remote API) and async batch (local vLLM) modes. +Set BATCH_SIZE>1 env var for parallel requests to local vLLM. +""" + +import asyncio import json import random import textwrap import time -from openai import OpenAI +from openai import AsyncOpenAI, OpenAI from .config import ( API_BASE_URL, API_KEY, + BATCH_SIZE, FORMAT_TYPES, FORMAT_WEIGHTS, FORMATS_PATH, @@ -26,27 +32,24 @@ that a developer would receive from an AI assistant. Your response MUST: - - Start with a brief explanation (1-3 sentences) of what the issue is and how to fix it + - Start with 1-2 sentences explaining what was wrong and how to fix it - Include the code in a properly formatted code block (```python) - - Optionally end with a short note about what changed or why + - Do NOT add anything after the code block Your response must NOT: - - Include phrases like "Here's the fix" or "I'll help you with that" — just explain directly - - Be longer than necessary — keep it concise + - Include phrases like "Here's the fix" or "I'll help you with that" + - Be longer than 2 sentences of explanation + the code block - Change the code in any way — use it exactly as provided - Add any imports or code not in the original - Example style: - The issue is that `process_data` uses `dict.items()` instead of iterating - over the sorted keys, which causes non-deterministic output. + Example: + The `process_data` function uses `dict.items()` instead of iterating over sorted keys, causing non-deterministic output. ```python def process_data(data): for key in sorted(data.keys()): yield key, data[key] ``` - - This ensures consistent ordering regardless of insertion order. """) @@ -75,7 +78,7 @@ def _generate_explanation( {"role": "user", "content": user_msg}, ], temperature=LLM_TEMPERATURE, - max_tokens=2000, + max_tokens=200, ) result = response.choices[0].message.content.strip() # Verify the code is actually in the response @@ -94,6 +97,47 @@ def _generate_explanation( return None +async def _generate_explanation_async( + aclient: AsyncOpenAI, model: str, code: str, query: str, context: str +) -> str | None: + """Async version of _generate_explanation for batch processing.""" + user_msg = f"""User's question: {query} + +Context (relevant source code): +{context[:3000]} + +Correct code fix: +```python +{code} +``` + +Write a natural AI assistant response that includes this exact code.""" + + for attempt in range(MAX_RETRIES): + try: + response = await aclient.chat.completions.create( + model=model, + messages=[ + {"role": "system", "content": EXPLANATION_SYSTEM_PROMPT}, + {"role": "user", "content": user_msg}, + ], + temperature=LLM_TEMPERATURE, + max_tokens=200, + ) + result = response.choices[0].message.content.strip() + if code[:50] in result or "```" in result: + return result + if attempt < MAX_RETRIES - 1: + continue + return None + except Exception: + if attempt < MAX_RETRIES - 1: + await asyncio.sleep(RETRY_DELAY * (attempt + 1)) + else: + return None + return None + + def assign_format(source_data: dict) -> tuple[str, str]: """Assign a format type and build the answer for an instance. @@ -169,7 +213,8 @@ def run( ): """Run Phase 5: Assign formats and build answers. - Returns list of dicts with instance_id, format_type, answer. + Uses async batch processing when BATCH_SIZE > 1 (for local vLLM). + Falls back to sequential processing for remote APIs (BATCH_SIZE=1). """ print("=" * 60) print("Phase 5: Answer Format Building") @@ -194,54 +239,104 @@ def run( to_process = [inst for inst in instances if inst["instance_id"] not in existing] print(f"Remaining: {len(to_process)} instances to process") + print(f"Batch size: {BATCH_SIZE}") + + # First pass: assign formats for all instances (no LLM needed) + # Collect which ones need explanation generation + needs_explanation = [] # (instance_id, code, query, context) + entries_no_llm = [] # entries that don't need LLM + + for inst in to_process: + instance_id = inst["instance_id"] + + cache_path = source_cache_dir / f"{instance_id}.json" + if not cache_path.exists(): + continue + + with open(cache_path) as fp: + source_data = json.load(fp) + + fmt, answer = assign_format(source_data) + if fmt is None: + continue + + if fmt == "code_with_explanation": + query = queries.get(instance_id, inst.get("problem_statement", "")[:500]) + context = source_data.get("patch_code", "") + needs_explanation.append((instance_id, answer, query, context, fmt)) + else: + entries_no_llm.append( + { + "instance_id": instance_id, + "format_type": fmt, + "answer": answer, + } + ) - # Only init LLM client if we'll need it (lazy) - client = None - + # Write non-LLM entries immediately results = list(existing.values()) format_counts = {fmt: 0 for fmt in FORMAT_TYPES} for entry in results: fmt = entry.get("format_type") if fmt in format_counts: format_counts[fmt] += 1 - skipped = 0 - explanation_failures = 0 + processed = 0 + explanation_failures = 0 with open(FORMATS_PATH, "a") as f: - for inst in to_process: - instance_id = inst["instance_id"] + for entry in entries_no_llm: + f.write(json.dumps(entry) + "\n") + results.append(entry) + format_counts[entry["format_type"]] += 1 + processed += 1 + f.flush() - # Load source data from cache - cache_path = source_cache_dir / f"{instance_id}.json" - if not cache_path.exists(): - skipped += 1 - continue + print(f" Assigned {len(entries_no_llm)} non-LLM formats") + print(f" Need LLM explanation: {len(needs_explanation)} instances") - with open(cache_path) as fp: - source_data = json.load(fp) + # Second pass: generate explanations (batched or sequential) + if needs_explanation: + if BATCH_SIZE > 1: + explanation_failures = _run_explanations_batched( + needs_explanation, format_counts, results, api_key, base_url, model + ) + else: + explanation_failures = _run_explanations_sequential( + needs_explanation, format_counts, results, api_key, base_url, model + ) - fmt, answer = assign_format(source_data) - if fmt is None: - skipped += 1 - continue + processed += len(needs_explanation) + + print(f"\nAssigned formats for {len(results)} instances") + if explanation_failures: + print(f" Explanation generation failures (fell back to fragment): {explanation_failures}") + for fmt, count in format_counts.items(): + pct = count * 100 // max(len(results), 1) + print(f" {fmt}: {count} ({pct}%)") - # Generate explanation wrapper for code_with_explanation format - if fmt == "code_with_explanation": - if client is None: - client = OpenAI(api_key=api_key, base_url=base_url) - print(f" LLM client initialized for code_with_explanation ({base_url})") + return results - query = queries.get(instance_id, inst.get("problem_statement", "")[:500]) - context = source_data.get("patch_code", "") - explained = _generate_explanation(client, model, answer, query, context) - if explained is None: - # Fallback: use raw code as fragment - fmt = "fragment" - explanation_failures += 1 - else: - answer = explained +def _run_explanations_sequential( + needs_explanation, format_counts, results, api_key, base_url, model +): + """Generate explanations sequentially (for remote APIs).""" + client = OpenAI(api_key=api_key, base_url=base_url) + explanation_failures = 0 + processed = 0 + + with open(FORMATS_PATH, "a") as f: + for instance_id, code, query, context, _ in needs_explanation: + explained = _generate_explanation(client, model, code, query, context) + + if explained is None: + fmt = "fragment" + answer = code + explanation_failures += 1 + else: + fmt = "code_with_explanation" + answer = explained entry = { "instance_id": instance_id, @@ -256,17 +351,61 @@ def run( if processed % 100 == 0: print( - f" Phase 5: {processed}/{len(to_process)} (failures: {explanation_failures})" + f" Phase 5 (explanations): {processed}/{len(needs_explanation)} " + f"(failures: {explanation_failures})" ) - print(f"\nAssigned formats for {len(results)} instances (skipped {skipped})") - if explanation_failures: - print(f" Explanation generation failures (fell back to fragment): {explanation_failures}") - for fmt, count in format_counts.items(): - pct = count * 100 // max(len(results), 1) - print(f" {fmt}: {count} ({pct}%)") + return explanation_failures - return results + +def _run_explanations_batched(needs_explanation, format_counts, results, api_key, base_url, model): + """Generate explanations with async batching (for local vLLM).""" + aclient = AsyncOpenAI(api_key=api_key, base_url=base_url) + explanation_failures = 0 + processed = 0 + + async def process_batches(): + nonlocal explanation_failures, processed + + with open(FORMATS_PATH, "a") as f: + for batch_start in range(0, len(needs_explanation), BATCH_SIZE): + batch = needs_explanation[batch_start : batch_start + BATCH_SIZE] + + tasks = [] + for instance_id, code, query, context, _ in batch: + tasks.append(_generate_explanation_async(aclient, model, code, query, context)) + + batch_results = await asyncio.gather(*tasks, return_exceptions=True) + + for (instance_id, code, query, context, _), explained in zip(batch, batch_results): + if isinstance(explained, Exception) or explained is None: + fmt = "fragment" + answer = code + explanation_failures += 1 + else: + fmt = "code_with_explanation" + answer = explained + + entry = { + "instance_id": instance_id, + "format_type": fmt, + "answer": answer, + } + f.write(json.dumps(entry) + "\n") + results.append(entry) + format_counts[fmt] += 1 + processed += 1 + + f.flush() + + if processed % 100 == 0 or batch_start + BATCH_SIZE >= len(needs_explanation): + print( + f" Phase 5 (explanations): {processed}/{len(needs_explanation)} " + f"(failures: {explanation_failures})" + ) + + asyncio.run(process_batches()) + return explanation_failures if __name__ == "__main__": diff --git a/scripts/code_hallucination/hallucination_injector.py b/scripts/code_hallucination/hallucination_injector.py index f7bd6d5..58ff0d4 100644 --- a/scripts/code_hallucination/hallucination_injector.py +++ b/scripts/code_hallucination/hallucination_injector.py @@ -19,6 +19,7 @@ HALLUCINATED_PATH, HALLUCINATION_TEMPERATURE, HALLUCINATION_TYPES, + MAX_PROMPT_CHARS, MAX_RETRIES, MODEL, RETRY_DELAY, @@ -81,6 +82,20 @@ """) +def build_source_context(source_data: dict) -> str: + """Build source code context string from cached source data. + + Truncates to MAX_PROMPT_CHARS so the final sample fits in 8K model context. + """ + parts = [] + for filepath, content in source_data.get("source_files", {}).items(): + parts.append(f"File: {filepath}\n```python\n{content}\n```") + context = "\n\n".join(parts) + if len(context) > MAX_PROMPT_CHARS: + context = context[:MAX_PROMPT_CHARS] + return context + + def inject_hallucination( client: OpenAI, model: str, @@ -318,6 +333,7 @@ def run( formats: dict[str, dict], queries: dict[str, str], docs: dict[str, dict] | None = None, + source_cache: dict[str, dict] | None = None, api_key: str = API_KEY, base_url: str = API_BASE_URL, model: str = MODEL, @@ -333,6 +349,8 @@ def run( if docs is None: docs = {} + if source_cache is None: + source_cache = {} HALLUCINATED_PATH.parent.mkdir(parents=True, exist_ok=True) @@ -350,9 +368,13 @@ def run( print(f"Remaining: {len(to_process)} instances to inject") if BATCH_SIZE > 1: - results = _run_batched(to_process, formats, queries, docs, api_key, base_url, model) + results = _run_batched( + to_process, formats, queries, docs, source_cache, api_key, base_url, model + ) else: - results = _run_sequential(to_process, formats, queries, docs, api_key, base_url, model) + results = _run_sequential( + to_process, formats, queries, docs, source_cache, api_key, base_url, model + ) # Stats type_counts = {} @@ -372,7 +394,7 @@ def run( return results -def _run_sequential(to_process, formats, queries, docs, api_key, base_url, model): +def _run_sequential(to_process, formats, queries, docs, source_cache, api_key, base_url, model): """Sequential processing for remote APIs (rate-limited).""" client = OpenAI(api_key=api_key, base_url=base_url) processed = 0 @@ -391,7 +413,12 @@ def _run_sequential(to_process, formats, queries, docs, api_key, base_url, model hall_type = HALLUCINATION_TYPES[i % len(HALLUCINATION_TYPES)] query = queries.get(instance_id, "") - context = inst.get("problem_statement", "") + source_data = source_cache.get(instance_id, {}) + context = ( + build_source_context(source_data) + if source_data + else inst.get("problem_statement", "") + ) instance_docs = docs.get(instance_id, {}) # Try injection with up to 2 quality retries @@ -428,7 +455,7 @@ def _run_sequential(to_process, formats, queries, docs, api_key, base_url, model return results -def _run_batched(to_process, formats, queries, docs, api_key, base_url, model): +def _run_batched(to_process, formats, queries, docs, source_cache, api_key, base_url, model): """Async batch processing for local vLLM (no rate limiting needed).""" aclient = AsyncOpenAI(api_key=api_key, base_url=base_url) processed = 0 @@ -457,7 +484,12 @@ async def process_batches(): hall_type = HALLUCINATION_TYPES[global_idx % len(HALLUCINATION_TYPES)] query = queries.get(instance_id, "") - context = inst.get("problem_statement", "") + source_data = source_cache.get(instance_id, {}) + context = ( + build_source_context(source_data) + if source_data + else inst.get("problem_statement", "") + ) instance_docs = docs.get(instance_id, {}) tasks.append( diff --git a/scripts/code_hallucination/pipeline.py b/scripts/code_hallucination/pipeline.py index ea7fc23..59d8d26 100644 --- a/scripts/code_hallucination/pipeline.py +++ b/scripts/code_hallucination/pipeline.py @@ -29,9 +29,21 @@ HALLUCINATED_PATH, MODEL, QUERIES_PATH, + SOURCE_CACHE_DIR, ) +def load_source_cache(instance_ids: list[str]) -> dict[str, dict]: + """Load source cache for given instance IDs.""" + cache = {} + for iid in instance_ids: + cache_path = SOURCE_CACHE_DIR / f"{iid}.json" + if cache_path.exists(): + with open(cache_path) as f: + cache[iid] = json.load(f) + return cache + + def load_jsonl_dict(path, key="instance_id", value_key=None) -> dict: """Load a JSONL file into a dict keyed by instance_id.""" result = {} @@ -110,8 +122,16 @@ def run_test(n: int = 5, api_key: str = API_KEY, base_url: str = API_BASE_URL, m formats = load_jsonl_dict(FORMATS_PATH) docs = load_jsonl_dict(DOCS_PATH, value_key="docs") to_inject = [i for i in selected if i["instance_id"] in targets] + sc = load_source_cache([i["instance_id"] for i in to_inject]) run_inject( - to_inject, formats, queries_dict, docs=docs, api_key=api_key, base_url=base_url, model=model + to_inject, + formats, + queries_dict, + docs=docs, + source_cache=sc, + api_key=api_key, + base_url=base_url, + model=model, ) # Phase 7: Assemble @@ -210,11 +230,13 @@ def main(): docs = load_jsonl_dict(DOCS_PATH, value_key="docs") targets = select_hallucination_targets(instances) to_inject = [i for i in instances if i["instance_id"] in targets] + sc = load_source_cache([i["instance_id"] for i in to_inject]) run( to_inject, formats, queries, docs=docs, + source_cache=sc, api_key=args.api_key, base_url=args.base_url, model=args.model, diff --git a/scripts/code_hallucination/sample_assembler.py b/scripts/code_hallucination/sample_assembler.py index bdcbd61..7bf6157 100644 --- a/scripts/code_hallucination/sample_assembler.py +++ b/scripts/code_hallucination/sample_assembler.py @@ -2,7 +2,7 @@ import json -from .config import DATASET_PATH, METADATA_PATH, SOURCE_CACHE_DIR +from .config import DATASET_PATH, MAX_PROMPT_CHARS, METADATA_PATH, SOURCE_CACHE_DIR def build_prompt( @@ -24,7 +24,10 @@ def build_prompt( parts.append(f"User request: {user_query}") - return "\n\n".join(parts) + prompt = "\n\n".join(parts) + if len(prompt) > MAX_PROMPT_CHARS: + prompt = prompt[:MAX_PROMPT_CHARS] + return prompt def assemble_samples( From 406ed946acd472b2002cacbc40d5ff565af05f74 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Kov=C3=A1cs=20=C3=81d=C3=A1m?= Date: Fri, 6 Mar 2026 10:54:42 +0100 Subject: [PATCH 7/9] Ground hallucinations --- .../hallucination_injector.py | 41 ++++++++++--------- 1 file changed, 22 insertions(+), 19 deletions(-) diff --git a/scripts/code_hallucination/hallucination_injector.py b/scripts/code_hallucination/hallucination_injector.py index 58ff0d4..46d676d 100644 --- a/scripts/code_hallucination/hallucination_injector.py +++ b/scripts/code_hallucination/hallucination_injector.py @@ -29,24 +29,30 @@ You are a code hallucination injector for building a hallucination detection dataset. Given a correct answer (which may be pure code OR code with natural language explanation) - and context, create a hallucinated version with specific types of errors. + and SOURCE CODE CONTEXT, create a hallucinated version with specific types of errors. + + CRITICAL: Every error you inject MUST BE DETECTABLE by comparing the answer against + the provided source code context AND/OR the user's request. A human reading the + source files and user query must be able to spot that the hallucinated part + contradicts what's in the source or what the user asked for. Do NOT inject errors + that require running the code or external knowledge to detect. Hallucination types: - - STRUCTURAL: Change a function call, import, or parameter to something that - doesn't exist or is wrong. Code should still parse but reference non-existent - APIs, wrong methods, or invented parameters. - - BEHAVIORAL: Use correct APIs but with wrong values or logic. Wrong defaults, - off-by-one errors, swapped conditions, wrong argument values. - - SEMANTIC: Code that looks like it addresses the user's request but does - something subtly different or opposite. The code parses, uses real APIs, - but fails to do what was asked. If library documentation is provided, - you can make the code contradict the documented API (wrong parameter names, - wrong return types, deprecated usage, etc.). - For answers with explanations, you may also make the explanation contradict - the code or describe incorrect behavior. + - STRUCTURAL: Change a function/method/class name, import, or parameter to something + that does NOT exist in the source context. For example, rename a method call to one + that isn't defined in the provided source files, or add a parameter that the function + doesn't accept according to the source. + - BEHAVIORAL: Use correct API names from the source but with wrong values or logic + that contradicts the source. Wrong default values (different from source), swapped + conditions, wrong argument order compared to the function signature in source. + - SEMANTIC: Code that contradicts the source's behavior, the user's request, or the + explanation contradicts what the source code actually does. For example: claim a + function returns X when the source shows it returns Y, describe wrong control flow, + or solve a different problem than what the user asked for. Rules: - Make 2-3 DISTINCT changes spread across different parts of the answer + - Each change MUST contradict something visible in the source code or user request - Each changed span must be 20-150 characters long (not too short, not too long) - Total hallucinated text must be LESS THAN 40% of the original answer length - Keep most of the answer CORRECT — do NOT rewrite the entire thing @@ -54,11 +60,7 @@ - Make changes PLAUSIBLE — something an LLM would realistically generate - Changes must be SUBTLE, not obviously broken - The code in the hallucinated answer must still be syntactically valid - - Do NOT add comments explaining or hinting at the hallucination (no "# wrong", - "# error", "# typo", "# nonexistent", etc.) — the errors must be invisible - to someone skimming the answer - - If the answer contains both code and explanation, inject errors in BOTH parts - (e.g. wrong API in code + misleading description in text) + - Do NOT add comments explaining or hinting at the hallucination - Preserve the overall structure: keep markdown formatting, code blocks, etc. Respond in this exact JSON format (no markdown, no code blocks): @@ -68,7 +70,7 @@ { "original": "exact original text that was changed", "hallucinated": "what you changed it to", - "explanation": "why this is a hallucination" + "explanation": "why this is wrong — what does the source code or user request actually say?" } ] } @@ -78,6 +80,7 @@ - "original" must be an exact substring of the correct answer - "hallucinated" must be an exact substring of your hallucinated answer - Each "hallucinated" value must be at least 20 characters long + - Each "explanation" must reference what the source code or user request actually says - Return ONLY valid JSON, nothing else """) From e40b072f373cfa85dc4838bef58d2bd80695633b Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Kov=C3=A1cs=20=C3=81d=C3=A1m?= Date: Sat, 7 Mar 2026 14:33:12 +0100 Subject: [PATCH 8/9] Strengthen prompts for code hallucination --- scripts/code_hallucination/config.py | 44 ++- scripts/code_hallucination/format_builder.py | 5 +- .../hallucination_injector.py | 372 ++++++++++++++---- scripts/code_hallucination/pipeline.py | 87 ++-- scripts/code_hallucination/query_rewriter.py | 3 +- .../code_hallucination/sample_assembler.py | 6 + scripts/code_hallucination/source_fetcher.py | 80 +++- scripts/code_hallucination/validator.py | 167 +++++++- 8 files changed, 633 insertions(+), 131 deletions(-) diff --git a/scripts/code_hallucination/config.py b/scripts/code_hallucination/config.py index 85bfed4..bb1c7d0 100644 --- a/scripts/code_hallucination/config.py +++ b/scripts/code_hallucination/config.py @@ -5,7 +5,8 @@ # === Paths === PROJECT_ROOT = Path(__file__).resolve().parent.parent.parent -DATA_DIR = PROJECT_ROOT / "data" / "code_hallucination" +DEFAULT_DATA_DIR = PROJECT_ROOT / "data" / "code_hallucination" +DATA_DIR = Path(os.environ.get("CODE_HALLUCINATION_OUTPUT_DIR", str(DEFAULT_DATA_DIR))) REPOS_DIR = DATA_DIR / "repos" SOURCE_CACHE_DIR = DATA_DIR / "source_cache" @@ -21,6 +22,37 @@ METADATA_PATH = DATA_DIR / "code_hallucination_metadata.json" VALIDATION_REPORT_PATH = DATA_DIR / "validation_report.txt" + +def set_output_dir(path: str | os.PathLike[str]) -> Path: + """Redirect all pipeline outputs to a specific directory.""" + global DATA_DIR + global REPOS_DIR + global SOURCE_CACHE_DIR + global INSTANCES_PATH + global QUERIES_PATH + global DOCS_PATH + global FORMATS_PATH + global HALLUCINATED_PATH + global DATASET_PATH + global METADATA_PATH + global VALIDATION_REPORT_PATH + + DATA_DIR = Path(path) + REPOS_DIR = DATA_DIR / "repos" + SOURCE_CACHE_DIR = DATA_DIR / "source_cache" + INSTANCES_PATH = DATA_DIR / "swebench_instances.json" + QUERIES_PATH = DATA_DIR / "queries.jsonl" + DOCS_PATH = DATA_DIR / "documentation.jsonl" + FORMATS_PATH = DATA_DIR / "formats.jsonl" + HALLUCINATED_PATH = DATA_DIR / "hallucinated_samples.jsonl" + DATASET_PATH = DATA_DIR / "code_hallucination_data.json" + METADATA_PATH = DATA_DIR / "code_hallucination_metadata.json" + VALIDATION_REPORT_PATH = DATA_DIR / "validation_report.txt" + + os.environ["CODE_HALLUCINATION_OUTPUT_DIR"] = str(DATA_DIR) + return DATA_DIR + + # === LLM API Config === # Override via env vars or CLI args API_BASE_URL = os.environ.get("API_BASE_URL", "https://api.groq.com/openai/v1") @@ -55,3 +87,13 @@ # SWE-bench datasets SWEBENCH_FULL = "princeton-nlp/SWE-bench" SWEBENCH_LITE = "princeton-nlp/SWE-bench_Lite" + +# Models that require max_completion_tokens instead of max_tokens +_REASONING_MODEL_PREFIXES = ("o1", "o3", "o4", "gpt-5") + + +def token_limit_kwargs(model: str, max_tokens: int = 4000) -> dict: + """Return the right token-limit kwarg for the given model.""" + if any(model.startswith(p) for p in _REASONING_MODEL_PREFIXES): + return {"max_completion_tokens": max_tokens, "reasoning_effort": "none"} + return {"max_tokens": max_tokens} diff --git a/scripts/code_hallucination/format_builder.py b/scripts/code_hallucination/format_builder.py index 0f0a660..60f912e 100644 --- a/scripts/code_hallucination/format_builder.py +++ b/scripts/code_hallucination/format_builder.py @@ -24,6 +24,7 @@ MODEL, RETRY_DELAY, SOURCE_CACHE_DIR, + token_limit_kwargs, ) EXPLANATION_SYSTEM_PROMPT = textwrap.dedent("""\ @@ -78,7 +79,7 @@ def _generate_explanation( {"role": "user", "content": user_msg}, ], temperature=LLM_TEMPERATURE, - max_tokens=200, + **token_limit_kwargs(model, 200), ) result = response.choices[0].message.content.strip() # Verify the code is actually in the response @@ -122,7 +123,7 @@ async def _generate_explanation_async( {"role": "user", "content": user_msg}, ], temperature=LLM_TEMPERATURE, - max_tokens=200, + **token_limit_kwargs(model, 200), ) result = response.choices[0].message.content.strip() if code[:50] in result or "```" in result: diff --git a/scripts/code_hallucination/hallucination_injector.py b/scripts/code_hallucination/hallucination_injector.py index 46d676d..a1d470f 100644 --- a/scripts/code_hallucination/hallucination_injector.py +++ b/scripts/code_hallucination/hallucination_injector.py @@ -23,67 +23,119 @@ MAX_RETRIES, MODEL, RETRY_DELAY, + token_limit_kwargs, ) INJECTION_SYSTEM_PROMPT = textwrap.dedent("""\ You are a code hallucination injector for building a hallucination detection dataset. Given a correct answer (which may be pure code OR code with natural language explanation) - and SOURCE CODE CONTEXT, create a hallucinated version with specific types of errors. - - CRITICAL: Every error you inject MUST BE DETECTABLE by comparing the answer against - the provided source code context AND/OR the user's request. A human reading the - source files and user query must be able to spot that the hallucinated part - contradicts what's in the source or what the user asked for. Do NOT inject errors - that require running the code or external knowledge to detect. + and SOURCE CODE CONTEXT, return ONLY a small set of localized replacement edits that will + turn the answer into a hallucinated answer. + + IMPORTANT: You are NOT allowed to rewrite the full answer. + - Return replacement edits only. + - The pipeline will apply those edits to the original answer. + - Outside the returned edits, the answer must remain unchanged. + + IMPORTANT: Only inject hallucinations into CODE portions of the answer. + - If the answer contains markdown code fences, edits must be inside the fenced code block(s). + - Do NOT modify natural language explanations before or after the code block. + - Do NOT add explanatory comments inside code. + - The explanation text must remain correct and neutral; only the code should be wrong. + + CRITICAL RULES FOR GROUNDING: + - Every error you inject MUST BE DETECTABLE by comparing the answer against + the provided source code context AND/OR the user's request. + - ONLY reference functions, methods, classes, variables, and parameters that + appear in the PROVIDED source context. Do NOT use your own knowledge of the + library — pretend you only know what's in the context. + - A human reading ONLY the source files and user query must be able to spot + that the hallucinated part is wrong. If they can't, the hallucination is useless. + - Do NOT inject errors that require running code, external docs, or knowledge + beyond what's in the provided context to detect. Hallucination types: - - STRUCTURAL: Change a function/method/class name, import, or parameter to something - that does NOT exist in the source context. For example, rename a method call to one - that isn't defined in the provided source files, or add a parameter that the function - doesn't accept according to the source. - - BEHAVIORAL: Use correct API names from the source but with wrong values or logic - that contradicts the source. Wrong default values (different from source), swapped - conditions, wrong argument order compared to the function signature in source. - - SEMANTIC: Code that contradicts the source's behavior, the user's request, or the - explanation contradicts what the source code actually does. For example: claim a - function returns X when the source shows it returns Y, describe wrong control flow, - or solve a different problem than what the user asked for. + - STRUCTURAL: Change a function/method/class name to something that does NOT + appear anywhere in the provided source context. + - BEHAVIORAL: Use correct names from the source but with wrong values or logic + that visibly contradicts the source. + - SEMANTIC: Make the CODE solve a different problem than the user asked for, or + make the code behave differently than what the source context shows. Rules: - - Make 2-3 DISTINCT changes spread across different parts of the answer - - Each change MUST contradict something visible in the source code or user request - - Each changed span must be 20-150 characters long (not too short, not too long) - - Total hallucinated text must be LESS THAN 40% of the original answer length + - Make 1-3 DISTINCT replacement edits spread across different parts of the answer + - Each edit MUST contradict something VISIBLE in the provided source code or user request + - Do NOT reference functions/classes/methods not present in the provided context + - Do NOT make any unlabeled edits outside the returned replacement edits + - Each replacement span must be 12-120 characters long and as small as possible + - Total hallucinated text must be LESS THAN 30% of the original answer length - Keep most of the answer CORRECT — do NOT rewrite the entire thing - - Changes should be in different functions/blocks/paragraphs, not adjacent lines + - Changes should be in different functions/blocks, not adjacent lines - Make changes PLAUSIBLE — something an LLM would realistically generate - Changes must be SUBTLE, not obviously broken - - The code in the hallucinated answer must still be syntactically valid + - The edited code must still be syntactically valid - Do NOT add comments explaining or hinting at the hallucination - - Preserve the overall structure: keep markdown formatting, code blocks, etc. + - Do NOT add words like BUG, wrong, incorrect, deprecated, hallucination, fix, helper + - Do NOT include editorial text that describes the mistake inside the answer itself + - Preserve the overall structure: keep markdown formatting, code blocks, indentation, imports, and surrounding text unchanged + - Do NOT add or remove markdown fences + - Do NOT add explanation text, tutorial text, wrapper text, or placeholder text + - Do NOT add imports, helper functions, or surrounding code + - Prefer changing existing lines over insertions or deletions + - Each edit must replace an existing substring of the original answer; no insert-only edits Respond in this exact JSON format (no markdown, no code blocks): { - "hallucinated_code": "the full modified answer with hallucinations injected", "changes": [ { - "original": "exact original text that was changed", - "hallucinated": "what you changed it to", - "explanation": "why this is wrong — what does the source code or user request actually say?" + "original": "exact original substring from the correct answer", + "hallucinated": "replacement text for that substring", + "left_context": "up to 40 exact characters immediately before the original substring in the correct answer", + "right_context": "up to 40 exact characters immediately after the original substring in the correct answer", + "target_zone": "code", + "explanation": "why this replacement is wrong according to the source code or user request" } ] } IMPORTANT: - - You MUST include 2-3 changes in the "changes" array - - "original" must be an exact substring of the correct answer - - "hallucinated" must be an exact substring of your hallucinated answer - - Each "hallucinated" value must be at least 20 characters long + - You MUST include 1-3 changes in the "changes" array + - The returned changes must be sufficient to construct the full hallucinated answer + - "original" must be a non-empty exact substring of the correct answer + - Before returning, verify that each "original" substring appears verbatim in the provided correct answer + - Prefer substrings that appear exactly once in the correct answer + - If a substring appears multiple times, use left_context and right_context that disambiguate a single occurrence + - "hallucinated" is the exact replacement text for that substring + - "left_context" and "right_context" must come from the original correct answer, not a rewritten one + - "target_zone" must always be "code" - Each "explanation" must reference what the source code or user request actually says + - If you cannot find 1-3 exact editable substrings in the provided answer, return {"changes": []} - Return ONLY valid JSON, nothing else """) +LEAKY_TERMS = ( + "bug", + "wrong", + "incorrect", + "incorrectly", + "deprecated", + "hallucination", + "helper method", + "should be replaced", +) +PROMPT_RESIDUE = ( + "Generate a hallucinated version", + "Return JSON only", + "hallucinated_code", + "target_zone", + "left_context", + "right_context", +) +MAX_LABEL_COVERAGE = 0.30 +MAX_LABEL_SPAN_CHARS = 500 +MIN_LABEL_SPAN_CHARS = 12 + def build_source_context(source_data: dict) -> str: """Build source code context string from cached source data. @@ -108,10 +160,7 @@ def inject_hallucination( context: str = "", documentation: dict[str, str] | None = None, ) -> dict | None: - """Inject a hallucination and get back structured JSON with spans. - - Returns dict with 'hallucinated_code' and 'changes', or None if failed. - """ + """Request structured replacement edits for hallucination injection.""" docs_section = "" if documentation: docs_parts = [f"Documentation for {lib}:\n{doc}" for lib, doc in documentation.items()] @@ -127,10 +176,10 @@ def inject_hallucination( Context (source code): {context}{docs_section} -Correct code to modify: +Correct answer to modify: {clean_answer} -Generate a hallucinated version with {hall_type} error(s). Return JSON only.""" +Return ONLY replacement edits for {hall_type} error(s). Do not return the full rewritten answer.""" for attempt in range(MAX_RETRIES): try: @@ -141,7 +190,7 @@ def inject_hallucination( {"role": "user", "content": user_msg}, ], temperature=HALLUCINATION_TEMPERATURE, - max_tokens=4000, + **token_limit_kwargs(model), ) raw = response.choices[0].message.content.strip() @@ -154,13 +203,11 @@ def inject_hallucination( result = json.loads(json_match.group()) - if "hallucinated_code" not in result or "changes" not in result: + if "changes" not in result or not isinstance(result["changes"], list): if attempt < MAX_RETRIES - 1: continue return None - - # Verify the hallucinated code is actually different - if result["hallucinated_code"].strip() == clean_answer.strip(): + if not result["changes"]: if attempt < MAX_RETRIES - 1: continue return None @@ -176,41 +223,159 @@ def inject_hallucination( return None -def compute_span_offsets(hallucinated_code: str, hallucinated_span: str) -> list[dict]: - """Find character offsets of a hallucinated span within the answer code.""" - spans = [] - idx = hallucinated_code.find(hallucinated_span) - if idx != -1: - spans.append({"start": idx, "end": idx + len(hallucinated_span)}) - return spans +def _find_all_occurrences(text: str, pattern: str) -> list[dict]: + """Return all exact matches of pattern in text.""" + if not pattern: + return [] + offsets = [] + start = 0 + while True: + idx = text.find(pattern, start) + if idx == -1: + break + offsets.append({"start": idx, "end": idx + len(pattern)}) + start = idx + 1 + return offsets + +def _truncate_context(text: str, max_chars: int = 40) -> str: + """Normalize context fields to the same length budget used in the prompt.""" + if len(text) <= max_chars: + return text + return text[-max_chars:] -def build_labels_from_changes( - hallucinated_code: str, changes: list[dict], hall_type: str -) -> list[dict]: - """Build span labels by finding each hallucinated string in the code. - Only includes spans where the hallucinated text is actually found in the answer. +def _extract_code_regions(answer: str) -> list[tuple[int, int]]: + """Return ranges that correspond to markdown fenced code blocks. + + If no fenced blocks are present, treat the whole answer as code. """ - labels = [] + regions = [] + idx = 0 + while True: + start = answer.find("```", idx) + if start == -1: + break + code_start = answer.find("\n", start + 3) + if code_start == -1: + break + code_start += 1 + end = answer.find("```", code_start) + if end == -1: + break + regions.append((code_start, end)) + idx = end + 3 + if not regions: + return [(0, len(answer))] + return regions + + +def _span_is_in_code(answer: str, start: int, end: int) -> bool: + """Check whether a span lies fully inside a code region.""" + for code_start, code_end in _extract_code_regions(answer): + if start >= code_start and end <= code_end: + return True + return False + + +def _contains_leakage(text: str) -> bool: + """Detect obvious synthetic giveaway text inside a label span.""" + lowered = text.lower() + return any(term in lowered for term in LEAKY_TERMS) + + +def _max_allowed_coverage(answer_len: int) -> float: + """Use a looser coverage cap for short answers and fragments.""" + if answer_len <= 400: + return 0.40 + if answer_len <= 800: + return 0.35 + return MAX_LABEL_COVERAGE + + +def _locate_original_change(original_answer: str, change: dict) -> dict | None: + """Locate a replacement span in the original answer using substring plus context.""" + original_span = change.get("original", "") + hallucinated_span = change.get("hallucinated", "") + if not original_span or not hallucinated_span: + return None + if change.get("target_zone") not in (None, "code"): + return None + + offsets = _find_all_occurrences(original_answer, original_span) + if not offsets: + return None + + left_context = _truncate_context(change.get("left_context", "")) + right_context = _truncate_context(change.get("right_context", "")) + filtered = [] + for offset in offsets: + start = offset["start"] + end = offset["end"] + observed_left = _truncate_context( + original_answer[max(0, start - len(left_context)) : start] + ) + observed_right = original_answer[end : end + len(right_context)] + left_ok = not left_context or observed_left == left_context + right_ok = not right_context or observed_right == right_context + if left_ok and right_ok: + filtered.append(offset) + + matches = filtered or offsets + if len(matches) != 1: + return None + + return { + "start": matches[0]["start"], + "end": matches[0]["end"], + "original": original_span, + "hallucinated": hallucinated_span, + } + + +def apply_changes_to_answer( + original_answer: str, changes: list[dict], hall_type: str +) -> tuple[str, list[dict]] | tuple[None, None]: + """Apply structured replacement edits to the original answer and build labels. + + The model returns edits only. This function deterministically constructs the + hallucinated answer and the corresponding label offsets. + """ + located = [] for change in changes: - h_span = change.get("hallucinated", "") - if not h_span or len(h_span) < 15: - continue - if h_span not in hallucinated_code: - continue - - offsets = compute_span_offsets(hallucinated_code, h_span) - for offset in offsets[:1]: # First occurrence only - labels.append( - { - "start": offset["start"], - "end": offset["end"], - "label": hall_type, - } - ) + if len(change.get("hallucinated", "")) < MIN_LABEL_SPAN_CHARS: + return None, None + located_change = _locate_original_change(original_answer, change) + if located_change is None: + return None, None + located.append(located_change) + + # Reject overlapping edits in the original answer. + located.sort(key=lambda item: (item["start"], item["end"])) + previous_end = -1 + for item in located: + if item["start"] < previous_end: + return None, None + previous_end = item["end"] + + hallucinated_parts = [] + labels = [] + cursor = 0 + for item in located: + start = item["start"] + end = item["end"] + hallucinated_span = item["hallucinated"] - return labels + hallucinated_parts.append(original_answer[cursor:start]) + label_start = sum(len(part) for part in hallucinated_parts) + hallucinated_parts.append(hallucinated_span) + label_end = label_start + len(hallucinated_span) + labels.append({"start": label_start, "end": label_end, "label": hall_type}) + cursor = end + + hallucinated_parts.append(original_answer[cursor:]) + hallucinated_answer = "".join(hallucinated_parts) + return hallucinated_answer, labels def load_existing_hallucinations(path=HALLUCINATED_PATH) -> dict[str, dict]: @@ -252,10 +417,10 @@ async def _inject_one_async( Context (source code): {context}{docs_section} -Correct code to modify: +Correct answer to modify: {clean_answer} -Generate a hallucinated version with {hall_type} error(s). Return JSON only.""" +Return ONLY replacement edits for {hall_type} error(s). Do not return the full rewritten answer.""" for attempt in range(MAX_RETRIES): try: @@ -266,16 +431,16 @@ async def _inject_one_async( {"role": "user", "content": user_msg}, ], temperature=HALLUCINATION_TEMPERATURE, - max_tokens=4000, + **token_limit_kwargs(model), ) raw = response.choices[0].message.content.strip() json_match = re.search(r"\{[\s\S]*\}", raw) if not json_match: continue result = json.loads(json_match.group()) - if "hallucinated_code" not in result or "changes" not in result: + if "changes" not in result or not isinstance(result["changes"], list): continue - if result["hallucinated_code"].strip() == clean_answer.strip(): + if not result["changes"]: continue return result except Exception: @@ -286,7 +451,9 @@ async def _inject_one_async( return None -def _validate_labels(hallucinated_code: str, labels: list[dict]) -> tuple[bool, str]: +def _validate_labels( + original_answer: str, hallucinated_code: str, labels: list[dict], format_type: str +) -> tuple[bool, str]: """Validate that hallucination labels meet quality thresholds. :return: (is_valid, reason) tuple. @@ -294,17 +461,46 @@ def _validate_labels(hallucinated_code: str, labels: list[dict]) -> tuple[bool, if not labels: return False, "no_labels" + # Reject prompt contamination (LLM leaked its instructions into the answer) + for residue in PROMPT_RESIDUE: + if residue in hallucinated_code: + return False, f"prompt_residue ({residue[:30]})" + + # Reject unbalanced code fences for code_with_explanation + if format_type == "code_with_explanation": + fence_count = hallucinated_code.count("```") + if fence_count % 2 != 0: + return False, f"unbalanced_fences ({fence_count})" + if fence_count == 0: + return False, "no_code_fences" + total_span = sum(lab["end"] - lab["start"] for lab in labels) code_len = len(hallucinated_code) if hallucinated_code else 1 coverage = total_span / code_len - if coverage > 0.60: - return False, f"coverage_too_high ({coverage:.0%})" + max_coverage = _max_allowed_coverage(code_len) + if coverage > max_coverage: + return False, f"coverage_too_high ({coverage:.0%} > {max_coverage:.0%})" + previous_end = -1 for lab in labels: span_len = lab["end"] - lab["start"] - if span_len < 15: + if span_len < MIN_LABEL_SPAN_CHARS: return False, f"span_too_short ({span_len} chars)" + if span_len > MAX_LABEL_SPAN_CHARS: + return False, f"span_too_long ({span_len} chars)" + if lab["start"] < previous_end: + return False, "overlapping_or_unsorted_labels" + previous_end = lab["end"] + + span_text = hallucinated_code[lab["start"] : lab["end"]] + if _contains_leakage(span_text): + return False, "leaky_label_text" + + if format_type == "code_with_explanation" and not _span_is_in_code( + hallucinated_code, lab["start"], lab["end"] + ): + return False, "label_outside_code_block" return True, "" @@ -313,11 +509,14 @@ def _process_result(result, instance_id, hall_type, fmt_data, model): """Process a single injection result into a JSONL entry.""" if result is None: return None - hallucinated_code = result["hallucinated_code"] + original_answer = fmt_data.get("answer", "") changes = result.get("changes", []) - labels = build_labels_from_changes(hallucinated_code, changes, hall_type) + hallucinated_code, labels = apply_changes_to_answer(original_answer, changes, hall_type) + if hallucinated_code is None or labels is None: + return None + format_type = fmt_data.get("format_type", "fragment") - valid, reason = _validate_labels(hallucinated_code, labels) + valid, reason = _validate_labels(original_answer, hallucinated_code, labels, format_type) if not valid: return None @@ -327,7 +526,8 @@ def _process_result(result, instance_id, hall_type, fmt_data, model): "labels": labels, "hallucination_type": hall_type, "injector_model": model, - "format_type": fmt_data.get("format_type", "fragment"), + "format_type": format_type, + "changes": changes, } diff --git a/scripts/code_hallucination/pipeline.py b/scripts/code_hallucination/pipeline.py index 59d8d26..aa875d7 100644 --- a/scripts/code_hallucination/pipeline.py +++ b/scripts/code_hallucination/pipeline.py @@ -20,24 +20,15 @@ import json import random -from .config import ( - API_BASE_URL, - API_KEY, - DATA_DIR, - DOCS_PATH, - FORMATS_PATH, - HALLUCINATED_PATH, - MODEL, - QUERIES_PATH, - SOURCE_CACHE_DIR, -) +from . import config +from .config import API_BASE_URL, API_KEY, MODEL def load_source_cache(instance_ids: list[str]) -> dict[str, dict]: """Load source cache for given instance IDs.""" cache = {} for iid in instance_ids: - cache_path = SOURCE_CACHE_DIR / f"{iid}.json" + cache_path = config.SOURCE_CACHE_DIR / f"{iid}.json" if cache_path.exists(): with open(cache_path) as f: cache[iid] = json.load(f) @@ -62,6 +53,16 @@ def load_jsonl_dict(path, key="instance_id", value_key=None) -> dict: return result +def filter_instances_by_splits(instances: list[dict], splits: list[str] | None) -> list[dict]: + """Optionally filter instances to a subset of SWE-bench splits.""" + if not splits: + return instances + split_set = set(splits) + filtered = [inst for inst in instances if inst.get("split") in split_set] + print(f"Using splits {sorted(split_set)}: {len(filtered)}/{len(instances)} instances") + return filtered + + def run_test(n: int = 5, api_key: str = API_KEY, base_url: str = API_BASE_URL, model: str = MODEL): """Run a quick test with n instances from the test split.""" print("=" * 60) @@ -81,8 +82,8 @@ def run_test(n: int = 5, api_key: str = API_KEY, base_url: str = API_BASE_URL, m print(f"Selected {len(selected)} test instances") # Save temporary instances - DATA_DIR.mkdir(parents=True, exist_ok=True) - test_path = DATA_DIR / "test_instances.json" + config.DATA_DIR.mkdir(parents=True, exist_ok=True) + test_path = config.DATA_DIR / "test_instances.json" with open(test_path, "w") as f: json.dump(selected, f, indent=2) @@ -108,7 +109,7 @@ def run_test(n: int = 5, api_key: str = API_KEY, base_url: str = API_BASE_URL, m # Phase 5: Assign formats (needs LLM for code_with_explanation) from .format_builder import run as run_formats - queries_dict = load_jsonl_dict(QUERIES_PATH, value_key="query") + queries_dict = load_jsonl_dict(config.QUERIES_PATH, value_key="query") run_formats(selected, api_key=api_key, base_url=base_url, model=model, queries=queries_dict) # Phase 8: Select targets (before phase 6) @@ -119,8 +120,8 @@ def run_test(n: int = 5, api_key: str = API_KEY, base_url: str = API_BASE_URL, m # Phase 6: Inject hallucinations from .hallucination_injector import run as run_inject - formats = load_jsonl_dict(FORMATS_PATH) - docs = load_jsonl_dict(DOCS_PATH, value_key="docs") + formats = load_jsonl_dict(config.FORMATS_PATH) + docs = load_jsonl_dict(config.DOCS_PATH, value_key="docs") to_inject = [i for i in selected if i["instance_id"] in targets] sc = load_source_cache([i["instance_id"] for i in to_inject]) run_inject( @@ -137,7 +138,7 @@ def run_test(n: int = 5, api_key: str = API_KEY, base_url: str = API_BASE_URL, m # Phase 7: Assemble from .sample_assembler import run as run_assemble - hallucinations = load_jsonl_dict(HALLUCINATED_PATH) + hallucinations = load_jsonl_dict(config.HALLUCINATED_PATH) samples, metadata = run_assemble(selected, queries_dict, docs, formats, hallucinations, targets) # Phase 9: Validate @@ -171,8 +172,23 @@ def main(): parser.add_argument("--api-key", type=str, default=API_KEY, help="LLM API key") parser.add_argument("--base-url", type=str, default=API_BASE_URL, help="LLM API base URL") parser.add_argument("--model", type=str, default=MODEL, help="LLM model name") + parser.add_argument( + "--output-dir", + type=str, + help="Optional output directory for all intermediate and final pipeline files", + ) + parser.add_argument( + "--splits", + nargs="+", + choices=["train", "dev", "test"], + help="Optional SWE-bench splits to operate on", + ) args = parser.parse_args() + if args.output_dir: + output_dir = config.set_output_dir(args.output_dir) + print(f"Using output directory: {output_dir}") + if args.test: run_test(args.test, api_key=args.api_key, base_url=args.base_url, model=args.model) return @@ -196,24 +212,29 @@ def main(): from .source_fetcher import run from .swebench_loader import load_instances - run(load_instances()) + run(filter_instances_by_splits(load_instances(), args.splits)) elif phase == 3: from .query_rewriter import run from .swebench_loader import load_instances - run(load_instances(), api_key=args.api_key, base_url=args.base_url, model=args.model) + run( + filter_instances_by_splits(load_instances(), args.splits), + api_key=args.api_key, + base_url=args.base_url, + model=args.model, + ) elif phase == 4: from .context7_docs import run from .swebench_loader import load_instances - run(load_instances()) + run(filter_instances_by_splits(load_instances(), args.splits)) elif phase == 5: from .format_builder import run from .swebench_loader import load_instances - queries = load_jsonl_dict(QUERIES_PATH, value_key="query") + queries = load_jsonl_dict(config.QUERIES_PATH, value_key="query") run( - load_instances(), + filter_instances_by_splits(load_instances(), args.splits), api_key=args.api_key, base_url=args.base_url, model=args.model, @@ -224,10 +245,10 @@ def main(): from .splitter import select_hallucination_targets from .swebench_loader import load_instances - instances = load_instances() - formats = load_jsonl_dict(FORMATS_PATH) - queries = load_jsonl_dict(QUERIES_PATH, value_key="query") - docs = load_jsonl_dict(DOCS_PATH, value_key="docs") + instances = filter_instances_by_splits(load_instances(), args.splits) + formats = load_jsonl_dict(config.FORMATS_PATH) + queries = load_jsonl_dict(config.QUERIES_PATH, value_key="query") + docs = load_jsonl_dict(config.DOCS_PATH, value_key="docs") targets = select_hallucination_targets(instances) to_inject = [i for i in instances if i["instance_id"] in targets] sc = load_source_cache([i["instance_id"] for i in to_inject]) @@ -246,18 +267,18 @@ def main(): from .splitter import select_hallucination_targets from .swebench_loader import load_instances - instances = load_instances() - queries = load_jsonl_dict(QUERIES_PATH, value_key="query") - docs = load_jsonl_dict(DOCS_PATH, value_key="docs") - formats = load_jsonl_dict(FORMATS_PATH) - hallucinations = load_jsonl_dict(HALLUCINATED_PATH) + instances = filter_instances_by_splits(load_instances(), args.splits) + queries = load_jsonl_dict(config.QUERIES_PATH, value_key="query") + docs = load_jsonl_dict(config.DOCS_PATH, value_key="docs") + formats = load_jsonl_dict(config.FORMATS_PATH) + hallucinations = load_jsonl_dict(config.HALLUCINATED_PATH) targets = select_hallucination_targets(instances) run(instances, queries, docs, formats, hallucinations, targets) elif phase == 8: from .splitter import run from .swebench_loader import load_instances - run(load_instances()) + run(filter_instances_by_splits(load_instances(), args.splits)) elif phase == 9: from .validator import run diff --git a/scripts/code_hallucination/query_rewriter.py b/scripts/code_hallucination/query_rewriter.py index 65f3a16..5b0e512 100644 --- a/scripts/code_hallucination/query_rewriter.py +++ b/scripts/code_hallucination/query_rewriter.py @@ -14,6 +14,7 @@ MODEL, QUERIES_PATH, RETRY_DELAY, + token_limit_kwargs, ) REWRITE_SYSTEM_PROMPT = textwrap.dedent("""\ @@ -52,7 +53,7 @@ def llm_call( {"role": "user", "content": user}, ], temperature=temperature, - max_tokens=max_tokens, + **token_limit_kwargs(model, max_tokens), ) return response.choices[0].message.content.strip() except Exception as e: diff --git a/scripts/code_hallucination/sample_assembler.py b/scripts/code_hallucination/sample_assembler.py index 7bf6157..44147f9 100644 --- a/scripts/code_hallucination/sample_assembler.py +++ b/scripts/code_hallucination/sample_assembler.py @@ -92,6 +92,12 @@ def assemble_samples( if not answer.strip(): continue + # Reject code_with_explanation with unbalanced fences + if fmt_data.get("format_type") == "code_with_explanation": + fence_count = answer.count("```") + if fence_count % 2 != 0 or fence_count == 0: + continue + sample = { "prompt": prompt, "answer": answer, diff --git a/scripts/code_hallucination/source_fetcher.py b/scripts/code_hallucination/source_fetcher.py index fa24c4e..e579f06 100644 --- a/scripts/code_hallucination/source_fetcher.py +++ b/scripts/code_hallucination/source_fetcher.py @@ -15,6 +15,72 @@ GITHUB_RAW_BASE = "https://raw.githubusercontent.com" +def truncate_around_patch( + full_content: str, patch: str, filepath: str, max_chars: int = MAX_FILE_CHARS +) -> str: + """Truncate a source file keeping the region around the patch. + + Instead of taking the first N chars (which may miss the patched region), + find where the patch applies and keep a window around it, plus the file header + (imports/class definitions). + """ + if len(full_content) <= max_chars: + return full_content + + # Find the hunk start lines from the patch for this file + hunk_lines = [] + in_file = False + for line in patch.split("\n"): + if line.startswith("diff --git"): + match = re.match(r"diff --git a/(.+?) b/(.+)$", line) + in_file = match is not None and match.group(2) == filepath + elif in_file and line.startswith("@@"): + hunk_match = re.match(r"@@ -(\d+)", line) + if hunk_match: + hunk_lines.append(int(hunk_match.group(1))) + + if not hunk_lines: + # Can't find patch location, fall back to first N chars + return full_content[:max_chars] + + lines = full_content.split("\n") + + # Always keep the header (imports, class defs) — first 50 lines or until first function + header_end = min(50, len(lines)) + for i, line in enumerate(lines[:200]): + if line.strip().startswith("def ") or line.strip().startswith("class "): + if i > 20: + header_end = i + break + + header = "\n".join(lines[:header_end]) + header_chars = len(header) + remaining_budget = max_chars - header_chars - 100 # 100 for separator + + if remaining_budget <= 0: + return full_content[:max_chars] + + # Build a window around the patch hunks + min_hunk = min(hunk_lines) - 1 # Convert to 0-based + max_hunk = max(hunk_lines) - 1 + + # Expand window to use the remaining budget + lines_budget = remaining_budget // 80 # Rough estimate: 80 chars per line + padding = max(lines_budget // 2, 30) + + window_start = max(header_end, min_hunk - padding) + window_end = min(len(lines), max_hunk + padding) + + window = "\n".join(lines[window_start:window_end]) + + if window_start > header_end: + result = header + "\n\n# ... (truncated) ...\n\n" + window + else: + result = header + "\n" + window + + return result[:max_chars] + + def extract_changed_files(patch: str) -> list[str]: """Extract file paths from a unified diff using anchored regex. @@ -62,7 +128,7 @@ def fetch_file_from_github(repo: str, commit: str, filepath: str) -> str | None: try: r = requests.get(url, timeout=15) if r.status_code == 200: - return r.text[:MAX_FILE_CHARS] + return r.text return None except Exception: return None @@ -79,7 +145,7 @@ def fetch_file_at_commit(repo_dir: Path, commit: str, filepath: str) -> str | No timeout=30, ) if result.returncode == 0: - return result.stdout[:MAX_FILE_CHARS] + return result.stdout return None except (subprocess.TimeoutExpired, Exception) as e: print(f" Error fetching {filepath}@{commit[:8]}: {e}") @@ -119,8 +185,7 @@ def apply_patch_and_get_file(repo_dir: Path, commit: str, patch: str, filepath: # Read the patched file patched_path = Path(tmpdir) / filepath if patched_path.exists(): - content = patched_path.read_text()[:MAX_FILE_CHARS] - return content + return patched_path.read_text() # Clean up worktree subprocess.run( @@ -458,7 +523,7 @@ def fetch_source_for_instance( # Edit-style format edit_style = build_edit_style_answer(patch, changed_files) - # Complete function format — extract modified functions + # Complete function format — extract modified functions (needs full content) modified_functions = [] for filepath in changed_files: if filepath not in source_files: @@ -470,6 +535,11 @@ def fetch_source_for_instance( func["file"] = filepath modified_functions.extend(funcs) + # Smart truncation AFTER patch application: keep header + patch-relevant regions + # instead of blind first-N-chars truncation + for filepath in list(source_files.keys()): + source_files[filepath] = truncate_around_patch(source_files[filepath], patch, filepath) + return { "instance_id": instance["instance_id"], "changed_files": changed_files, diff --git a/scripts/code_hallucination/validator.py b/scripts/code_hallucination/validator.py index 50d5251..40d65e8 100644 --- a/scripts/code_hallucination/validator.py +++ b/scripts/code_hallucination/validator.py @@ -1,10 +1,31 @@ """Phase 9: Quality checks and validation report.""" import ast +import difflib import json from collections import Counter -from .config import DATASET_PATH, METADATA_PATH, VALIDATION_REPORT_PATH +from .config import DATASET_PATH, FORMATS_PATH, METADATA_PATH, VALIDATION_REPORT_PATH + +LEAKY_TERMS = ( + "bug", + "wrong", + "incorrect", + "incorrectly", + "deprecated", + "hallucination", + "helper method", + "should be replaced", +) + + +def _max_allowed_coverage(answer_len: int) -> float: + """Use a looser coverage cap for short answers and fragments.""" + if answer_len <= 400: + return 0.40 + if answer_len <= 800: + return 0.35 + return 0.30 def validate_spans(samples: list[dict]) -> list[str]: @@ -12,6 +33,8 @@ def validate_spans(samples: list[dict]) -> list[str]: issues = [] for i, sample in enumerate(samples): answer_len = len(sample["answer"]) + previous_end = -1 + seen = set() for label in sample.get("labels", []): start = label.get("start", 0) end = label.get("end", 0) @@ -21,9 +44,140 @@ def validate_spans(samples: list[dict]) -> list[str]: issues.append(f"Sample {i}: empty/inverted span ({start}, {end})") if end > answer_len: issues.append(f"Sample {i}: span exceeds answer length ({end} > {answer_len})") + if start < previous_end: + issues.append(f"Sample {i}: unsorted/overlapping spans ({start} < {previous_end})") + if (start, end, label.get("label")) in seen: + issues.append(f"Sample {i}: duplicate span ({start}, {end})") + seen.add((start, end, label.get("label"))) + previous_end = end return issues +def _extract_code_regions(answer: str) -> list[tuple[int, int]]: + """Return markdown fenced code block ranges, or the whole answer if none.""" + regions = [] + idx = 0 + while True: + start = answer.find("```", idx) + if start == -1: + break + code_start = answer.find("\n", start + 3) + if code_start == -1: + break + code_start += 1 + end = answer.find("```", code_start) + if end == -1: + break + regions.append((code_start, end)) + idx = end + 3 + if not regions: + return [(0, len(answer))] + return regions + + +def _span_is_in_code(answer: str, start: int, end: int) -> bool: + """Check whether a span is fully inside a fenced code region.""" + return any( + start >= code_start and end <= code_end + for code_start, code_end in _extract_code_regions(answer) + ) + + +def _is_whitespace_only_diff(original_text: str, hallucinated_text: str) -> bool: + """Treat pure whitespace edits as ignorable when checking diff coverage.""" + return (original_text or "").strip() == "" and (hallucinated_text or "").strip() == "" + + +def _diff_outside_labels( + original_answer: str, hallucinated_answer: str, labels: list[dict] +) -> list[dict]: + """Return meaningful diffs not covered by any labeled hallucinated span.""" + label_ranges = [(lab["start"], lab["end"]) for lab in labels] + + def is_covered(start: int, end: int) -> bool: + return any( + not (end <= lab_start or start >= lab_end) for lab_start, lab_end in label_ranges + ) + + uncovered = [] + matcher = difflib.SequenceMatcher(a=original_answer, b=hallucinated_answer) + for tag, i1, i2, j1, j2 in matcher.get_opcodes(): + if tag == "equal": + continue + + original_chunk = original_answer[i1:i2] + hallucinated_chunk = hallucinated_answer[j1:j2] + if _is_whitespace_only_diff(original_chunk, hallucinated_chunk): + continue + + if j1 == j2: + continue + + if not is_covered(j1, j2): + uncovered.append( + { + "tag": tag, + "start": j1, + "end": j2, + "original": original_chunk[:80], + "hallucinated": hallucinated_chunk[:80], + } + ) + + return uncovered + + +def check_label_quality(samples: list[dict], metadata: list[dict]) -> dict: + """Report common synthetic-label issues that should be filtered before training.""" + issues = Counter() + original_answers = {} + if FORMATS_PATH.exists(): + with open(FORMATS_PATH) as f: + for line in f: + try: + entry = json.loads(line) + except json.JSONDecodeError: + continue + original_answers[entry.get("instance_id")] = entry.get("answer", "") + + for sample, meta in zip(samples, metadata): + if not sample.get("labels"): + continue + + answer = sample["answer"] + coverage = sum(label["end"] - label["start"] for label in sample["labels"]) / max( + len(answer), 1 + ) + if coverage > _max_allowed_coverage(len(answer)): + issues["coverage_over_30pct"] += 1 + + for label in sample["labels"]: + span_text = answer[label["start"] : label["end"]] + if any(term in span_text.lower() for term in LEAKY_TERMS): + issues["labels_with_leakage_terms"] += 1 + break + + if meta.get("format_type") == "code_with_explanation": + if any( + not _span_is_in_code(answer, label["start"], label["end"]) + for label in sample["labels"] + ): + issues["code_with_explanation_label_outside_code"] += 1 + + original_answer = original_answers.get(meta.get("instance_id")) + if original_answer: + uncovered_diffs = _diff_outside_labels(original_answer, answer, sample["labels"]) + if uncovered_diffs: + issues["diff_outside_labels"] += 1 + if any( + diff["tag"] == "insert" or len(diff["hallucinated"]) >= 20 + for diff in uncovered_diffs + ): + issues["large_diff_outside_labels"] += 1 + + return dict(issues) + + def check_span_coverage(samples: list[dict]) -> dict: """Report span coverage distribution for hallucinated samples.""" coverages = [] @@ -168,14 +322,21 @@ def report(text): report(f"Near duplicates (sampled): {n_dup}") report("") - # 5. AST parseability + # 5. Label quality + report("=== Label Quality ===") + label_quality = check_label_quality(samples, metadata) + for k, v in label_quality.items(): + report(f" {k}: {v}") + report("") + + # 6. AST parseability report("=== AST Parseability ===") ast_check = check_ast_parseability(samples, metadata) for k, v in ast_check.items(): report(f" {k}: {v}") report("") - # 6. Length stats + # 7. Length stats report("=== Length Statistics ===") prompt_lens = [len(s["prompt"]) for s in samples] answer_lens = [len(s["answer"]) for s in samples] From 1ad0f547aa832ed91cf0a0ffa308bf594a11182e Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Kov=C3=A1cs=20=C3=81d=C3=A1m?= Date: Mon, 11 May 2026 15:43:03 +0200 Subject: [PATCH 9/9] Changes in injector --- .../hallucination_injector.py | 90 +++++++++++-------- 1 file changed, 55 insertions(+), 35 deletions(-) diff --git a/scripts/code_hallucination/hallucination_injector.py b/scripts/code_hallucination/hallucination_injector.py index a1d470f..77c82c9 100644 --- a/scripts/code_hallucination/hallucination_injector.py +++ b/scripts/code_hallucination/hallucination_injector.py @@ -84,6 +84,8 @@ - Do NOT add imports, helper functions, or surrounding code - Prefer changing existing lines over insertions or deletions - Each edit must replace an existing substring of the original answer; no insert-only edits + - Choose exact substrings that appear exactly once in the original answer whenever possible + - Prefer whole expressions or full lines over tiny fragments Respond in this exact JSON format (no markdown, no code blocks): { @@ -91,23 +93,50 @@ { "original": "exact original substring from the correct answer", "hallucinated": "replacement text for that substring", - "left_context": "up to 40 exact characters immediately before the original substring in the correct answer", - "right_context": "up to 40 exact characters immediately after the original substring in the correct answer", "target_zone": "code", "explanation": "why this replacement is wrong according to the source code or user request" } ] } + Example 1: + Original answer contains: + return self.steps[-1][-1].transform(X) + Good JSON change: + { + "changes": [ + { + "original": "return self.steps[-1][-1].transform(X)", + "hallucinated": "return self.steps[-1][-1].predict(X)", + "target_zone": "code", + "explanation": "The source context shows this method should transform the data, not run prediction." + } + ] + } + + Example 2: + Original answer contains: + if handle_unknown == 'error': + Good JSON change: + { + "changes": [ + { + "original": "if handle_unknown == 'error':", + "hallucinated": "if handle_unknown != 'error':", + "target_zone": "code", + "explanation": "This flips the branch condition and contradicts the intended error handling in the source." + } + ] + } + IMPORTANT: - You MUST include 1-3 changes in the "changes" array - The returned changes must be sufficient to construct the full hallucinated answer - "original" must be a non-empty exact substring of the correct answer - Before returning, verify that each "original" substring appears verbatim in the provided correct answer - Prefer substrings that appear exactly once in the correct answer - - If a substring appears multiple times, use left_context and right_context that disambiguate a single occurrence + - If a substring appears multiple times, pick a different, longer substring that uniquely identifies the target location - "hallucinated" is the exact replacement text for that substring - - "left_context" and "right_context" must come from the original correct answer, not a rewritten one - "target_zone" must always be "code" - Each "explanation" must reference what the source code or user request actually says - If you cannot find 1-3 exact editable substrings in the provided answer, return {"changes": []} @@ -238,13 +267,6 @@ def _find_all_occurrences(text: str, pattern: str) -> list[dict]: return offsets -def _truncate_context(text: str, max_chars: int = 40) -> str: - """Normalize context fields to the same length budget used in the prompt.""" - if len(text) <= max_chars: - return text - return text[-max_chars:] - - def _extract_code_regions(answer: str) -> list[tuple[int, int]]: """Return ranges that correspond to markdown fenced code blocks. @@ -294,7 +316,7 @@ def _max_allowed_coverage(answer_len: int) -> float: def _locate_original_change(original_answer: str, change: dict) -> dict | None: - """Locate a replacement span in the original answer using substring plus context.""" + """Locate a replacement span in the original answer by exact unique match.""" original_span = change.get("original", "") hallucinated_span = change.get("hallucinated", "") if not original_span or not hallucinated_span: @@ -303,36 +325,31 @@ def _locate_original_change(original_answer: str, change: dict) -> dict | None: return None offsets = _find_all_occurrences(original_answer, original_span) - if not offsets: - return None - - left_context = _truncate_context(change.get("left_context", "")) - right_context = _truncate_context(change.get("right_context", "")) - filtered = [] - for offset in offsets: - start = offset["start"] - end = offset["end"] - observed_left = _truncate_context( - original_answer[max(0, start - len(left_context)) : start] - ) - observed_right = original_answer[end : end + len(right_context)] - left_ok = not left_context or observed_left == left_context - right_ok = not right_context or observed_right == right_context - if left_ok and right_ok: - filtered.append(offset) - - matches = filtered or offsets - if len(matches) != 1: + if len(offsets) != 1: return None return { - "start": matches[0]["start"], - "end": matches[0]["end"], + "start": offsets[0]["start"], + "end": offsets[0]["end"], "original": original_span, "hallucinated": hallucinated_span, } +def _sort_changes_by_original_position( + original_answer: str, changes: list[dict] +) -> list[dict] | None: + """Return changes ordered by their matched position in the original answer.""" + located = [] + for change in changes: + loc = _locate_original_change(original_answer, change) + if loc is None: + return None + located.append((loc["start"], loc["end"], change)) + located.sort(key=lambda item: (item[0], item[1])) + return [change for _, _, change in located] + + def apply_changes_to_answer( original_answer: str, changes: list[dict], hall_type: str ) -> tuple[str, list[dict]] | tuple[None, None]: @@ -514,6 +531,9 @@ def _process_result(result, instance_id, hall_type, fmt_data, model): hallucinated_code, labels = apply_changes_to_answer(original_answer, changes, hall_type) if hallucinated_code is None or labels is None: return None + ordered_changes = _sort_changes_by_original_position(original_answer, changes) + if ordered_changes is None: + return None format_type = fmt_data.get("format_type", "fragment") valid, reason = _validate_labels(original_answer, hallucinated_code, labels, format_type) @@ -527,7 +547,7 @@ def _process_result(result, instance_id, hall_type, fmt_data, model): "hallucination_type": hall_type, "injector_model": model, "format_type": format_type, - "changes": changes, + "changes": ordered_changes, }