From 2b8d9114e31e3e682903adf83fbc4809f7acb70c Mon Sep 17 00:00:00 2001
From: =?UTF-8?q?Kov=C3=A1cs=20=C3=81d=C3=A1m?= <adaam.ko@gmail.com>
Date: Thu, 5 Mar 2026 17:37:22 +0100
Subject: [PATCH 1/9] Continue phase 5

---
 scripts/code_hallucination/format_builder.py | 107 +++++++++++--------
 1 file changed, 65 insertions(+), 42 deletions(-)

diff --git a/scripts/code_hallucination/format_builder.py b/scripts/code_hallucination/format_builder.py
index 9d290f7..f371efe 100644
--- a/scripts/code_hallucination/format_builder.py
+++ b/scripts/code_hallucination/format_builder.py
@@ -180,61 +180,84 @@ def run(
     if queries is None:
         queries = {}
 
+    # Load existing for resumability
+    existing = {}
+    if FORMATS_PATH.exists():
+        with open(FORMATS_PATH) as f:
+            for line in f:
+                try:
+                    entry = json.loads(line)
+                    existing[entry["instance_id"]] = entry
+                except (json.JSONDecodeError, KeyError):
+                    continue
+    print(f"Already processed: {len(existing)} formats")
+
+    to_process = [inst for inst in instances if inst["instance_id"] not in existing]
+    print(f"Remaining: {len(to_process)} instances to process")
+
     # Only init LLM client if we'll need it (lazy)
     client = None
 
-    results = []
+    results = list(existing.values())
     format_counts = {fmt: 0 for fmt in FORMAT_TYPES}
+    for entry in results:
+        fmt = entry.get("format_type")
+        if fmt in format_counts:
+            format_counts[fmt] += 1
     skipped = 0
     explanation_failures = 0
+    processed = 0
 
-    for inst in instances:
-        instance_id = inst["instance_id"]
-
-        # Load source data from cache
-        cache_path = source_cache_dir / f"{instance_id}.json"
-        if not cache_path.exists():
-            skipped += 1
-            continue
-
-        with open(cache_path) as f:
-            source_data = json.load(f)
-
-        fmt, answer = assign_format(source_data)
-        if fmt is None:
-            skipped += 1
-            continue
-
-        # Generate explanation wrapper for code_with_explanation format
-        if fmt == "code_with_explanation":
-            if client is None:
-                client = OpenAI(api_key=api_key, base_url=base_url)
-                print(f"  LLM client initialized for code_with_explanation ({base_url})")
-
-            query = queries.get(instance_id, inst.get("problem_statement", "")[:500])
-            context = source_data.get("patch_code", "")
-            explained = _generate_explanation(client, model, answer, query, context)
-
-            if explained is None:
-                # Fallback: use raw code as fragment
-                fmt = "fragment"
-                explanation_failures += 1
-            else:
-                answer = explained
+    with open(FORMATS_PATH, "a") as f:
+        for inst in to_process:
+            instance_id = inst["instance_id"]
+
+            # Load source data from cache
+            cache_path = source_cache_dir / f"{instance_id}.json"
+            if not cache_path.exists():
+                skipped += 1
+                continue
+
+            with open(cache_path) as fp:
+                source_data = json.load(fp)
 
-        results.append(
-            {
+            fmt, answer = assign_format(source_data)
+            if fmt is None:
+                skipped += 1
+                continue
+
+            # Generate explanation wrapper for code_with_explanation format
+            if fmt == "code_with_explanation":
+                if client is None:
+                    client = OpenAI(api_key=api_key, base_url=base_url)
+                    print(f"  LLM client initialized for code_with_explanation ({base_url})")
+
+                query = queries.get(instance_id, inst.get("problem_statement", "")[:500])
+                context = source_data.get("patch_code", "")
+                explained = _generate_explanation(client, model, answer, query, context)
+
+                if explained is None:
+                    # Fallback: use raw code as fragment
+                    fmt = "fragment"
+                    explanation_failures += 1
+                else:
+                    answer = explained
+
+            entry = {
                 "instance_id": instance_id,
                 "format_type": fmt,
                 "answer": answer,
             }
-        )
-        format_counts[fmt] += 1
-
-    # Save
-    with open(FORMATS_PATH, "w") as f:
-        for entry in results:
             f.write(json.dumps(entry) + "\n")
+            f.flush()
+            results.append(entry)
+            format_counts[fmt] += 1
+            processed += 1
+
+            if processed % 100 == 0:
+                print(
+                    f"  Progress: {processed}/{len(to_process)} (failures: {explanation_failures})"
+                )
 
     print(f"\nAssigned formats for {len(results)} instances (skipped {skipped})")
     if explanation_failures:

From c1cd5eb7a2a335d15c4f87031349edf92b59931e Mon Sep 17 00:00:00 2001
From: =?UTF-8?q?Kov=C3=A1cs=20=C3=81d=C3=A1m?= <adaam.ko@gmail.com>
Date: Thu, 5 Mar 2026 17:41:22 +0100
Subject: [PATCH 2/9] Better progressbar

---
 scripts/code_hallucination/context7_docs.py          |  2 +-
 scripts/code_hallucination/format_builder.py         |  2 +-
 scripts/code_hallucination/hallucination_injector.py | 10 ++++------
 scripts/code_hallucination/query_rewriter.py         |  4 ++--
 scripts/code_hallucination/source_fetcher.py         |  4 ++--
 5 files changed, 10 insertions(+), 12 deletions(-)

diff --git a/scripts/code_hallucination/context7_docs.py b/scripts/code_hallucination/context7_docs.py
index b287015..ef60850 100644
--- a/scripts/code_hallucination/context7_docs.py
+++ b/scripts/code_hallucination/context7_docs.py
@@ -228,7 +228,7 @@ def run(instances: list[dict]):
 
             if processed % 100 == 0:
                 print(
-                    f"  Progress: {processed}/{len(to_process)} ({with_docs} with docs, {skipped_by_ratio} skipped)"
+                    f"  Phase 4: {processed}/{len(to_process)} ({with_docs} with docs, {skipped_by_ratio} skipped)"
                 )
 
     print(
diff --git a/scripts/code_hallucination/format_builder.py b/scripts/code_hallucination/format_builder.py
index f371efe..9ad9dfe 100644
--- a/scripts/code_hallucination/format_builder.py
+++ b/scripts/code_hallucination/format_builder.py
@@ -256,7 +256,7 @@ def run(
 
             if processed % 100 == 0:
                 print(
-                    f"  Progress: {processed}/{len(to_process)} (failures: {explanation_failures})"
+                    f"  Phase 5: {processed}/{len(to_process)} (failures: {explanation_failures})"
                 )
 
     print(f"\nAssigned formats for {len(results)} instances (skipped {skipped})")
diff --git a/scripts/code_hallucination/hallucination_injector.py b/scripts/code_hallucination/hallucination_injector.py
index 941315c..f7bd6d5 100644
--- a/scripts/code_hallucination/hallucination_injector.py
+++ b/scripts/code_hallucination/hallucination_injector.py
@@ -421,8 +421,8 @@ def _run_sequential(to_process, formats, queries, docs, api_key, base_url, model
             results.append(entry)
             processed += 1
 
-            if processed % 50 == 0:
-                print(f"  Progress: {processed}/{len(to_process)} (failed: {failed})")
+            if processed % 100 == 0:
+                print(f"  Phase 6: {processed}/{len(to_process)} (failed: {failed})")
 
     print(f"\nDone: {processed} injected, {failed} failed ({no_spans} had no matchable spans)")
     return results
@@ -497,11 +497,9 @@ async def process_batches():
                     results.append(entry)
                     processed += 1
 
-                if processed % 50 == 0 or batch_start + BATCH_SIZE >= len(to_process):
+                if processed % 100 == 0 or batch_start + BATCH_SIZE >= len(to_process):
                     total = processed + failed
-                    print(
-                        f"  Progress: {total}/{len(to_process)} ({processed} ok, {failed} failed)"
-                    )
+                    print(f"  Phase 6: {total}/{len(to_process)} ({processed} ok, {failed} failed)")
 
     asyncio.run(process_batches())
     print(f"\nDone: {processed} injected, {failed} failed ({no_spans} had no matchable spans)")
diff --git a/scripts/code_hallucination/query_rewriter.py b/scripts/code_hallucination/query_rewriter.py
index 966c70f..65f3a16 100644
--- a/scripts/code_hallucination/query_rewriter.py
+++ b/scripts/code_hallucination/query_rewriter.py
@@ -123,8 +123,8 @@ def run(
                 f.flush()
                 processed += 1
 
-                if processed % 50 == 0:
-                    print(f"  Progress: {processed}/{len(to_process)} (failed: {failed})")
+                if processed % 100 == 0:
+                    print(f"  Phase 3: {processed}/{len(to_process)} (failed: {failed})")
             except Exception as e:
                 print(f"  ERROR {instance_id}: {e}")
                 failed += 1
diff --git a/scripts/code_hallucination/source_fetcher.py b/scripts/code_hallucination/source_fetcher.py
index 0fc25da..7eadaee 100644
--- a/scripts/code_hallucination/source_fetcher.py
+++ b/scripts/code_hallucination/source_fetcher.py
@@ -509,8 +509,8 @@ def run(instances: list[dict], use_github_api: bool = False):
     failed = 0
 
     for i, instance in enumerate(instances):
-        if (i + 1) % 100 == 0:
-            print(f"  Progress: {i + 1}/{len(instances)} ({len(results)} success, {failed} failed)")
+        if (i + 1) % 100 == 0 or (i + 1) == len(instances):
+            print(f"  Phase 2: {i + 1}/{len(instances)} ({len(results)} success, {failed} failed)")
 
         # Skip if already cached
         cache_path = SOURCE_CACHE_DIR / f"{instance['instance_id']}.json"

From 83d5421db9d2f373c67ee3124142b7d3b4d2f379 Mon Sep 17 00:00:00 2001
From: =?UTF-8?q?Kov=C3=A1cs=20=C3=81d=C3=A1m?= <adaam.ko@gmail.com>
Date: Thu, 5 Mar 2026 18:24:57 +0100
Subject: [PATCH 3/9] Dont fetch everything

---
 scripts/code_hallucination/source_fetcher.py | 30 +++++++++++++-------
 1 file changed, 19 insertions(+), 11 deletions(-)

diff --git a/scripts/code_hallucination/source_fetcher.py b/scripts/code_hallucination/source_fetcher.py
index 7eadaee..dacebf1 100644
--- a/scripts/code_hallucination/source_fetcher.py
+++ b/scripts/code_hallucination/source_fetcher.py
@@ -43,7 +43,7 @@ def clone_repo(repo: str, repos_dir: Path = REPOS_DIR) -> Path | None:
             ["git", "clone", "--bare", f"https://github.com/{repo}.git", str(repo_dir)],
             capture_output=True,
             text=True,
-            timeout=1800,  # 30 min for large repos
+            timeout=60,  # 1 min timeout, fall back to GitHub API
         )
         if result.returncode != 0:
             print(f"  ERROR cloning {repo}: {result.stderr[:200]}")
@@ -493,17 +493,14 @@ def run(instances: list[dict], use_github_api: bool = False):
     print("=" * 60)
 
     SOURCE_CACHE_DIR.mkdir(parents=True, exist_ok=True)
+    REPOS_DIR.mkdir(parents=True, exist_ok=True)
 
-    if not use_github_api:
-        REPOS_DIR.mkdir(parents=True, exist_ok=True)
-        # Group by repo for efficient cloning
-        repos = set(inst["repo"] for inst in instances)
-        print(f"Need to clone {len(repos)} repos")
-        for repo in sorted(repos):
-            clone_repo(repo)
-    else:
+    if use_github_api:
         print("Using GitHub raw API (no cloning)")
 
+    # Track repos that failed to clone so we don't retry
+    clone_failed_repos: set[str] = set()
+
     # Fetch sources per instance
     results = []
     failed = 0
@@ -519,10 +516,21 @@ def run(instances: list[dict], use_github_api: bool = False):
                 results.append(json.load(f))
             continue
 
-        result = fetch_source_for_instance(instance, use_github_api=use_github_api)
+        # Try clone first, fall back to GitHub API
+        repo = instance["repo"]
+        use_api_for_this = use_github_api
+        if not use_api_for_this and repo not in clone_failed_repos:
+            repo_dir = clone_repo(repo)
+            if repo_dir is None:
+                clone_failed_repos.add(repo)
+                use_api_for_this = True
+                print(f"  Falling back to GitHub API for {repo}")
+
+        result = fetch_source_for_instance(
+            instance, use_github_api=use_api_for_this or repo in clone_failed_repos
+        )
         if result:
             results.append(result)
-            # Cache result
             cache_path = SOURCE_CACHE_DIR / f"{instance['instance_id']}.json"
             with open(cache_path, "w") as f:
                 json.dump(result, f)

From d19bf79d3f82a31a11ebc36d9171eccc3a4c857a Mon Sep 17 00:00:00 2001
From: =?UTF-8?q?Kov=C3=A1cs=20=C3=81d=C3=A1m?= <adaam.ko@gmail.com>
Date: Thu, 5 Mar 2026 19:47:29 +0100
Subject: [PATCH 4/9] Fetch faster

---
 scripts/code_hallucination/source_fetcher.py | 13 ++++++++-----
 1 file changed, 8 insertions(+), 5 deletions(-)

diff --git a/scripts/code_hallucination/source_fetcher.py b/scripts/code_hallucination/source_fetcher.py
index dacebf1..fa24c4e 100644
--- a/scripts/code_hallucination/source_fetcher.py
+++ b/scripts/code_hallucination/source_fetcher.py
@@ -5,6 +5,7 @@
 import re
 import subprocess
 import tempfile
+import warnings
 from pathlib import Path
 
 import requests
@@ -143,7 +144,9 @@ def extract_modified_functions(original_source: str, patched_source: str) -> lis
     def get_functions(source: str) -> dict[str, str]:
         """Parse source and extract function name -> source mapping."""
         try:
-            tree = ast.parse(source)
+            with warnings.catch_warnings():
+                warnings.simplefilter("ignore", SyntaxWarning)
+                tree = ast.parse(source)
         except SyntaxError:
             return {}
 
@@ -460,10 +463,7 @@ def fetch_source_for_instance(
     for filepath in changed_files:
         if filepath not in source_files:
             continue
-        if repo_dir is not None:
-            patched_source = apply_patch_and_get_file(repo_dir, commit, patch, filepath)
-        else:
-            patched_source = apply_patch_in_memory(source_files[filepath], patch, filepath)
+        patched_source = apply_patch_in_memory(source_files[filepath], patch, filepath)
         if patched_source:
             funcs = extract_modified_functions(source_files[filepath], patched_source)
             for func in funcs:
@@ -492,6 +492,9 @@ def run(instances: list[dict], use_github_api: bool = False):
     print("Phase 2: Source File Fetching")
     print("=" * 60)
 
+    # Suppress SyntaxWarning from ast.parse on third-party source files
+    warnings.filterwarnings("ignore", category=SyntaxWarning)
+
     SOURCE_CACHE_DIR.mkdir(parents=True, exist_ok=True)
     REPOS_DIR.mkdir(parents=True, exist_ok=True)
 

From 5096af2525d91da6e30feb66dc05bac505cfc049 Mon Sep 17 00:00:00 2001
From: =?UTF-8?q?Kov=C3=A1cs=20=C3=81d=C3=A1m?= <adaam.ko@gmail.com>
Date: Thu, 5 Mar 2026 21:20:59 +0100
Subject: [PATCH 5/9] Better evaluation for code

---
 scripts/evaluate_code_hallucination.py | 29 +++++++++++++++-----------
 1 file changed, 17 insertions(+), 12 deletions(-)

diff --git a/scripts/evaluate_code_hallucination.py b/scripts/evaluate_code_hallucination.py
index 87a68fd..2df98c0 100644
--- a/scripts/evaluate_code_hallucination.py
+++ b/scripts/evaluate_code_hallucination.py
@@ -5,12 +5,13 @@
 Supports Groq API with any OpenAI-compatible model.
 
 Usage:
-    # With Groq + Kimi
+    # With Groq
     OPENAI_API_KEY=gsk_... OPENAI_API_BASE=https://api.groq.com/openai/v1 \
         python scripts/evaluate_code_hallucination.py \
         --model moonshotai/kimi-k2-instruct-0905 \
         --data_path data/code_hallucination_lettucedetect_v2.json \
-        --evaluation_type example_level
+        --evaluation_type example_level \
+        --split test
 """
 
 import argparse
@@ -210,9 +211,12 @@ def main():
         help="Limit number of test samples (for quick testing)",
     )
     parser.add_argument(
-        "--test_ratio", type=float, default=0.3, help="Fraction of data to use as test set"
+        "--split",
+        type=str,
+        default="test",
+        choices=["train", "dev", "test"],
+        help="Which split to evaluate on (uses the split field from the dataset)",
     )
-    parser.add_argument("--seed", type=int, default=42)
 
     args = parser.parse_args()
 
@@ -243,14 +247,13 @@ def main():
             )
         )
 
-    # Split into test set
-    import random
-
-    random.seed(args.seed)
-    random.shuffle(samples)
+    # Filter to the requested split
+    test_samples = [s for s in samples if s.split == args.split]
 
-    test_size = int(len(samples) * args.test_ratio)
-    test_samples = samples[:test_size]
+    if not test_samples:
+        available_splits = set(s.split for s in samples)
+        print(f"No samples found for split '{args.split}'. Available splits: {available_splits}")
+        return
 
     if args.max_samples:
         test_samples = test_samples[: args.max_samples]
@@ -260,7 +263,9 @@ def main():
 
     print(f"Dataset: {data_path}")
     print(f"Total samples: {len(samples)}")
-    print(f"Test samples: {len(test_samples)} (positive: {n_positive}, negative: {n_negative})")
+    print(
+        f"Evaluating on '{args.split}' split: {len(test_samples)} samples (positive: {n_positive}, negative: {n_negative})"
+    )
     print(f"Model: {args.model}")
     print(f"API base: {os.getenv('OPENAI_API_BASE', 'https://api.openai.com/v1')}")
 

From 9be1d2ed2f13cd7dedc68133e2cb71f2ad7eda90 Mon Sep 17 00:00:00 2001
From: =?UTF-8?q?Kov=C3=A1cs=20=C3=81d=C3=A1m?= <adaam.ko@gmail.com>
Date: Fri, 6 Mar 2026 09:40:56 +0100
Subject: [PATCH 6/9] Smaller prompts

---
 scripts/code_hallucination/config.py          |   1 +
 scripts/code_hallucination/format_builder.py  | 245 ++++++++++++++----
 .../hallucination_injector.py                 |  44 +++-
 scripts/code_hallucination/pipeline.py        |  24 +-
 .../code_hallucination/sample_assembler.py    |   7 +-
 5 files changed, 259 insertions(+), 62 deletions(-)

diff --git a/scripts/code_hallucination/config.py b/scripts/code_hallucination/config.py
index 7f71f99..85bfed4 100644
--- a/scripts/code_hallucination/config.py
+++ b/scripts/code_hallucination/config.py
@@ -36,6 +36,7 @@
 HALLUCINATION_RATIO = 0.4  # 40% hallucinated, 60% clean
 MAX_FILE_CHARS = 12000  # Cap individual source file size
 MAX_CONTEXT7_CHARS = 4000  # Documentation fetch limit
+MAX_PROMPT_CHARS = 24000  # ~6K tokens, leaves room for answer within 8K model context
 
 # === LLM Config ===
 RETRY_DELAY = 2
diff --git a/scripts/code_hallucination/format_builder.py b/scripts/code_hallucination/format_builder.py
index 9ad9dfe..0f0a660 100644
--- a/scripts/code_hallucination/format_builder.py
+++ b/scripts/code_hallucination/format_builder.py
@@ -1,15 +1,21 @@
-"""Phase 5: Assign answer format to each instance."""
+"""Phase 5: Assign answer format to each instance.
 
+Supports both sequential (remote API) and async batch (local vLLM) modes.
+Set BATCH_SIZE>1 env var for parallel requests to local vLLM.
+"""
+
+import asyncio
 import json
 import random
 import textwrap
 import time
 
-from openai import OpenAI
+from openai import AsyncOpenAI, OpenAI
 
 from .config import (
     API_BASE_URL,
     API_KEY,
+    BATCH_SIZE,
     FORMAT_TYPES,
     FORMAT_WEIGHTS,
     FORMATS_PATH,
@@ -26,27 +32,24 @@
     that a developer would receive from an AI assistant.
 
     Your response MUST:
-    - Start with a brief explanation (1-3 sentences) of what the issue is and how to fix it
+    - Start with 1-2 sentences explaining what was wrong and how to fix it
     - Include the code in a properly formatted code block (```python)
-    - Optionally end with a short note about what changed or why
+    - Do NOT add anything after the code block
 
     Your response must NOT:
-    - Include phrases like "Here's the fix" or "I'll help you with that" — just explain directly
-    - Be longer than necessary — keep it concise
+    - Include phrases like "Here's the fix" or "I'll help you with that"
+    - Be longer than 2 sentences of explanation + the code block
     - Change the code in any way — use it exactly as provided
     - Add any imports or code not in the original
 
-    Example style:
-    The issue is that `process_data` uses `dict.items()` instead of iterating
-    over the sorted keys, which causes non-deterministic output.
+    Example:
+    The `process_data` function uses `dict.items()` instead of iterating over sorted keys, causing non-deterministic output.
 
     ```python
     def process_data(data):
         for key in sorted(data.keys()):
             yield key, data[key]
     ```
-
-    This ensures consistent ordering regardless of insertion order.
 """)
 
 
@@ -75,7 +78,7 @@ def _generate_explanation(
                     {"role": "user", "content": user_msg},
                 ],
                 temperature=LLM_TEMPERATURE,
-                max_tokens=2000,
+                max_tokens=200,
             )
             result = response.choices[0].message.content.strip()
             # Verify the code is actually in the response
@@ -94,6 +97,47 @@ def _generate_explanation(
     return None
 
 
+async def _generate_explanation_async(
+    aclient: AsyncOpenAI, model: str, code: str, query: str, context: str
+) -> str | None:
+    """Async version of _generate_explanation for batch processing."""
+    user_msg = f"""User's question: {query}
+
+Context (relevant source code):
+{context[:3000]}
+
+Correct code fix:
+```python
+{code}
+```
+
+Write a natural AI assistant response that includes this exact code."""
+
+    for attempt in range(MAX_RETRIES):
+        try:
+            response = await aclient.chat.completions.create(
+                model=model,
+                messages=[
+                    {"role": "system", "content": EXPLANATION_SYSTEM_PROMPT},
+                    {"role": "user", "content": user_msg},
+                ],
+                temperature=LLM_TEMPERATURE,
+                max_tokens=200,
+            )
+            result = response.choices[0].message.content.strip()
+            if code[:50] in result or "```" in result:
+                return result
+            if attempt < MAX_RETRIES - 1:
+                continue
+            return None
+        except Exception:
+            if attempt < MAX_RETRIES - 1:
+                await asyncio.sleep(RETRY_DELAY * (attempt + 1))
+            else:
+                return None
+    return None
+
+
 def assign_format(source_data: dict) -> tuple[str, str]:
     """Assign a format type and build the answer for an instance.
 
@@ -169,7 +213,8 @@ def run(
 ):
     """Run Phase 5: Assign formats and build answers.
 
-    Returns list of dicts with instance_id, format_type, answer.
+    Uses async batch processing when BATCH_SIZE > 1 (for local vLLM).
+    Falls back to sequential processing for remote APIs (BATCH_SIZE=1).
     """
     print("=" * 60)
     print("Phase 5: Answer Format Building")
@@ -194,54 +239,104 @@ def run(
 
     to_process = [inst for inst in instances if inst["instance_id"] not in existing]
     print(f"Remaining: {len(to_process)} instances to process")
+    print(f"Batch size: {BATCH_SIZE}")
+
+    # First pass: assign formats for all instances (no LLM needed)
+    # Collect which ones need explanation generation
+    needs_explanation = []  # (instance_id, code, query, context)
+    entries_no_llm = []  # entries that don't need LLM
+
+    for inst in to_process:
+        instance_id = inst["instance_id"]
+
+        cache_path = source_cache_dir / f"{instance_id}.json"
+        if not cache_path.exists():
+            continue
+
+        with open(cache_path) as fp:
+            source_data = json.load(fp)
+
+        fmt, answer = assign_format(source_data)
+        if fmt is None:
+            continue
+
+        if fmt == "code_with_explanation":
+            query = queries.get(instance_id, inst.get("problem_statement", "")[:500])
+            context = source_data.get("patch_code", "")
+            needs_explanation.append((instance_id, answer, query, context, fmt))
+        else:
+            entries_no_llm.append(
+                {
+                    "instance_id": instance_id,
+                    "format_type": fmt,
+                    "answer": answer,
+                }
+            )
 
-    # Only init LLM client if we'll need it (lazy)
-    client = None
-
+    # Write non-LLM entries immediately
     results = list(existing.values())
     format_counts = {fmt: 0 for fmt in FORMAT_TYPES}
     for entry in results:
         fmt = entry.get("format_type")
         if fmt in format_counts:
             format_counts[fmt] += 1
-    skipped = 0
-    explanation_failures = 0
+
     processed = 0
+    explanation_failures = 0
 
     with open(FORMATS_PATH, "a") as f:
-        for inst in to_process:
-            instance_id = inst["instance_id"]
+        for entry in entries_no_llm:
+            f.write(json.dumps(entry) + "\n")
+            results.append(entry)
+            format_counts[entry["format_type"]] += 1
+            processed += 1
+        f.flush()
 
-            # Load source data from cache
-            cache_path = source_cache_dir / f"{instance_id}.json"
-            if not cache_path.exists():
-                skipped += 1
-                continue
+    print(f"  Assigned {len(entries_no_llm)} non-LLM formats")
+    print(f"  Need LLM explanation: {len(needs_explanation)} instances")
 
-            with open(cache_path) as fp:
-                source_data = json.load(fp)
+    # Second pass: generate explanations (batched or sequential)
+    if needs_explanation:
+        if BATCH_SIZE > 1:
+            explanation_failures = _run_explanations_batched(
+                needs_explanation, format_counts, results, api_key, base_url, model
+            )
+        else:
+            explanation_failures = _run_explanations_sequential(
+                needs_explanation, format_counts, results, api_key, base_url, model
+            )
 
-            fmt, answer = assign_format(source_data)
-            if fmt is None:
-                skipped += 1
-                continue
+    processed += len(needs_explanation)
+
+    print(f"\nAssigned formats for {len(results)} instances")
+    if explanation_failures:
+        print(f"  Explanation generation failures (fell back to fragment): {explanation_failures}")
+    for fmt, count in format_counts.items():
+        pct = count * 100 // max(len(results), 1)
+        print(f"  {fmt}: {count} ({pct}%)")
 
-            # Generate explanation wrapper for code_with_explanation format
-            if fmt == "code_with_explanation":
-                if client is None:
-                    client = OpenAI(api_key=api_key, base_url=base_url)
-                    print(f"  LLM client initialized for code_with_explanation ({base_url})")
+    return results
 
-                query = queries.get(instance_id, inst.get("problem_statement", "")[:500])
-                context = source_data.get("patch_code", "")
-                explained = _generate_explanation(client, model, answer, query, context)
 
-                if explained is None:
-                    # Fallback: use raw code as fragment
-                    fmt = "fragment"
-                    explanation_failures += 1
-                else:
-                    answer = explained
+def _run_explanations_sequential(
+    needs_explanation, format_counts, results, api_key, base_url, model
+):
+    """Generate explanations sequentially (for remote APIs)."""
+    client = OpenAI(api_key=api_key, base_url=base_url)
+    explanation_failures = 0
+    processed = 0
+
+    with open(FORMATS_PATH, "a") as f:
+        for instance_id, code, query, context, _ in needs_explanation:
+            explained = _generate_explanation(client, model, code, query, context)
+
+            if explained is None:
+                fmt = "fragment"
+                answer = code
+                explanation_failures += 1
+            else:
+                fmt = "code_with_explanation"
+                answer = explained
 
             entry = {
                 "instance_id": instance_id,
@@ -256,17 +351,61 @@ def run(
 
             if processed % 100 == 0:
                 print(
-                    f"  Phase 5: {processed}/{len(to_process)} (failures: {explanation_failures})"
+                    f"  Phase 5 (explanations): {processed}/{len(needs_explanation)} "
+                    f"(failures: {explanation_failures})"
                 )
 
-    print(f"\nAssigned formats for {len(results)} instances (skipped {skipped})")
-    if explanation_failures:
-        print(f"  Explanation generation failures (fell back to fragment): {explanation_failures}")
-    for fmt, count in format_counts.items():
-        pct = count * 100 // max(len(results), 1)
-        print(f"  {fmt}: {count} ({pct}%)")
+    return explanation_failures
 
-    return results
+
+def _run_explanations_batched(needs_explanation, format_counts, results, api_key, base_url, model):
+    """Generate explanations with async batching (for local vLLM)."""
+    aclient = AsyncOpenAI(api_key=api_key, base_url=base_url)
+    explanation_failures = 0
+    processed = 0
+
+    async def process_batches():
+        nonlocal explanation_failures, processed
+
+        with open(FORMATS_PATH, "a") as f:
+            for batch_start in range(0, len(needs_explanation), BATCH_SIZE):
+                batch = needs_explanation[batch_start : batch_start + BATCH_SIZE]
+
+                tasks = []
+                for instance_id, code, query, context, _ in batch:
+                    tasks.append(_generate_explanation_async(aclient, model, code, query, context))
+
+                batch_results = await asyncio.gather(*tasks, return_exceptions=True)
+
+                for (instance_id, code, query, context, _), explained in zip(batch, batch_results):
+                    if isinstance(explained, Exception) or explained is None:
+                        fmt = "fragment"
+                        answer = code
+                        explanation_failures += 1
+                    else:
+                        fmt = "code_with_explanation"
+                        answer = explained
+
+                    entry = {
+                        "instance_id": instance_id,
+                        "format_type": fmt,
+                        "answer": answer,
+                    }
+                    f.write(json.dumps(entry) + "\n")
+                    results.append(entry)
+                    format_counts[fmt] += 1
+                    processed += 1
+
+                f.flush()
+
+                if processed % 100 == 0 or batch_start + BATCH_SIZE >= len(needs_explanation):
+                    print(
+                        f"  Phase 5 (explanations): {processed}/{len(needs_explanation)} "
+                        f"(failures: {explanation_failures})"
+                    )
+
+    asyncio.run(process_batches())
+    return explanation_failures
 
 
 if __name__ == "__main__":
diff --git a/scripts/code_hallucination/hallucination_injector.py b/scripts/code_hallucination/hallucination_injector.py
index f7bd6d5..58ff0d4 100644
--- a/scripts/code_hallucination/hallucination_injector.py
+++ b/scripts/code_hallucination/hallucination_injector.py
@@ -19,6 +19,7 @@
     HALLUCINATED_PATH,
     HALLUCINATION_TEMPERATURE,
     HALLUCINATION_TYPES,
+    MAX_PROMPT_CHARS,
     MAX_RETRIES,
     MODEL,
     RETRY_DELAY,
@@ -81,6 +82,20 @@
 """)
 
 
+def build_source_context(source_data: dict) -> str:
+    """Build source code context string from cached source data.
+
+    Truncates to MAX_PROMPT_CHARS so the final sample fits in 8K model context.
+    """
+    parts = []
+    for filepath, content in source_data.get("source_files", {}).items():
+        parts.append(f"File: {filepath}\n```python\n{content}\n```")
+    context = "\n\n".join(parts)
+    if len(context) > MAX_PROMPT_CHARS:
+        context = context[:MAX_PROMPT_CHARS]
+    return context
+
+
 def inject_hallucination(
     client: OpenAI,
     model: str,
@@ -318,6 +333,7 @@ def run(
     formats: dict[str, dict],
     queries: dict[str, str],
     docs: dict[str, dict] | None = None,
+    source_cache: dict[str, dict] | None = None,
     api_key: str = API_KEY,
     base_url: str = API_BASE_URL,
     model: str = MODEL,
@@ -333,6 +349,8 @@ def run(
 
     if docs is None:
         docs = {}
+    if source_cache is None:
+        source_cache = {}
 
     HALLUCINATED_PATH.parent.mkdir(parents=True, exist_ok=True)
 
@@ -350,9 +368,13 @@ def run(
     print(f"Remaining: {len(to_process)} instances to inject")
 
     if BATCH_SIZE > 1:
-        results = _run_batched(to_process, formats, queries, docs, api_key, base_url, model)
+        results = _run_batched(
+            to_process, formats, queries, docs, source_cache, api_key, base_url, model
+        )
     else:
-        results = _run_sequential(to_process, formats, queries, docs, api_key, base_url, model)
+        results = _run_sequential(
+            to_process, formats, queries, docs, source_cache, api_key, base_url, model
+        )
 
     # Stats
     type_counts = {}
@@ -372,7 +394,7 @@ def run(
     return results
 
 
-def _run_sequential(to_process, formats, queries, docs, api_key, base_url, model):
+def _run_sequential(to_process, formats, queries, docs, source_cache, api_key, base_url, model):
     """Sequential processing for remote APIs (rate-limited)."""
     client = OpenAI(api_key=api_key, base_url=base_url)
     processed = 0
@@ -391,7 +413,12 @@ def _run_sequential(to_process, formats, queries, docs, api_key, base_url, model
 
             hall_type = HALLUCINATION_TYPES[i % len(HALLUCINATION_TYPES)]
             query = queries.get(instance_id, "")
-            context = inst.get("problem_statement", "")
+            source_data = source_cache.get(instance_id, {})
+            context = (
+                build_source_context(source_data)
+                if source_data
+                else inst.get("problem_statement", "")
+            )
             instance_docs = docs.get(instance_id, {})
 
             # Try injection with up to 2 quality retries
@@ -428,7 +455,7 @@ def _run_sequential(to_process, formats, queries, docs, api_key, base_url, model
     return results
 
 
-def _run_batched(to_process, formats, queries, docs, api_key, base_url, model):
+def _run_batched(to_process, formats, queries, docs, source_cache, api_key, base_url, model):
     """Async batch processing for local vLLM (no rate limiting needed)."""
     aclient = AsyncOpenAI(api_key=api_key, base_url=base_url)
     processed = 0
@@ -457,7 +484,12 @@ async def process_batches():
 
                     hall_type = HALLUCINATION_TYPES[global_idx % len(HALLUCINATION_TYPES)]
                     query = queries.get(instance_id, "")
-                    context = inst.get("problem_statement", "")
+                    source_data = source_cache.get(instance_id, {})
+                    context = (
+                        build_source_context(source_data)
+                        if source_data
+                        else inst.get("problem_statement", "")
+                    )
                     instance_docs = docs.get(instance_id, {})
 
                     tasks.append(
diff --git a/scripts/code_hallucination/pipeline.py b/scripts/code_hallucination/pipeline.py
index ea7fc23..59d8d26 100644
--- a/scripts/code_hallucination/pipeline.py
+++ b/scripts/code_hallucination/pipeline.py
@@ -29,9 +29,21 @@
     HALLUCINATED_PATH,
     MODEL,
     QUERIES_PATH,
+    SOURCE_CACHE_DIR,
 )
 
 
+def load_source_cache(instance_ids: list[str]) -> dict[str, dict]:
+    """Load source cache for given instance IDs."""
+    cache = {}
+    for iid in instance_ids:
+        cache_path = SOURCE_CACHE_DIR / f"{iid}.json"
+        if cache_path.exists():
+            with open(cache_path) as f:
+                cache[iid] = json.load(f)
+    return cache
+
+
 def load_jsonl_dict(path, key="instance_id", value_key=None) -> dict:
     """Load a JSONL file into a dict keyed by instance_id."""
     result = {}
@@ -110,8 +122,16 @@ def run_test(n: int = 5, api_key: str = API_KEY, base_url: str = API_BASE_URL, m
     formats = load_jsonl_dict(FORMATS_PATH)
     docs = load_jsonl_dict(DOCS_PATH, value_key="docs")
     to_inject = [i for i in selected if i["instance_id"] in targets]
+    sc = load_source_cache([i["instance_id"] for i in to_inject])
     run_inject(
-        to_inject, formats, queries_dict, docs=docs, api_key=api_key, base_url=base_url, model=model
+        to_inject,
+        formats,
+        queries_dict,
+        docs=docs,
+        source_cache=sc,
+        api_key=api_key,
+        base_url=base_url,
+        model=model,
     )
 
     # Phase 7: Assemble
@@ -210,11 +230,13 @@ def main():
             docs = load_jsonl_dict(DOCS_PATH, value_key="docs")
             targets = select_hallucination_targets(instances)
             to_inject = [i for i in instances if i["instance_id"] in targets]
+            sc = load_source_cache([i["instance_id"] for i in to_inject])
             run(
                 to_inject,
                 formats,
                 queries,
                 docs=docs,
+                source_cache=sc,
                 api_key=args.api_key,
                 base_url=args.base_url,
                 model=args.model,
diff --git a/scripts/code_hallucination/sample_assembler.py b/scripts/code_hallucination/sample_assembler.py
index bdcbd61..7bf6157 100644
--- a/scripts/code_hallucination/sample_assembler.py
+++ b/scripts/code_hallucination/sample_assembler.py
@@ -2,7 +2,7 @@
 
 import json
 
-from .config import DATASET_PATH, METADATA_PATH, SOURCE_CACHE_DIR
+from .config import DATASET_PATH, MAX_PROMPT_CHARS, METADATA_PATH, SOURCE_CACHE_DIR
 
 
 def build_prompt(
@@ -24,7 +24,10 @@ def build_prompt(
 
     parts.append(f"User request: {user_query}")
 
-    return "\n\n".join(parts)
+    prompt = "\n\n".join(parts)
+    if len(prompt) > MAX_PROMPT_CHARS:
+        prompt = prompt[:MAX_PROMPT_CHARS]
+    return prompt
 
 
 def assemble_samples(

From 406ed946acd472b2002cacbc40d5ff565af05f74 Mon Sep 17 00:00:00 2001
From: =?UTF-8?q?Kov=C3=A1cs=20=C3=81d=C3=A1m?= <adaam.ko@gmail.com>
Date: Fri, 6 Mar 2026 10:54:42 +0100
Subject: [PATCH 7/9] Ground hallucinations

---
 .../hallucination_injector.py                 | 41 ++++++++++---------
 1 file changed, 22 insertions(+), 19 deletions(-)

diff --git a/scripts/code_hallucination/hallucination_injector.py b/scripts/code_hallucination/hallucination_injector.py
index 58ff0d4..46d676d 100644
--- a/scripts/code_hallucination/hallucination_injector.py
+++ b/scripts/code_hallucination/hallucination_injector.py
@@ -29,24 +29,30 @@
     You are a code hallucination injector for building a hallucination detection dataset.
 
     Given a correct answer (which may be pure code OR code with natural language explanation)
-    and context, create a hallucinated version with specific types of errors.
+    and SOURCE CODE CONTEXT, create a hallucinated version with specific types of errors.
+
+    CRITICAL: Every error you inject MUST BE DETECTABLE by comparing the answer against
+    the provided source code context AND/OR the user's request. A human reading the
+    source files and user query must be able to spot that the hallucinated part
+    contradicts what's in the source or what the user asked for. Do NOT inject errors
+    that require running the code or external knowledge to detect.
 
     Hallucination types:
-    - STRUCTURAL: Change a function call, import, or parameter to something that
-      doesn't exist or is wrong. Code should still parse but reference non-existent
-      APIs, wrong methods, or invented parameters.
-    - BEHAVIORAL: Use correct APIs but with wrong values or logic. Wrong defaults,
-      off-by-one errors, swapped conditions, wrong argument values.
-    - SEMANTIC: Code that looks like it addresses the user's request but does
-      something subtly different or opposite. The code parses, uses real APIs,
-      but fails to do what was asked. If library documentation is provided,
-      you can make the code contradict the documented API (wrong parameter names,
-      wrong return types, deprecated usage, etc.).
-      For answers with explanations, you may also make the explanation contradict
-      the code or describe incorrect behavior.
+    - STRUCTURAL: Change a function/method/class name, import, or parameter to something
+      that does NOT exist in the source context. For example, rename a method call to one
+      that isn't defined in the provided source files, or add a parameter that the function
+      doesn't accept according to the source.
+    - BEHAVIORAL: Use correct API names from the source but with wrong values or logic
+      that contradicts the source. Wrong default values (different from source), swapped
+      conditions, wrong argument order compared to the function signature in source.
+    - SEMANTIC: Code that contradicts the source's behavior, the user's request, or the
+      explanation contradicts what the source code actually does. For example: claim a
+      function returns X when the source shows it returns Y, describe wrong control flow,
+      or solve a different problem than what the user asked for.
 
     Rules:
     - Make 2-3 DISTINCT changes spread across different parts of the answer
+    - Each change MUST contradict something visible in the source code or user request
     - Each changed span must be 20-150 characters long (not too short, not too long)
     - Total hallucinated text must be LESS THAN 40% of the original answer length
     - Keep most of the answer CORRECT — do NOT rewrite the entire thing
@@ -54,11 +60,7 @@
     - Make changes PLAUSIBLE — something an LLM would realistically generate
     - Changes must be SUBTLE, not obviously broken
     - The code in the hallucinated answer must still be syntactically valid
-    - Do NOT add comments explaining or hinting at the hallucination (no "# wrong",
-      "# error", "# typo", "# nonexistent", etc.) — the errors must be invisible
-      to someone skimming the answer
-    - If the answer contains both code and explanation, inject errors in BOTH parts
-      (e.g. wrong API in code + misleading description in text)
+    - Do NOT add comments explaining or hinting at the hallucination
     - Preserve the overall structure: keep markdown formatting, code blocks, etc.
 
     Respond in this exact JSON format (no markdown, no code blocks):
@@ -68,7 +70,7 @@
             {
                 "original": "exact original text that was changed",
                 "hallucinated": "what you changed it to",
-                "explanation": "why this is a hallucination"
+                "explanation": "why this is wrong — what does the source code or user request actually say?"
             }
         ]
     }
@@ -78,6 +80,7 @@
     - "original" must be an exact substring of the correct answer
     - "hallucinated" must be an exact substring of your hallucinated answer
     - Each "hallucinated" value must be at least 20 characters long
+    - Each "explanation" must reference what the source code or user request actually says
     - Return ONLY valid JSON, nothing else
 """)
 

From e40b072f373cfa85dc4838bef58d2bd80695633b Mon Sep 17 00:00:00 2001
From: =?UTF-8?q?Kov=C3=A1cs=20=C3=81d=C3=A1m?= <adaam.ko@gmail.com>
Date: Sat, 7 Mar 2026 14:33:12 +0100
Subject: [PATCH 8/9] Strengthen prompts for code hallucination

---
 scripts/code_hallucination/config.py          |  44 ++-
 scripts/code_hallucination/format_builder.py  |   5 +-
 .../hallucination_injector.py                 | 372 ++++++++++++++----
 scripts/code_hallucination/pipeline.py        |  87 ++--
 scripts/code_hallucination/query_rewriter.py  |   3 +-
 .../code_hallucination/sample_assembler.py    |   6 +
 scripts/code_hallucination/source_fetcher.py  |  80 +++-
 scripts/code_hallucination/validator.py       | 167 +++++++-
 8 files changed, 633 insertions(+), 131 deletions(-)

diff --git a/scripts/code_hallucination/config.py b/scripts/code_hallucination/config.py
index 85bfed4..bb1c7d0 100644
--- a/scripts/code_hallucination/config.py
+++ b/scripts/code_hallucination/config.py
@@ -5,7 +5,8 @@
 
 # === Paths ===
 PROJECT_ROOT = Path(__file__).resolve().parent.parent.parent
-DATA_DIR = PROJECT_ROOT / "data" / "code_hallucination"
+DEFAULT_DATA_DIR = PROJECT_ROOT / "data" / "code_hallucination"
+DATA_DIR = Path(os.environ.get("CODE_HALLUCINATION_OUTPUT_DIR", str(DEFAULT_DATA_DIR)))
 REPOS_DIR = DATA_DIR / "repos"
 SOURCE_CACHE_DIR = DATA_DIR / "source_cache"
 
@@ -21,6 +22,37 @@
 METADATA_PATH = DATA_DIR / "code_hallucination_metadata.json"
 VALIDATION_REPORT_PATH = DATA_DIR / "validation_report.txt"
 
+
+def set_output_dir(path: str | os.PathLike[str]) -> Path:
+    """Redirect all pipeline outputs to a specific directory."""
+    global DATA_DIR
+    global REPOS_DIR
+    global SOURCE_CACHE_DIR
+    global INSTANCES_PATH
+    global QUERIES_PATH
+    global DOCS_PATH
+    global FORMATS_PATH
+    global HALLUCINATED_PATH
+    global DATASET_PATH
+    global METADATA_PATH
+    global VALIDATION_REPORT_PATH
+
+    DATA_DIR = Path(path)
+    REPOS_DIR = DATA_DIR / "repos"
+    SOURCE_CACHE_DIR = DATA_DIR / "source_cache"
+    INSTANCES_PATH = DATA_DIR / "swebench_instances.json"
+    QUERIES_PATH = DATA_DIR / "queries.jsonl"
+    DOCS_PATH = DATA_DIR / "documentation.jsonl"
+    FORMATS_PATH = DATA_DIR / "formats.jsonl"
+    HALLUCINATED_PATH = DATA_DIR / "hallucinated_samples.jsonl"
+    DATASET_PATH = DATA_DIR / "code_hallucination_data.json"
+    METADATA_PATH = DATA_DIR / "code_hallucination_metadata.json"
+    VALIDATION_REPORT_PATH = DATA_DIR / "validation_report.txt"
+
+    os.environ["CODE_HALLUCINATION_OUTPUT_DIR"] = str(DATA_DIR)
+    return DATA_DIR
+
+
 # === LLM API Config ===
 # Override via env vars or CLI args
 API_BASE_URL = os.environ.get("API_BASE_URL", "https://api.groq.com/openai/v1")
@@ -55,3 +87,13 @@
 # SWE-bench datasets
 SWEBENCH_FULL = "princeton-nlp/SWE-bench"
 SWEBENCH_LITE = "princeton-nlp/SWE-bench_Lite"
+
+# Models that require max_completion_tokens instead of max_tokens
+_REASONING_MODEL_PREFIXES = ("o1", "o3", "o4", "gpt-5")
+
+
+def token_limit_kwargs(model: str, max_tokens: int = 4000) -> dict:
+    """Return the right token-limit kwarg for the given model."""
+    if any(model.startswith(p) for p in _REASONING_MODEL_PREFIXES):
+        return {"max_completion_tokens": max_tokens, "reasoning_effort": "none"}
+    return {"max_tokens": max_tokens}
diff --git a/scripts/code_hallucination/format_builder.py b/scripts/code_hallucination/format_builder.py
index 0f0a660..60f912e 100644
--- a/scripts/code_hallucination/format_builder.py
+++ b/scripts/code_hallucination/format_builder.py
@@ -24,6 +24,7 @@
     MODEL,
     RETRY_DELAY,
     SOURCE_CACHE_DIR,
+    token_limit_kwargs,
 )
 
 EXPLANATION_SYSTEM_PROMPT = textwrap.dedent("""\
@@ -78,7 +79,7 @@ def _generate_explanation(
                     {"role": "user", "content": user_msg},
                 ],
                 temperature=LLM_TEMPERATURE,
-                max_tokens=200,
+                **token_limit_kwargs(model, 200),
             )
             result = response.choices[0].message.content.strip()
             # Verify the code is actually in the response
@@ -122,7 +123,7 @@ async def _generate_explanation_async(
                     {"role": "user", "content": user_msg},
                 ],
                 temperature=LLM_TEMPERATURE,
-                max_tokens=200,
+                **token_limit_kwargs(model, 200),
             )
             result = response.choices[0].message.content.strip()
             if code[:50] in result or "```" in result:
diff --git a/scripts/code_hallucination/hallucination_injector.py b/scripts/code_hallucination/hallucination_injector.py
index 46d676d..a1d470f 100644
--- a/scripts/code_hallucination/hallucination_injector.py
+++ b/scripts/code_hallucination/hallucination_injector.py
@@ -23,67 +23,119 @@
     MAX_RETRIES,
     MODEL,
     RETRY_DELAY,
+    token_limit_kwargs,
 )
 
 INJECTION_SYSTEM_PROMPT = textwrap.dedent("""\
     You are a code hallucination injector for building a hallucination detection dataset.
 
     Given a correct answer (which may be pure code OR code with natural language explanation)
-    and SOURCE CODE CONTEXT, create a hallucinated version with specific types of errors.
-
-    CRITICAL: Every error you inject MUST BE DETECTABLE by comparing the answer against
-    the provided source code context AND/OR the user's request. A human reading the
-    source files and user query must be able to spot that the hallucinated part
-    contradicts what's in the source or what the user asked for. Do NOT inject errors
-    that require running the code or external knowledge to detect.
+    and SOURCE CODE CONTEXT, return ONLY a small set of localized replacement edits that will
+    turn the answer into a hallucinated answer.
+
+    IMPORTANT: You are NOT allowed to rewrite the full answer.
+    - Return replacement edits only.
+    - The pipeline will apply those edits to the original answer.
+    - Outside the returned edits, the answer must remain unchanged.
+
+    IMPORTANT: Only inject hallucinations into CODE portions of the answer.
+    - If the answer contains markdown code fences, edits must be inside the fenced code block(s).
+    - Do NOT modify natural language explanations before or after the code block.
+    - Do NOT add explanatory comments inside code.
+    - The explanation text must remain correct and neutral; only the code should be wrong.
+
+    CRITICAL RULES FOR GROUNDING:
+    - Every error you inject MUST BE DETECTABLE by comparing the answer against
+      the provided source code context AND/OR the user's request.
+    - ONLY reference functions, methods, classes, variables, and parameters that
+      appear in the PROVIDED source context. Do NOT use your own knowledge of the
+      library — pretend you only know what's in the context.
+    - A human reading ONLY the source files and user query must be able to spot
+      that the hallucinated part is wrong. If they can't, the hallucination is useless.
+    - Do NOT inject errors that require running code, external docs, or knowledge
+      beyond what's in the provided context to detect.
 
     Hallucination types:
-    - STRUCTURAL: Change a function/method/class name, import, or parameter to something
-      that does NOT exist in the source context. For example, rename a method call to one
-      that isn't defined in the provided source files, or add a parameter that the function
-      doesn't accept according to the source.
-    - BEHAVIORAL: Use correct API names from the source but with wrong values or logic
-      that contradicts the source. Wrong default values (different from source), swapped
-      conditions, wrong argument order compared to the function signature in source.
-    - SEMANTIC: Code that contradicts the source's behavior, the user's request, or the
-      explanation contradicts what the source code actually does. For example: claim a
-      function returns X when the source shows it returns Y, describe wrong control flow,
-      or solve a different problem than what the user asked for.
+    - STRUCTURAL: Change a function/method/class name to something that does NOT
+      appear anywhere in the provided source context.
+    - BEHAVIORAL: Use correct names from the source but with wrong values or logic
+      that visibly contradicts the source.
+    - SEMANTIC: Make the CODE solve a different problem than the user asked for, or
+      make the code behave differently than what the source context shows.
 
     Rules:
-    - Make 2-3 DISTINCT changes spread across different parts of the answer
-    - Each change MUST contradict something visible in the source code or user request
-    - Each changed span must be 20-150 characters long (not too short, not too long)
-    - Total hallucinated text must be LESS THAN 40% of the original answer length
+    - Make 1-3 DISTINCT replacement edits spread across different parts of the answer
+    - Each edit MUST contradict something VISIBLE in the provided source code or user request
+    - Do NOT reference functions/classes/methods not present in the provided context
+    - Do NOT make any unlabeled edits outside the returned replacement edits
+    - Each replacement span must be 12-120 characters long and as small as possible
+    - Total hallucinated text must be LESS THAN 30% of the original answer length
     - Keep most of the answer CORRECT — do NOT rewrite the entire thing
-    - Changes should be in different functions/blocks/paragraphs, not adjacent lines
+    - Changes should be in different functions/blocks, not adjacent lines
     - Make changes PLAUSIBLE — something an LLM would realistically generate
     - Changes must be SUBTLE, not obviously broken
-    - The code in the hallucinated answer must still be syntactically valid
+    - The edited code must still be syntactically valid
     - Do NOT add comments explaining or hinting at the hallucination
-    - Preserve the overall structure: keep markdown formatting, code blocks, etc.
+    - Do NOT add words like BUG, wrong, incorrect, deprecated, hallucination, fix, helper
+    - Do NOT include editorial text that describes the mistake inside the answer itself
+    - Preserve the overall structure: keep markdown formatting, code blocks, indentation, imports, and surrounding text unchanged
+    - Do NOT add or remove markdown fences
+    - Do NOT add explanation text, tutorial text, wrapper text, or placeholder text
+    - Do NOT add imports, helper functions, or surrounding code
+    - Prefer changing existing lines over insertions or deletions
+    - Each edit must replace an existing substring of the original answer; no insert-only edits
 
     Respond in this exact JSON format (no markdown, no code blocks):
     {
-        "hallucinated_code": "the full modified answer with hallucinations injected",
         "changes": [
             {
-                "original": "exact original text that was changed",
-                "hallucinated": "what you changed it to",
-                "explanation": "why this is wrong — what does the source code or user request actually say?"
+                "original": "exact original substring from the correct answer",
+                "hallucinated": "replacement text for that substring",
+                "left_context": "up to 40 exact characters immediately before the original substring in the correct answer",
+                "right_context": "up to 40 exact characters immediately after the original substring in the correct answer",
+                "target_zone": "code",
+                "explanation": "why this replacement is wrong according to the source code or user request"
             }
         ]
     }
 
     IMPORTANT:
-    - You MUST include 2-3 changes in the "changes" array
-    - "original" must be an exact substring of the correct answer
-    - "hallucinated" must be an exact substring of your hallucinated answer
-    - Each "hallucinated" value must be at least 20 characters long
+    - You MUST include 1-3 changes in the "changes" array
+    - The returned changes must be sufficient to construct the full hallucinated answer
+    - "original" must be a non-empty exact substring of the correct answer
+    - Before returning, verify that each "original" substring appears verbatim in the provided correct answer
+    - Prefer substrings that appear exactly once in the correct answer
+    - If a substring appears multiple times, use left_context and right_context that disambiguate a single occurrence
+    - "hallucinated" is the exact replacement text for that substring
+    - "left_context" and "right_context" must come from the original correct answer, not a rewritten one
+    - "target_zone" must always be "code"
     - Each "explanation" must reference what the source code or user request actually says
+    - If you cannot find 1-3 exact editable substrings in the provided answer, return {"changes": []}
     - Return ONLY valid JSON, nothing else
 """)
 
+LEAKY_TERMS = (
+    "bug",
+    "wrong",
+    "incorrect",
+    "incorrectly",
+    "deprecated",
+    "hallucination",
+    "helper method",
+    "should be replaced",
+)
+PROMPT_RESIDUE = (
+    "Generate a hallucinated version",
+    "Return JSON only",
+    "hallucinated_code",
+    "target_zone",
+    "left_context",
+    "right_context",
+)
+MAX_LABEL_COVERAGE = 0.30
+MAX_LABEL_SPAN_CHARS = 500
+MIN_LABEL_SPAN_CHARS = 12
+
 
 def build_source_context(source_data: dict) -> str:
     """Build source code context string from cached source data.
@@ -108,10 +160,7 @@ def inject_hallucination(
     context: str = "",
     documentation: dict[str, str] | None = None,
 ) -> dict | None:
-    """Inject a hallucination and get back structured JSON with spans.
-
-    Returns dict with 'hallucinated_code' and 'changes', or None if failed.
-    """
+    """Request structured replacement edits for hallucination injection."""
     docs_section = ""
     if documentation:
         docs_parts = [f"Documentation for {lib}:\n{doc}" for lib, doc in documentation.items()]
@@ -127,10 +176,10 @@ def inject_hallucination(
 Context (source code):
 {context}{docs_section}
 
-Correct code to modify:
+Correct answer to modify:
 {clean_answer}
 
-Generate a hallucinated version with {hall_type} error(s). Return JSON only."""
+Return ONLY replacement edits for {hall_type} error(s). Do not return the full rewritten answer."""
 
     for attempt in range(MAX_RETRIES):
         try:
@@ -141,7 +190,7 @@ def inject_hallucination(
                     {"role": "user", "content": user_msg},
                 ],
                 temperature=HALLUCINATION_TEMPERATURE,
-                max_tokens=4000,
+                **token_limit_kwargs(model),
             )
             raw = response.choices[0].message.content.strip()
 
@@ -154,13 +203,11 @@ def inject_hallucination(
 
             result = json.loads(json_match.group())
 
-            if "hallucinated_code" not in result or "changes" not in result:
+            if "changes" not in result or not isinstance(result["changes"], list):
                 if attempt < MAX_RETRIES - 1:
                     continue
                 return None
-
-            # Verify the hallucinated code is actually different
-            if result["hallucinated_code"].strip() == clean_answer.strip():
+            if not result["changes"]:
                 if attempt < MAX_RETRIES - 1:
                     continue
                 return None
@@ -176,41 +223,159 @@ def inject_hallucination(
                 return None
 
 
-def compute_span_offsets(hallucinated_code: str, hallucinated_span: str) -> list[dict]:
-    """Find character offsets of a hallucinated span within the answer code."""
-    spans = []
-    idx = hallucinated_code.find(hallucinated_span)
-    if idx != -1:
-        spans.append({"start": idx, "end": idx + len(hallucinated_span)})
-    return spans
+def _find_all_occurrences(text: str, pattern: str) -> list[dict]:
+    """Return all exact matches of pattern in text."""
+    if not pattern:
+        return []
+    offsets = []
+    start = 0
+    while True:
+        idx = text.find(pattern, start)
+        if idx == -1:
+            break
+        offsets.append({"start": idx, "end": idx + len(pattern)})
+        start = idx + 1
+    return offsets
+
 
+def _truncate_context(text: str, max_chars: int = 40) -> str:
+    """Normalize context fields to the same length budget used in the prompt."""
+    if len(text) <= max_chars:
+        return text
+    return text[-max_chars:]
 
-def build_labels_from_changes(
-    hallucinated_code: str, changes: list[dict], hall_type: str
-) -> list[dict]:
-    """Build span labels by finding each hallucinated string in the code.
 
-    Only includes spans where the hallucinated text is actually found in the answer.
+def _extract_code_regions(answer: str) -> list[tuple[int, int]]:
+    """Return ranges that correspond to markdown fenced code blocks.
+
+    If no fenced blocks are present, treat the whole answer as code.
     """
-    labels = []
+    regions = []
+    idx = 0
+    while True:
+        start = answer.find("```", idx)
+        if start == -1:
+            break
+        code_start = answer.find("\n", start + 3)
+        if code_start == -1:
+            break
+        code_start += 1
+        end = answer.find("```", code_start)
+        if end == -1:
+            break
+        regions.append((code_start, end))
+        idx = end + 3
+    if not regions:
+        return [(0, len(answer))]
+    return regions
+
+
+def _span_is_in_code(answer: str, start: int, end: int) -> bool:
+    """Check whether a span lies fully inside a code region."""
+    for code_start, code_end in _extract_code_regions(answer):
+        if start >= code_start and end <= code_end:
+            return True
+    return False
+
+
+def _contains_leakage(text: str) -> bool:
+    """Detect obvious synthetic giveaway text inside a label span."""
+    lowered = text.lower()
+    return any(term in lowered for term in LEAKY_TERMS)
+
+
+def _max_allowed_coverage(answer_len: int) -> float:
+    """Use a looser coverage cap for short answers and fragments."""
+    if answer_len <= 400:
+        return 0.40
+    if answer_len <= 800:
+        return 0.35
+    return MAX_LABEL_COVERAGE
+
+
+def _locate_original_change(original_answer: str, change: dict) -> dict | None:
+    """Locate a replacement span in the original answer using substring plus context."""
+    original_span = change.get("original", "")
+    hallucinated_span = change.get("hallucinated", "")
+    if not original_span or not hallucinated_span:
+        return None
+    if change.get("target_zone") not in (None, "code"):
+        return None
+
+    offsets = _find_all_occurrences(original_answer, original_span)
+    if not offsets:
+        return None
+
+    left_context = _truncate_context(change.get("left_context", ""))
+    right_context = _truncate_context(change.get("right_context", ""))
+    filtered = []
+    for offset in offsets:
+        start = offset["start"]
+        end = offset["end"]
+        observed_left = _truncate_context(
+            original_answer[max(0, start - len(left_context)) : start]
+        )
+        observed_right = original_answer[end : end + len(right_context)]
+        left_ok = not left_context or observed_left == left_context
+        right_ok = not right_context or observed_right == right_context
+        if left_ok and right_ok:
+            filtered.append(offset)
+
+    matches = filtered or offsets
+    if len(matches) != 1:
+        return None
+
+    return {
+        "start": matches[0]["start"],
+        "end": matches[0]["end"],
+        "original": original_span,
+        "hallucinated": hallucinated_span,
+    }
+
+
+def apply_changes_to_answer(
+    original_answer: str, changes: list[dict], hall_type: str
+) -> tuple[str, list[dict]] | tuple[None, None]:
+    """Apply structured replacement edits to the original answer and build labels.
+
+    The model returns edits only. This function deterministically constructs the
+    hallucinated answer and the corresponding label offsets.
+    """
+    located = []
     for change in changes:
-        h_span = change.get("hallucinated", "")
-        if not h_span or len(h_span) < 15:
-            continue
-        if h_span not in hallucinated_code:
-            continue
-
-        offsets = compute_span_offsets(hallucinated_code, h_span)
-        for offset in offsets[:1]:  # First occurrence only
-            labels.append(
-                {
-                    "start": offset["start"],
-                    "end": offset["end"],
-                    "label": hall_type,
-                }
-            )
+        if len(change.get("hallucinated", "")) < MIN_LABEL_SPAN_CHARS:
+            return None, None
+        located_change = _locate_original_change(original_answer, change)
+        if located_change is None:
+            return None, None
+        located.append(located_change)
+
+    # Reject overlapping edits in the original answer.
+    located.sort(key=lambda item: (item["start"], item["end"]))
+    previous_end = -1
+    for item in located:
+        if item["start"] < previous_end:
+            return None, None
+        previous_end = item["end"]
+
+    hallucinated_parts = []
+    labels = []
+    cursor = 0
+    for item in located:
+        start = item["start"]
+        end = item["end"]
+        hallucinated_span = item["hallucinated"]
 
-    return labels
+        hallucinated_parts.append(original_answer[cursor:start])
+        label_start = sum(len(part) for part in hallucinated_parts)
+        hallucinated_parts.append(hallucinated_span)
+        label_end = label_start + len(hallucinated_span)
+        labels.append({"start": label_start, "end": label_end, "label": hall_type})
+        cursor = end
+
+    hallucinated_parts.append(original_answer[cursor:])
+    hallucinated_answer = "".join(hallucinated_parts)
+    return hallucinated_answer, labels
 
 
 def load_existing_hallucinations(path=HALLUCINATED_PATH) -> dict[str, dict]:
@@ -252,10 +417,10 @@ async def _inject_one_async(
 Context (source code):
 {context}{docs_section}
 
-Correct code to modify:
+Correct answer to modify:
 {clean_answer}
 
-Generate a hallucinated version with {hall_type} error(s). Return JSON only."""
+Return ONLY replacement edits for {hall_type} error(s). Do not return the full rewritten answer."""
 
     for attempt in range(MAX_RETRIES):
         try:
@@ -266,16 +431,16 @@ async def _inject_one_async(
                     {"role": "user", "content": user_msg},
                 ],
                 temperature=HALLUCINATION_TEMPERATURE,
-                max_tokens=4000,
+                **token_limit_kwargs(model),
             )
             raw = response.choices[0].message.content.strip()
             json_match = re.search(r"\{[\s\S]*\}", raw)
             if not json_match:
                 continue
             result = json.loads(json_match.group())
-            if "hallucinated_code" not in result or "changes" not in result:
+            if "changes" not in result or not isinstance(result["changes"], list):
                 continue
-            if result["hallucinated_code"].strip() == clean_answer.strip():
+            if not result["changes"]:
                 continue
             return result
         except Exception:
@@ -286,7 +451,9 @@ async def _inject_one_async(
     return None
 
 
-def _validate_labels(hallucinated_code: str, labels: list[dict]) -> tuple[bool, str]:
+def _validate_labels(
+    original_answer: str, hallucinated_code: str, labels: list[dict], format_type: str
+) -> tuple[bool, str]:
     """Validate that hallucination labels meet quality thresholds.
 
     :return: (is_valid, reason) tuple.
@@ -294,17 +461,46 @@ def _validate_labels(hallucinated_code: str, labels: list[dict]) -> tuple[bool,
     if not labels:
         return False, "no_labels"
 
+    # Reject prompt contamination (LLM leaked its instructions into the answer)
+    for residue in PROMPT_RESIDUE:
+        if residue in hallucinated_code:
+            return False, f"prompt_residue ({residue[:30]})"
+
+    # Reject unbalanced code fences for code_with_explanation
+    if format_type == "code_with_explanation":
+        fence_count = hallucinated_code.count("```")
+        if fence_count % 2 != 0:
+            return False, f"unbalanced_fences ({fence_count})"
+        if fence_count == 0:
+            return False, "no_code_fences"
+
     total_span = sum(lab["end"] - lab["start"] for lab in labels)
     code_len = len(hallucinated_code) if hallucinated_code else 1
     coverage = total_span / code_len
 
-    if coverage > 0.60:
-        return False, f"coverage_too_high ({coverage:.0%})"
+    max_coverage = _max_allowed_coverage(code_len)
+    if coverage > max_coverage:
+        return False, f"coverage_too_high ({coverage:.0%} > {max_coverage:.0%})"
 
+    previous_end = -1
     for lab in labels:
         span_len = lab["end"] - lab["start"]
-        if span_len < 15:
+        if span_len < MIN_LABEL_SPAN_CHARS:
             return False, f"span_too_short ({span_len} chars)"
+        if span_len > MAX_LABEL_SPAN_CHARS:
+            return False, f"span_too_long ({span_len} chars)"
+        if lab["start"] < previous_end:
+            return False, "overlapping_or_unsorted_labels"
+        previous_end = lab["end"]
+
+        span_text = hallucinated_code[lab["start"] : lab["end"]]
+        if _contains_leakage(span_text):
+            return False, "leaky_label_text"
+
+        if format_type == "code_with_explanation" and not _span_is_in_code(
+            hallucinated_code, lab["start"], lab["end"]
+        ):
+            return False, "label_outside_code_block"
 
     return True, ""
 
@@ -313,11 +509,14 @@ def _process_result(result, instance_id, hall_type, fmt_data, model):
     """Process a single injection result into a JSONL entry."""
     if result is None:
         return None
-    hallucinated_code = result["hallucinated_code"]
+    original_answer = fmt_data.get("answer", "")
     changes = result.get("changes", [])
-    labels = build_labels_from_changes(hallucinated_code, changes, hall_type)
+    hallucinated_code, labels = apply_changes_to_answer(original_answer, changes, hall_type)
+    if hallucinated_code is None or labels is None:
+        return None
+    format_type = fmt_data.get("format_type", "fragment")
 
-    valid, reason = _validate_labels(hallucinated_code, labels)
+    valid, reason = _validate_labels(original_answer, hallucinated_code, labels, format_type)
     if not valid:
         return None
 
@@ -327,7 +526,8 @@ def _process_result(result, instance_id, hall_type, fmt_data, model):
         "labels": labels,
         "hallucination_type": hall_type,
         "injector_model": model,
-        "format_type": fmt_data.get("format_type", "fragment"),
+        "format_type": format_type,
+        "changes": changes,
     }
 
 
diff --git a/scripts/code_hallucination/pipeline.py b/scripts/code_hallucination/pipeline.py
index 59d8d26..aa875d7 100644
--- a/scripts/code_hallucination/pipeline.py
+++ b/scripts/code_hallucination/pipeline.py
@@ -20,24 +20,15 @@
 import json
 import random
 
-from .config import (
-    API_BASE_URL,
-    API_KEY,
-    DATA_DIR,
-    DOCS_PATH,
-    FORMATS_PATH,
-    HALLUCINATED_PATH,
-    MODEL,
-    QUERIES_PATH,
-    SOURCE_CACHE_DIR,
-)
+from . import config
+from .config import API_BASE_URL, API_KEY, MODEL
 
 
 def load_source_cache(instance_ids: list[str]) -> dict[str, dict]:
     """Load source cache for given instance IDs."""
     cache = {}
     for iid in instance_ids:
-        cache_path = SOURCE_CACHE_DIR / f"{iid}.json"
+        cache_path = config.SOURCE_CACHE_DIR / f"{iid}.json"
         if cache_path.exists():
             with open(cache_path) as f:
                 cache[iid] = json.load(f)
@@ -62,6 +53,16 @@ def load_jsonl_dict(path, key="instance_id", value_key=None) -> dict:
     return result
 
 
+def filter_instances_by_splits(instances: list[dict], splits: list[str] | None) -> list[dict]:
+    """Optionally filter instances to a subset of SWE-bench splits."""
+    if not splits:
+        return instances
+    split_set = set(splits)
+    filtered = [inst for inst in instances if inst.get("split") in split_set]
+    print(f"Using splits {sorted(split_set)}: {len(filtered)}/{len(instances)} instances")
+    return filtered
+
+
 def run_test(n: int = 5, api_key: str = API_KEY, base_url: str = API_BASE_URL, model: str = MODEL):
     """Run a quick test with n instances from the test split."""
     print("=" * 60)
@@ -81,8 +82,8 @@ def run_test(n: int = 5, api_key: str = API_KEY, base_url: str = API_BASE_URL, m
     print(f"Selected {len(selected)} test instances")
 
     # Save temporary instances
-    DATA_DIR.mkdir(parents=True, exist_ok=True)
-    test_path = DATA_DIR / "test_instances.json"
+    config.DATA_DIR.mkdir(parents=True, exist_ok=True)
+    test_path = config.DATA_DIR / "test_instances.json"
     with open(test_path, "w") as f:
         json.dump(selected, f, indent=2)
 
@@ -108,7 +109,7 @@ def run_test(n: int = 5, api_key: str = API_KEY, base_url: str = API_BASE_URL, m
     # Phase 5: Assign formats (needs LLM for code_with_explanation)
     from .format_builder import run as run_formats
 
-    queries_dict = load_jsonl_dict(QUERIES_PATH, value_key="query")
+    queries_dict = load_jsonl_dict(config.QUERIES_PATH, value_key="query")
     run_formats(selected, api_key=api_key, base_url=base_url, model=model, queries=queries_dict)
 
     # Phase 8: Select targets (before phase 6)
@@ -119,8 +120,8 @@ def run_test(n: int = 5, api_key: str = API_KEY, base_url: str = API_BASE_URL, m
     # Phase 6: Inject hallucinations
     from .hallucination_injector import run as run_inject
 
-    formats = load_jsonl_dict(FORMATS_PATH)
-    docs = load_jsonl_dict(DOCS_PATH, value_key="docs")
+    formats = load_jsonl_dict(config.FORMATS_PATH)
+    docs = load_jsonl_dict(config.DOCS_PATH, value_key="docs")
     to_inject = [i for i in selected if i["instance_id"] in targets]
     sc = load_source_cache([i["instance_id"] for i in to_inject])
     run_inject(
@@ -137,7 +138,7 @@ def run_test(n: int = 5, api_key: str = API_KEY, base_url: str = API_BASE_URL, m
     # Phase 7: Assemble
     from .sample_assembler import run as run_assemble
 
-    hallucinations = load_jsonl_dict(HALLUCINATED_PATH)
+    hallucinations = load_jsonl_dict(config.HALLUCINATED_PATH)
     samples, metadata = run_assemble(selected, queries_dict, docs, formats, hallucinations, targets)
 
     # Phase 9: Validate
@@ -171,8 +172,23 @@ def main():
     parser.add_argument("--api-key", type=str, default=API_KEY, help="LLM API key")
     parser.add_argument("--base-url", type=str, default=API_BASE_URL, help="LLM API base URL")
     parser.add_argument("--model", type=str, default=MODEL, help="LLM model name")
+    parser.add_argument(
+        "--output-dir",
+        type=str,
+        help="Optional output directory for all intermediate and final pipeline files",
+    )
+    parser.add_argument(
+        "--splits",
+        nargs="+",
+        choices=["train", "dev", "test"],
+        help="Optional SWE-bench splits to operate on",
+    )
     args = parser.parse_args()
 
+    if args.output_dir:
+        output_dir = config.set_output_dir(args.output_dir)
+        print(f"Using output directory: {output_dir}")
+
     if args.test:
         run_test(args.test, api_key=args.api_key, base_url=args.base_url, model=args.model)
         return
@@ -196,24 +212,29 @@ def main():
             from .source_fetcher import run
             from .swebench_loader import load_instances
 
-            run(load_instances())
+            run(filter_instances_by_splits(load_instances(), args.splits))
         elif phase == 3:
             from .query_rewriter import run
             from .swebench_loader import load_instances
 
-            run(load_instances(), api_key=args.api_key, base_url=args.base_url, model=args.model)
+            run(
+                filter_instances_by_splits(load_instances(), args.splits),
+                api_key=args.api_key,
+                base_url=args.base_url,
+                model=args.model,
+            )
         elif phase == 4:
             from .context7_docs import run
             from .swebench_loader import load_instances
 
-            run(load_instances())
+            run(filter_instances_by_splits(load_instances(), args.splits))
         elif phase == 5:
             from .format_builder import run
             from .swebench_loader import load_instances
 
-            queries = load_jsonl_dict(QUERIES_PATH, value_key="query")
+            queries = load_jsonl_dict(config.QUERIES_PATH, value_key="query")
             run(
-                load_instances(),
+                filter_instances_by_splits(load_instances(), args.splits),
                 api_key=args.api_key,
                 base_url=args.base_url,
                 model=args.model,
@@ -224,10 +245,10 @@ def main():
             from .splitter import select_hallucination_targets
             from .swebench_loader import load_instances
 
-            instances = load_instances()
-            formats = load_jsonl_dict(FORMATS_PATH)
-            queries = load_jsonl_dict(QUERIES_PATH, value_key="query")
-            docs = load_jsonl_dict(DOCS_PATH, value_key="docs")
+            instances = filter_instances_by_splits(load_instances(), args.splits)
+            formats = load_jsonl_dict(config.FORMATS_PATH)
+            queries = load_jsonl_dict(config.QUERIES_PATH, value_key="query")
+            docs = load_jsonl_dict(config.DOCS_PATH, value_key="docs")
             targets = select_hallucination_targets(instances)
             to_inject = [i for i in instances if i["instance_id"] in targets]
             sc = load_source_cache([i["instance_id"] for i in to_inject])
@@ -246,18 +267,18 @@ def main():
             from .splitter import select_hallucination_targets
             from .swebench_loader import load_instances
 
-            instances = load_instances()
-            queries = load_jsonl_dict(QUERIES_PATH, value_key="query")
-            docs = load_jsonl_dict(DOCS_PATH, value_key="docs")
-            formats = load_jsonl_dict(FORMATS_PATH)
-            hallucinations = load_jsonl_dict(HALLUCINATED_PATH)
+            instances = filter_instances_by_splits(load_instances(), args.splits)
+            queries = load_jsonl_dict(config.QUERIES_PATH, value_key="query")
+            docs = load_jsonl_dict(config.DOCS_PATH, value_key="docs")
+            formats = load_jsonl_dict(config.FORMATS_PATH)
+            hallucinations = load_jsonl_dict(config.HALLUCINATED_PATH)
             targets = select_hallucination_targets(instances)
             run(instances, queries, docs, formats, hallucinations, targets)
         elif phase == 8:
             from .splitter import run
             from .swebench_loader import load_instances
 
-            run(load_instances())
+            run(filter_instances_by_splits(load_instances(), args.splits))
         elif phase == 9:
             from .validator import run
 
diff --git a/scripts/code_hallucination/query_rewriter.py b/scripts/code_hallucination/query_rewriter.py
index 65f3a16..5b0e512 100644
--- a/scripts/code_hallucination/query_rewriter.py
+++ b/scripts/code_hallucination/query_rewriter.py
@@ -14,6 +14,7 @@
     MODEL,
     QUERIES_PATH,
     RETRY_DELAY,
+    token_limit_kwargs,
 )
 
 REWRITE_SYSTEM_PROMPT = textwrap.dedent("""\
@@ -52,7 +53,7 @@ def llm_call(
                     {"role": "user", "content": user},
                 ],
                 temperature=temperature,
-                max_tokens=max_tokens,
+                **token_limit_kwargs(model, max_tokens),
             )
             return response.choices[0].message.content.strip()
         except Exception as e:
diff --git a/scripts/code_hallucination/sample_assembler.py b/scripts/code_hallucination/sample_assembler.py
index 7bf6157..44147f9 100644
--- a/scripts/code_hallucination/sample_assembler.py
+++ b/scripts/code_hallucination/sample_assembler.py
@@ -92,6 +92,12 @@ def assemble_samples(
             if not answer.strip():
                 continue
 
+            # Reject code_with_explanation with unbalanced fences
+            if fmt_data.get("format_type") == "code_with_explanation":
+                fence_count = answer.count("```")
+                if fence_count % 2 != 0 or fence_count == 0:
+                    continue
+
             sample = {
                 "prompt": prompt,
                 "answer": answer,
diff --git a/scripts/code_hallucination/source_fetcher.py b/scripts/code_hallucination/source_fetcher.py
index fa24c4e..e579f06 100644
--- a/scripts/code_hallucination/source_fetcher.py
+++ b/scripts/code_hallucination/source_fetcher.py
@@ -15,6 +15,72 @@
 GITHUB_RAW_BASE = "https://raw.githubusercontent.com"
 
 
+def truncate_around_patch(
+    full_content: str, patch: str, filepath: str, max_chars: int = MAX_FILE_CHARS
+) -> str:
+    """Truncate a source file keeping the region around the patch.
+
+    Instead of taking the first N chars (which may miss the patched region),
+    find where the patch applies and keep a window around it, plus the file header
+    (imports/class definitions).
+    """
+    if len(full_content) <= max_chars:
+        return full_content
+
+    # Find the hunk start lines from the patch for this file
+    hunk_lines = []
+    in_file = False
+    for line in patch.split("\n"):
+        if line.startswith("diff --git"):
+            match = re.match(r"diff --git a/(.+?) b/(.+)$", line)
+            in_file = match is not None and match.group(2) == filepath
+        elif in_file and line.startswith("@@"):
+            hunk_match = re.match(r"@@ -(\d+)", line)
+            if hunk_match:
+                hunk_lines.append(int(hunk_match.group(1)))
+
+    if not hunk_lines:
+        # Can't find patch location, fall back to first N chars
+        return full_content[:max_chars]
+
+    lines = full_content.split("\n")
+
+    # Always keep the header (imports, class defs) — first 50 lines or until first function
+    header_end = min(50, len(lines))
+    for i, line in enumerate(lines[:200]):
+        if line.strip().startswith("def ") or line.strip().startswith("class "):
+            if i > 20:
+                header_end = i
+                break
+
+    header = "\n".join(lines[:header_end])
+    header_chars = len(header)
+    remaining_budget = max_chars - header_chars - 100  # 100 for separator
+
+    if remaining_budget <= 0:
+        return full_content[:max_chars]
+
+    # Build a window around the patch hunks
+    min_hunk = min(hunk_lines) - 1  # Convert to 0-based
+    max_hunk = max(hunk_lines) - 1
+
+    # Expand window to use the remaining budget
+    lines_budget = remaining_budget // 80  # Rough estimate: 80 chars per line
+    padding = max(lines_budget // 2, 30)
+
+    window_start = max(header_end, min_hunk - padding)
+    window_end = min(len(lines), max_hunk + padding)
+
+    window = "\n".join(lines[window_start:window_end])
+
+    if window_start > header_end:
+        result = header + "\n\n# ... (truncated) ...\n\n" + window
+    else:
+        result = header + "\n" + window
+
+    return result[:max_chars]
+
+
 def extract_changed_files(patch: str) -> list[str]:
     """Extract file paths from a unified diff using anchored regex.
 
@@ -62,7 +128,7 @@ def fetch_file_from_github(repo: str, commit: str, filepath: str) -> str | None:
     try:
         r = requests.get(url, timeout=15)
         if r.status_code == 200:
-            return r.text[:MAX_FILE_CHARS]
+            return r.text
         return None
     except Exception:
         return None
@@ -79,7 +145,7 @@ def fetch_file_at_commit(repo_dir: Path, commit: str, filepath: str) -> str | No
             timeout=30,
         )
         if result.returncode == 0:
-            return result.stdout[:MAX_FILE_CHARS]
+            return result.stdout
         return None
     except (subprocess.TimeoutExpired, Exception) as e:
         print(f"    Error fetching {filepath}@{commit[:8]}: {e}")
@@ -119,8 +185,7 @@ def apply_patch_and_get_file(repo_dir: Path, commit: str, patch: str, filepath:
             # Read the patched file
             patched_path = Path(tmpdir) / filepath
             if patched_path.exists():
-                content = patched_path.read_text()[:MAX_FILE_CHARS]
-                return content
+                return patched_path.read_text()
 
             # Clean up worktree
             subprocess.run(
@@ -458,7 +523,7 @@ def fetch_source_for_instance(
     # Edit-style format
     edit_style = build_edit_style_answer(patch, changed_files)
 
-    # Complete function format — extract modified functions
+    # Complete function format — extract modified functions (needs full content)
     modified_functions = []
     for filepath in changed_files:
         if filepath not in source_files:
@@ -470,6 +535,11 @@ def fetch_source_for_instance(
                 func["file"] = filepath
             modified_functions.extend(funcs)
 
+    # Smart truncation AFTER patch application: keep header + patch-relevant regions
+    # instead of blind first-N-chars truncation
+    for filepath in list(source_files.keys()):
+        source_files[filepath] = truncate_around_patch(source_files[filepath], patch, filepath)
+
     return {
         "instance_id": instance["instance_id"],
         "changed_files": changed_files,
diff --git a/scripts/code_hallucination/validator.py b/scripts/code_hallucination/validator.py
index 50d5251..40d65e8 100644
--- a/scripts/code_hallucination/validator.py
+++ b/scripts/code_hallucination/validator.py
@@ -1,10 +1,31 @@
 """Phase 9: Quality checks and validation report."""
 
 import ast
+import difflib
 import json
 from collections import Counter
 
-from .config import DATASET_PATH, METADATA_PATH, VALIDATION_REPORT_PATH
+from .config import DATASET_PATH, FORMATS_PATH, METADATA_PATH, VALIDATION_REPORT_PATH
+
+LEAKY_TERMS = (
+    "bug",
+    "wrong",
+    "incorrect",
+    "incorrectly",
+    "deprecated",
+    "hallucination",
+    "helper method",
+    "should be replaced",
+)
+
+
+def _max_allowed_coverage(answer_len: int) -> float:
+    """Use a looser coverage cap for short answers and fragments."""
+    if answer_len <= 400:
+        return 0.40
+    if answer_len <= 800:
+        return 0.35
+    return 0.30
 
 
 def validate_spans(samples: list[dict]) -> list[str]:
@@ -12,6 +33,8 @@ def validate_spans(samples: list[dict]) -> list[str]:
     issues = []
     for i, sample in enumerate(samples):
         answer_len = len(sample["answer"])
+        previous_end = -1
+        seen = set()
         for label in sample.get("labels", []):
             start = label.get("start", 0)
             end = label.get("end", 0)
@@ -21,9 +44,140 @@ def validate_spans(samples: list[dict]) -> list[str]:
                 issues.append(f"Sample {i}: empty/inverted span ({start}, {end})")
             if end > answer_len:
                 issues.append(f"Sample {i}: span exceeds answer length ({end} > {answer_len})")
+            if start < previous_end:
+                issues.append(f"Sample {i}: unsorted/overlapping spans ({start} < {previous_end})")
+            if (start, end, label.get("label")) in seen:
+                issues.append(f"Sample {i}: duplicate span ({start}, {end})")
+            seen.add((start, end, label.get("label")))
+            previous_end = end
     return issues
 
 
+def _extract_code_regions(answer: str) -> list[tuple[int, int]]:
+    """Return markdown fenced code block ranges, or the whole answer if none."""
+    regions = []
+    idx = 0
+    while True:
+        start = answer.find("```", idx)
+        if start == -1:
+            break
+        code_start = answer.find("\n", start + 3)
+        if code_start == -1:
+            break
+        code_start += 1
+        end = answer.find("```", code_start)
+        if end == -1:
+            break
+        regions.append((code_start, end))
+        idx = end + 3
+    if not regions:
+        return [(0, len(answer))]
+    return regions
+
+
+def _span_is_in_code(answer: str, start: int, end: int) -> bool:
+    """Check whether a span is fully inside a fenced code region."""
+    return any(
+        start >= code_start and end <= code_end
+        for code_start, code_end in _extract_code_regions(answer)
+    )
+
+
+def _is_whitespace_only_diff(original_text: str, hallucinated_text: str) -> bool:
+    """Treat pure whitespace edits as ignorable when checking diff coverage."""
+    return (original_text or "").strip() == "" and (hallucinated_text or "").strip() == ""
+
+
+def _diff_outside_labels(
+    original_answer: str, hallucinated_answer: str, labels: list[dict]
+) -> list[dict]:
+    """Return meaningful diffs not covered by any labeled hallucinated span."""
+    label_ranges = [(lab["start"], lab["end"]) for lab in labels]
+
+    def is_covered(start: int, end: int) -> bool:
+        return any(
+            not (end <= lab_start or start >= lab_end) for lab_start, lab_end in label_ranges
+        )
+
+    uncovered = []
+    matcher = difflib.SequenceMatcher(a=original_answer, b=hallucinated_answer)
+    for tag, i1, i2, j1, j2 in matcher.get_opcodes():
+        if tag == "equal":
+            continue
+
+        original_chunk = original_answer[i1:i2]
+        hallucinated_chunk = hallucinated_answer[j1:j2]
+        if _is_whitespace_only_diff(original_chunk, hallucinated_chunk):
+            continue
+
+        if j1 == j2:
+            continue
+
+        if not is_covered(j1, j2):
+            uncovered.append(
+                {
+                    "tag": tag,
+                    "start": j1,
+                    "end": j2,
+                    "original": original_chunk[:80],
+                    "hallucinated": hallucinated_chunk[:80],
+                }
+            )
+
+    return uncovered
+
+
+def check_label_quality(samples: list[dict], metadata: list[dict]) -> dict:
+    """Report common synthetic-label issues that should be filtered before training."""
+    issues = Counter()
+    original_answers = {}
+    if FORMATS_PATH.exists():
+        with open(FORMATS_PATH) as f:
+            for line in f:
+                try:
+                    entry = json.loads(line)
+                except json.JSONDecodeError:
+                    continue
+                original_answers[entry.get("instance_id")] = entry.get("answer", "")
+
+    for sample, meta in zip(samples, metadata):
+        if not sample.get("labels"):
+            continue
+
+        answer = sample["answer"]
+        coverage = sum(label["end"] - label["start"] for label in sample["labels"]) / max(
+            len(answer), 1
+        )
+        if coverage > _max_allowed_coverage(len(answer)):
+            issues["coverage_over_30pct"] += 1
+
+        for label in sample["labels"]:
+            span_text = answer[label["start"] : label["end"]]
+            if any(term in span_text.lower() for term in LEAKY_TERMS):
+                issues["labels_with_leakage_terms"] += 1
+                break
+
+        if meta.get("format_type") == "code_with_explanation":
+            if any(
+                not _span_is_in_code(answer, label["start"], label["end"])
+                for label in sample["labels"]
+            ):
+                issues["code_with_explanation_label_outside_code"] += 1
+
+        original_answer = original_answers.get(meta.get("instance_id"))
+        if original_answer:
+            uncovered_diffs = _diff_outside_labels(original_answer, answer, sample["labels"])
+            if uncovered_diffs:
+                issues["diff_outside_labels"] += 1
+                if any(
+                    diff["tag"] == "insert" or len(diff["hallucinated"]) >= 20
+                    for diff in uncovered_diffs
+                ):
+                    issues["large_diff_outside_labels"] += 1
+
+    return dict(issues)
+
+
 def check_span_coverage(samples: list[dict]) -> dict:
     """Report span coverage distribution for hallucinated samples."""
     coverages = []
@@ -168,14 +322,21 @@ def report(text):
     report(f"Near duplicates (sampled): {n_dup}")
     report("")
 
-    # 5. AST parseability
+    # 5. Label quality
+    report("=== Label Quality ===")
+    label_quality = check_label_quality(samples, metadata)
+    for k, v in label_quality.items():
+        report(f"  {k}: {v}")
+    report("")
+
+    # 6. AST parseability
     report("=== AST Parseability ===")
     ast_check = check_ast_parseability(samples, metadata)
     for k, v in ast_check.items():
         report(f"  {k}: {v}")
     report("")
 
-    # 6. Length stats
+    # 7. Length stats
     report("=== Length Statistics ===")
     prompt_lens = [len(s["prompt"]) for s in samples]
     answer_lens = [len(s["answer"]) for s in samples]

From 1ad0f547aa832ed91cf0a0ffa308bf594a11182e Mon Sep 17 00:00:00 2001
From: =?UTF-8?q?Kov=C3=A1cs=20=C3=81d=C3=A1m?= <adaam.ko@gmail.com>
Date: Mon, 11 May 2026 15:43:03 +0200
Subject: [PATCH 9/9] Changes in injector

---
 .../hallucination_injector.py                 | 90 +++++++++++--------
 1 file changed, 55 insertions(+), 35 deletions(-)

diff --git a/scripts/code_hallucination/hallucination_injector.py b/scripts/code_hallucination/hallucination_injector.py
index a1d470f..77c82c9 100644
--- a/scripts/code_hallucination/hallucination_injector.py
+++ b/scripts/code_hallucination/hallucination_injector.py
@@ -84,6 +84,8 @@
     - Do NOT add imports, helper functions, or surrounding code
     - Prefer changing existing lines over insertions or deletions
     - Each edit must replace an existing substring of the original answer; no insert-only edits
+    - Choose exact substrings that appear exactly once in the original answer whenever possible
+    - Prefer whole expressions or full lines over tiny fragments
 
     Respond in this exact JSON format (no markdown, no code blocks):
     {
@@ -91,23 +93,50 @@
             {
                 "original": "exact original substring from the correct answer",
                 "hallucinated": "replacement text for that substring",
-                "left_context": "up to 40 exact characters immediately before the original substring in the correct answer",
-                "right_context": "up to 40 exact characters immediately after the original substring in the correct answer",
                 "target_zone": "code",
                 "explanation": "why this replacement is wrong according to the source code or user request"
             }
         ]
     }
 
+    Example 1:
+    Original answer contains:
+        return self.steps[-1][-1].transform(X)
+    Good JSON change:
+    {
+      "changes": [
+        {
+          "original": "return self.steps[-1][-1].transform(X)",
+          "hallucinated": "return self.steps[-1][-1].predict(X)",
+          "target_zone": "code",
+          "explanation": "The source context shows this method should transform the data, not run prediction."
+        }
+      ]
+    }
+
+    Example 2:
+    Original answer contains:
+        if handle_unknown == 'error':
+    Good JSON change:
+    {
+      "changes": [
+        {
+          "original": "if handle_unknown == 'error':",
+          "hallucinated": "if handle_unknown != 'error':",
+          "target_zone": "code",
+          "explanation": "This flips the branch condition and contradicts the intended error handling in the source."
+        }
+      ]
+    }
+
     IMPORTANT:
     - You MUST include 1-3 changes in the "changes" array
     - The returned changes must be sufficient to construct the full hallucinated answer
     - "original" must be a non-empty exact substring of the correct answer
     - Before returning, verify that each "original" substring appears verbatim in the provided correct answer
     - Prefer substrings that appear exactly once in the correct answer
-    - If a substring appears multiple times, use left_context and right_context that disambiguate a single occurrence
+    - If a substring appears multiple times, pick a different, longer substring that uniquely identifies the target location
     - "hallucinated" is the exact replacement text for that substring
-    - "left_context" and "right_context" must come from the original correct answer, not a rewritten one
     - "target_zone" must always be "code"
     - Each "explanation" must reference what the source code or user request actually says
     - If you cannot find 1-3 exact editable substrings in the provided answer, return {"changes": []}
@@ -238,13 +267,6 @@ def _find_all_occurrences(text: str, pattern: str) -> list[dict]:
     return offsets
 
 
-def _truncate_context(text: str, max_chars: int = 40) -> str:
-    """Normalize context fields to the same length budget used in the prompt."""
-    if len(text) <= max_chars:
-        return text
-    return text[-max_chars:]
-
-
 def _extract_code_regions(answer: str) -> list[tuple[int, int]]:
     """Return ranges that correspond to markdown fenced code blocks.
 
@@ -294,7 +316,7 @@ def _max_allowed_coverage(answer_len: int) -> float:
 
 
 def _locate_original_change(original_answer: str, change: dict) -> dict | None:
-    """Locate a replacement span in the original answer using substring plus context."""
+    """Locate a replacement span in the original answer by exact unique match."""
     original_span = change.get("original", "")
     hallucinated_span = change.get("hallucinated", "")
     if not original_span or not hallucinated_span:
@@ -303,36 +325,31 @@ def _locate_original_change(original_answer: str, change: dict) -> dict | None:
         return None
 
     offsets = _find_all_occurrences(original_answer, original_span)
-    if not offsets:
-        return None
-
-    left_context = _truncate_context(change.get("left_context", ""))
-    right_context = _truncate_context(change.get("right_context", ""))
-    filtered = []
-    for offset in offsets:
-        start = offset["start"]
-        end = offset["end"]
-        observed_left = _truncate_context(
-            original_answer[max(0, start - len(left_context)) : start]
-        )
-        observed_right = original_answer[end : end + len(right_context)]
-        left_ok = not left_context or observed_left == left_context
-        right_ok = not right_context or observed_right == right_context
-        if left_ok and right_ok:
-            filtered.append(offset)
-
-    matches = filtered or offsets
-    if len(matches) != 1:
+    if len(offsets) != 1:
         return None
 
     return {
-        "start": matches[0]["start"],
-        "end": matches[0]["end"],
+        "start": offsets[0]["start"],
+        "end": offsets[0]["end"],
         "original": original_span,
         "hallucinated": hallucinated_span,
     }
 
 
+def _sort_changes_by_original_position(
+    original_answer: str, changes: list[dict]
+) -> list[dict] | None:
+    """Return changes ordered by their matched position in the original answer."""
+    located = []
+    for change in changes:
+        loc = _locate_original_change(original_answer, change)
+        if loc is None:
+            return None
+        located.append((loc["start"], loc["end"], change))
+    located.sort(key=lambda item: (item[0], item[1]))
+    return [change for _, _, change in located]
+
+
 def apply_changes_to_answer(
     original_answer: str, changes: list[dict], hall_type: str
 ) -> tuple[str, list[dict]] | tuple[None, None]:
@@ -514,6 +531,9 @@ def _process_result(result, instance_id, hall_type, fmt_data, model):
     hallucinated_code, labels = apply_changes_to_answer(original_answer, changes, hall_type)
     if hallucinated_code is None or labels is None:
         return None
+    ordered_changes = _sort_changes_by_original_position(original_answer, changes)
+    if ordered_changes is None:
+        return None
     format_type = fmt_data.get("format_type", "fragment")
 
     valid, reason = _validate_labels(original_answer, hallucinated_code, labels, format_type)
@@ -527,7 +547,7 @@ def _process_result(result, instance_id, hall_type, fmt_data, model):
         "hallucination_type": hall_type,
         "injector_model": model,
         "format_type": format_type,
-        "changes": changes,
+        "changes": ordered_changes,
     }