From 5d93ea74f02470180f645dc611edb57246ba6543 Mon Sep 17 00:00:00 2001
From: Bing1100 <bingxuhu@gmail.com>
Date: Tue, 31 Mar 2026 23:01:30 -0400
Subject: [PATCH 1/3] feat(agents:AI agents for title/abstract screening and
 workload reduction

---
 backend/api/core/cit_utils.py                 |  11 +
 backend/api/screen/agentic_utils.py           |  95 +++
 backend/api/screen/prompts.py                 | 145 ++++
 backend/api/screen/router.py                  | 666 +++++++++++++++++-
 backend/api/services/cit_db_service.py        | 330 ++++++++-
 backend/docker-compose.yml                    |   5 +-
 backend/main.py                               |  10 +
 .../app/[lang]/can-sr/l1-screen/view/page.tsx | 160 +++++
 .../app/[lang]/can-sr/l2-screen/view/page.tsx | 167 +++++
 .../can-sr/screen/agent-runs/latest/route.ts  |  67 ++
 .../app/api/can-sr/screen/validate/route.ts   |  44 ++
 frontend/components/can-sr/PagedList.tsx      | 165 ++++-
 frontend/package-lock.json                    | 129 ++--
 13 files changed, 1908 insertions(+), 86 deletions(-)
 create mode 100644 backend/api/screen/agentic_utils.py
 create mode 100644 frontend/app/api/can-sr/screen/agent-runs/latest/route.ts
 create mode 100644 frontend/app/api/can-sr/screen/validate/route.ts
diff --git a/backend/api/core/cit_utils.py b/backend/api/core/cit_utils.py
index 1e33ce13..5e0461ae 100644
--- a/backend/api/core/cit_utils.py
+++ b/backend/api/core/cit_utils.py
@@ -14,6 +14,7 @@
 from fastapi.concurrency import run_in_threadpool
 
 from .config import settings
+from ..services.cit_db_service import cits_dp_service
 
 
 def _is_postgres_configured() -> bool:
@@ -94,5 +95,15 @@ async def load_sr_and_check(
         if not screening:
             raise HTTPException(status_code=status.HTTP_404_NOT_FOUND, detail="No screening database configured for this systematic review")
 
+        # Best-effort runtime schema evolution for agentic screening.
+        # CAN-SR uses per-upload screening tables, so we may need to add the
+        # validation columns to the specific table referenced by the SR.
+        try:
+            table_name = (screening or {}).get("table_name") or "citations"
+            await run_in_threadpool(cits_dp_service.ensure_step_validation_columns, table_name)
+        except Exception:
+            # Don't block requests if the DB isn't ready/configured.
+            pass
+
 
     return sr, screening
diff --git a/backend/api/screen/agentic_utils.py b/backend/api/screen/agentic_utils.py
new file mode 100644
index 00000000..1250287b
--- /dev/null
+++ b/backend/api/screen/agentic_utils.py
@@ -0,0 +1,95 @@
+"""backend.api.screen.agentic_utils
+
+Utilities for the GREP-Agent style "screening + critical" workflow.
+
+We keep this module small and dependency-free so routers can reuse the helpers
+for title/abstract and fulltext pipelines.
+"""
+
+from __future__ import annotations
+
+import re
+from dataclasses import dataclass
+from typing import Optional
+
+
+@dataclass
+class ParsedAgentXML:
+    answer: str
+    confidence: float
+    rationale: str
+    parse_ok: bool
+
+
+_TAG_RE_CACHE: dict[str, re.Pattern[str]] = {}
+
+
+def _tag_re(tag: str) -> re.Pattern[str]:
+    if tag not in _TAG_RE_CACHE:
+        _TAG_RE_CACHE[tag] = re.compile(rf"<{tag}>(.*?)</{tag}>", re.IGNORECASE | re.DOTALL)
+    return _TAG_RE_CACHE[tag]
+
+
+def parse_agent_xml(text: str) -> ParsedAgentXML:
+    """Parse <answer>, <confidence>, <rationale> tags from model output."""
+
+    raw = (text or "").strip()
+    ans_m = _tag_re("answer").search(raw)
+    conf_m = _tag_re("confidence").search(raw)
+    rat_m = _tag_re("rationale").search(raw)
+
+    answer = (ans_m.group(1).strip() if ans_m else "")
+    rationale = (rat_m.group(1).strip() if rat_m else "")
+
+    conf_val = 0.0
+    if conf_m:
+        try:
+            conf_val = float(conf_m.group(1).strip())
+        except Exception:
+            conf_val = 0.0
+    conf_val = max(0.0, min(1.0, conf_val))
+
+    parse_ok = bool(ans_m and conf_m)
+    return ParsedAgentXML(answer=answer, confidence=conf_val, rationale=rationale, parse_ok=parse_ok)
+
+
+def resolve_option(raw_answer: str, options: list[str]) -> str:
+    """Resolve a model answer to one of the provided options (best-effort)."""
+    ans = (raw_answer or "").strip()
+    if not ans:
+        return ans
+
+    # Exact match first
+    for opt in options or []:
+        if ans == opt:
+            return opt
+
+    # Case-insensitive exact
+    ans_l = ans.lower()
+    for opt in options or []:
+        if ans_l == (opt or "").lower():
+            return opt
+
+    # Substring containment (mirrors existing CAN-SR JSON screening logic)
+    for opt in options or []:
+        if (opt or "").lower() in ans_l:
+            return opt
+
+    return ans
+
+
+def build_critical_options(*, all_options: list[str], screening_answer: str) -> list[str]:
+    """Forced alternatives: (all_options - {screening_answer}) + ["None of the above"]."""
+    base = [o for o in (all_options or []) if (o or "").strip()]
+    sa = (screening_answer or "").strip()
+    if sa:
+        base = [o for o in base if o.strip() != sa]
+    base.append("None of the above")
+    # stable unique
+    seen = set()
+    out = []
+    for o in base:
+        if o not in seen:
+            seen.add(o)
+            out.append(o)
+    return out
diff --git a/backend/api/screen/prompts.py b/backend/api/screen/prompts.py
index ba7ac9d5..97861767 100644
--- a/backend/api/screen/prompts.py
+++ b/backend/api/screen/prompts.py
@@ -72,4 +72,149 @@
 - Use sentence indices from the numbered full text for "evidence_sentences"
 - Use table numbers from the Tables section for "evidence_tables"
 - Use figure numbers from the Figures section for "evidence_figures"
+"""
+
+
+# ---------------------------------------------------------------------------
+# Agentic screening (GREP-Agent style) prompt contracts
+# ---------------------------------------------------------------------------
+
+# NOTE:
+# CAN-SR historically used JSON output for screening. The agentic plan expects
+# XML-tag parsing (<answer>, <confidence>, <rationale>) so we can reuse a stable
+# parsing contract across screening + critical steps.
+
+PROMPT_XML_TEMPLATE_TA = """
+You are a highly critical, helpful scientific evaluator completing an academic review.
+
+Task:
+Answer the question "{question}" for the following citation.
+
+Citation:
+{cit}
+
+Choose EXACTLY ONE of these options (exact text):
+{options}
+
+Additional guidance:
+{xtra}
+
+Output requirement:
+Return ONLY the following XML tags (no Markdown, no extra prose):
+<answer>...</answer>
+<confidence>...</confidence>
+<rationale>...</rationale>
+
+Confidence requirements:
+- confidence is a float between 0 and 1
+- be conservative; do not overestimate confidence
+"""
+
+
+PROMPT_XML_TEMPLATE_TA_CRITICAL = """
+You are a critical reviewer double-checking another model's screening answer.
+
+Original question:
+"{question}"
+
+Citation:
+{cit}
+
+The first model answered:
+"{screening_answer}"
+
+Now, you MUST choose from the following forced alternatives.
+Rules:
+- You are NOT allowed to choose the original answer.
+- If you agree with the original answer, choose "None of the above".
+
+Forced alternatives (choose exactly one; exact text):
+{options}
+
+Additional guidance:
+{xtra}
+
+Output requirement:
+Return ONLY the following XML tags (no Markdown, no extra prose):
+<answer>...</answer>
+<confidence>...</confidence>
+<rationale>...</rationale>
+
+Confidence requirements:
+- confidence is a float between 0 and 1
+- be conservative; do not overestimate confidence
+"""
+
+
+PROMPT_XML_TEMPLATE_FULLTEXT = """
+You are assisting with a scientific full-text screening task.
+
+Task:
+Evaluate the question "{question}" against the paper content provided as numbered sentences (e.g., "[0] ...", "[1] ...").
+
+Choose EXACTLY ONE of these options (exact text):
+{options}
+
+Additional guidance:
+{xtra}
+
+Full text (numbered sentences):
+{fulltext}
+
+Tables (numbered):
+{tables}
+
+Figures (numbered; captions correspond to images provided alongside this message):
+{figures}
+
+Output requirement:
+Return ONLY the following XML tags (no Markdown, no extra prose):
+<answer>...</answer>
+<confidence>...</confidence>
+<rationale>...</rationale>
+
+Confidence requirements:
+- confidence is a float between 0 and 1
+- be conservative; do not overestimate confidence
+"""
+
+
+PROMPT_XML_TEMPLATE_FULLTEXT_CRITICAL = """
+You are a critical reviewer double-checking another model's full-text screening answer.
+
+Original question:
+"{question}"
+
+The first model answered:
+"{screening_answer}"
+
+Now, you MUST choose from the following forced alternatives.
+Rules:
+- You are NOT allowed to choose the original answer.
+- If you agree with the original answer, choose "None of the above".
+
+Forced alternatives (choose exactly one; exact text):
+{options}
+
+Additional guidance:
+{xtra}
+
+Full text (numbered sentences):
+{fulltext}
+
+Tables (numbered):
+{tables}
+
+Figures (numbered; captions correspond to images provided alongside this message):
+{figures}
+
+Output requirement:
+Return ONLY the following XML tags (no Markdown, no extra prose):
+<answer>...</answer>
+<confidence>...</confidence>
+<rationale>...</rationale>
+
+Confidence requirements:
+- confidence is a float between 0 and 1
+- be conservative; do not overestimate confidence
 """
\ No newline at end of file
diff --git a/backend/api/screen/router.py b/backend/api/screen/router.py
index 10174b82..400ea9b0 100644
--- a/backend/api/screen/router.py
+++ b/backend/api/screen/router.py
@@ -19,13 +19,28 @@
 
 # Import consolidated Postgres helpers if available (optional)
 from ..services.cit_db_service import cits_dp_service, snake_case_column, snake_case
-from .prompts import PROMPT_JSON_TEMPLATE, PROMPT_JSON_TEMPLATE_FULLTEXT
+from .prompts import (
+    PROMPT_JSON_TEMPLATE,
+    PROMPT_JSON_TEMPLATE_FULLTEXT,
+    PROMPT_XML_TEMPLATE_TA,
+    PROMPT_XML_TEMPLATE_TA_CRITICAL,
+    PROMPT_XML_TEMPLATE_FULLTEXT,
+    PROMPT_XML_TEMPLATE_FULLTEXT_CRITICAL,
+)
+from .agentic_utils import build_critical_options, parse_agent_xml, resolve_option
 
 logger = logging.getLogger(__name__)
 
 router = APIRouter()
 
 
+class AgentRunsQueryResponse(BaseModel):
+    sr_id: str
+    pipeline: str
+    citation_ids: List[int]
+    runs: List[Dict[str, Any]]
+
+
 def _normalize_int_list(v: Any) -> List[int]:
     if v is None:
         return []
@@ -85,6 +100,31 @@ class HumanClassifyRequest(BaseModel):
     explanation: Optional[str] = Field("", description="Optional free-text explanation from the human reviewer")
     confidence: Optional[float] = Field(None, ge=0.0, le=1.0, description="Optional confidence (0.0 - 1.0)")
     reviewer: Optional[str] = Field(None, description="Optional reviewer id or name")
+
+
+class TitleAbstractRunRequest(BaseModel):
+    sr_id: str = Field(..., description="Systematic review id")
+    citation_id: int = Field(..., ge=1, description="Citation id (row id in the SR screening table)")
+    model: Optional[str] = Field(None, description="Model key/deployment to use")
+    temperature: float = Field(0.0, ge=0.0, le=1.0)
+    max_tokens: int = Field(1200, ge=64, le=4000)
+    prompt_version: Optional[str] = Field("v1", description="Prompt version tag for auditing")
+
+
+class ValidateStepRequest(BaseModel):
+    sr_id: str = Field(..., description="Systematic review id")
+    citation_id: int = Field(..., ge=1, description="Citation id (row id in the SR screening table)")
+    step: str = Field("l1", description="Validation step: l1|l2|parameters")
+
+
+class FulltextRunRequest(BaseModel):
+    sr_id: str = Field(..., description="Systematic review id")
+    citation_id: int = Field(..., ge=1, description="Citation id (row id in the SR screening table)")
+    model: Optional[str] = Field(None, description="Model key/deployment to use")
+    temperature: float = Field(0.0, ge=0.0, le=1.0)
+    max_tokens: int = Field(2000, ge=64, le=4000)
+    prompt_version: Optional[str] = Field("v1", description="Prompt version tag for auditing")
+
     
 # _update_sync moved to backend.api.core.postgres.update_jsonb_column
 # Use run_in_threadpool(update_jsonb_column, ...) where needed.
@@ -397,6 +437,630 @@ async def human_classify_citation(
 
     return {"status": "success", "sr_id": sr_id, "citation_id": citation_id, "column": col_name, "classification": classification_json}
 
+
+@router.post("/title-abstract/run")
+async def run_title_abstract_agentic(
+    payload: TitleAbstractRunRequest,
+    current_user: Dict[str, Any] = Depends(get_current_active_user),
+):
+    """Run orchestrated Title/Abstract screening + critical for one citation.
+
+    Implements Phase 1 MVP endpoint from planning/agentic_implementation_plan.
+    """
+
+    sr_id = str(payload.sr_id)
+    citation_id = int(payload.citation_id)
+
+    try:
+        sr, screening = await load_sr_and_check(sr_id, current_user, srdb_service)
+    except HTTPException:
+        raise
+    except Exception as e:
+        raise HTTPException(
+            status_code=status.HTTP_500_INTERNAL_SERVER_ERROR,
+            detail=f"Failed to load systematic review or screening: {e}",
+        )
+
+    table_name = (screening or {}).get("table_name") or "citations"
+
+    # Ensure LLM client is available
+    if not azure_openai_client.is_configured():
+        raise HTTPException(
+            status_code=status.HTTP_503_SERVICE_UNAVAILABLE,
+            detail="Azure OpenAI client is not configured on the server",
+        )
+
+    # Load citation row
+    try:
+        row = await run_in_threadpool(cits_dp_service.get_citation_by_id, citation_id, table_name)
+    except RuntimeError as rexc:
+        raise HTTPException(status_code=status.HTTP_503_SERVICE_UNAVAILABLE, detail=str(rexc))
+    except Exception as e:
+        raise HTTPException(status_code=status.HTTP_500_INTERNAL_SERVER_ERROR, detail=f"Failed to query screening DB: {e}")
+
+    if not row:
+        raise HTTPException(status_code=status.HTTP_404_NOT_FOUND, detail="Citation not found")
+
+    # Build combined citation text (use SR include columns or fallback to title+abstract)
+    include_cols = []
+    try:
+        include_cols = cits_dp_service.load_include_columns_from_criteria(sr) or []
+    except Exception:
+        include_cols = []
+    if not include_cols:
+        include_cols = ["title", "abstract"]
+
+    citation_text = citations_router._build_combined_citation_from_row(row, include_cols)
+
+    # Load L1 criteria
+    cp = sr.get("criteria_parsed") or sr.get("criteria") or {}
+    l1 = cp.get("l1") if isinstance(cp, dict) else None
+    questions = (l1 or {}).get("questions") if isinstance(l1, dict) else []
+    possible = (l1 or {}).get("possible_answers") if isinstance(l1, dict) else []
+    addinfos = (l1 or {}).get("additional_infos") if isinstance(l1, dict) else []
+    questions = questions if isinstance(questions, list) else []
+    possible = possible if isinstance(possible, list) else []
+    addinfos = addinfos if isinstance(addinfos, list) else []
+
+    if not questions:
+        raise HTTPException(status_code=status.HTTP_400_BAD_REQUEST, detail="SR has no L1 criteria questions configured")
+
+    async def _call_llm(prompt: str) -> Tuple[str, Dict[str, Any], int]:
+        """Return (content, usage, latency_ms)."""
+        import time
+
+        t0 = time.time()
+        messages = [{"role": "user", "content": prompt}]
+        resp = await azure_openai_client.chat_completion(
+            messages=messages,
+            model=payload.model,
+            max_tokens=payload.max_tokens,
+            temperature=payload.temperature,
+            stream=False,
+        )
+        latency_ms = int((time.time() - t0) * 1000)
+        content = ((resp.get("choices") or [{}])[0].get("message") or {}).get("content") or ""
+        usage = resp.get("usage") or {}
+        return str(content), dict(usage), latency_ms
+
+    results: List[Dict[str, Any]] = []
+    user_email = str(current_user.get("email") or current_user.get("id") or "")
+
+    for i, q in enumerate(questions):
+        if not isinstance(q, str) or not q.strip():
+            continue
+
+        opts = possible[i] if i < len(possible) and isinstance(possible[i], list) else []
+        opts = [str(o) for o in opts if o is not None and str(o).strip()]
+        xtra = addinfos[i] if i < len(addinfos) and isinstance(addinfos[i], str) else ""
+
+        if not opts:
+            # still return shape to UI
+            results.append(
+                {
+                    "question": q,
+                    "criterion_key": snake_case(q, max_len=56),
+                    "error": "No options configured",
+                }
+            )
+            continue
+
+        options_listed = "\n".join(opts)
+        criterion_key = snake_case(q, max_len=56)
+
+        # 1) screening
+        screening_prompt = PROMPT_XML_TEMPLATE_TA.format(
+            question=q,
+            cit=citation_text,
+            options=options_listed,
+            xtra=xtra or "",
+        )
+        screening_raw, screening_usage, screening_latency = await _call_llm(screening_prompt)
+        screening_parsed = parse_agent_xml(screening_raw)
+        screening_answer = resolve_option(screening_parsed.answer, opts)
+
+        try:
+            screening_run_id = await run_in_threadpool(
+                cits_dp_service.insert_screening_agent_run,
+                {
+                    "sr_id": sr_id,
+                    "table_name": table_name,
+                    "citation_id": citation_id,
+                    "pipeline": "title_abstract",
+                    "criterion_key": criterion_key,
+                    "stage": "screening",
+                    "answer": screening_answer,
+                    "confidence": screening_parsed.confidence,
+                    "rationale": screening_parsed.rationale,
+                    "raw_response": screening_raw,
+                    "model": payload.model,
+                    "prompt_version": payload.prompt_version,
+                    "temperature": payload.temperature,
+                    "latency_ms": screening_latency,
+                    "input_tokens": screening_usage.get("prompt_tokens"),
+                    "output_tokens": screening_usage.get("completion_tokens"),
+                },
+            )
+        except RuntimeError as rexc:
+            raise HTTPException(status_code=status.HTTP_503_SERVICE_UNAVAILABLE, detail=str(rexc))
+        except Exception as e:
+            raise HTTPException(status_code=status.HTTP_500_INTERNAL_SERVER_ERROR, detail=f"Failed to persist screening run: {e}")
+
+        # 2) critical
+        critical_opts = build_critical_options(all_options=opts, screening_answer=screening_answer)
+        critical_listed = "\n".join(critical_opts)
+        critical_prompt = PROMPT_XML_TEMPLATE_TA_CRITICAL.format(
+            question=q,
+            cit=citation_text,
+            screening_answer=screening_answer,
+            options=critical_listed,
+            xtra=xtra or "",
+        )
+        critical_raw, critical_usage, critical_latency = await _call_llm(critical_prompt)
+        critical_parsed = parse_agent_xml(critical_raw)
+        critical_answer = resolve_option(critical_parsed.answer, critical_opts)
+
+        disagrees = str(critical_answer).strip() != "None of the above"
+
+        try:
+            critical_run_id = await run_in_threadpool(
+                cits_dp_service.insert_screening_agent_run,
+                {
+                    "sr_id": sr_id,
+                    "table_name": table_name,
+                    "citation_id": citation_id,
+                    "pipeline": "title_abstract",
+                    "criterion_key": criterion_key,
+                    "stage": "critical",
+                    "answer": critical_answer,
+                    "confidence": critical_parsed.confidence,
+                    "rationale": critical_parsed.rationale,
+                    "raw_response": critical_raw,
+                    "model": payload.model,
+                    "prompt_version": payload.prompt_version,
+                    "temperature": payload.temperature,
+                    "latency_ms": critical_latency,
+                    "input_tokens": critical_usage.get("prompt_tokens"),
+                    "output_tokens": critical_usage.get("completion_tokens"),
+                },
+            )
+        except RuntimeError as rexc:
+            raise HTTPException(status_code=status.HTTP_503_SERVICE_UNAVAILABLE, detail=str(rexc))
+        except Exception as e:
+            raise HTTPException(status_code=status.HTTP_500_INTERNAL_SERVER_ERROR, detail=f"Failed to persist critical run: {e}")
+
+        results.append(
+            {
+                "question": q,
+                "criterion_key": criterion_key,
+                "screening": {
+                    "run_id": screening_run_id,
+                    "answer": screening_answer,
+                    "confidence": screening_parsed.confidence,
+                    "rationale": screening_parsed.rationale,
+                    "parse_ok": screening_parsed.parse_ok,
+                },
+                "critical": {
+                    "run_id": critical_run_id,
+                    "answer": critical_answer,
+                    "confidence": critical_parsed.confidence,
+                    "rationale": critical_parsed.rationale,
+                    "parse_ok": critical_parsed.parse_ok,
+                    "disagrees": disagrees,
+                },
+            }
+        )
+
+    return {
+        "status": "success",
+        "sr_id": sr_id,
+        "citation_id": citation_id,
+        "pipeline": "title_abstract",
+        "criteria": results,
+    }
+
+
+@router.post("/validate")
+async def validate_screening_step(
+    payload: ValidateStepRequest,
+    current_user: Dict[str, Any] = Depends(get_current_active_user),
+):
+    """Mark a citation as validated for a given step.
+
+    Phase 1 MVP uses step=l1 (Title/Abstract). This endpoint is written to be
+    forward-compatible with l2/parameters.
+    """
+
+    sr_id = str(payload.sr_id)
+    citation_id = int(payload.citation_id)
+    step = (payload.step or "l1").lower().strip()
+
+    if step not in {"l1", "l2", "parameters"}:
+        raise HTTPException(status_code=status.HTTP_400_BAD_REQUEST, detail="step must be one of: l1, l2, parameters")
+
+    try:
+        _sr, screening = await load_sr_and_check(sr_id, current_user, srdb_service)
+    except HTTPException:
+        raise
+    except Exception as e:
+        raise HTTPException(status_code=status.HTTP_500_INTERNAL_SERVER_ERROR, detail=f"Failed to load SR: {e}")
+
+    table_name = (screening or {}).get("table_name") or "citations"
+
+    validated_by_col = f"{step}_validated_by"
+    validated_at_col = f"{step}_validated_at"
+    validated_by = str(current_user.get("email") or current_user.get("id") or "")
+    now_iso = datetime.utcnow().isoformat() + "Z"
+
+    try:
+        # Ensure columns exist (best-effort; no-migrations philosophy)
+        await run_in_threadpool(cits_dp_service.create_column, validated_by_col, "TEXT", table_name)
+        await run_in_threadpool(cits_dp_service.create_column, validated_at_col, "TIMESTAMPTZ", table_name)
+
+        u1 = await run_in_threadpool(cits_dp_service.update_text_column, citation_id, validated_by_col, validated_by, table_name)
+        u2 = await run_in_threadpool(cits_dp_service.update_text_column, citation_id, validated_at_col, now_iso, table_name)
+    except RuntimeError as rexc:
+        raise HTTPException(status_code=status.HTTP_503_SERVICE_UNAVAILABLE, detail=str(rexc))
+    except Exception as e:
+        raise HTTPException(status_code=status.HTTP_500_INTERNAL_SERVER_ERROR, detail=f"Failed to update validation fields: {e}")
+
+    if not (u1 and u2):
+        raise HTTPException(status_code=status.HTTP_404_NOT_FOUND, detail="Citation not found to update")
+
+    return {
+        "status": "success",
+        "sr_id": sr_id,
+        "citation_id": citation_id,
+        "step": step,
+        "validated_by": validated_by,
+        "validated_at": now_iso,
+    }
+
+
+@router.post("/fulltext/run")
+async def run_fulltext_agentic(
+    payload: FulltextRunRequest,
+    current_user: Dict[str, Any] = Depends(get_current_active_user),
+):
+    """Run orchestrated Fulltext screening + critical for one citation (L2)."""
+
+    sr_id = str(payload.sr_id)
+    citation_id = int(payload.citation_id)
+
+    try:
+        sr, screening = await load_sr_and_check(sr_id, current_user, srdb_service)
+    except HTTPException:
+        raise
+    except Exception as e:
+        raise HTTPException(status_code=status.HTTP_500_INTERNAL_SERVER_ERROR, detail=f"Failed to load SR: {e}")
+
+    table_name = (screening or {}).get("table_name") or "citations"
+
+    if not azure_openai_client.is_configured():
+        raise HTTPException(
+            status_code=status.HTTP_503_SERVICE_UNAVAILABLE,
+            detail="Azure OpenAI client is not configured on the server",
+        )
+
+    # Load citation row
+    try:
+        row = await run_in_threadpool(cits_dp_service.get_citation_by_id, citation_id, table_name)
+    except RuntimeError as rexc:
+        raise HTTPException(status_code=status.HTTP_503_SERVICE_UNAVAILABLE, detail=str(rexc))
+    except Exception as e:
+        raise HTTPException(status_code=status.HTTP_500_INTERNAL_SERVER_ERROR, detail=f"Failed to query screening DB: {e}")
+
+    if not row:
+        raise HTTPException(status_code=status.HTTP_404_NOT_FOUND, detail="Citation not found")
+
+    # Ensure fulltext exists (CAN-SR source of truth: extracted DI/Grobid artifacts)
+    if not row.get("fulltext"):
+        # We don't have a direct SR id in the extract endpoint signature; it expects sr_id.
+        # We'll try best-effort to trigger extraction if fulltext_url exists.
+        try:
+            from ..extract.router import extract_fulltext_from_storage
+
+            await extract_fulltext_from_storage(sr_id, citation_id, current_user=current_user)  # type: ignore
+        except Exception:
+            pass
+
+        row = await run_in_threadpool(cits_dp_service.get_citation_by_id, citation_id, table_name)
+
+    include_cols = []
+    try:
+        include_cols = cits_dp_service.load_include_columns_from_criteria(sr) or []
+    except Exception:
+        include_cols = []
+    if not include_cols:
+        include_cols = ["title", "abstract"]
+
+    citation_text = citations_router._build_combined_citation_from_row(row or {}, include_cols)
+    fulltext = (row or {}).get("fulltext") or citation_text
+
+    # Tables/Figures context from row
+    tables_md_lines: List[str] = []
+    figures_lines: List[str] = []
+    images: List[Tuple[bytes, str]] = []
+
+    ft_tables = (row or {}).get("fulltext_tables")
+    if isinstance(ft_tables, str):
+        try:
+            ft_tables = json.loads(ft_tables)
+        except Exception:
+            ft_tables = None
+    if isinstance(ft_tables, list):
+        for item in ft_tables:
+            if not isinstance(item, dict):
+                continue
+            idx = item.get("index")
+            blob_addr = item.get("blob_address")
+            caption = item.get("caption")
+            if not idx or not blob_addr:
+                continue
+            try:
+                md_bytes, _ = await storage_service.get_bytes_by_path(blob_addr)
+                md_txt = md_bytes.decode("utf-8", errors="replace")
+                header = f"Table [T{idx}]" + (f" caption: {caption}" if caption else "")
+                tables_md_lines.extend([header, md_txt, ""])
+            except Exception:
+                continue
+
+    ft_figs = (row or {}).get("fulltext_figures")
+    if isinstance(ft_figs, str):
+        try:
+            ft_figs = json.loads(ft_figs)
+        except Exception:
+            ft_figs = None
+    if isinstance(ft_figs, list):
+        for item in ft_figs:
+            if not isinstance(item, dict):
+                continue
+            idx = item.get("index")
+            blob_addr = item.get("blob_address")
+            caption = item.get("caption")
+            if not idx or not blob_addr:
+                continue
+            figures_lines.append(f"Figure [F{idx}] caption: {caption or '(no caption)'} (see attached image F{idx})")
+            try:
+                img_bytes, _ = await storage_service.get_bytes_by_path(blob_addr)
+                if img_bytes:
+                    images.append((img_bytes, "image/png"))
+            except Exception:
+                continue
+
+    # Load L2 criteria
+    cp = sr.get("criteria_parsed") or sr.get("criteria") or {}
+    l2 = cp.get("l2") if isinstance(cp, dict) else None
+    questions = (l2 or {}).get("questions") if isinstance(l2, dict) else []
+    possible = (l2 or {}).get("possible_answers") if isinstance(l2, dict) else []
+    addinfos = (l2 or {}).get("additional_infos") if isinstance(l2, dict) else []
+    questions = questions if isinstance(questions, list) else []
+    possible = possible if isinstance(possible, list) else []
+    addinfos = addinfos if isinstance(addinfos, list) else []
+
+    if not questions:
+        raise HTTPException(status_code=status.HTTP_400_BAD_REQUEST, detail="SR has no L2 criteria questions configured")
+
+    async def _call_llm(prompt: str) -> Tuple[str, Dict[str, Any], int]:
+        import time
+
+        t0 = time.time()
+        # Use multimodal API when we have figure images
+        if images:
+            content = await azure_openai_client.multimodal_chat(
+                user_text=prompt,
+                images=images,
+                system_prompt=None,
+                model=payload.model,
+                max_tokens=payload.max_tokens,
+                temperature=payload.temperature,
+            )
+            latency_ms = int((time.time() - t0) * 1000)
+            # multimodal_chat does not expose usage
+            return str(content), {}, latency_ms
+
+        messages = [{"role": "user", "content": prompt}]
+        resp = await azure_openai_client.chat_completion(
+            messages=messages,
+            model=payload.model,
+            max_tokens=payload.max_tokens,
+            temperature=payload.temperature,
+            stream=False,
+        )
+        latency_ms = int((time.time() - t0) * 1000)
+        content = ((resp.get("choices") or [{}])[0].get("message") or {}).get("content") or ""
+        usage = resp.get("usage") or {}
+        return str(content), dict(usage), latency_ms
+
+    results: List[Dict[str, Any]] = []
+
+    for i, q in enumerate(questions):
+        if not isinstance(q, str) or not q.strip():
+            continue
+
+        opts = possible[i] if i < len(possible) and isinstance(possible[i], list) else []
+        opts = [str(o) for o in opts if o is not None and str(o).strip()]
+        xtra = addinfos[i] if i < len(addinfos) and isinstance(addinfos[i], str) else ""
+
+        if not opts:
+            results.append({"question": q, "criterion_key": snake_case(q, max_len=56), "error": "No options configured"})
+            continue
+
+        criterion_key = snake_case(q, max_len=56)
+        options_listed = "\n".join(opts)
+
+        # 1) screening
+        screening_prompt = PROMPT_XML_TEMPLATE_FULLTEXT.format(
+            question=q,
+            options=options_listed,
+            xtra=xtra or "",
+            fulltext=fulltext,
+            tables="\n".join(tables_md_lines) if tables_md_lines else "(none)",
+            figures="\n".join(figures_lines) if figures_lines else "(none)",
+        )
+        screening_raw, screening_usage, screening_latency = await _call_llm(screening_prompt)
+        screening_parsed = parse_agent_xml(screening_raw)
+        screening_answer = resolve_option(screening_parsed.answer, opts)
+
+        try:
+            screening_run_id = await run_in_threadpool(
+                cits_dp_service.insert_screening_agent_run,
+                {
+                    "sr_id": sr_id,
+                    "table_name": table_name,
+                    "citation_id": citation_id,
+                    "pipeline": "fulltext",
+                    "criterion_key": criterion_key,
+                    "stage": "screening",
+                    "answer": screening_answer,
+                    "confidence": screening_parsed.confidence,
+                    "rationale": screening_parsed.rationale,
+                    "raw_response": screening_raw,
+                    "model": payload.model,
+                    "prompt_version": payload.prompt_version,
+                    "temperature": payload.temperature,
+                    "latency_ms": screening_latency,
+                    "input_tokens": screening_usage.get("prompt_tokens"),
+                    "output_tokens": screening_usage.get("completion_tokens"),
+                },
+            )
+        except RuntimeError as rexc:
+            raise HTTPException(status_code=status.HTTP_503_SERVICE_UNAVAILABLE, detail=str(rexc))
+        except Exception as e:
+            raise HTTPException(status_code=status.HTTP_500_INTERNAL_SERVER_ERROR, detail=f"Failed to persist screening run: {e}")
+
+        # 2) critical
+        critical_opts = build_critical_options(all_options=opts, screening_answer=screening_answer)
+        critical_listed = "\n".join(critical_opts)
+        critical_prompt = PROMPT_XML_TEMPLATE_FULLTEXT_CRITICAL.format(
+            question=q,
+            screening_answer=screening_answer,
+            options=critical_listed,
+            xtra=xtra or "",
+            fulltext=fulltext,
+            tables="\n".join(tables_md_lines) if tables_md_lines else "(none)",
+            figures="\n".join(figures_lines) if figures_lines else "(none)",
+        )
+        critical_raw, critical_usage, critical_latency = await _call_llm(critical_prompt)
+        critical_parsed = parse_agent_xml(critical_raw)
+        critical_answer = resolve_option(critical_parsed.answer, critical_opts)
+        disagrees = str(critical_answer).strip() != "None of the above"
+
+        try:
+            critical_run_id = await run_in_threadpool(
+                cits_dp_service.insert_screening_agent_run,
+                {
+                    "sr_id": sr_id,
+                    "table_name": table_name,
+                    "citation_id": citation_id,
+                    "pipeline": "fulltext",
+                    "criterion_key": criterion_key,
+                    "stage": "critical",
+                    "answer": critical_answer,
+                    "confidence": critical_parsed.confidence,
+                    "rationale": critical_parsed.rationale,
+                    "raw_response": critical_raw,
+                    "model": payload.model,
+                    "prompt_version": payload.prompt_version,
+                    "temperature": payload.temperature,
+                    "latency_ms": critical_latency,
+                    "input_tokens": critical_usage.get("prompt_tokens"),
+                    "output_tokens": critical_usage.get("completion_tokens"),
+                },
+            )
+        except RuntimeError as rexc:
+            raise HTTPException(status_code=status.HTTP_503_SERVICE_UNAVAILABLE, detail=str(rexc))
+        except Exception as e:
+            raise HTTPException(status_code=status.HTTP_500_INTERNAL_SERVER_ERROR, detail=f"Failed to persist critical run: {e}")
+
+        results.append(
+            {
+                "question": q,
+                "criterion_key": criterion_key,
+                "screening": {
+                    "run_id": screening_run_id,
+                    "answer": screening_answer,
+                    "confidence": screening_parsed.confidence,
+                    "rationale": screening_parsed.rationale,
+                    "parse_ok": screening_parsed.parse_ok,
+                },
+                "critical": {
+                    "run_id": critical_run_id,
+                    "answer": critical_answer,
+                    "confidence": critical_parsed.confidence,
+                    "rationale": critical_parsed.rationale,
+                    "parse_ok": critical_parsed.parse_ok,
+                    "disagrees": disagrees,
+                },
+            }
+        )
+
+    return {
+        "status": "success",
+        "sr_id": sr_id,
+        "citation_id": citation_id,
+        "pipeline": "fulltext",
+        "criteria": results,
+    }
+
+
+@router.get("/agent-runs/latest", response_model=AgentRunsQueryResponse)
+async def get_latest_agent_runs(
+    sr_id: str,
+    pipeline: str,
+    citation_ids: str,
+    current_user: Dict[str, Any] = Depends(get_current_active_user),
+):
+    """Fetch latest screening_agent_runs for a set of citations.
+
+    Query params:
+      - sr_id: SR id
+      - pipeline: title_abstract | fulltext
+      - citation_ids: comma-separated citation ids
+    """
+
+    pipeline_norm = (pipeline or "").strip().lower()
+    if pipeline_norm in {"ta", "titleabstract", "title-abstract"}:
+        pipeline_norm = "title_abstract"
+    if pipeline_norm not in {"title_abstract", "fulltext"}:
+        raise HTTPException(status_code=status.HTTP_400_BAD_REQUEST, detail="pipeline must be 'title_abstract' or 'fulltext'")
+
+    raw_ids = [p.strip() for p in (citation_ids or "").split(",") if p.strip()]
+    if not raw_ids:
+        raise HTTPException(status_code=status.HTTP_400_BAD_REQUEST, detail="citation_ids is required")
+    parsed_ids: List[int] = []
+    for p in raw_ids:
+        try:
+            parsed_ids.append(int(p))
+        except Exception:
+            continue
+    if not parsed_ids:
+        raise HTTPException(status_code=status.HTTP_400_BAD_REQUEST, detail="citation_ids must be a comma-separated list of integers")
+
+    try:
+        _sr, screening = await load_sr_and_check(sr_id, current_user, srdb_service)
+    except HTTPException:
+        raise
+    except Exception as e:
+        raise HTTPException(status_code=status.HTTP_500_INTERNAL_SERVER_ERROR, detail=f"Failed to load SR: {e}")
+
+    table_name = (screening or {}).get("table_name") or "citations"
+
+    try:
+        rows = await run_in_threadpool(
+            cits_dp_service.list_latest_agent_runs,
+            sr_id=sr_id,
+            table_name=table_name,
+            citation_ids=parsed_ids,
+            pipeline=pipeline_norm,
+        )
+    except RuntimeError as rexc:
+        raise HTTPException(status_code=status.HTTP_503_SERVICE_UNAVAILABLE, detail=str(rexc))
+    except Exception as e:
+        raise HTTPException(status_code=status.HTTP_500_INTERNAL_SERVER_ERROR, detail=f"Failed to query screening_agent_runs: {e}")
+
+    return AgentRunsQueryResponse(sr_id=sr_id, pipeline=pipeline_norm, citation_ids=parsed_ids, runs=rows)
+
 async def update_inclusion_decision(
     sr: Dict[str, Any],
     citation_id: int,
diff --git a/backend/api/services/cit_db_service.py b/backend/api/services/cit_db_service.py
index 85371835..1e3f28e4 100644
--- a/backend/api/services/cit_db_service.py
+++ b/backend/api/services/cit_db_service.py
@@ -13,8 +13,15 @@
 can surface a 503 with an actionable message.
 """
 from typing import Any, Dict, List, Optional, Tuple
-import psycopg2
-import psycopg2.extras
+
+# psycopg2 is optional in some deploy/test contexts.
+# Per module docstring contract: methods should raise RuntimeError when psycopg2
+# is unavailable so routers can surface a 503.
+try:
+    import psycopg2  # type: ignore
+    import psycopg2.extras  # type: ignore
+except Exception:  # pragma: no cover
+    psycopg2 = None
 import json
 import re
 import os
@@ -22,6 +29,8 @@
 import csv
 import urllib.parse as up
 import hashlib
+from datetime import datetime
+import uuid
 
 # Local settings import (for POSTGRES_ADMIN_DSN / DATABASE_URL usage)
 try:
@@ -145,6 +154,297 @@ def __init__(self):
         # nothing stateful for now; keep class for ergonomics and easier testing
         pass
 
+    def _require_psycopg2(self) -> None:
+        if psycopg2 is None:
+            raise RuntimeError(
+                "psycopg2 is not installed. Install backend dependencies (requirements.txt) "
+                "or run with the docker backend image."
+            )
+
+    # -----------------------
+    # Schema helpers
+    # -----------------------
+    def table_exists(self, table_name: str = "citations") -> bool:
+        """Return True if a public table exists.
+
+        NOTE: We intentionally use runtime schema evolution (ALTER TABLE ...)
+        throughout CAN-SR, so callers need a safe way to check existence before
+        attempting to add columns.
+        """
+        table_name = _validate_ident(table_name, kind="table_name")
+        self._require_psycopg2()
+        conn = None
+        try:
+            conn = postgres_server.conn
+            cur = conn.cursor()
+            cur.execute(
+                """
+                SELECT 1
+                FROM information_schema.tables
+                WHERE table_schema = 'public' AND table_name = %s
+                LIMIT 1
+                """,
+                (table_name,),
+            )
+            return cur.fetchone() is not None
+        except Exception:
+            _safe_rollback(conn)
+            raise
+        finally:
+            if conn:
+                pass
+
+    def ensure_step_validation_columns(self, table_name: str = "citations") -> None:
+        """Ensure step-level validation columns exist for a screening table.
+
+        CAN-SR uses per-upload screening tables, so we create these columns on
+        those tables (not just a single shared citations table).
+
+        This is intentionally NOT backwards-compatible: it will eagerly add the
+        columns to whatever table is passed.
+        """
+        if not self.table_exists(table_name):
+            return
+
+        # L1 (Title/Abstract)
+        self.create_column("l1_validated_by", "TEXT", table_name=table_name)
+        self.create_column("l1_validated_at", "TIMESTAMPTZ", table_name=table_name)
+
+        # L2 (Full Text)
+        self.create_column("l2_validated_by", "TEXT", table_name=table_name)
+        self.create_column("l2_validated_at", "TIMESTAMPTZ", table_name=table_name)
+
+        # Parameters / extraction
+        self.create_column("parameters_validated_by", "TEXT", table_name=table_name)
+        self.create_column("parameters_validated_at", "TIMESTAMPTZ", table_name=table_name)
+
+    def ensure_screening_agent_runs_table(self) -> None:
+        """Ensure the normalized agent-run storage table exists.
+
+        We keep it in the shared Postgres DB (public schema). Because CAN-SR uses
+        per-upload screening tables (each with its own id sequence), we store
+        both the `sr_id` and the screening `table_name` alongside `citation_id`.
+        """
+        conn = None
+        try:
+            self._require_psycopg2()
+            conn = postgres_server.conn
+            cur = conn.cursor()
+
+            cur.execute(
+                """
+                CREATE TABLE IF NOT EXISTS screening_agent_runs (
+                    id TEXT PRIMARY KEY,
+                    sr_id TEXT NOT NULL,
+                    table_name TEXT NOT NULL,
+                    citation_id INT NOT NULL,
+                    pipeline TEXT NOT NULL,
+                    criterion_key TEXT NOT NULL,
+                    stage TEXT NOT NULL,
+                    answer TEXT,
+                    confidence DOUBLE PRECISION,
+                    rationale TEXT,
+                    raw_response TEXT,
+                    model TEXT,
+                    prompt_version TEXT,
+                    temperature DOUBLE PRECISION,
+                    top_p DOUBLE PRECISION,
+                    seed INT,
+                    latency_ms INT,
+                    input_tokens INT,
+                    output_tokens INT,
+                    cost_usd DOUBLE PRECISION,
+                    created_at TIMESTAMPTZ DEFAULT now()
+                )
+                """
+            )
+
+            # A couple of pragmatic indexes for common lookups.
+            cur.execute(
+                """
+                CREATE INDEX IF NOT EXISTS idx_screening_agent_runs_citation
+                ON screening_agent_runs (sr_id, table_name, citation_id, pipeline)
+                """
+            )
+            cur.execute(
+                """
+                CREATE INDEX IF NOT EXISTS idx_screening_agent_runs_criterion
+                ON screening_agent_runs (sr_id, pipeline, criterion_key, stage)
+                """
+            )
+
+            conn.commit()
+        except Exception:
+            _safe_rollback(conn)
+            raise
+        finally:
+            if conn:
+                pass
+
+    def ensure_agentic_screening_schema(self) -> None:
+        """One-call bootstrap for agentic screening.
+
+        This is safe to call at startup (creates only global tables), and can
+        also be called by endpoints before use.
+        """
+        self.ensure_screening_agent_runs_table()
+
+    # -----------------------
+    # Agent-run persistence
+    # -----------------------
+    def insert_screening_agent_run(self, run: Dict[str, Any]) -> str:
+        """Insert a single screening_agent_runs row.
+
+        Expected keys (most optional):
+        - sr_id, table_name, citation_id, pipeline, criterion_key, stage
+        - answer, confidence, rationale, raw_response
+        - model, prompt_version, temperature, top_p, seed
+        - latency_ms, input_tokens, output_tokens, cost_usd
+
+        Returns the generated run id.
+        """
+        self._require_psycopg2()
+        self.ensure_screening_agent_runs_table()
+
+        run_id = str(run.get("id") or uuid.uuid4())
+        sr_id = str(run.get("sr_id") or "")
+        table_name = str(run.get("table_name") or "")
+        citation_id = int(run.get("citation_id") or 0)
+        pipeline = str(run.get("pipeline") or "")
+        criterion_key = str(run.get("criterion_key") or "")
+        stage = str(run.get("stage") or "")
+
+        if not (sr_id and table_name and citation_id and pipeline and criterion_key and stage):
+            raise ValueError("insert_screening_agent_run missing required fields")
+
+        conn = None
+        try:
+            conn = postgres_server.conn
+            cur = conn.cursor()
+            cur.execute(
+                """
+                INSERT INTO screening_agent_runs (
+                    id, sr_id, table_name, citation_id, pipeline, criterion_key, stage,
+                    answer, confidence, rationale, raw_response,
+                    model, prompt_version, temperature, top_p, seed,
+                    latency_ms, input_tokens, output_tokens, cost_usd, created_at
+                ) VALUES (
+                    %s, %s, %s, %s, %s, %s, %s,
+                    %s, %s, %s, %s,
+                    %s, %s, %s, %s, %s,
+                    %s, %s, %s, %s, %s
+                )
+                """,
+                (
+                    run_id,
+                    sr_id,
+                    table_name,
+                    citation_id,
+                    pipeline,
+                    criterion_key,
+                    stage,
+                    run.get("answer"),
+                    run.get("confidence"),
+                    run.get("rationale"),
+                    run.get("raw_response"),
+                    run.get("model"),
+                    run.get("prompt_version"),
+                    run.get("temperature"),
+                    run.get("top_p"),
+                    run.get("seed"),
+                    run.get("latency_ms"),
+                    run.get("input_tokens"),
+                    run.get("output_tokens"),
+                    run.get("cost_usd"),
+                    run.get("created_at") or datetime.utcnow().isoformat() + "Z",
+                ),
+            )
+            conn.commit()
+            return run_id
+        except Exception:
+            _safe_rollback(conn)
+            raise
+        finally:
+            if conn:
+                pass
+
+    def list_latest_agent_runs(
+        self,
+        *,
+        sr_id: str,
+        table_name: str,
+        citation_ids: List[int],
+        pipeline: str,
+    ) -> List[Dict[str, Any]]:
+        """Return latest agent runs per (citation_id, criterion_key, stage) for a set of citations.
+
+        This is designed for list pages where we need to compute "needs validation"
+        without loading full raw responses.
+        """
+        self._require_psycopg2()
+        self.ensure_screening_agent_runs_table()
+
+        sr_id = str(sr_id or "")
+        table_name = str(table_name or "")
+        pipeline = str(pipeline or "")
+
+        ids: List[int] = []
+        for i in citation_ids or []:
+            try:
+                ids.append(int(i))
+            except Exception:
+                continue
+        if not (sr_id and table_name and pipeline and ids):
+            return []
+
+        conn = None
+        try:
+            conn = postgres_server.conn
+            cur = conn.cursor(cursor_factory=psycopg2.extras.RealDictCursor)
+
+            # DISTINCT ON picks the first row per group according to ORDER BY.
+            cur.execute(
+                """
+                SELECT DISTINCT ON (citation_id, criterion_key, stage)
+                    id,
+                    sr_id,
+                    table_name,
+                    citation_id,
+                    pipeline,
+                    criterion_key,
+                    stage,
+                    answer,
+                    confidence,
+                    rationale,
+                    model,
+                    prompt_version,
+                    temperature,
+                    top_p,
+                    seed,
+                    latency_ms,
+                    input_tokens,
+                    output_tokens,
+                    cost_usd,
+                    created_at
+                FROM screening_agent_runs
+                WHERE sr_id = %s
+                  AND table_name = %s
+                  AND pipeline = %s
+                  AND citation_id = ANY(%s)
+                ORDER BY citation_id, criterion_key, stage, created_at DESC
+                """,
+                (sr_id, table_name, pipeline, ids),
+            )
+
+            rows = cur.fetchall() or []
+            return [dict(r) for r in rows if r]
+        except Exception:
+            _safe_rollback(conn)
+            raise
+        finally:
+            if conn:
+                pass
+
     # -----------------------
     # Low level connection helpers
     # -----------------------
@@ -160,6 +460,7 @@ def create_column(self, col: str, col_type: str, table_name: str = "citations")
         col_type is the SQL type (e.g. TEXT, JSONB).
         """
         table_name = _validate_ident(table_name, kind="table_name")
+        self._require_psycopg2()
         conn = None
         try:
             conn = postgres_server.conn
@@ -193,6 +494,7 @@ def update_jsonb_column(
         Update a JSONB column for a citation. Creates the column if needed.
         """
         table_name = _validate_ident(table_name, kind="table_name")
+        self._require_psycopg2()
         conn = None
         try:
             conn = postgres_server.conn
@@ -227,6 +529,7 @@ def update_text_column(
         Update a TEXT column for a citation. Creates the column if needed.
         """
         table_name = _validate_ident(table_name, kind="table_name")
+        self._require_psycopg2()
         conn = None
         try:
             conn = postgres_server.conn
@@ -259,6 +562,7 @@ def update_bool_column(
     ) -> int:
         """Update a BOOLEAN column for a citation. Creates the column if needed."""
         table_name = _validate_ident(table_name, kind="table_name")
+        self._require_psycopg2()
         conn = None
         try:
             conn = postgres_server.conn
@@ -284,6 +588,7 @@ def update_bool_column(
     def get_table_columns(self, table_name: str = "citations") -> List[Dict[str, str]]:
         """Return [{name, data_type, udt_name}] for table columns ordered by ordinal_position."""
         table_name = _validate_ident(table_name, kind="table_name")
+        self._require_psycopg2()
         conn = None
         try:
             conn = postgres_server.conn
@@ -368,6 +673,7 @@ def copy_jsonb_if_empty(
         Intended for auto-filling human_* from llm_* while never overwriting.
         """
         table_name = _validate_ident(table_name, kind="table_name")
+        self._require_psycopg2()
         conn = None
         try:
             conn = postgres_server.conn
@@ -407,6 +713,7 @@ def dump_citations_csv(self, table_name: str = "citations") -> bytes:
         Uses Postgres COPY for correctness and performance.
         """
         table_name = _validate_ident(table_name, kind="table_name")
+        self._require_psycopg2()
         conn = None
         try:
             conn = postgres_server.conn
@@ -439,6 +746,7 @@ def dump_citations_csv_filtered(self, table_name: str = "citations") -> bytes:
           explicit scalar columns (selected/explanation/confidence/found/value/...).
         """
         table_name = _validate_ident(table_name, kind="table_name")
+        self._require_psycopg2()
 
         # 1) Determine columns to export
         cols_meta = self.get_table_columns(table_name)
@@ -596,6 +904,7 @@ def get_citation_by_id(self, citation_id: int, table_name: str = "citations") ->
         Return a dict mapping column -> value for the citation row, or None.
         """
         table_name = _validate_ident(table_name, kind="table_name")
+        self._require_psycopg2()
         conn = None
         try:
             conn = postgres_server.conn
@@ -638,6 +947,7 @@ def get_citations_by_ids(
             List[dict] rows. Missing ids are omitted.
         """
         table_name = _validate_ident(table_name, kind="table_name")
+        self._require_psycopg2()
         ids: List[int] = []
         for i in citation_ids or []:
             try:
@@ -691,6 +1001,7 @@ def backfill_human_decisions(self, criteria_parsed: Dict[str, Any], table_name:
         - undecided: any question missing/unanswered
         """
         table_name = _validate_ident(table_name, kind="table_name")
+        self._require_psycopg2()
 
         cp = criteria_parsed or {}
         l1_qs = (cp.get("l1") or {}).get("questions") if isinstance(cp.get("l1"), dict) else None
@@ -815,6 +1126,7 @@ def list_citation_ids(self, filter_step=None, table_name: str = "citations") ->
         Return list of integer primary keys (id) from citations table ordered by id.
         """
         table_name = _validate_ident(table_name, kind="table_name")
+        self._require_psycopg2()
         conn = None
         try:
             conn = postgres_server.conn
@@ -862,6 +1174,7 @@ def list_fulltext_urls(self, table_name: str = "citations") -> List[str]:
         Return list of fulltext_url values (non-null) from citations table.
         """
         table_name = _validate_ident(table_name, kind="table_name")
+        self._require_psycopg2()
         conn = None
         try:
             conn = postgres_server.conn
@@ -898,6 +1211,7 @@ def attach_fulltext(
         Creates columns if necessary. Returns rows modified (0/1).
         """
         table_name = _validate_ident(table_name, kind="table_name")
+        self._require_psycopg2()
         # create columns if missing
         self.create_column("fulltext_url", "TEXT", table_name=table_name)
         # compute md5
@@ -929,6 +1243,7 @@ def get_column_value(self, citation_id: int, column: str, table_name: str = "cit
         Return the value stored in `column` for the citation row (or None).
         """
         table_name = _validate_ident(table_name, kind="table_name")
+        self._require_psycopg2()
         conn = None
         try:
             conn = postgres_server.conn
@@ -968,6 +1283,7 @@ def set_column_value(self, citation_id: int, column: str, value: Any, table_name
     def drop_table(self, table_name: str, cascade: bool = True) -> None:
         """Drop a screening table in the shared database."""
         table_name = _validate_ident(table_name, kind="table_name")
+        self._require_psycopg2()
         conn = None
         try:
             conn = postgres_server.conn
@@ -994,6 +1310,7 @@ def create_table_and_insert_sync(
         is per-upload (e.g. sr_<sr>_<ts>_citations) inside the shared DB.
         """
         table_name = _validate_ident(table_name, kind="table_name")
+        self._require_psycopg2()
         conn = None
         try:
             conn = postgres_server.conn
@@ -1009,6 +1326,15 @@ def create_table_and_insert_sync(
             col_defs.append('"fulltext_url" TEXT')
             col_defs.append('"fulltext" TEXT')
             col_defs.append('"fulltext_md5" TEXT')
+
+            # Step-level validation fields (agentic screening plan)
+            col_defs.append('"l1_validated_by" TEXT')
+            col_defs.append('"l1_validated_at" TIMESTAMP WITH TIME ZONE')
+            col_defs.append('"l2_validated_by" TEXT')
+            col_defs.append('"l2_validated_at" TIMESTAMP WITH TIME ZONE')
+            col_defs.append('"parameters_validated_by" TEXT')
+            col_defs.append('"parameters_validated_at" TIMESTAMP WITH TIME ZONE')
+
             col_defs.append('"created_at" TIMESTAMP WITH TIME ZONE DEFAULT now()')
 
             cols_sql = ", ".join(col_defs)
diff --git a/backend/docker-compose.yml b/backend/docker-compose.yml
index 3cca1cbd..1a8567a7 100644
--- a/backend/docker-compose.yml
+++ b/backend/docker-compose.yml
@@ -47,6 +47,9 @@ services:
   # POSTGRESQL - Database (Citations & Systematic Reviews)
   # =============================================================================
   pgdb-service:
+    # IMPORTANT: pin to a major version.
+    # Using `postgres` (latest) can auto-upgrade across major versions (e.g., 16 -> 18)
+    # and break existing on-disk data without a pg_upgrade/backup-restore.
     image: postgres
     container_name: pgdb-service
     restart: unless-stopped
@@ -57,7 +60,7 @@ services:
     ports:
       - "5432:5432"
     volumes:
-      - ./volumes/postgres:/var/lib/postgresql/data
+      - ./volumes/postgres:/var/lib/postgresql
     healthcheck:
       test: ["CMD-SHELL", "pg_isready -U admin -d postgres -h localhost"]
       interval: 30s
diff --git a/backend/main.py b/backend/main.py
index 606dc55f..379553c5 100644
--- a/backend/main.py
+++ b/backend/main.py
@@ -14,6 +14,7 @@
 from api.core.config import settings
 from api.services.sr_db_service import srdb_service
 from api.services.user_db import user_db_service
+from api.services.cit_db_service import cits_dp_service
 
 
 app = FastAPI(
@@ -45,6 +46,15 @@ async def startup_event():
     except Exception as e:
         print(f"⚠️ Failed to ensure SR table exists: {e}", flush=True)
 
+    # Agentic screening schema bootstrap (no migrations; runtime schema evolution)
+    try:
+        print("🤖 Ensuring agentic screening tables...", flush=True)
+        await run_in_threadpool(cits_dp_service.ensure_agentic_screening_schema)
+        print("✓ Agentic screening tables initialized", flush=True)
+    except Exception as e:
+        # Do not fail startup; allow deployments without Postgres / in degraded mode.
+        print(f"⚠️ Failed to ensure agentic screening tables: {e}", flush=True)
+
     # Procrastinate schema + run-all job tables
     try:
         from api.jobs.procrastinate_app import (
diff --git a/frontend/app/[lang]/can-sr/l1-screen/view/page.tsx b/frontend/app/[lang]/can-sr/l1-screen/view/page.tsx
index 75721fb6..bb4dd22f 100644
--- a/frontend/app/[lang]/can-sr/l1-screen/view/page.tsx
+++ b/frontend/app/[lang]/can-sr/l1-screen/view/page.tsx
@@ -60,6 +60,16 @@ type CriteriaData = {
   possible_answers: string[][]
 }
 
+type LatestAgentRun = {
+  citation_id: number
+  criterion_key: string
+  stage: 'screening' | 'critical' | string
+  answer?: string | null
+  confidence?: number | null
+  rationale?: string | null
+  created_at?: string
+}
+
 /* Main page component */
 export default function CanSrL1ScreenPage() {
   const router = useRouter()
@@ -91,6 +101,12 @@ export default function CanSrL1ScreenPage() {
   // Collapsible open state for LLM panels
   const [panelOpen, setPanelOpen] = useState<Record<number, boolean>>({})
 
+  // Agentic runs (screening_agent_runs) for this citation
+  const [agentRuns, setAgentRuns] = useState<LatestAgentRun[]>([])
+  const [loadingRuns, setLoadingRuns] = useState(false)
+
+  const [validating, setValidating] = useState(false)
+
   useEffect(() => {
     if (!srId || !citationId) {
       router.replace('/can-sr')
@@ -159,6 +175,49 @@ export default function CanSrL1ScreenPage() {
     fetchCitationById(citationId)
   }, [srId, citationId])
 
+  // Load latest agent runs for this citation (screening + critical per criterion)
+  useEffect(() => {
+    if (!srId || !citationId) return
+    const loadRuns = async () => {
+      setLoadingRuns(true)
+      try {
+        const headers = getAuthHeaders()
+        const res = await fetch(
+          `/api/can-sr/screen/agent-runs/latest?sr_id=${encodeURIComponent(
+            srId,
+          )}&pipeline=${encodeURIComponent('title_abstract')}&citation_ids=${encodeURIComponent(
+            String(citationId),
+          )}`,
+          { method: 'GET', headers },
+        )
+        const data = await res.json().catch(() => ({}))
+        if (res.ok && Array.isArray(data?.runs)) {
+          setAgentRuns(data.runs as LatestAgentRun[])
+        } else {
+          setAgentRuns([])
+        }
+      } catch {
+        setAgentRuns([])
+      } finally {
+        setLoadingRuns(false)
+      }
+    }
+    loadRuns()
+  }, [srId, citationId])
+
+  const runsByCriterion = useMemo(() => {
+    const by: Record<string, { screening?: LatestAgentRun; critical?: LatestAgentRun }> = {}
+    for (const r of agentRuns) {
+      const key = String((r as any)?.criterion_key || '')
+      if (!key) continue
+      if (!by[key]) by[key] = {}
+      const stage = String((r as any)?.stage || '')
+      if (stage === 'screening') by[key].screening = r
+      if (stage === 'critical') by[key].critical = r
+    }
+    return by
+  }, [agentRuns])
+
   // Load parsed criteria (L1)
   useEffect(() => {
     if (!srId) return
@@ -438,6 +497,107 @@ export default function CanSrL1ScreenPage() {
       />
 
       <main className="mx-auto max-w-6xl px-6 py-8">
+        {/* Agentic summary + Validate */}
+        <div className="mb-6 rounded-lg border border-gray-200 bg-white p-4 shadow-sm">
+          <div className="flex flex-wrap items-center justify-between gap-3">
+            <div>
+              <h3 className="text-sm font-semibold text-gray-900">Agentic results</h3>
+              <p className="text-xs text-gray-600">
+                Latest <code>screening</code> + <code>critical</code> runs per criterion.
+              </p>
+            </div>
+            <div className="flex items-center gap-2">
+              <button
+                onClick={async () => {
+                  if (!srId || !citationId) return
+                  setValidating(true)
+                  try {
+                    const headers = {
+                      'Content-Type': 'application/json',
+                      ...getAuthHeaders(),
+                    }
+                    await fetch('/api/can-sr/screen/validate', {
+                      method: 'POST',
+                      headers,
+                      body: JSON.stringify({
+                        sr_id: srId,
+                        citation_id: Number(citationId),
+                        step: 'l1',
+                      }),
+                    })
+                    // Refresh citation so validated fields appear
+                    await fetchCitationById(String(citationId))
+                  } finally {
+                    setValidating(false)
+                  }
+                }}
+                className="rounded-md bg-emerald-600 px-3 py-2 text-sm font-medium text-white hover:bg-emerald-700 disabled:opacity-60"
+                disabled={validating}
+                type="button"
+              >
+                {validating ? 'Validating…' : 'Validate (L1)'}
+              </button>
+
+              {citation?.l1_validated_by ? (
+                <span className="text-xs text-emerald-700">
+                  Validated by {String(citation.l1_validated_by)}
+                </span>
+              ) : (
+                <span className="text-xs text-gray-600">Not validated</span>
+              )}
+            </div>
+          </div>
+
+          {loadingRuns ? (
+            <div className="mt-3 text-sm text-gray-600">Loading agent runs…</div>
+          ) : criteriaData?.questions?.length ? (
+            <div className="mt-3 space-y-2">
+              {criteriaData.questions.map((q, idx) => {
+                const criterionKey = q
+                  ? q
+                      .trim()
+                      .toLowerCase()
+                      .replace(/[^\w]+/g, '_')
+                      .replace(/_+/g, '_')
+                      .replace(/^_+|_+$/g, '')
+                      .slice(0, 56)
+                  : ''
+
+                const r = runsByCriterion[criterionKey] || {}
+                const scr = r.screening
+                const crit = r.critical
+
+                const critDisagrees =
+                  crit && String((crit as any)?.answer || '').trim() !== '' &&
+                  String((crit as any)?.answer || '').trim() !== 'None of the above'
+
+                return (
+                  <div key={idx} className="rounded-md border border-gray-100 bg-gray-50 p-3">
+                    <div className="text-sm font-medium text-gray-800">{q}</div>
+                    <div className="mt-2 grid grid-cols-2 gap-3 text-xs text-gray-700">
+                      <div className="rounded-md border border-gray-100 bg-white p-2">
+                        <div className="font-semibold">Screening</div>
+                        <div>Answer: {String((scr as any)?.answer ?? '—')}</div>
+                        <div>Confidence: {String((scr as any)?.confidence ?? '—')}</div>
+                      </div>
+                      <div className={"rounded-md border border-gray-100 bg-white p-2 " + (critDisagrees ? 'border-amber-300 bg-amber-50' : '')}>
+                        <div className="font-semibold">Critical</div>
+                        <div>Answer: {String((crit as any)?.answer ?? '—')}</div>
+                        <div>Confidence: {String((crit as any)?.confidence ?? '—')}</div>
+                        {critDisagrees ? (
+                          <div className="mt-1 font-medium text-amber-700">Disagrees</div>
+                        ) : null}
+                      </div>
+                    </div>
+                  </div>
+                )
+              })}
+            </div>
+          ) : (
+            <div className="mt-3 text-sm text-gray-600">No criteria loaded yet.</div>
+          )}
+        </div>
+
         <div className="grid grid-cols-12 gap-6">
           {/* Workspace (left) */}
           <div className="col-span-7">
diff --git a/frontend/app/[lang]/can-sr/l2-screen/view/page.tsx b/frontend/app/[lang]/can-sr/l2-screen/view/page.tsx
index 914634da..fef5b4f6 100644
--- a/frontend/app/[lang]/can-sr/l2-screen/view/page.tsx
+++ b/frontend/app/[lang]/can-sr/l2-screen/view/page.tsx
@@ -63,6 +63,16 @@ type CriteriaData = {
   additional_infos?: (string | null)[] // optional per-question extra guidance when available
 }
 
+type LatestAgentRun = {
+  citation_id: number
+  criterion_key: string
+  stage: 'screening' | 'critical' | string
+  answer?: string | null
+  confidence?: number | null
+  rationale?: string | null
+  created_at?: string
+}
+
 /* Main page component */
 export default function CanSrL2ScreenViewPage() {
   const router = useRouter()
@@ -98,6 +108,11 @@ export default function CanSrL2ScreenViewPage() {
   // Hint text from Title/Abstract screening for L1 questions
   const [hintByIndex, setHintByIndex] = useState<Record<number, string>>({})
 
+  // Agentic runs (screening_agent_runs) for this citation
+  const [agentRuns, setAgentRuns] = useState<LatestAgentRun[]>([])
+  const [loadingRuns, setLoadingRuns] = useState(false)
+  const [validating, setValidating] = useState(false)
+
   // Fulltext PDF viewer linkage
   const [fulltextCoords, setFulltextCoords] = useState<any[] | null>(null)
   const [fulltextPages, setFulltextPages] = useState<{ width: number; height: number }[] | null>(null)
@@ -218,6 +233,49 @@ export default function CanSrL2ScreenViewPage() {
     fetchCitationById(citationId)
   }, [srId, citationId])
 
+  // Load latest agent runs for this citation (screening + critical per criterion)
+  useEffect(() => {
+    if (!srId || !citationId) return
+    const loadRuns = async () => {
+      setLoadingRuns(true)
+      try {
+        const headers = getAuthHeaders()
+        const res = await fetch(
+          `/api/can-sr/screen/agent-runs/latest?sr_id=${encodeURIComponent(
+            srId,
+          )}&pipeline=${encodeURIComponent('fulltext')}&citation_ids=${encodeURIComponent(
+            String(citationId),
+          )}`,
+          { method: 'GET', headers },
+        )
+        const data = await res.json().catch(() => ({}))
+        if (res.ok && Array.isArray(data?.runs)) {
+          setAgentRuns(data.runs as LatestAgentRun[])
+        } else {
+          setAgentRuns([])
+        }
+      } catch {
+        setAgentRuns([])
+      } finally {
+        setLoadingRuns(false)
+      }
+    }
+    loadRuns()
+  }, [srId, citationId])
+
+  const runsByCriterion = useMemo(() => {
+    const by: Record<string, { screening?: LatestAgentRun; critical?: LatestAgentRun }> = {}
+    for (const r of agentRuns) {
+      const key = String((r as any)?.criterion_key || '')
+      if (!key) continue
+      if (!by[key]) by[key] = {}
+      const stage = String((r as any)?.stage || '')
+      if (stage === 'screening') by[key].screening = r
+      if (stage === 'critical') by[key].critical = r
+    }
+    return by
+  }, [agentRuns])
+
   // Load parsed criteria (L1 + L2 merged, L1 first)
   useEffect(() => {
     if (!srId) return
@@ -616,6 +674,115 @@ export default function CanSrL2ScreenViewPage() {
       />
 
       <main className="mx-auto max-w-8xl px-3 py-3">
+        {/* Agentic summary + Validate */}
+        <div className="mb-3 rounded-lg border border-gray-200 bg-white p-3 shadow-sm">
+          <div className="flex flex-wrap items-center justify-between gap-3">
+            <div>
+              <h3 className="text-sm font-semibold text-gray-900">Agentic results</h3>
+              <p className="text-xs text-gray-600">
+                Latest <code>screening</code> + <code>critical</code> runs for L2/fulltext per criterion.
+              </p>
+            </div>
+            <div className="flex items-center gap-2">
+              <button
+                onClick={async () => {
+                  if (!srId || !citationId) return
+                  setValidating(true)
+                  try {
+                    const headers = {
+                      'Content-Type': 'application/json',
+                      ...getAuthHeaders(),
+                    }
+                    await fetch('/api/can-sr/screen/validate', {
+                      method: 'POST',
+                      headers,
+                      body: JSON.stringify({
+                        sr_id: srId,
+                        citation_id: Number(citationId),
+                        step: 'l2',
+                      }),
+                    })
+                    await fetchCitationById(String(citationId))
+                  } finally {
+                    setValidating(false)
+                  }
+                }}
+                className="rounded-md bg-emerald-600 px-3 py-2 text-sm font-medium text-white hover:bg-emerald-700 disabled:opacity-60"
+                disabled={validating}
+                type="button"
+              >
+                {validating ? 'Validating…' : 'Validate (L2)'}
+              </button>
+
+              {citation?.l2_validated_by ? (
+                <span className="text-xs text-emerald-700">
+                  Validated by {String(citation.l2_validated_by)}
+                </span>
+              ) : (
+                <span className="text-xs text-gray-600">Not validated</span>
+              )}
+            </div>
+          </div>
+
+          {loadingRuns ? (
+            <div className="mt-2 text-sm text-gray-600">Loading agent runs…</div>
+          ) : criteriaData?.questions?.length ? (
+            <div className="mt-2 space-y-2">
+              {criteriaData.questions
+                .map((q, idx) => ({ q, idx }))
+                .filter(({ idx }) => sourceFlags[idx] === 'l2')
+                .map(({ q, idx }) => {
+                  const criterionKey = q
+                    ? q
+                        .trim()
+                        .toLowerCase()
+                        .replace(/[^\w]+/g, '_')
+                        .replace(/_+/g, '_')
+                        .replace(/^_+|_+$/g, '')
+                        .slice(0, 56)
+                    : ''
+
+                  const r = runsByCriterion[criterionKey] || {}
+                  const scr = r.screening
+                  const crit = r.critical
+
+                  const critDisagrees =
+                    crit &&
+                    String((crit as any)?.answer || '').trim() !== '' &&
+                    String((crit as any)?.answer || '').trim() !== 'None of the above'
+
+                  return (
+                    <div key={idx} className="rounded-md border border-gray-100 bg-gray-50 p-3">
+                      <div className="text-sm font-medium text-gray-800">{q}</div>
+                      <div className="mt-2 grid grid-cols-2 gap-3 text-xs text-gray-700">
+                        <div className="rounded-md border border-gray-100 bg-white p-2">
+                          <div className="font-semibold">Screening</div>
+                          <div>Answer: {String((scr as any)?.answer ?? '—')}</div>
+                          <div>Confidence: {String((scr as any)?.confidence ?? '—')}</div>
+                        </div>
+                        <div
+                          className={
+                            'rounded-md border border-gray-100 bg-white p-2 ' +
+                            (critDisagrees ? 'border-amber-300 bg-amber-50' : '')
+                          }
+                        >
+                          <div className="font-semibold">Critical</div>
+                          <div>Answer: {String((crit as any)?.answer ?? '—')}</div>
+                          <div>Confidence: {String((crit as any)?.confidence ?? '—')}</div>
+                          {critDisagrees ? (
+                            <div className="mt-1 font-medium text-amber-700">Disagrees</div>
+                          ) : null}
+                        </div>
+                      </div>
+                    </div>
+                  )
+                })}
+            </div>
+          ) : (
+            <div className="mt-2 text-sm text-gray-600">No criteria loaded yet.</div>
+          )}
+        </div>
+
         <div className="grid grid-cols-12 gap-3">
           {/* Workspace (left) */}
           <div className="col-span-9">
diff --git a/frontend/app/api/can-sr/screen/agent-runs/latest/route.ts b/frontend/app/api/can-sr/screen/agent-runs/latest/route.ts
new file mode 100644
index 00000000..4fb1b1b9
--- /dev/null
+++ b/frontend/app/api/can-sr/screen/agent-runs/latest/route.ts
@@ -0,0 +1,67 @@
+import { NextRequest, NextResponse } from 'next/server'
+import { BACKEND_URL } from '@/lib/config'
+
+/**
+ * Proxy: GET /api/can-sr/screen/agent-runs/latest?sr_id=<sr>&pipeline=title_abstract&citation_ids=1,2,3
+ *   -> GET {BACKEND_URL}/api/screen/agent-runs/latest?sr_id=...&pipeline=...&citation_ids=...
+ */
+
+export async function OPTIONS() {
+  return new Response(null, {
+    status: 204,
+    headers: {
+      'Access-Control-Allow-Origin': '*',
+      'Access-Control-Allow-Methods': 'GET,OPTIONS',
+      'Access-Control-Allow-Headers': 'Authorization, Content-Type',
+    },
+  })
+}
+
+export async function GET(request: NextRequest) {
+  try {
+    const params = request.nextUrl.searchParams
+    const srId = params.get('sr_id')
+    const pipeline = params.get('pipeline')
+    const citationIds = params.get('citation_ids')
+
+    if (!srId || !pipeline || !citationIds) {
+      return NextResponse.json(
+        { error: 'sr_id, pipeline, citation_ids are required' },
+        { status: 400 },
+      )
+    }
+
+    const authHeader = request.headers.get('authorization')
+    if (!authHeader) {
+      return NextResponse.json(
+        { error: 'Authorization header is required' },
+        { status: 401 },
+      )
+    }
+
+    const url = new URL(`${BACKEND_URL}/api/screen/agent-runs/latest`)
+    url.searchParams.set('sr_id', srId)
+    url.searchParams.set('pipeline', pipeline)
+    url.searchParams.set('citation_ids', citationIds)
+
+    const res = await fetch(url.toString(), {
+      method: 'GET',
+      headers: {
+        Authorization: authHeader,
+      },
+    })
+
+    const text = await res.text().catch(() => '')
+    let json: any = null
+    try {
+      json = text ? JSON.parse(text) : {}
+    } catch {
+      json = { detail: text || null }
+    }
+
+    return NextResponse.json(json, { status: res.status })
+  } catch (err: any) {
+    console.error('Agent runs latest proxy GET error:', err)
+    return NextResponse.json({ error: 'Internal server error' }, { status: 500 })
+  }
+}
diff --git a/frontend/app/api/can-sr/screen/validate/route.ts b/frontend/app/api/can-sr/screen/validate/route.ts
new file mode 100644
index 00000000..5ea0e153
--- /dev/null
+++ b/frontend/app/api/can-sr/screen/validate/route.ts
@@ -0,0 +1,44 @@
+import { NextRequest, NextResponse } from 'next/server'
+import { BACKEND_URL } from '@/lib/config'
+
+/**
+ * Proxy: POST /api/can-sr/screen/validate
+ *  body: { sr_id, citation_id, step }
+ *   -> POST {BACKEND_URL}/api/screen/validate
+ */
+
+export async function POST(request: NextRequest) {
+  try {
+    const authHeader = request.headers.get('authorization')
+    if (!authHeader) {
+      return NextResponse.json(
+        { error: 'Authorization header is required' },
+        { status: 401 },
+      )
+    }
+
+    const body = await request.json().catch(() => ({}))
+
+    const res = await fetch(`${BACKEND_URL}/api/screen/validate`, {
+      method: 'POST',
+      headers: {
+        Authorization: authHeader,
+        'Content-Type': 'application/json',
+      },
+      body: JSON.stringify(body),
+    })
+
+    const text = await res.text().catch(() => '')
+    let json: any = null
+    try {
+      json = text ? JSON.parse(text) : {}
+    } catch {
+      json = { detail: text || null }
+    }
+
+    return NextResponse.json(json, { status: res.status })
+  } catch (err: any) {
+    console.error('Validate proxy POST error:', err)
+    return NextResponse.json({ error: 'Internal server error' }, { status: 500 })
+  }
+}
diff --git a/frontend/components/can-sr/PagedList.tsx b/frontend/components/can-sr/PagedList.tsx
index c0ab7a05..b0417b50 100644
--- a/frontend/components/can-sr/PagedList.tsx
+++ b/frontend/components/can-sr/PagedList.tsx
@@ -14,6 +14,15 @@ type CitationInfo = {
   pageview: string
 }
 
+type LatestAgentRun = {
+  citation_id: number
+  criterion_key: string
+  stage: 'screening' | 'critical' | string
+  answer?: string | null
+  confidence?: number | null
+  created_at?: string
+}
+
 function getAuthHeaders(): Record<string, string> {
   const token = getAuthToken()
   const tokenType = getTokenType()
@@ -53,13 +62,19 @@ export default function PagedList({
   )
   const [showClassify, setShowClassify] = useState<Record<number, boolean>>({})
 
+  // TA list controls
+  const [threshold, setThreshold] = useState<number>(0.9)
+  const [filterMode, setFilterMode] = useState<'needs' | 'validated' | 'unvalidated' | 'all'>('needs')
+
+  const [latestRunsByCitation, setLatestRunsByCitation] = useState<Record<number, LatestAgentRun[]>>({})
+
   const fileInputRefs = useRef<Record<number, HTMLInputElement | null>>({})
 
   // --- paging ---
   useEffect(() => {
     const lp = Math.max(1, Math.ceil((citationIds?.length || 0) / pageSize))
     setLastpage(lp)
-    setpage((prev) => Math.min(Math.max(1, prev), lp))
+    setpage((prev: number) => Math.min(Math.max(1, prev), lp))
   }, [citationIds, pageSize])
 
   useEffect(() => {
@@ -112,12 +127,102 @@ export default function PagedList({
         if (row?.fulltext_url) nextShow[id] = true
       }
 
-      setLlmClassified((prev) => ({ ...prev, ...nextLlm }))
-      setHumanVerified((prev) => ({ ...prev, ...nextHuman }))
-      setShowClassify((prev) => ({ ...prev, ...nextShow }))
+      setLlmClassified((prev: Record<number, boolean>) => ({ ...prev, ...nextLlm }))
+      setHumanVerified((prev: Record<number, boolean>) => ({ ...prev, ...nextHuman }))
+      setShowClassify((prev: Record<number, boolean>) => ({ ...prev, ...nextShow }))
+
+      // Fetch latest agent runs for this page (L1=title_abstract, L2=fulltext)
+      try {
+        const shouldFetchRuns = (screeningStep === 'l1' || screeningStep === 'l2') && pageIds.length
+        if (shouldFetchRuns) {
+          const pipeline = screeningStep === 'l2' ? 'fulltext' : 'title_abstract'
+          const r2 = await fetch(
+            `/api/can-sr/screen/agent-runs/latest?sr_id=${encodeURIComponent(srId)}&pipeline=${encodeURIComponent(
+              pipeline,
+            )}&citation_ids=${encodeURIComponent(pageIds.join(','))}`,
+            { method: 'GET', headers },
+          )
+          const j2 = await r2.json().catch(() => ({}))
+          if (r2.ok && Array.isArray(j2?.runs)) {
+            const grouped: Record<number, LatestAgentRun[]> = {}
+            for (const run of j2.runs as LatestAgentRun[]) {
+              const cid = Number((run as any)?.citation_id)
+              if (!Number.isFinite(cid)) continue
+              if (!grouped[cid]) grouped[cid] = []
+              grouped[cid].push(run)
+            }
+            setLatestRunsByCitation((prev: Record<number, LatestAgentRun[]>) => ({ ...prev, ...grouped }))
+          }
+        }
+      } catch (e) {
+        // best-effort
+      }
     }
     fetchCitations()
-  }, [citationIds, page, pageSize, questions, srId])
+  }, [citationIds, page, pageSize, questions, srId, screeningStep])
+
+  // Reset cached runs when switching steps (avoid mixing l1/l2 pipeline results)
+  useEffect(() => {
+    setLatestRunsByCitation({})
+  }, [screeningStep])
+
+  const isValidatedForStep = (row: any): boolean => {
+    if (!row) return false
+    if (screeningStep === 'l1') return Boolean(row?.l1_validated_by)
+    if (screeningStep === 'l2') return Boolean(row?.l2_validated_by)
+    if (screeningStep === 'extract') return Boolean(row?.parameters_validated_by)
+    return false
+  }
+
+  const computeNeedsValidation = (citationId: number, row: any): boolean => {
+    // If validated, it no longer “needs validation”
+    if (isValidatedForStep(row)) return false
+
+    const runs = latestRunsByCitation[citationId] || []
+    if (!runs.length) {
+      // No agent runs yet => should be in "unvalidated" but not necessarily "needs"
+      // We'll treat missing runs as "needs" so it's easy to find.
+      return true
+    }
+
+    // Group by criterion_key
+    const byKey: Record<string, LatestAgentRun[]> = {}
+    for (const r of runs) {
+      const key = String((r as any)?.criterion_key || '')
+      if (!key) continue
+      if (!byKey[key]) byKey[key] = []
+      byKey[key].push(r)
+    }
+
+    // Needs validation if ANY criterion is low confidence OR critical disagrees
+    for (const key of Object.keys(byKey)) {
+      const items = byKey[key]
+      const screening = items.find((x) => String((x as any)?.stage) === 'screening')
+      const critical = items.find((x) => String((x as any)?.stage) === 'critical')
+
+      const conf = Number((screening as any)?.confidence)
+      if (Number.isFinite(conf) && conf < threshold) return true
+
+      const criticalAns = String((critical as any)?.answer || '')
+      // In our critical prompt contract, agreement is "None of the above".
+      if (critical && criticalAns.trim() !== '' && criticalAns.trim() !== 'None of the above') return true
+    }
+
+    return false
+  }
+
+  const filteredCitationData = citationData.filter((row: any) => {
+    const id = Number(row?.id)
+    if (!Number.isFinite(id)) return false
+    const validated = isValidatedForStep(row)
+    const needs = computeNeedsValidation(id, row)
+    const unvalidated = !validated
+    if (filterMode === 'all') return true
+    if (filterMode === 'validated') return validated
+    if (filterMode === 'unvalidated') return unvalidated
+    if (filterMode === 'needs') return needs
+    return true
+  })
 
   // NOTE: Previously we fetched each citation via /citations/get.
   // This is now replaced by a single /citations/batch call per page.
@@ -156,7 +261,7 @@ export default function PagedList({
         { method: 'POST', headers, body: JSON.stringify(bodyPayload) },
       )
     }
-    setLlmClassified((prev) => ({ ...prev, [id]: true }))
+    setLlmClassified((prev: Record<number, boolean>) => ({ ...prev, [id]: true }))
   }
 
   const onChooseFile = (id: number) => {
@@ -196,16 +301,56 @@ export default function PagedList({
       { method: 'POST', headers, body: fd as any },
     )
 
-    setShowClassify((prev) => ({ ...prev, [id]: true }))
+    setShowClassify((prev: Record<number, boolean>) => ({ ...prev, [id]: true }))
   }
 
   return (
     <div className="flex flex-col items-center space-y-4">
+      {screeningStep === 'l1' || screeningStep === 'l2' ? (
+        <div className="flex w-full flex-wrap items-center justify-between gap-3 rounded-md border border-gray-200 bg-white p-3">
+          <div className="flex items-center gap-2">
+            <label className="text-sm text-gray-700">Threshold</label>
+            <input
+              type="number"
+              min={0}
+              max={1}
+              step={0.01}
+              value={threshold}
+              onChange={(e: React.ChangeEvent<HTMLInputElement>) => {
+                const v = Number(e.target.value)
+                if (!Number.isFinite(v)) return
+                setThreshold(Math.max(0, Math.min(1, v)))
+              }}
+              className="w-24 rounded-md border border-gray-200 px-2 py-1 text-sm"
+            />
+          </div>
+
+          <div className="flex items-center gap-2">
+            <label className="text-sm text-gray-700">Filter</label>
+            <select
+              value={filterMode}
+              onChange={(e: React.ChangeEvent<HTMLSelectElement>) => setFilterMode(e.target.value as any)}
+              className="rounded-md border border-gray-200 bg-white px-2 py-1 text-sm"
+            >
+              <option value="needs">Needs validation</option>
+              <option value="unvalidated">Unvalidated</option>
+              <option value="validated">Validated</option>
+              <option value="all">All</option>
+            </select>
+          </div>
+        </div>
+      ) : null}
+
       <ul className="w-full space-y-2">
-        {citationData.map((data) => (
+        {filteredCitationData.map((data: any) => (
           <li
             key={data.id}
-            className="flex items-center justify-between rounded-md border border-gray-200 bg-gray-50 px-4 py-2"
+            className={
+              'flex items-center justify-between rounded-md border px-4 py-2 ' +
+              ((screeningStep === 'l1' || screeningStep === 'l2') && computeNeedsValidation(Number(data.id), data)
+                ? 'border-amber-300 bg-amber-50'
+                : 'border-gray-200 bg-gray-50')
+            }
           >
             <div className="flex flex-col space-y-2 pr-4">
               <p className="text-xs text-gray-600">Citation #{data.id}</p>
@@ -334,7 +479,7 @@ export default function PagedList({
           <label className="text-sm text-gray-700">{dict.common.jumpToPage}</label>
           <input
             value={jumpPageInput}
-            onChange={(e) => setJumpPageInput(e.target.value)}
+            onChange={(e: React.ChangeEvent<HTMLInputElement>) => setJumpPageInput(e.target.value)}
             className="w-20 rounded-md border border-gray-200 px-2 py-1 text-sm"
             placeholder={String(page)}
             inputMode="numeric"
diff --git a/frontend/package-lock.json b/frontend/package-lock.json
index a8237aa6..ae2deb78 100644
--- a/frontend/package-lock.json
+++ b/frontend/package-lock.json
@@ -1022,10 +1022,9 @@
       }
     },
     "node_modules/@next/env": {
-      "version": "16.1.6",
-      "resolved": "https://registry.npmjs.org/@next/env/-/env-16.1.6.tgz",
-      "integrity": "sha512-N1ySLuZjnAtN3kFnwhAwPvZah8RJxKasD7x1f8shFqhncnWZn4JMfg37diLNuoHsLAlrDfM3g4mawVdtAG8XLQ==",
-      "license": "MIT"
+      "version": "16.2.2",
+      "resolved": "https://registry.npmjs.org/@next/env/-/env-16.2.2.tgz",
+      "integrity": "sha512-LqSGz5+xGk9EL/iBDr2yo/CgNQV6cFsNhRR2xhSXYh7B/hb4nePCxlmDvGEKG30NMHDFf0raqSyOZiQrO7BkHQ=="
     },
     "node_modules/@next/eslint-plugin-next": {
       "version": "15.5.9",
@@ -1037,13 +1036,12 @@
       }
     },
     "node_modules/@next/swc-darwin-arm64": {
-      "version": "16.1.6",
-      "resolved": "https://registry.npmjs.org/@next/swc-darwin-arm64/-/swc-darwin-arm64-16.1.6.tgz",
-      "integrity": "sha512-wTzYulosJr/6nFnqGW7FrG3jfUUlEf8UjGA0/pyypJl42ExdVgC6xJgcXQ+V8QFn6niSG2Pb8+MIG1mZr2vczw==",
+      "version": "16.2.2",
+      "resolved": "https://registry.npmjs.org/@next/swc-darwin-arm64/-/swc-darwin-arm64-16.2.2.tgz",
+      "integrity": "sha512-B92G3ulrwmkDSEJEp9+XzGLex5wC1knrmCSIylyVeiAtCIfvEJYiN3v5kXPlYt5R4RFlsfO/v++aKV63Acrugg==",
       "cpu": [
         "arm64"
       ],
-      "license": "MIT",
       "optional": true,
       "os": [
         "darwin"
@@ -1053,13 +1051,12 @@
       }
     },
     "node_modules/@next/swc-darwin-x64": {
-      "version": "16.1.6",
-      "resolved": "https://registry.npmjs.org/@next/swc-darwin-x64/-/swc-darwin-x64-16.1.6.tgz",
-      "integrity": "sha512-BLFPYPDO+MNJsiDWbeVzqvYd4NyuRrEYVB5k2N3JfWncuHAy2IVwMAOlVQDFjj+krkWzhY2apvmekMkfQR0CUQ==",
+      "version": "16.2.2",
+      "resolved": "https://registry.npmjs.org/@next/swc-darwin-x64/-/swc-darwin-x64-16.2.2.tgz",
+      "integrity": "sha512-7ZwSgNKJNQiwW0CKhNm9B1WS2L1Olc4B2XY0hPYCAL3epFnugMhuw5TMWzMilQ3QCZcCHoYm9NGWTHbr5REFxw==",
       "cpu": [
         "x64"
       ],
-      "license": "MIT",
       "optional": true,
       "os": [
         "darwin"
@@ -1069,13 +1066,12 @@
       }
     },
     "node_modules/@next/swc-linux-arm64-gnu": {
-      "version": "16.1.6",
-      "resolved": "https://registry.npmjs.org/@next/swc-linux-arm64-gnu/-/swc-linux-arm64-gnu-16.1.6.tgz",
-      "integrity": "sha512-OJYkCd5pj/QloBvoEcJ2XiMnlJkRv9idWA/j0ugSuA34gMT6f5b7vOiCQHVRpvStoZUknhl6/UxOXL4OwtdaBw==",
+      "version": "16.2.2",
+      "resolved": "https://registry.npmjs.org/@next/swc-linux-arm64-gnu/-/swc-linux-arm64-gnu-16.2.2.tgz",
+      "integrity": "sha512-c3m8kBHMziMgo2fICOP/cd/5YlrxDU5YYjAJeQLyFsCqVF8xjOTH/QYG4a2u48CvvZZSj1eHQfBCbyh7kBr30Q==",
       "cpu": [
         "arm64"
       ],
-      "license": "MIT",
       "optional": true,
       "os": [
         "linux"
@@ -1085,13 +1081,12 @@
       }
     },
     "node_modules/@next/swc-linux-arm64-musl": {
-      "version": "16.1.6",
-      "resolved": "https://registry.npmjs.org/@next/swc-linux-arm64-musl/-/swc-linux-arm64-musl-16.1.6.tgz",
-      "integrity": "sha512-S4J2v+8tT3NIO9u2q+S0G5KdvNDjXfAv06OhfOzNDaBn5rw84DGXWndOEB7d5/x852A20sW1M56vhC/tRVbccQ==",
+      "version": "16.2.2",
+      "resolved": "https://registry.npmjs.org/@next/swc-linux-arm64-musl/-/swc-linux-arm64-musl-16.2.2.tgz",
+      "integrity": "sha512-VKLuscm0P/mIfzt+SDdn2+8TNNJ7f0qfEkA+az7OqQbjzKdBxAHs0UvuiVoCtbwX+dqMEL9U54b5wQ/aN3dHeg==",
       "cpu": [
         "arm64"
       ],
-      "license": "MIT",
       "optional": true,
       "os": [
         "linux"
@@ -1101,13 +1096,12 @@
       }
     },
     "node_modules/@next/swc-linux-x64-gnu": {
-      "version": "16.1.6",
-      "resolved": "https://registry.npmjs.org/@next/swc-linux-x64-gnu/-/swc-linux-x64-gnu-16.1.6.tgz",
-      "integrity": "sha512-2eEBDkFlMMNQnkTyPBhQOAyn2qMxyG2eE7GPH2WIDGEpEILcBPI/jdSv4t6xupSP+ot/jkfrCShLAa7+ZUPcJQ==",
+      "version": "16.2.2",
+      "resolved": "https://registry.npmjs.org/@next/swc-linux-x64-gnu/-/swc-linux-x64-gnu-16.2.2.tgz",
+      "integrity": "sha512-kU3OPHJq6sBUjOk7wc5zJ7/lipn8yGldMoAv4z67j6ov6Xo/JvzA7L7LCsyzzsXmgLEhk3Qkpwqaq/1+XpNR3g==",
       "cpu": [
         "x64"
       ],
-      "license": "MIT",
       "optional": true,
       "os": [
         "linux"
@@ -1117,13 +1111,12 @@
       }
     },
     "node_modules/@next/swc-linux-x64-musl": {
-      "version": "16.1.6",
-      "resolved": "https://registry.npmjs.org/@next/swc-linux-x64-musl/-/swc-linux-x64-musl-16.1.6.tgz",
-      "integrity": "sha512-oicJwRlyOoZXVlxmIMaTq7f8pN9QNbdes0q2FXfRsPhfCi8n8JmOZJm5oo1pwDaFbnnD421rVU409M3evFbIqg==",
+      "version": "16.2.2",
+      "resolved": "https://registry.npmjs.org/@next/swc-linux-x64-musl/-/swc-linux-x64-musl-16.2.2.tgz",
+      "integrity": "sha512-CKXRILyErMtUftp+coGcZ38ZwE/Aqq45VMCcRLr2I4OXKrgxIBDXHnBgeX/UMil0S09i2JXaDL3Q+TN8D/cKmg==",
       "cpu": [
         "x64"
       ],
-      "license": "MIT",
       "optional": true,
       "os": [
         "linux"
@@ -1133,13 +1126,12 @@
       }
     },
     "node_modules/@next/swc-win32-arm64-msvc": {
-      "version": "16.1.6",
-      "resolved": "https://registry.npmjs.org/@next/swc-win32-arm64-msvc/-/swc-win32-arm64-msvc-16.1.6.tgz",
-      "integrity": "sha512-gQmm8izDTPgs+DCWH22kcDmuUp7NyiJgEl18bcr8irXA5N2m2O+JQIr6f3ct42GOs9c0h8QF3L5SzIxcYAAXXw==",
+      "version": "16.2.2",
+      "resolved": "https://registry.npmjs.org/@next/swc-win32-arm64-msvc/-/swc-win32-arm64-msvc-16.2.2.tgz",
+      "integrity": "sha512-sS/jSk5VUoShUqINJFvNjVT7JfR5ORYj/+/ZpOYbbIohv/lQfduWnGAycq2wlknbOql2xOR0DoV0s6Xfcy49+g==",
       "cpu": [
         "arm64"
       ],
-      "license": "MIT",
       "optional": true,
       "os": [
         "win32"
@@ -1149,13 +1141,12 @@
       }
     },
     "node_modules/@next/swc-win32-x64-msvc": {
-      "version": "16.1.6",
-      "resolved": "https://registry.npmjs.org/@next/swc-win32-x64-msvc/-/swc-win32-x64-msvc-16.1.6.tgz",
-      "integrity": "sha512-NRfO39AIrzBnixKbjuo2YiYhB6o9d8v/ymU9m/Xk8cyVk+k7XylniXkHwjs4s70wedVffc6bQNbufk5v0xEm0A==",
+      "version": "16.2.2",
+      "resolved": "https://registry.npmjs.org/@next/swc-win32-x64-msvc/-/swc-win32-x64-msvc-16.2.2.tgz",
+      "integrity": "sha512-aHaKceJgdySReT7qeck5oShucxWRiiEuwCGK8HHALe6yZga8uyFpLkPgaRw3kkF04U7ROogL/suYCNt/+CuXGA==",
       "cpu": [
         "x64"
       ],
-      "license": "MIT",
       "optional": true,
       "os": [
         "win32"
@@ -2823,11 +2814,10 @@
       }
     },
     "node_modules/@typescript-eslint/typescript-estree/node_modules/brace-expansion": {
-      "version": "2.0.2",
-      "resolved": "https://registry.npmjs.org/brace-expansion/-/brace-expansion-2.0.2.tgz",
-      "integrity": "sha512-Jt0vHyM+jmUBqojB7E1NIYadt0vI0Qxjxd2TErW94wDz+E2LAm5vKMXXwg6ZZBTHPuUlDgQHKXvjGBdfcF1ZDQ==",
+      "version": "2.0.3",
+      "resolved": "https://registry.npmjs.org/brace-expansion/-/brace-expansion-2.0.3.tgz",
+      "integrity": "sha512-MCV/fYJEbqx68aE58kv2cA/kiky1G8vux3OR6/jbS+jIMe/6fJWa0DTzJU7dqijOWYwHi1t29FlfYI9uytqlpA==",
       "dev": true,
-      "license": "MIT",
       "dependencies": {
         "balanced-match": "^1.0.0"
       }
@@ -3587,11 +3577,10 @@
       }
     },
     "node_modules/brace-expansion": {
-      "version": "1.1.12",
-      "resolved": "https://registry.npmjs.org/brace-expansion/-/brace-expansion-1.1.12.tgz",
-      "integrity": "sha512-9T9UjW3r0UW5c1Q7GTwllptXwhvYmEzFhzMfZ9H7FQWt+uZePjZPjBP/W1ZEyZ1twGWom5/56TF4lPcqjnDHcg==",
+      "version": "1.1.13",
+      "resolved": "https://registry.npmjs.org/brace-expansion/-/brace-expansion-1.1.13.tgz",
+      "integrity": "sha512-9ZLprWS6EENmhEOpjCYW2c8VkmOvckIJZfkr7rBW6dObmfgJ/L1GpSYW5Hpo9lDz4D1+n0Ckz8rU7FwHDQiG/w==",
       "dev": true,
-      "license": "MIT",
       "dependencies": {
         "balanced-match": "^1.0.0",
         "concat-map": "0.0.1"
@@ -4863,11 +4852,10 @@
       }
     },
     "node_modules/flatted": {
-      "version": "3.3.3",
-      "resolved": "https://registry.npmjs.org/flatted/-/flatted-3.3.3.tgz",
-      "integrity": "sha512-GX+ysw4PBCz0PzosHDepZGANEuFCMLrnRTiEy9McGjmkCQYwRq4A/X786G/fjM/+OjsWSU1ZrY5qyARZmO/uwg==",
-      "dev": true,
-      "license": "ISC"
+      "version": "3.4.2",
+      "resolved": "https://registry.npmjs.org/flatted/-/flatted-3.4.2.tgz",
+      "integrity": "sha512-PjDse7RzhcPkIJwy5t7KPWQSZ9cAbzQXcafsetQoD7sOJRQlGikNbx7yZp2OotDnJyrDcbyRq3Ttb18iYOqkxA==",
+      "dev": true
     },
     "node_modules/for-each": {
       "version": "0.3.5",
@@ -7350,14 +7338,13 @@
       }
     },
     "node_modules/next": {
-      "version": "16.1.6",
-      "resolved": "https://registry.npmjs.org/next/-/next-16.1.6.tgz",
-      "integrity": "sha512-hkyRkcu5x/41KoqnROkfTm2pZVbKxvbZRuNvKXLRXxs3VfyO0WhY50TQS40EuKO9SW3rBj/sF3WbVwDACeMZyw==",
-      "license": "MIT",
+      "version": "16.2.2",
+      "resolved": "https://registry.npmjs.org/next/-/next-16.2.2.tgz",
+      "integrity": "sha512-i6AJdyVa4oQjyvX/6GeER8dpY/xlIV+4NMv/svykcLtURJSy/WzDnnUk/TM4d0uewFHK7xSQz4TbIwPgjky+3A==",
       "dependencies": {
-        "@next/env": "16.1.6",
+        "@next/env": "16.2.2",
         "@swc/helpers": "0.5.15",
-        "baseline-browser-mapping": "^2.8.3",
+        "baseline-browser-mapping": "^2.9.19",
         "caniuse-lite": "^1.0.30001579",
         "postcss": "8.4.31",
         "styled-jsx": "5.1.6"
@@ -7369,15 +7356,15 @@
         "node": ">=20.9.0"
       },
       "optionalDependencies": {
-        "@next/swc-darwin-arm64": "16.1.6",
-        "@next/swc-darwin-x64": "16.1.6",
-        "@next/swc-linux-arm64-gnu": "16.1.6",
-        "@next/swc-linux-arm64-musl": "16.1.6",
-        "@next/swc-linux-x64-gnu": "16.1.6",
-        "@next/swc-linux-x64-musl": "16.1.6",
-        "@next/swc-win32-arm64-msvc": "16.1.6",
-        "@next/swc-win32-x64-msvc": "16.1.6",
-        "sharp": "^0.34.4"
+        "@next/swc-darwin-arm64": "16.2.2",
+        "@next/swc-darwin-x64": "16.2.2",
+        "@next/swc-linux-arm64-gnu": "16.2.2",
+        "@next/swc-linux-arm64-musl": "16.2.2",
+        "@next/swc-linux-x64-gnu": "16.2.2",
+        "@next/swc-linux-x64-musl": "16.2.2",
+        "@next/swc-win32-arm64-msvc": "16.2.2",
+        "@next/swc-win32-x64-msvc": "16.2.2",
+        "sharp": "^0.34.5"
       },
       "peerDependencies": {
         "@opentelemetry/api": "^1.1.0",
@@ -7701,11 +7688,10 @@
       "license": "ISC"
     },
     "node_modules/picomatch": {
-      "version": "2.3.1",
-      "resolved": "https://registry.npmjs.org/picomatch/-/picomatch-2.3.1.tgz",
-      "integrity": "sha512-JU3teHTNjmE2VCGFzuY8EXzCDVwEqB2a8fsIvwaStHhAWJEeVd1o1QD80CU6+ZdEXXSLbSsuLwJjkCBWqRQUVA==",
+      "version": "2.3.2",
+      "resolved": "https://registry.npmjs.org/picomatch/-/picomatch-2.3.2.tgz",
+      "integrity": "sha512-V7+vQEJ06Z+c5tSye8S+nHUfI51xoXIXjHQ99cQtKUkQqqO1kO/KCJUfZXuB47h/YBlDhah2H3hdUGXn8ie0oA==",
       "dev": true,
-      "license": "MIT",
       "engines": {
         "node": ">=8.6"
       },
@@ -8973,11 +8959,10 @@
       }
     },
     "node_modules/tinyglobby/node_modules/picomatch": {
-      "version": "4.0.2",
-      "resolved": "https://registry.npmjs.org/picomatch/-/picomatch-4.0.2.tgz",
-      "integrity": "sha512-M7BAV6Rlcy5u+m6oPhAPFgJTzAioX/6B0DxyvDlo9l8+T3nLKbrczg2WLUyzd45L8RqfUMyGPzekbMvX2Ldkwg==",
+      "version": "4.0.4",
+      "resolved": "https://registry.npmjs.org/picomatch/-/picomatch-4.0.4.tgz",
+      "integrity": "sha512-QP88BAKvMam/3NxH6vj2o21R6MjxZUAd6nlwAS/pnGvN9IVLocLHxGYIzFhg6fUQ+5th6P4dv4eW9jX3DSIj7A==",
       "dev": true,
-      "license": "MIT",
       "engines": {
         "node": ">=12"
       },

From 94e748a7269e07fdd7ebdae373270aeae2c35655 Mon Sep 17 00:00:00 2001
From: bing1100 <bingxuhu@gmail.com>
Date: Mon, 13 Apr 2026 23:18:47 -0400
Subject: [PATCH 2/3] fixing up metrics

---
 backend/api/screen/router.py                  | 410 +++++++++++++++++-
 backend/api/services/sr_db_service.py         |  61 ++-
 backend/api/sr/router.py                      |  77 ++++
 .../app/[lang]/can-sr/l1-screen/view/page.tsx | 260 ++++++-----
 .../app/[lang]/can-sr/l2-screen/view/page.tsx | 132 ++++--
 .../api/can-sr/reviews/thresholds/route.ts    |  57 +++
 .../app/api/can-sr/screen/metrics/route.ts    |  59 +++
 .../components/can-sr/CitationListPage.tsx    | 351 ++++++++++-----
 frontend/components/can-sr/PagedList.tsx      | 112 ++++-
 .../can-sr/ScreeningMetricsPanel.tsx          | 214 +++++++++
 10 files changed, 1451 insertions(+), 282 deletions(-)
 create mode 100644 frontend/app/api/can-sr/reviews/thresholds/route.ts
 create mode 100644 frontend/app/api/can-sr/screen/metrics/route.ts
 create mode 100644 frontend/components/can-sr/ScreeningMetricsPanel.tsx

diff --git a/backend/api/screen/router.py b/backend/api/screen/router.py
index 400ea9b0..3e7f27da 100644
--- a/backend/api/screen/router.py
+++ b/backend/api/screen/router.py
@@ -41,6 +41,33 @@ class AgentRunsQueryResponse(BaseModel):
     runs: List[Dict[str, Any]]
 
 
+class ScreeningMetricsCriterion(BaseModel):
+    criterion_key: str
+    label: str
+    threshold: float
+    total_citations: int
+    has_run_count: int
+    low_confidence_count: int
+    critical_disagreement_count: int
+    confident_exclude_count: int
+    needs_human_review_count: int
+
+
+class ScreeningMetricsSummary(BaseModel):
+    step: str
+    total_citations: int
+    validated_all: int
+    unvalidated_all: int
+    validated_needs_review: int
+    unvalidated_needs_review: int
+    needs_review_total: int
+
+
+class ScreeningMetricsResponse(BaseModel):
+    sr_id: str
+    steps: Dict[str, Any]
+
+
 def _normalize_int_list(v: Any) -> List[int]:
     if v is None:
         return []
@@ -115,6 +142,91 @@ class ValidateStepRequest(BaseModel):
     sr_id: str = Field(..., description="Systematic review id")
     citation_id: int = Field(..., ge=1, description="Citation id (row id in the SR screening table)")
     step: str = Field("l1", description="Validation step: l1|l2|parameters")
+    checked: bool = Field(True, description="If true, add/update the current user's validation; if false, remove it")
+
+
+def _as_validation_list(v: Any) -> List[Dict[str, str]]:
+    """Normalize DB values into a list of {user, validated_at} dicts."""
+
+    if v is None:
+        return []
+
+    # JSONB may come back as a list already; some deployments may return it as string.
+    if isinstance(v, str):
+        try:
+            v = json.loads(v)
+        except Exception:
+            return []
+
+    if not isinstance(v, list):
+        return []
+
+    out: List[Dict[str, str]] = []
+    for item in v:
+        if not isinstance(item, dict):
+            continue
+        user = item.get("user") or item.get("email") or item.get("validated_by")
+        ts = item.get("validated_at") or item.get("timestamp") or item.get("validatedAt")
+        if not user:
+            continue
+        out.append({"user": str(user), "validated_at": str(ts or "")})
+    return out
+
+
+def _dedupe_validations(items: List[Dict[str, str]]) -> List[Dict[str, str]]:
+    """Keep only one entry per user, keeping the latest timestamp lexicographically (ISO8601)."""
+
+    by_user: Dict[str, Dict[str, str]] = {}
+    for it in items or []:
+        user = str(it.get("user") or "").strip()
+        if not user:
+            continue
+        cur = by_user.get(user)
+        if not cur:
+            by_user[user] = {"user": user, "validated_at": str(it.get("validated_at") or "")}
+            continue
+        # Prefer newest timestamp (ISO strings compare in chronological order)
+        if str(it.get("validated_at") or "") >= str(cur.get("validated_at") or ""):
+            by_user[user] = {"user": user, "validated_at": str(it.get("validated_at") or "")}
+
+    # Return newest-first for nicer UI (most recent first)
+    return sorted(by_user.values(), key=lambda x: str(x.get("validated_at") or ""), reverse=True)
+
+
+def _is_disagreeing_critical_answer(ans: Any) -> bool:
+    """Return True if critical stage indicates disagreement.
+
+    Contract: agreement is encoded as "None of the above".
+    Any non-empty answer other than that is treated as critical disagreement.
+    """
+
+    s = str(ans or "").strip()
+    if not s:
+        return False
+    return s != "None of the above"
+
+
+def _is_exclude_answer(ans: Any) -> bool:
+    """Detect exclude answers by convention: contains '(exclude)' (case-insensitive)."""
+
+    s = str(ans or "")
+    return "(exclude)" in s.lower()
+
+
+def _criterion_key_from_question(question: str) -> str:
+    # Keep in sync with the frontend derivation in l2-screen view.
+    q = str(question or "")
+    try:
+        # Prefer shared helper when available.
+        return str(snake_case(q, max_len=56))
+    except Exception:
+        # Fallback: lowercase, non-word -> underscore, collapse underscores.
+        s = q.strip().lower()
+        s = re.sub(r"[^\w]+", "_", s)
+        s = re.sub(r"_+", "_", s)
+        s = re.sub(r"^_+|_+$", "", s)
+        return s[:56]
+
 
 
 class FulltextRunRequest(BaseModel):
@@ -674,6 +786,7 @@ async def validate_screening_step(
     sr_id = str(payload.sr_id)
     citation_id = int(payload.citation_id)
     step = (payload.step or "l1").lower().strip()
+    checked = bool(payload.checked)
 
     if step not in {"l1", "l2", "parameters"}:
         raise HTTPException(status_code=status.HTTP_400_BAD_REQUEST, detail="step must be one of: l1, l2, parameters")
@@ -687,24 +800,66 @@ async def validate_screening_step(
 
     table_name = (screening or {}).get("table_name") or "citations"
 
-    validated_by_col = f"{step}_validated_by"
-    validated_at_col = f"{step}_validated_at"
-    validated_by = str(current_user.get("email") or current_user.get("id") or "")
+    # New storage: per-step validations list (JSONB)
+    validations_col = f"{step}_validations"
+    validated_by_col = f"{step}_validated_by"       # legacy summary
+    validated_at_col = f"{step}_validated_at"       # legacy summary
+
+    user_email = str(current_user.get("email") or current_user.get("id") or "").strip()
     now_iso = datetime.utcnow().isoformat() + "Z"
 
     try:
         # Ensure columns exist (best-effort; no-migrations philosophy)
+        await run_in_threadpool(cits_dp_service.create_column, validations_col, "JSONB", table_name)
         await run_in_threadpool(cits_dp_service.create_column, validated_by_col, "TEXT", table_name)
         await run_in_threadpool(cits_dp_service.create_column, validated_at_col, "TIMESTAMPTZ", table_name)
 
-        u1 = await run_in_threadpool(cits_dp_service.update_text_column, citation_id, validated_by_col, validated_by, table_name)
-        u2 = await run_in_threadpool(cits_dp_service.update_text_column, citation_id, validated_at_col, now_iso, table_name)
+        # Load row to get existing validations list
+        row = await run_in_threadpool(cits_dp_service.get_citation_by_id, citation_id, table_name)
+        if not row:
+            raise HTTPException(status_code=status.HTTP_404_NOT_FOUND, detail="Citation not found")
+
+        existing = _as_validation_list(row.get(validations_col))
+
+        if checked:
+            # Upsert (replace existing entry for this user with new timestamp)
+            existing = [x for x in existing if str(x.get("user") or "") != user_email]
+            existing.append({"user": user_email, "validated_at": now_iso})
+        else:
+            # Remove
+            existing = [x for x in existing if str(x.get("user") or "") != user_email]
+
+        normalized = _dedupe_validations(existing)
+
+        u_list = await run_in_threadpool(
+            cits_dp_service.update_jsonb_column,
+            citation_id,
+            validations_col,
+            normalized,
+            table_name,
+        )
+
+        # Keep legacy summary fields in sync for existing UI/components:
+        # - if list empty => NULL out by/at
+        # - else => most recent validation
+        if not normalized:
+            await run_in_threadpool(cits_dp_service.clear_columns, citation_id, [validated_by_col, validated_at_col], table_name)
+            summary_by = None
+            summary_at = None
+        else:
+            summary_by = normalized[0].get("user")
+            summary_at = normalized[0].get("validated_at")
+            await run_in_threadpool(cits_dp_service.update_text_column, citation_id, validated_by_col, str(summary_by or ""), table_name)
+            await run_in_threadpool(cits_dp_service.update_text_column, citation_id, validated_at_col, str(summary_at or ""), table_name)
+
+    except HTTPException:
+        raise
     except RuntimeError as rexc:
         raise HTTPException(status_code=status.HTTP_503_SERVICE_UNAVAILABLE, detail=str(rexc))
     except Exception as e:
         raise HTTPException(status_code=status.HTTP_500_INTERNAL_SERVER_ERROR, detail=f"Failed to update validation fields: {e}")
 
-    if not (u1 and u2):
+    if not u_list:
         raise HTTPException(status_code=status.HTTP_404_NOT_FOUND, detail="Citation not found to update")
 
     return {
@@ -712,8 +867,12 @@ async def validate_screening_step(
         "sr_id": sr_id,
         "citation_id": citation_id,
         "step": step,
-        "validated_by": validated_by,
-        "validated_at": now_iso,
+        "checked": checked,
+        "user": user_email,
+        "validated_at": now_iso if checked else None,
+        "validations": normalized,
+        "summary_validated_by": summary_by,
+        "summary_validated_at": summary_at,
     }
 
 
@@ -1061,6 +1220,241 @@ async def get_latest_agent_runs(
 
     return AgentRunsQueryResponse(sr_id=sr_id, pipeline=pipeline_norm, citation_ids=parsed_ids, runs=rows)
 
+
+@router.get("/metrics", response_model=ScreeningMetricsResponse)
+async def get_screening_metrics(
+    sr_id: str,
+    step: str = "l1",
+    current_user: Dict[str, Any] = Depends(get_current_active_user),
+):
+    """Return per-criterion metrics + validation summaries for a screening step.
+
+    - Each criterion uses its own threshold (from SR.screening_thresholds[step][criterion_key]).
+    - Needs-human-review logic:
+        1) If ANY criterion is a confident exclude => no human review needed for the citation.
+        2) Else if ANY criterion has critical disagreement => needs review.
+        3) Else if ANY criterion is low confidence (below its threshold) => needs review.
+    """
+
+    step_norm = str(step or "l1").lower().strip()
+    if step_norm not in {"l1", "l2"}:
+        raise HTTPException(status_code=status.HTTP_400_BAD_REQUEST, detail="step must be l1 or l2")
+
+    try:
+        sr, screening = await load_sr_and_check(sr_id, current_user, srdb_service)
+    except HTTPException:
+        raise
+    except Exception as e:
+        raise HTTPException(status_code=status.HTTP_500_INTERNAL_SERVER_ERROR, detail=f"Failed to load SR: {e}")
+
+    table_name = (screening or {}).get("table_name") or "citations"
+
+    # Criteria questions for step
+    cp = sr.get("criteria_parsed") or {}
+    crit_block = cp.get(step_norm) if isinstance(cp, dict) else None
+    questions = (crit_block or {}).get("questions") if isinstance(crit_block, dict) else []
+    questions = questions if isinstance(questions, list) else []
+
+    # Threshold map
+    sr_thresholds = sr.get("screening_thresholds") or {}
+    step_thresholds = sr_thresholds.get(step_norm) if isinstance(sr_thresholds, dict) else None
+    step_thresholds = step_thresholds if isinstance(step_thresholds, dict) else {}
+
+    # Build criterion list (key + label + threshold)
+    criteria: List[Dict[str, Any]] = []
+    for q in questions:
+        if not isinstance(q, str) or not q.strip():
+            continue
+        ck = _criterion_key_from_question(q)
+        thr_raw = step_thresholds.get(ck)
+        try:
+            thr = float(thr_raw)
+            thr = max(0.0, min(1.0, thr))
+        except Exception:
+            thr = 0.9
+        criteria.append({"criterion_key": ck, "label": q, "threshold": thr})
+
+    # Pull all citation ids for this step (L2 list is filtered by human_l1_decision include)
+    filter_step = ""
+    if step_norm == "l2":
+        filter_step = "l1"
+    try:
+        ids = await run_in_threadpool(cits_dp_service.list_citation_ids, filter_step, table_name)
+    except Exception as e:
+        raise HTTPException(status_code=status.HTTP_500_INTERNAL_SERVER_ERROR, detail=f"Failed to list citations: {e}")
+
+    # Load only columns we need
+    needed_cols: List[str] = ["id"]
+    validations_col = f"{step_norm}_validations"
+    legacy_validated_by = f"{step_norm}_validated_by"
+
+    needed_cols.extend([validations_col, legacy_validated_by])
+
+    # We'll compute per-citation needs-review based on agent runs only.
+    # Fetch latest runs for all citations (bulk query using service helper)
+    pipeline_norm = "title_abstract" if step_norm == "l1" else "fulltext"
+    try:
+        runs = await run_in_threadpool(
+            cits_dp_service.list_latest_agent_runs,
+            sr_id=sr_id,
+            table_name=table_name,
+            citation_ids=ids,
+            pipeline=pipeline_norm,
+        )
+    except Exception as e:
+        raise HTTPException(status_code=status.HTTP_500_INTERNAL_SERVER_ERROR, detail=f"Failed to load agent runs: {e}")
+
+    # Group runs by citation then criterion
+    runs_by_cit: Dict[int, Dict[str, Dict[str, Dict[str, Any]]]] = {}
+    for r in runs or []:
+        try:
+            cid = int(r.get("citation_id"))
+        except Exception:
+            continue
+        ck = str(r.get("criterion_key") or "")
+        stg = str(r.get("stage") or "")
+        if not ck or stg not in {"screening", "critical"}:
+            continue
+        if cid not in runs_by_cit:
+            runs_by_cit[cid] = {}
+        if ck not in runs_by_cit[cid]:
+            runs_by_cit[cid][ck] = {}
+        runs_by_cit[cid][ck][stg] = r
+
+    # Load citation rows for validations (and to know total citations count)
+    # If ids huge, this could be heavy; acceptable for now, can paginate later.
+    try:
+        rows = await run_in_threadpool(cits_dp_service.get_citations_by_ids, ids, table_name, needed_cols)
+    except Exception as e:
+        raise HTTPException(status_code=status.HTTP_500_INTERNAL_SERVER_ERROR, detail=f"Failed to load citation rows: {e}")
+
+    # Helper: is validated?
+    def _is_validated(row: Dict[str, Any]) -> bool:
+        v = row.get(validations_col)
+        if v:
+            try:
+                parsed = v
+                if isinstance(v, str):
+                    parsed = json.loads(v)
+                if isinstance(parsed, list) and len(parsed) > 0:
+                    return True
+            except Exception:
+                pass
+        return bool(row.get(legacy_validated_by))
+
+    # Per-criterion aggregates
+    agg: Dict[str, Dict[str, int]] = {}
+    for c in criteria:
+        ck = c["criterion_key"]
+        agg[ck] = {
+            "total_citations": 0,
+            "has_run_count": 0,
+            "low_confidence_count": 0,
+            "critical_disagreement_count": 0,
+            "confident_exclude_count": 0,
+            "needs_human_review_count": 0,
+        }
+
+    total_citations = 0
+    validated_all = 0
+    needs_review_total = 0
+    validated_needs_review = 0
+
+    # Iterate citations and compute needs-review + per-criterion counts
+    for row in rows or []:
+        try:
+            cid = int(row.get("id"))
+        except Exception:
+            continue
+        total_citations += 1
+        validated = _is_validated(row)
+        if validated:
+            validated_all += 1
+
+        per_crit = runs_by_cit.get(cid, {})
+
+        # Evaluate confident exclude override
+        has_confident_exclude = False
+        has_critical_disagreement = False
+        has_low_confidence = False
+
+        for c in criteria:
+            ck = c["criterion_key"]
+            thr = float(c["threshold"])
+            a = agg.get(ck)
+            if a is None:
+                continue
+            a["total_citations"] += 1
+
+            rpair = per_crit.get(ck) or {}
+            scr = rpair.get("screening")
+            crit = rpair.get("critical")
+
+            if scr:
+                a["has_run_count"] += 1
+                conf = scr.get("confidence")
+                try:
+                    conf_f = float(conf)
+                except Exception:
+                    conf_f = None
+                ans = scr.get("answer")
+
+                if conf_f is not None and conf_f < thr:
+                    a["low_confidence_count"] += 1
+                    has_low_confidence = True
+
+                if conf_f is not None and conf_f >= thr and _is_exclude_answer(ans):
+                    a["confident_exclude_count"] += 1
+                    has_confident_exclude = True
+
+            if crit and _is_disagreeing_critical_answer(crit.get("answer")):
+                a["critical_disagreement_count"] += 1
+                has_critical_disagreement = True
+
+        needs_review = (not has_confident_exclude) and (has_critical_disagreement or has_low_confidence)
+        if needs_review:
+            needs_review_total += 1
+            if validated:
+                validated_needs_review += 1
+            # increment per-criterion needs-review count for all criteria
+            for c in criteria:
+                agg[c["criterion_key"]]["needs_human_review_count"] += 1
+
+    unvalidated_all = max(0, total_citations - validated_all)
+    unvalidated_needs_review = max(0, needs_review_total - validated_needs_review)
+
+    # Build response
+    crit_out: List[Dict[str, Any]] = []
+    for c in criteria:
+        ck = c["criterion_key"]
+        a = agg.get(ck) or {}
+        crit_out.append(
+            {
+                "criterion_key": ck,
+                "label": c["label"],
+                "threshold": float(c["threshold"]),
+                **a,
+            }
+        )
+
+    return ScreeningMetricsResponse(
+        sr_id=sr_id,
+        steps={
+            step_norm: {
+                "summary": {
+                    "step": step_norm,
+                    "total_citations": total_citations,
+                    "validated_all": validated_all,
+                    "unvalidated_all": unvalidated_all,
+                    "needs_review_total": needs_review_total,
+                    "validated_needs_review": validated_needs_review,
+                    "unvalidated_needs_review": unvalidated_needs_review,
+                },
+                "criteria": crit_out,
+            }
+        },
+    )
+
 async def update_inclusion_decision(
     sr: Dict[str, Any],
     citation_id: int,
diff --git a/backend/api/services/sr_db_service.py b/backend/api/services/sr_db_service.py
index f936ee2a..013a4c32 100644
--- a/backend/api/services/sr_db_service.py
+++ b/backend/api/services/sr_db_service.py
@@ -50,12 +50,28 @@ def ensure_table_exists(self) -> None:
                     criteria JSONB,
                     criteria_yaml TEXT,
                     criteria_parsed JSONB,
+                    screening_thresholds JSONB,
                     screening_db JSONB,
                     created_at TIMESTAMP WITH TIME ZONE DEFAULT now(),
                     updated_at TIMESTAMP WITH TIME ZONE DEFAULT now()
                 )
             """
             cur.execute(create_table_sql)
+
+            # Runtime schema evolution for existing deployments.
+            # (No migrations philosophy: add columns if missing.)
+            try:
+                cur.execute(
+                    "ALTER TABLE systematic_reviews ADD COLUMN IF NOT EXISTS screening_thresholds JSONB"
+                )
+            except Exception:
+                # Older PG versions might not support IF NOT EXISTS.
+                try:
+                    cur.execute(
+                        "ALTER TABLE systematic_reviews ADD COLUMN screening_thresholds JSONB"
+                    )
+                except Exception:
+                    pass
             conn.commit()
                 
             logger.info("Ensured systematic_reviews table exists")
@@ -186,8 +202,8 @@ def create_systematic_review(
             insert_sql = """
                 INSERT INTO systematic_reviews 
                 (id, name, description, owner_id, owner_email, users, visible, 
-                 criteria, criteria_yaml, criteria_parsed, created_at, updated_at)
-                VALUES (%s, %s, %s, %s, %s, %s, %s, %s, %s, %s, %s, %s)
+                 criteria, criteria_yaml, criteria_parsed, screening_thresholds, created_at, updated_at)
+                VALUES (%s, %s, %s, %s, %s, %s, %s, %s, %s, %s, %s, %s, %s)
             """
             
             cur.execute(insert_sql, (
@@ -201,6 +217,7 @@ def create_systematic_review(
                 json.dumps(criteria_obj) if criteria_obj else None,
                 criteria_str,
                 json.dumps(criteria_parsed),
+                json.dumps({"l1": {}, "l2": {}, "parameters": {}}),
                 now,
                 now
             ))
@@ -220,6 +237,8 @@ def create_systematic_review(
                 sr_doc['criteria'] = json.loads(sr_doc['criteria'])
             if sr_doc.get('criteria_parsed') and isinstance(sr_doc['criteria_parsed'], str):
                 sr_doc['criteria_parsed'] = json.loads(sr_doc['criteria_parsed'])
+            if sr_doc.get('screening_thresholds') and isinstance(sr_doc['screening_thresholds'], str):
+                sr_doc['screening_thresholds'] = json.loads(sr_doc['screening_thresholds'])
             # Convert datetime objects to ISO strings
             from datetime import datetime as dt
             if sr_doc.get('created_at') and isinstance(sr_doc['created_at'], dt):
@@ -501,6 +520,8 @@ def list_systematic_reviews_for_user(self, user_email: str) -> List[Dict[str, An
                     doc['criteria'] = json.loads(doc['criteria'])
                 if doc.get('criteria_parsed') and isinstance(doc['criteria_parsed'], str):
                     doc['criteria_parsed'] = json.loads(doc['criteria_parsed'])
+                if doc.get('screening_thresholds') and isinstance(doc['screening_thresholds'], str):
+                    doc['screening_thresholds'] = json.loads(doc['screening_thresholds'])
                 # Convert datetime objects to ISO strings
                 from datetime import datetime as dt
                 if doc.get('created_at') and isinstance(doc['created_at'], dt):
@@ -559,6 +580,8 @@ def get_systematic_review(self, sr_id: str, ignore_visibility: bool = False) ->
                 doc['criteria'] = json.loads(doc['criteria'])
             if doc.get('criteria_parsed') and isinstance(doc['criteria_parsed'], str):
                 doc['criteria_parsed'] = json.loads(doc['criteria_parsed'])
+            if doc.get('screening_thresholds') and isinstance(doc['screening_thresholds'], str):
+                doc['screening_thresholds'] = json.loads(doc['screening_thresholds'])
             # Convert datetime objects to ISO strings
             from datetime import datetime as dt
             if doc.get('created_at') and isinstance(doc['created_at'], dt):
@@ -709,6 +732,40 @@ def update_screening_db_info(self, sr_id: str, screening_db: Dict[str, Any]) ->
             if conn:
                 pass
 
+
+    def update_screening_thresholds(self, sr_id: str, screening_thresholds: Dict[str, Any]) -> None:
+        """Persist per-criterion screening thresholds on the SR record.
+
+        This is SR-scoped shared state. Permission checks are expected to be
+        enforced by callers (routers) before calling this helper.
+        """
+
+        conn = None
+        try:
+            conn = postgres_server.conn
+            cur = conn.cursor()
+
+            updated_at = datetime.utcnow().isoformat()
+            cur.execute(
+                "UPDATE systematic_reviews SET screening_thresholds = %s, updated_at = %s WHERE id = %s",
+                (json.dumps(screening_thresholds), updated_at, sr_id),
+            )
+            conn.commit()
+        except Exception as e:
+            try:
+                if conn:
+                    conn.rollback()
+            except Exception:
+                pass
+            logger.exception(f"Failed to update screening thresholds: {e}")
+            raise HTTPException(
+                status_code=status.HTTP_500_INTERNAL_SERVER_ERROR,
+                detail=f"Failed to update screening thresholds: {e}",
+            )
+        finally:
+            if conn:
+                pass
+
     def clear_screening_db_info(self, sr_id: str) -> None:
         """
         Remove the screening_db field from the SR document.
diff --git a/backend/api/sr/router.py b/backend/api/sr/router.py
index e36b095d..c52416a2 100644
--- a/backend/api/sr/router.py
+++ b/backend/api/sr/router.py
@@ -52,6 +52,13 @@ class SystematicReviewRead(BaseModel):
     # convenience structured metadata extracted from criteria (l1, l2, parameters)
     criteria_parsed: Optional[Dict[str, Any]] = None
 
+    # Per-step, per-criterion thresholds (SR-scoped). Example:
+    # {
+    #   "l1": {"population": 0.9, "intervention": 0.85},
+    #   "l2": {"outcome": 0.9}
+    # }
+    screening_thresholds: Optional[Dict[str, Any]] = None
+
 
 
 
@@ -136,6 +143,7 @@ async def create_systematic_review(
         criteria=sr_doc.get("criteria"),
         criteria_yaml=sr_doc.get("criteria_yaml"),
         criteria_parsed=sr_doc.get("criteria_parsed"),
+        screening_thresholds=sr_doc.get("screening_thresholds"),
     )
 
 
@@ -261,6 +269,7 @@ async def list_systematic_reviews_for_user(
                 criteria=doc.get("criteria"),
                 criteria_yaml=doc.get("criteria_yaml"),
                 criteria_parsed=doc.get("criteria_parsed"),
+                screening_thresholds=doc.get("screening_thresholds"),
             )
         )
 
@@ -293,6 +302,7 @@ async def get_systematic_review(sr_id: str, current_user: Dict[str, Any] = Depen
         criteria=doc.get("criteria"),
         criteria_yaml=doc.get("criteria_yaml"),
         criteria_parsed=doc.get("criteria_parsed"),
+        screening_thresholds=doc.get("screening_thresholds"),
     )
 
 
@@ -390,9 +400,76 @@ async def update_systematic_review_criteria(
         criteria=doc.get("criteria"),
         criteria_yaml=doc.get("criteria_yaml"),
         criteria_parsed=doc.get("criteria_parsed"),
+        screening_thresholds=doc.get("screening_thresholds"),
     )
 
 
+class ThresholdsUpdateRequest(BaseModel):
+    screening_thresholds: Dict[str, Any] = {}
+
+
+@router.get("/{sr_id}/screening_thresholds")
+async def get_screening_thresholds(sr_id: str, current_user: Dict[str, Any] = Depends(get_current_active_user)):
+    """Get SR-scoped per-step per-criterion thresholds."""
+
+    try:
+        doc, _screening = await load_sr_and_check(sr_id, current_user, srdb_service, require_screening=False)
+    except HTTPException:
+        raise
+    except Exception as e:
+        raise HTTPException(status_code=status.HTTP_500_INTERNAL_SERVER_ERROR, detail=f"Failed to load systematic review: {e}")
+
+    thresholds = doc.get("screening_thresholds") or {}
+    if not isinstance(thresholds, dict):
+        thresholds = {}
+    return {"sr_id": sr_id, "screening_thresholds": thresholds}
+
+
+@router.put("/{sr_id}/screening_thresholds")
+async def update_screening_thresholds(
+    sr_id: str,
+    payload: ThresholdsUpdateRequest,
+    current_user: Dict[str, Any] = Depends(get_current_active_user),
+):
+    """Update SR-scoped per-step per-criterion thresholds.
+
+    Any SR member may update thresholds (per product requirement).
+    """
+
+    try:
+        _doc, _screening = await load_sr_and_check(sr_id, current_user, srdb_service, require_screening=False)
+    except HTTPException:
+        raise
+    except Exception as e:
+        raise HTTPException(status_code=status.HTTP_500_INTERNAL_SERVER_ERROR, detail=f"Failed to load systematic review: {e}")
+
+    thresholds = payload.screening_thresholds or {}
+    if not isinstance(thresholds, dict):
+        raise HTTPException(status_code=status.HTTP_400_BAD_REQUEST, detail="screening_thresholds must be an object")
+
+    # Normalize: only allow known steps keys, but keep it permissive.
+    normalized: Dict[str, Any] = {}
+    for step in ("l1", "l2"):
+        block = thresholds.get(step)
+        if isinstance(block, dict):
+            out: Dict[str, float] = {}
+            for k, v in block.items():
+                if not isinstance(k, str) or not k.strip():
+                    continue
+                try:
+                    f = float(v)
+                except Exception:
+                    continue
+                f = max(0.0, min(1.0, f))
+                out[k] = f
+            normalized[step] = out
+        else:
+            normalized[step] = {}
+
+    await run_in_threadpool(srdb_service.update_screening_thresholds, sr_id, normalized)
+    return {"status": "success", "sr_id": sr_id, "screening_thresholds": normalized}
+
+
 @router.delete("/{sr_id}")
 async def delete_systematic_review(sr_id: str, current_user: Dict[str, Any] = Depends(get_current_active_user)):
     """
diff --git a/frontend/app/[lang]/can-sr/l1-screen/view/page.tsx b/frontend/app/[lang]/can-sr/l1-screen/view/page.tsx
index bb4dd22f..57891cb5 100644
--- a/frontend/app/[lang]/can-sr/l1-screen/view/page.tsx
+++ b/frontend/app/[lang]/can-sr/l1-screen/view/page.tsx
@@ -54,6 +54,32 @@ function humanScreenColumn(name: string) {
   return base.replace(/^llm_/, 'human_')
 }
 
+type ValidationEntry = { user: string; validated_at: string }
+
+function parseValidations(v: any): ValidationEntry[] {
+  if (!v) return []
+  try {
+    const parsed = typeof v === 'string' ? JSON.parse(v) : v
+    if (!Array.isArray(parsed)) return []
+    return parsed
+      .filter((x: any) => x && typeof x === 'object')
+      .map((x: any) => ({
+        user: String(x.user ?? x.email ?? x.validated_by ?? ''),
+        validated_at: String(x.validated_at ?? x.timestamp ?? ''),
+      }))
+      .filter((x: any) => x.user)
+  } catch {
+    return []
+  }
+}
+
+function formatValidationDate(v: string): string {
+  if (!v) return ''
+  const d = new Date(v)
+  if (Number.isNaN(d.getTime())) return v
+  return d.toLocaleString()
+}
+
 /* Types for local clarity */
 type CriteriaData = {
   questions: string[]
@@ -76,6 +102,11 @@ export default function CanSrL1ScreenPage() {
   const searchParams = useSearchParams()
   const srId = searchParams?.get('sr_id')
   const citationId = searchParams?.get('citation_id')
+  const thresholdParam = searchParams?.get('threshold')
+  const threshold = useMemo(() => {
+    const v = Number(thresholdParam)
+    return Number.isFinite(v) ? Math.max(0, Math.min(1, v)) : 0.9
+  }, [thresholdParam])
   // Get current language to keep language when navigating (must be unconditional hook call)
   const { lang } = useParams<{ lang: string }>()
   const [selectedModel, setSelectedModel] = useState('gpt-5-mini')
@@ -106,6 +137,17 @@ export default function CanSrL1ScreenPage() {
   const [loadingRuns, setLoadingRuns] = useState(false)
 
   const [validating, setValidating] = useState(false)
+  const [userEmail, setUserEmail] = useState<string | null>(null)
+
+  const l1Validations = useMemo(() => parseValidations((citation as any)?.l1_validations), [citation])
+  const l1Checked = useMemo(() => {
+    const me = String(userEmail || '')
+    if (!me) return false
+    return l1Validations.some((v) => v.user === me)
+  }, [l1Validations, userEmail])
+  const l1ValidationsSorted = useMemo(() => {
+    return [...l1Validations].sort((a, b) => String(b.validated_at || '').localeCompare(String(a.validated_at || '')))
+  }, [l1Validations])
 
   useEffect(() => {
     if (!srId || !citationId) {
@@ -137,6 +179,23 @@ export default function CanSrL1ScreenPage() {
     loadIds()
   }, [srId])
 
+  // Fetch current user email for the "Validated by [UserEmail]" checkbox label.
+  useEffect(() => {
+    const loadMe = async () => {
+      try {
+        const headers = { ...getAuthHeaders() }
+        const res = await fetch('/api/auth/me', { method: 'GET', headers })
+        const data = await res.json().catch(() => ({}))
+        if (res.ok) {
+          setUserEmail(String(data?.user?.email || data?.email || ''))
+        }
+      } catch {
+        // ignore
+      }
+    }
+    loadMe()
+  }, [])
+
   // Load citation row
   // Extracted fetch function so we can re-use it when navigating between citations
   async function fetchCitationById(id: string) {
@@ -486,8 +545,7 @@ export default function CanSrL1ScreenPage() {
       <GCHeader />
       <SRHeader
         title={dict.screening.titleAbstract}
-        backHref={`/can-sr/l1-screen?sr_id=${encodeURIComponent(srId || '')}`}
-        backLabel={dict.cansr.backToCitations}
+        backHref={`/can-sr/sr?sr_id=${encodeURIComponent(srId)}`}
         right={
           <ModelSelector
             selectedModel={selectedModel}
@@ -496,130 +554,27 @@ export default function CanSrL1ScreenPage() {
         }
       />
 
-      <main className="mx-auto max-w-6xl px-6 py-8">
-        {/* Agentic summary + Validate */}
-        <div className="mb-6 rounded-lg border border-gray-200 bg-white p-4 shadow-sm">
-          <div className="flex flex-wrap items-center justify-between gap-3">
-            <div>
-              <h3 className="text-sm font-semibold text-gray-900">Agentic results</h3>
-              <p className="text-xs text-gray-600">
-                Latest <code>screening</code> + <code>critical</code> runs per criterion.
-              </p>
-            </div>
-            <div className="flex items-center gap-2">
-              <button
-                onClick={async () => {
-                  if (!srId || !citationId) return
-                  setValidating(true)
-                  try {
-                    const headers = {
-                      'Content-Type': 'application/json',
-                      ...getAuthHeaders(),
-                    }
-                    await fetch('/api/can-sr/screen/validate', {
-                      method: 'POST',
-                      headers,
-                      body: JSON.stringify({
-                        sr_id: srId,
-                        citation_id: Number(citationId),
-                        step: 'l1',
-                      }),
-                    })
-                    // Refresh citation so validated fields appear
-                    await fetchCitationById(String(citationId))
-                  } finally {
-                    setValidating(false)
-                  }
-                }}
-                className="rounded-md bg-emerald-600 px-3 py-2 text-sm font-medium text-white hover:bg-emerald-700 disabled:opacity-60"
-                disabled={validating}
-                type="button"
-              >
-                {validating ? 'Validating…' : 'Validate (L1)'}
-              </button>
-
-              {citation?.l1_validated_by ? (
-                <span className="text-xs text-emerald-700">
-                  Validated by {String(citation.l1_validated_by)}
-                </span>
-              ) : (
-                <span className="text-xs text-gray-600">Not validated</span>
-              )}
-            </div>
-          </div>
-
-          {loadingRuns ? (
-            <div className="mt-3 text-sm text-gray-600">Loading agent runs…</div>
-          ) : criteriaData?.questions?.length ? (
-            <div className="mt-3 space-y-2">
-              {criteriaData.questions.map((q, idx) => {
-                const criterionKey = q
-                  ? q
-                      .trim()
-                      .toLowerCase()
-                      .replace(/[^\w]+/g, '_')
-                      .replace(/_+/g, '_')
-                      .replace(/^_+|_+$/g, '')
-                      .slice(0, 56)
-                  : ''
-
-                const r = runsByCriterion[criterionKey] || {}
-                const scr = r.screening
-                const crit = r.critical
-
-                const critDisagrees =
-                  crit && String((crit as any)?.answer || '').trim() !== '' &&
-                  String((crit as any)?.answer || '').trim() !== 'None of the above'
-
-                return (
-                  <div key={idx} className="rounded-md border border-gray-100 bg-gray-50 p-3">
-                    <div className="text-sm font-medium text-gray-800">{q}</div>
-                    <div className="mt-2 grid grid-cols-2 gap-3 text-xs text-gray-700">
-                      <div className="rounded-md border border-gray-100 bg-white p-2">
-                        <div className="font-semibold">Screening</div>
-                        <div>Answer: {String((scr as any)?.answer ?? '—')}</div>
-                        <div>Confidence: {String((scr as any)?.confidence ?? '—')}</div>
-                      </div>
-                      <div className={"rounded-md border border-gray-100 bg-white p-2 " + (critDisagrees ? 'border-amber-300 bg-amber-50' : '')}>
-                        <div className="font-semibold">Critical</div>
-                        <div>Answer: {String((crit as any)?.answer ?? '—')}</div>
-                        <div>Confidence: {String((crit as any)?.confidence ?? '—')}</div>
-                        {critDisagrees ? (
-                          <div className="mt-1 font-medium text-amber-700">Disagrees</div>
-                        ) : null}
-                      </div>
-                    </div>
-                  </div>
-                )
-              })}
-            </div>
-          ) : (
-            <div className="mt-3 text-sm text-gray-600">No criteria loaded yet.</div>
-          )}
-        </div>
-
+      <main className="mx-auto max-w-6xl px-6 py-10">
         <div className="grid grid-cols-12 gap-6">
           {/* Workspace (left) */}
-          <div className="col-span-7">
+          <div className="col-span-12 md:col-span-7">
             <div className="rounded-lg border border-gray-200 bg-white p-6 shadow-sm">
               {workspace}
             </div>
           </div>
 
           {/* Selection sidebar (right) */}
-          <aside className="col-span-5">
+          <aside className="col-span-12 md:col-span-5">
             <div className="space-y-4 rounded-lg border border-gray-200 bg-white p-4 shadow-sm">
-              <h4 className="text-md font-semibold text-gray-900">{dict.screening.selection}</h4>
-              <p className="text-sm text-gray-600">
-                {dict.screening.selectionDesc}
-              </p>
+              <h4 className="text-md font-semibold text-gray-900">
+                {dict.screening.selection}
+              </h4>
+              <p className="text-sm text-gray-600">{dict.screening.selectionDesc}</p>
 
               {loadingCriteria ? (
                 <div className="text-sm text-gray-600">{dict.screening.loadingCriteria}</div>
               ) : !criteriaData || criteriaData.questions.length === 0 ? (
-                <div className="text-sm text-gray-600">
-                  {dict.screening.noCriteria}
-                </div>
+                <div className="text-sm text-gray-600">{dict.screening.noCriteria}</div>
               ) : (
                 <div className="space-y-4">
                   {criteriaData.questions.map((q, idx) => {
@@ -629,10 +584,33 @@ export default function CanSrL1ScreenPage() {
                     const aiSelected =
                       aiData && aiData.selected ? aiData.selected : undefined
 
+                    // Per-question highlight when low confidence or agentic disagreement.
+                    const criterionKey = q
+                      ? q
+                          .trim()
+                          .toLowerCase()
+                          .replace(/[^\w]+/g, '_')
+                          .replace(/_+/g, '_')
+                          .replace(/^_+|_+$/g, '')
+                          .slice(0, 56)
+                      : ''
+
+                    const r = runsByCriterion[criterionKey] || {}
+                    const scr = r.screening
+                    const crit = r.critical
+                    const scrConf = Number((scr as any)?.confidence)
+                    const lowConfidence = Number.isFinite(scrConf) ? scrConf < threshold : false
+                    const critAns = String((crit as any)?.answer || '').trim()
+                    const critDisagrees = !!crit && critAns !== '' && critAns !== 'None of the above'
+                    const needsHuman = lowConfidence || critDisagrees
+
                     return (
                       <div
                         key={idx}
-                        className="rounded-md border border-gray-100 p-3"
+                        className={
+                          'rounded-md border p-3 ' +
+                          (needsHuman ? 'border-amber-300 bg-amber-50' : 'border-gray-100')
+                        }
                       >
                         <div className="flex items-start justify-between">
                           <div className="flex-1">
@@ -732,6 +710,56 @@ export default function CanSrL1ScreenPage() {
                   })}
                 </div>
               )}
+
+              {/* Validation checkbox at the bottom of selection area */}
+              <div className="mt-4 rounded-md border border-gray-100 bg-gray-50 p-3">
+                <label className="flex items-center gap-2 text-sm text-gray-800">
+                  <input
+                    type="checkbox"
+                    checked={l1Checked}
+                    disabled={validating}
+                    onChange={async (e) => {
+                      if (!srId || !citationId) return
+                      setValidating(true)
+                      try {
+                        const headers = {
+                          'Content-Type': 'application/json',
+                          ...getAuthHeaders(),
+                        }
+                        await fetch('/api/can-sr/screen/validate', {
+                          method: 'POST',
+                          headers,
+                          body: JSON.stringify({
+                            sr_id: srId,
+                            citation_id: Number(citationId),
+                            step: 'l1',
+                            checked: Boolean(e.target.checked),
+                          }),
+                        })
+                        await fetchCitationById(String(citationId))
+                      } finally {
+                        setValidating(false)
+                      }
+                    }}
+                  />
+                  <span>
+                    Validated by{' '}
+                    <span className="font-medium">
+                      {String(userEmail || '—')}
+                    </span>
+                  </span>
+                </label>
+
+                {l1ValidationsSorted.length ? (
+                  <div className="mt-2 space-y-1">
+                    {l1ValidationsSorted.map((v, idx) => (
+                      <div key={`${v.user}-${idx}`} className="text-xs text-gray-600">
+                        Validated on {formatValidationDate(v.validated_at)} by {v.user}
+                      </div>
+                    ))}
+                  </div>
+                ) : null}
+              </div>
             </div>
           </aside>
         </div>
@@ -752,7 +780,7 @@ export default function CanSrL1ScreenPage() {
               router.push(
                 `/${lang}/can-sr/l1-screen/view?sr_id=${encodeURIComponent(srId)}&citation_id=${encodeURIComponent(
                   target,
-                )}`,
+                )}&threshold=${encodeURIComponent(String(threshold))}`,
               )
             }}
             className="rounded-md border bg-white px-4 py-2 text-sm shadow-sm hover:bg-gray-50"
@@ -775,7 +803,7 @@ export default function CanSrL1ScreenPage() {
               router.push(
                 `/${lang}/can-sr/l1-screen/view?sr_id=${encodeURIComponent(srId)}&citation_id=${encodeURIComponent(
                   target,
-                )}`,
+                )}&threshold=${encodeURIComponent(String(threshold))}`,
               )
             }}
             className="rounded-md bg-emerald-600 px-4 py-2 text-sm font-medium text-white hover:bg-emerald-700"
diff --git a/frontend/app/[lang]/can-sr/l2-screen/view/page.tsx b/frontend/app/[lang]/can-sr/l2-screen/view/page.tsx
index fef5b4f6..9af2199e 100644
--- a/frontend/app/[lang]/can-sr/l2-screen/view/page.tsx
+++ b/frontend/app/[lang]/can-sr/l2-screen/view/page.tsx
@@ -9,6 +9,32 @@ import { Wand2 } from 'lucide-react'
 import { getAuthToken, getTokenType } from '@/lib/auth'
 import { useDictionary } from '@/app/[lang]/DictionaryProvider'
 
+type ValidationEntry = { user: string; validated_at: string }
+
+function parseValidations(v: any): ValidationEntry[] {
+  if (!v) return []
+  try {
+    const parsed = typeof v === 'string' ? JSON.parse(v) : v
+    if (!Array.isArray(parsed)) return []
+    return parsed
+      .filter((x: any) => x && typeof x === 'object')
+      .map((x: any) => ({
+        user: String(x.user ?? x.email ?? x.validated_by ?? ''),
+        validated_at: String(x.validated_at ?? x.timestamp ?? ''),
+      }))
+      .filter((x: any) => x.user)
+  } catch {
+    return []
+  }
+}
+
+function formatValidationDate(v: string): string {
+  if (!v) return ''
+  const d = new Date(v)
+  if (Number.isNaN(d.getTime())) return v
+  return d.toLocaleString()
+}
+
 /*
   Full-text single-citation viewer for L2 screening.
 
@@ -112,6 +138,17 @@ export default function CanSrL2ScreenViewPage() {
   const [agentRuns, setAgentRuns] = useState<LatestAgentRun[]>([])
   const [loadingRuns, setLoadingRuns] = useState(false)
   const [validating, setValidating] = useState(false)
+  const [userEmail, setUserEmail] = useState<string | null>(null)
+
+  const l2Validations = useMemo(() => parseValidations((citation as any)?.l2_validations), [citation])
+  const l2Checked = useMemo(() => {
+    const me = String(userEmail || '')
+    if (!me) return false
+    return l2Validations.some((v) => v.user === me)
+  }, [l2Validations, userEmail])
+  const l2ValidationsSorted = useMemo(() => {
+    return [...l2Validations].sort((a, b) => String(b.validated_at || '').localeCompare(String(a.validated_at || '')))
+  }, [l2Validations])
 
   // Fulltext PDF viewer linkage
   const [fulltextCoords, setFulltextCoords] = useState<any[] | null>(null)
@@ -149,6 +186,23 @@ export default function CanSrL2ScreenViewPage() {
     loadIds()
   }, [srId])
 
+  // Fetch current user email for validation toggling.
+  useEffect(() => {
+    const loadMe = async () => {
+      try {
+        const headers = { ...getAuthHeaders() }
+        const res = await fetch('/api/auth/me', { method: 'GET', headers })
+        const data = await res.json().catch(() => ({}))
+        if (res.ok) {
+          setUserEmail(String(data?.user?.email || data?.email || ''))
+        }
+      } catch {
+        // ignore
+      }
+    }
+    loadMe()
+  }, [])
+
   // Load citation row (and ensure fulltext is extracted if missing)
   async function fetchCitationById(id: string) {
     if (!srId || !id) return
@@ -684,46 +738,54 @@ export default function CanSrL2ScreenViewPage() {
               </p>
             </div>
             <div className="flex items-center gap-2">
-              <button
-                onClick={async () => {
-                  if (!srId || !citationId) return
-                  setValidating(true)
-                  try {
-                    const headers = {
-                      'Content-Type': 'application/json',
-                      ...getAuthHeaders(),
+              <label className="flex items-center gap-2 text-sm text-gray-800">
+                <input
+                  type="checkbox"
+                  checked={l2Checked}
+                  disabled={validating}
+                  onChange={async (e) => {
+                    if (!srId || !citationId) return
+                    setValidating(true)
+                    try {
+                      const headers = {
+                        'Content-Type': 'application/json',
+                        ...getAuthHeaders(),
+                      }
+                      await fetch('/api/can-sr/screen/validate', {
+                        method: 'POST',
+                        headers,
+                        body: JSON.stringify({
+                          sr_id: srId,
+                          citation_id: Number(citationId),
+                          step: 'l2',
+                          checked: Boolean(e.target.checked),
+                        }),
+                      })
+                      await fetchCitationById(String(citationId))
+                    } finally {
+                      setValidating(false)
                     }
-                    await fetch('/api/can-sr/screen/validate', {
-                      method: 'POST',
-                      headers,
-                      body: JSON.stringify({
-                        sr_id: srId,
-                        citation_id: Number(citationId),
-                        step: 'l2',
-                      }),
-                    })
-                    await fetchCitationById(String(citationId))
-                  } finally {
-                    setValidating(false)
-                  }
-                }}
-                className="rounded-md bg-emerald-600 px-3 py-2 text-sm font-medium text-white hover:bg-emerald-700 disabled:opacity-60"
-                disabled={validating}
-                type="button"
-              >
-                {validating ? 'Validating…' : 'Validate (L2)'}
-              </button>
-
-              {citation?.l2_validated_by ? (
-                <span className="text-xs text-emerald-700">
-                  Validated by {String(citation.l2_validated_by)}
+                  }}
+                />
+                <span>
+                  Validated by <span className="font-medium">{String(userEmail || '—')}</span>
                 </span>
-              ) : (
-                <span className="text-xs text-gray-600">Not validated</span>
-              )}
+              </label>
             </div>
           </div>
 
+          {l2ValidationsSorted.length ? (
+            <div className="mt-2 space-y-1">
+              {l2ValidationsSorted.map((v, idx) => (
+                <div key={`${v.user}-${idx}`} className="text-xs text-gray-600">
+                  Validated on {formatValidationDate(v.validated_at)} by {v.user}
+                </div>
+              ))}
+            </div>
+          ) : (
+            <div className="mt-2 text-xs text-gray-600">Not validated</div>
+          )}
+
           {loadingRuns ? (
             <div className="mt-2 text-sm text-gray-600">Loading agent runs…</div>
           ) : criteriaData?.questions?.length ? (
diff --git a/frontend/app/api/can-sr/reviews/thresholds/route.ts b/frontend/app/api/can-sr/reviews/thresholds/route.ts
new file mode 100644
index 00000000..2e439af8
--- /dev/null
+++ b/frontend/app/api/can-sr/reviews/thresholds/route.ts
@@ -0,0 +1,57 @@
+import { NextRequest, NextResponse } from 'next/server'
+import { BACKEND_URL } from '@/lib/config'
+
+/**
+ * Frontend proxy for SR screening thresholds.
+ *
+ * Routes handled:
+ * - GET /api/can-sr/reviews/thresholds?sr_id=...  -> BACKEND_URL/api/sr/{sr_id}/screening_thresholds
+ * - PUT /api/can-sr/reviews/thresholds?sr_id=...  -> BACKEND_URL/api/sr/{sr_id}/screening_thresholds
+ */
+
+async function forward(request: NextRequest, method: 'GET' | 'PUT') {
+  const params = request.nextUrl.searchParams
+  const srId = params.get('sr_id')
+  if (!srId) {
+    return NextResponse.json({ error: 'sr_id query parameter is required' }, { status: 400 })
+  }
+
+  const authHeader = request.headers.get('authorization')
+  if (!authHeader) {
+    return NextResponse.json({ error: 'Authorization header is required' }, { status: 401 })
+  }
+
+  const url = `${BACKEND_URL}/api/sr/${encodeURIComponent(srId)}/screening_thresholds`
+
+  const body = method === 'PUT' ? JSON.stringify(await request.json()) : undefined
+
+  const res = await fetch(url, {
+    method,
+    headers: {
+      Authorization: authHeader,
+      ...(method === 'PUT' ? { 'Content-Type': 'application/json' } : {}),
+    },
+    body,
+  })
+
+  const data = await res.json().catch(() => ({}))
+  return NextResponse.json(data, { status: res.status })
+}
+
+export async function GET(request: NextRequest) {
+  try {
+    return await forward(request, 'GET')
+  } catch (error) {
+    console.error('thresholds GET API error:', error)
+    return NextResponse.json({ error: 'Internal server error' }, { status: 500 })
+  }
+}
+
+export async function PUT(request: NextRequest) {
+  try {
+    return await forward(request, 'PUT')
+  } catch (error) {
+    console.error('thresholds PUT API error:', error)
+    return NextResponse.json({ error: 'Internal server error' }, { status: 500 })
+  }
+}
diff --git a/frontend/app/api/can-sr/screen/metrics/route.ts b/frontend/app/api/can-sr/screen/metrics/route.ts
new file mode 100644
index 00000000..710dfc2e
--- /dev/null
+++ b/frontend/app/api/can-sr/screen/metrics/route.ts
@@ -0,0 +1,59 @@
+import { NextRequest, NextResponse } from 'next/server'
+import { BACKEND_URL } from '@/lib/config'
+
+/**
+ * Proxy: GET /api/can-sr/screen/metrics?sr_id=<sr>&step=l1|l2
+ *   -> GET {BACKEND_URL}/api/screen/metrics?sr_id=...&step=...
+ */
+
+export async function OPTIONS() {
+  return new Response(null, {
+    status: 204,
+    headers: {
+      'Access-Control-Allow-Origin': '*',
+      'Access-Control-Allow-Methods': 'GET,OPTIONS',
+      'Access-Control-Allow-Headers': 'Authorization, Content-Type',
+    },
+  })
+}
+
+export async function GET(request: NextRequest) {
+  try {
+    const params = request.nextUrl.searchParams
+    const srId = params.get('sr_id')
+    const step = params.get('step') || 'l1'
+
+    if (!srId) {
+      return NextResponse.json({ error: 'sr_id is required' }, { status: 400 })
+    }
+
+    const authHeader = request.headers.get('authorization')
+    if (!authHeader) {
+      return NextResponse.json({ error: 'Authorization header is required' }, { status: 401 })
+    }
+
+    const url = new URL(`${BACKEND_URL}/api/screen/metrics`)
+    url.searchParams.set('sr_id', srId)
+    url.searchParams.set('step', step)
+
+    const res = await fetch(url.toString(), {
+      method: 'GET',
+      headers: {
+        Authorization: authHeader,
+      },
+    })
+
+    const text = await res.text().catch(() => '')
+    let json: any = null
+    try {
+      json = text ? JSON.parse(text) : {}
+    } catch {
+      json = { detail: text || null }
+    }
+
+    return NextResponse.json(json, { status: res.status })
+  } catch (err: any) {
+    console.error('screen metrics proxy GET error:', err)
+    return NextResponse.json({ error: 'Internal server error' }, { status: 500 })
+  }
+}
diff --git a/frontend/components/can-sr/CitationListPage.tsx b/frontend/components/can-sr/CitationListPage.tsx
index df823536..3f99aa12 100644
--- a/frontend/components/can-sr/CitationListPage.tsx
+++ b/frontend/components/can-sr/CitationListPage.tsx
@@ -9,6 +9,11 @@ import { Bot, Check, Wand2 } from 'lucide-react'
 import { useDictionary } from '@/app/[lang]/DictionaryProvider'
 import { ModelSelector } from '@/components/chat'
 import { toast } from 'react-hot-toast'
+import ScreeningMetricsPanel, {
+  type ScreeningMetricsStats,
+  type ScreeningMetricsSummary,
+  type ScreeningCriterionMetrics,
+} from '@/components/can-sr/ScreeningMetricsPanel'
 import {
   Dialog,
   DialogContent,
@@ -74,6 +79,17 @@ export default function CitationsListPage({
   const [error, setError] = useState<string | null>(null)
   const [criteriaData, setCriteriaData] = useState<CriteriaData | null>()
 
+  // Phase 1 list control surface is now hosted by the left-side metrics module.
+  const [threshold, setThreshold] = useState<number>(0.9)
+  const [filterMode, setFilterMode] = useState<'needs' | 'validated' | 'unvalidated' | 'all'>('needs')
+  const [pageStats, setPageStats] = useState<ScreeningMetricsStats | undefined>(undefined)
+
+  // Phase 2 metrics (SR-wide)
+  const [srMetricsSummary, setSrMetricsSummary] = useState<ScreeningMetricsSummary | undefined>(undefined)
+  const [srCriterionMetrics, setSrCriterionMetrics] = useState<ScreeningCriterionMetrics[] | undefined>(undefined)
+  const [srThresholds, setSrThresholds] = useState<Record<string, any> | null>(null)
+  const [metricsRefreshKey, setMetricsRefreshKey] = useState<number>(0)
+
   // Run-all job tracking (persist across modal close / refresh)
   const [runAllForce, setRunAllForce] = useState<boolean>(false)
   const [runAllJobId, setRunAllJobId] = useState<string | null>(null)
@@ -168,6 +184,80 @@ export default function CitationsListPage({
     loadCitations()
   }, [srId, router, screeningStep])
 
+  // Load SR thresholds + metrics (L1/L2 only)
+  useEffect(() => {
+    if (!srId) return
+    if (!(screeningStep === 'l1' || screeningStep === 'l2')) {
+      setSrMetricsSummary(undefined)
+      setSrCriterionMetrics(undefined)
+      setSrThresholds(null)
+      return
+    }
+
+    const load = async () => {
+      try {
+        const headers = getAuthHeaders()
+
+        // 1) thresholds
+        const tRes = await fetch(
+          `/api/can-sr/reviews/thresholds?sr_id=${encodeURIComponent(srId)}`,
+          { method: 'GET', headers },
+        )
+        const tJson = await tRes.json().catch(() => ({}))
+        const thresholds = (tRes.ok ? tJson?.screening_thresholds : null) || {}
+        setSrThresholds(typeof thresholds === 'object' && thresholds ? thresholds : {})
+
+        // 2) metrics
+        const mRes = await fetch(
+          `/api/can-sr/screen/metrics?sr_id=${encodeURIComponent(srId)}&step=${encodeURIComponent(
+            screeningStep,
+          )}`,
+          { method: 'GET', headers },
+        )
+        const mJson = await mRes.json().catch(() => ({}))
+        if (mRes.ok) {
+          const stepBlock = mJson?.steps?.[screeningStep]
+          setSrMetricsSummary(stepBlock?.summary)
+          setSrCriterionMetrics(stepBlock?.criteria)
+        } else {
+          setSrMetricsSummary(undefined)
+          setSrCriterionMetrics(undefined)
+        }
+      } catch {
+        setSrMetricsSummary(undefined)
+        setSrCriterionMetrics(undefined)
+        setSrThresholds(null)
+      }
+    }
+    load()
+  }, [srId, screeningStep, metricsRefreshKey])
+
+  const persistThresholds = useCallback(
+    async (nextThresholds: Record<string, any>) => {
+      if (!srId) return
+      try {
+        const headers = { ...getAuthHeaders(), 'Content-Type': 'application/json' }
+        const res = await fetch(
+          `/api/can-sr/reviews/thresholds?sr_id=${encodeURIComponent(srId)}`,
+          {
+            method: 'PUT',
+            headers,
+            body: JSON.stringify({ screening_thresholds: nextThresholds }),
+          },
+        )
+        const j = await res.json().catch(() => ({}))
+        if (res.ok) {
+          setSrThresholds(j?.screening_thresholds || nextThresholds)
+          // Refresh metrics so counts reflect the new thresholds.
+          setMetricsRefreshKey((k) => k + 1)
+        }
+      } catch {
+        // ignore
+      }
+    },
+    [srId],
+  )
+
   // Restore persisted run-all job id
   useEffect(() => {
     if (!runAllStorageKey) return
@@ -323,7 +413,11 @@ export default function CitationsListPage({
         }
       />
 
-      <main className="mx-auto max-w-4xl px-6 py-10">
+      {/*
+        Layout: left floating/side metrics module + right list.
+        (A true fixed overlay can be added later; this keeps it responsive and simple.)
+      */}
+      <main className="mx-auto max-w-6xl px-6 py-10">
         <Dialog open={runAllModalOpen} onOpenChange={() => setRunAllModalOpen(false)}>
           <DialogContent className="sm:max-w-[560px]">
             <DialogHeader>
@@ -362,116 +456,163 @@ export default function CitationsListPage({
             </DialogFooter>
           </DialogContent>
         </Dialog>
-        <div className="rounded-lg border border-gray-200 bg-white p-6 shadow-sm">
-          <div className="flex items-start justify-between gap-4">
-            <div>
-              <h3 className="text-lg font-semibold text-gray-900">
-                {dict.screening.citationsList}
-              </h3>
-              <p className="mt-1 text-sm text-gray-600">
-                {dict.screening.citationsListDesc}
-              </p>
-            </div>
-
-            <div className="flex items-center gap-3">
-              <button
-                type="button"
-                disabled={!canRunAllServerSide || hasActiveRunAll}
-                onClick={() => setRunAllModalOpen(true)}
-                className="rounded-md border border-gray-200 bg-white px-3 py-2 text-sm font-medium text-gray-700 hover:bg-gray-50 disabled:bg-gray-100 disabled:text-gray-400"
-                title={dict.screening.runAllAI}
-              >
-                <span className="inline-flex items-center gap-1">
-                  {dict.screening.runAllAI}
-                  <Wand2 className="h-4 w-4" />
-                </span>
-              </button>
-
-              <button
-                type="button"
-                disabled={!srId || exporting}
-                onClick={async () => {
-                  if (!srId) return
-                  try {
-                    setExporting(true)
-                    const headers = getAuthHeaders()
-                    const res = await fetch(
-                      `/api/can-sr/citations/list?action=export&sr_id=${encodeURIComponent(srId)}`,
-                      { method: 'GET', headers },
-                    )
-                    if (!res.ok) {
-                      const text = await res.text().catch(() => '')
-                      throw new Error(text || `Export failed (${res.status})`)
-                    }
-                    const blob = await res.blob()
-                    const url = window.URL.createObjectURL(blob)
-                    const a = document.createElement('a')
-                    a.href = url
-                    a.download = `citations_${srId}.csv`
-                    document.body.appendChild(a)
-                    a.click()
-                    a.remove()
-                    window.URL.revokeObjectURL(url)
-                  } catch (e: any) {
-                    console.error('Export failed', e)
-                    setError(e?.message || 'Export failed')
-                  } finally {
-                    setExporting(false)
-                  }
+        <div className="grid grid-cols-12 gap-6">
+          <aside className="col-span-12 md:col-span-4">
+            <div className="sticky top-6">
+              <ScreeningMetricsPanel
+                title={dict?.screening?.metricsTitle || 'Screening metrics'}
+                filterMode={filterMode}
+                onFilterModeChange={setFilterMode}
+                stats={pageStats}
+                summary={srMetricsSummary}
+                criterionMetrics={srCriterionMetrics}
+                onCriterionThresholdChange={(criterionKey, v) => {
+                  // Update SR-scoped per-step thresholds
+                  const base = srThresholds && typeof srThresholds === 'object' ? { ...srThresholds } : {}
+                  const stepKey = String(screeningStep)
+                  const stepMap = (base as any)[stepKey] && typeof (base as any)[stepKey] === 'object'
+                    ? { ...(base as any)[stepKey] }
+                    : {}
+                  stepMap[criterionKey] = v
+                  ;(base as any)[stepKey] = stepMap
+                  setSrThresholds(base)
+                  void persistThresholds(base)
                 }}
-                className="rounded-md bg-emerald-600 px-3 py-2 text-sm font-medium text-white hover:bg-emerald-700 disabled:bg-emerald-300"
-              >
-                {exporting ? dict.common.downloading : dict.common.export}
-              </button>
-
-              <div className="flex max-w-xs flex-col items-center space-y-2 rounded-md border border-gray-200 bg-gray-50 p-2">
-                <div className="flex items-center space-x-2">
-                  <Bot className="h-5 w-5 text-green-600" />
-                  <span className="text-sm text-gray-700">
-                    {dict.screening.llmClassified}
-                  </span>
-                </div>
-                <div className="flex items-center space-x-2">
-                  <Check className="h-5 w-5 text-green-600" />
-                  <span className="text-sm text-gray-700">
-                    {dict.screening.humanVerified}
-                  </span>
-                </div>
-              </div>
+              />
             </div>
-          </div>
-
-          {/* Run-all status/controls are shown in the bottom-right floating panel. */}
+          </aside>
+
+          <div className="col-span-12 md:col-span-8">
+            <div className="rounded-lg border border-gray-200 bg-white p-6 shadow-sm">
+              <div className="flex items-start justify-between gap-4">
+                <div>
+                  <h3 className="text-lg font-semibold text-gray-900">
+                    {dict.screening.citationsList}
+                  </h3>
+                  <p className="mt-1 text-sm text-gray-600">
+                    {dict.screening.citationsListDesc}
+                  </p>
+                </div>
 
-          <div className="mt-6">
-            {loading ? (
-              <div className="text-sm text-gray-600">
-                {dict.screening.loadingCitations}
-              </div>
-            ) : error ? (
-              <div className="text-sm text-red-600">{error}</div>
-            ) : citationIds && citationIds.length === 0 ? (
-              <div className="text-sm text-gray-600">
-                {dict.screening.noCitations}
-              </div>
-            ) : (
-              <div>
-                <div className="mb-3 text-sm text-gray-700">
-                  {dict.screening.totalCitations}{' '}
-                  {citationIds ? citationIds.length : 0}
+                <div className="flex items-center gap-3">
+                  <button
+                    type="button"
+                    disabled={!canRunAllServerSide || hasActiveRunAll}
+                    onClick={() => setRunAllModalOpen(true)}
+                    className="rounded-md border border-gray-200 bg-white px-3 py-2 text-sm font-medium text-gray-700 hover:bg-gray-50 disabled:bg-gray-100 disabled:text-gray-400"
+                    title={dict.screening.runAllAI}
+                  >
+                    <span className="inline-flex items-center gap-1">
+                      {dict.screening.runAllAI}
+                      <Wand2 className="h-4 w-4" />
+                    </span>
+                  </button>
+
+                  <button
+                    type="button"
+                    disabled={!srId || exporting}
+                    onClick={async () => {
+                      if (!srId) return
+                      try {
+                        setExporting(true)
+                        const headers = getAuthHeaders()
+                        const res = await fetch(
+                          `/api/can-sr/citations/list?action=export&sr_id=${encodeURIComponent(srId)}`,
+                          { method: 'GET', headers },
+                        )
+                        if (!res.ok) {
+                          const text = await res.text().catch(() => '')
+                          throw new Error(text || `Export failed (${res.status})`)
+                        }
+                        const blob = await res.blob()
+                        const url = window.URL.createObjectURL(blob)
+                        const a = document.createElement('a')
+                        a.href = url
+                        a.download = `citations_${srId}.csv`
+                        document.body.appendChild(a)
+                        a.click()
+                        a.remove()
+                        window.URL.revokeObjectURL(url)
+                      } catch (e: any) {
+                        console.error('Export failed', e)
+                        setError(e?.message || 'Export failed')
+                      } finally {
+                        setExporting(false)
+                      }
+                    }}
+                    className="rounded-md bg-emerald-600 px-3 py-2 text-sm font-medium text-white hover:bg-emerald-700 disabled:bg-emerald-300"
+                  >
+                    {exporting ? dict.common.downloading : dict.common.export}
+                  </button>
+
+                  <div className="flex max-w-xs flex-col items-center space-y-2 rounded-md border border-gray-200 bg-gray-50 p-2">
+                    <div className="flex items-center space-x-2">
+                      <Bot className="h-5 w-5 text-green-600" />
+                      <span className="text-sm text-gray-700">
+                        {dict.screening.llmClassified}
+                      </span>
+                    </div>
+                    <div className="flex items-center space-x-2">
+                      <Check className="h-5 w-5 text-green-600" />
+                      <span className="text-sm text-gray-700">
+                        {dict.screening.humanVerified}
+                      </span>
+                    </div>
+                  </div>
                 </div>
+              </div>
 
-                <PagedList
-                  citationIds={citationIds || []}
-                  srId={srId || ''}
-                  questions={criteriaData?.questions || []}
-                  possible_answers={criteriaData?.possible_answers || []}
-                  include={criteriaData?.include || []}
-                  screeningStep={screeningStep || ''}
-                  pageview={pageview}
-                />
+              {/* Run-all status/controls are shown in the bottom-right floating panel. */}
+
+              <div className="mt-6">
+                {loading ? (
+                  <div className="text-sm text-gray-600">
+                    {dict.screening.loadingCitations}
+                  </div>
+                ) : error ? (
+                  <div className="text-sm text-red-600">{error}</div>
+                ) : citationIds && citationIds.length === 0 ? (
+                  <div className="text-sm text-gray-600">
+                    {dict.screening.noCitations}
+                  </div>
+                ) : (
+                  <div>
+                    <div className="mb-3 text-sm text-gray-700">
+                      {dict.screening.totalCitations}{' '}
+                      {citationIds ? citationIds.length : 0}
+                    </div>
+
+                    <PagedList
+                      citationIds={citationIds || []}
+                      srId={srId || ''}
+                      questions={criteriaData?.questions || []}
+                      possible_answers={criteriaData?.possible_answers || []}
+                      include={criteriaData?.include || []}
+                      screeningStep={screeningStep || ''}
+                      pageview={pageview}
+                      threshold={threshold}
+                      thresholdByCriterionKey={
+                        (srThresholds && typeof srThresholds === 'object'
+                          ? (srThresholds as any)[String(screeningStep)]
+                          : null) || undefined
+                      }
+                      filterMode={filterMode}
+                      onThresholdChange={setThreshold}
+                      onFilterModeChange={setFilterMode}
+                      onStatsChange={(s) =>
+                        setPageStats({
+                          scopeLabel: 'this page',
+                          total: s.total,
+                          needsValidation: s.needsValidation,
+                          validated: s.validated,
+                          unvalidated: s.unvalidated,
+                        })
+                      }
+                    />
+                  </div>
+                )}
               </div>
-            )}
+            </div>
           </div>
         </div>
       </main>
diff --git a/frontend/components/can-sr/PagedList.tsx b/frontend/components/can-sr/PagedList.tsx
index b0417b50..dced77a1 100644
--- a/frontend/components/can-sr/PagedList.tsx
+++ b/frontend/components/can-sr/PagedList.tsx
@@ -12,6 +12,17 @@ type CitationInfo = {
   include: string[]
   screeningStep: string
   pageview: string
+  threshold?: number
+  thresholdByCriterionKey?: Record<string, number>
+  filterMode?: 'needs' | 'validated' | 'unvalidated' | 'all'
+  onThresholdChange?: (v: number) => void
+  onFilterModeChange?: (v: 'needs' | 'validated' | 'unvalidated' | 'all') => void
+  onStatsChange?: (stats: {
+    total: number
+    needsValidation: number
+    validated: number
+    unvalidated: number
+  }) => void
 }
 
 type LatestAgentRun = {
@@ -44,6 +55,12 @@ export default function PagedList({
   possible_answers,
   screeningStep,
   pageview,
+  threshold: thresholdProp,
+  thresholdByCriterionKey,
+  filterMode: filterModeProp,
+  onThresholdChange,
+  onFilterModeChange,
+  onStatsChange,
 }: CitationInfo) {
   const router = useRouter()
   const { lang } = useParams<{ lang: string }>()
@@ -62,9 +79,12 @@ export default function PagedList({
   )
   const [showClassify, setShowClassify] = useState<Record<number, boolean>>({})
 
-  // TA list controls
-  const [threshold, setThreshold] = useState<number>(0.9)
-  const [filterMode, setFilterMode] = useState<'needs' | 'validated' | 'unvalidated' | 'all'>('needs')
+  // List controls (controlled by parent when provided; otherwise local state)
+  const [thresholdLocal, setThresholdLocal] = useState<number>(0.9)
+  const [filterModeLocal, setFilterModeLocal] = useState<'needs' | 'validated' | 'unvalidated' | 'all'>('needs')
+
+  const threshold = typeof thresholdProp === 'number' ? thresholdProp : thresholdLocal
+  const filterMode = filterModeProp || filterModeLocal
 
   const [latestRunsByCitation, setLatestRunsByCitation] = useState<Record<number, LatestAgentRun[]>>({})
 
@@ -154,7 +174,7 @@ export default function PagedList({
             setLatestRunsByCitation((prev: Record<number, LatestAgentRun[]>) => ({ ...prev, ...grouped }))
           }
         }
-      } catch (e) {
+      } catch {
         // best-effort
       }
     }
@@ -168,9 +188,25 @@ export default function PagedList({
 
   const isValidatedForStep = (row: any): boolean => {
     if (!row) return false
-    if (screeningStep === 'l1') return Boolean(row?.l1_validated_by)
-    if (screeningStep === 'l2') return Boolean(row?.l2_validated_by)
-    if (screeningStep === 'extract') return Boolean(row?.parameters_validated_by)
+    const hasValidationsList = (v: any): boolean => {
+      if (!v) return false
+      if (Array.isArray(v)) return v.length > 0
+      if (typeof v === 'string') {
+        try {
+          const parsed = JSON.parse(v)
+          return Array.isArray(parsed) && parsed.length > 0
+        } catch {
+          return false
+        }
+      }
+      return false
+    }
+
+    // Prefer new per-step validations list; fall back to legacy single fields.
+    if (screeningStep === 'l1') return hasValidationsList(row?.l1_validations) || Boolean(row?.l1_validated_by)
+    if (screeningStep === 'l2') return hasValidationsList(row?.l2_validations) || Boolean(row?.l2_validated_by)
+    if (screeningStep === 'extract')
+      return hasValidationsList(row?.parameters_validations) || Boolean(row?.parameters_validated_by)
     return false
   }
 
@@ -194,21 +230,39 @@ export default function PagedList({
       byKey[key].push(r)
     }
 
-    // Needs validation if ANY criterion is low confidence OR critical disagrees
+    // Rule:
+    // - If ANY criterion is a confident exclude (screening answer contains "(exclude)" AND conf >= threshold) => no review needed.
+    // - Else needs review if ANY criterion is low confidence OR critical disagrees.
+
+    let hasConfidentExclude = false
+    let hasLowConfidence = false
+    let hasCriticalDisagree = false
+
     for (const key of Object.keys(byKey)) {
       const items = byKey[key]
       const screening = items.find((x) => String((x as any)?.stage) === 'screening')
       const critical = items.find((x) => String((x as any)?.stage) === 'critical')
 
       const conf = Number((screening as any)?.confidence)
-      if (Number.isFinite(conf) && conf < threshold) return true
+      const perThrRaw = thresholdByCriterionKey ? Number((thresholdByCriterionKey as any)[key]) : NaN
+      const thr = Number.isFinite(perThrRaw) ? Math.max(0, Math.min(1, perThrRaw)) : threshold
+
+      if (Number.isFinite(conf) && conf < thr) hasLowConfidence = true
+
+      const ans = String((screening as any)?.answer || '')
+      if (Number.isFinite(conf) && conf >= thr && ans.toLowerCase().includes('(exclude)')) {
+        hasConfidentExclude = true
+      }
 
       const criticalAns = String((critical as any)?.answer || '')
       // In our critical prompt contract, agreement is "None of the above".
-      if (critical && criticalAns.trim() !== '' && criticalAns.trim() !== 'None of the above') return true
+      if (critical && criticalAns.trim() !== '' && criticalAns.trim() !== 'None of the above') {
+        hasCriticalDisagree = true
+      }
     }
 
-    return false
+    if (hasConfidentExclude) return false
+    return hasLowConfidence || hasCriticalDisagree
   }
 
   const filteredCitationData = citationData.filter((row: any) => {
@@ -224,6 +278,23 @@ export default function PagedList({
     return true
   })
 
+  // Emit list stats upward (so CitationListPage can render a floating metrics module)
+  useEffect(() => {
+    if (!onStatsChange) return
+    const total = citationData.length
+    let validated = 0
+    let needsValidation = 0
+    for (const row of citationData) {
+      const id = Number(row?.id)
+      if (!Number.isFinite(id)) continue
+      if (isValidatedForStep(row)) validated += 1
+      if (computeNeedsValidation(id, row)) needsValidation += 1
+    }
+    const unvalidated = Math.max(0, total - validated)
+    onStatsChange({ total, needsValidation, validated, unvalidated })
+    // eslint-disable-next-line react-hooks/exhaustive-deps
+  }, [citationData, threshold, screeningStep, latestRunsByCitation, thresholdByCriterionKey])
+
   // NOTE: Previously we fetched each citation via /citations/get.
   // This is now replaced by a single /citations/batch call per page.
 
@@ -306,7 +377,10 @@ export default function PagedList({
 
   return (
     <div className="flex flex-col items-center space-y-4">
-      {screeningStep === 'l1' || screeningStep === 'l2' ? (
+      {/* Controls moved to CitationListPage floating metrics module when provided.
+          Keep fallback local controls for other callers. */}
+      {((screeningStep === 'l1' || screeningStep === 'l2') &&
+        (typeof thresholdProp !== 'number' || !filterModeProp)) ? (
         <div className="flex w-full flex-wrap items-center justify-between gap-3 rounded-md border border-gray-200 bg-white p-3">
           <div className="flex items-center gap-2">
             <label className="text-sm text-gray-700">Threshold</label>
@@ -319,7 +393,9 @@ export default function PagedList({
               onChange={(e: React.ChangeEvent<HTMLInputElement>) => {
                 const v = Number(e.target.value)
                 if (!Number.isFinite(v)) return
-                setThreshold(Math.max(0, Math.min(1, v)))
+                const nv = Math.max(0, Math.min(1, v))
+                setThresholdLocal(nv)
+                onThresholdChange?.(nv)
               }}
               className="w-24 rounded-md border border-gray-200 px-2 py-1 text-sm"
             />
@@ -329,10 +405,14 @@ export default function PagedList({
             <label className="text-sm text-gray-700">Filter</label>
             <select
               value={filterMode}
-              onChange={(e: React.ChangeEvent<HTMLSelectElement>) => setFilterMode(e.target.value as any)}
+              onChange={(e: React.ChangeEvent<HTMLSelectElement>) => {
+                const v = e.target.value as any
+                setFilterModeLocal(v)
+                onFilterModeChange?.(v)
+              }}
               className="rounded-md border border-gray-200 bg-white px-2 py-1 text-sm"
             >
-              <option value="needs">Needs validation</option>
+              <option value="needs">Needs human review</option>
               <option value="unvalidated">Unvalidated</option>
               <option value="validated">Validated</option>
               <option value="all">All</option>
@@ -364,7 +444,7 @@ export default function PagedList({
               <button
                 onClick={() =>
                   router.push(
-                    `/${lang}/can-sr/${encodeURIComponent(pageview)}/view?sr_id=${encodeURIComponent(srId || '')}&citation_id=${data.id}&screening=${screeningStep}`,
+                    `/${lang}/can-sr/${encodeURIComponent(pageview)}/view?sr_id=${encodeURIComponent(srId || '')}&citation_id=${data.id}&screening=${screeningStep}&threshold=${encodeURIComponent(String(threshold))}`,
                   )
                 }
                 className="w-[90px] rounded-md bg-emerald-600 px-3 py-1 text-sm font-medium text-white hover:bg-emerald-700"
diff --git a/frontend/components/can-sr/ScreeningMetricsPanel.tsx b/frontend/components/can-sr/ScreeningMetricsPanel.tsx
new file mode 100644
index 00000000..2ad2b8bc
--- /dev/null
+++ b/frontend/components/can-sr/ScreeningMetricsPanel.tsx
@@ -0,0 +1,214 @@
+import React from 'react'
+
+export type ScreeningMetricsStats = {
+  scopeLabel?: string
+  total: number
+  needsValidation: number
+  validated: number
+  unvalidated: number
+}
+
+export type ScreeningMetricsSummary = {
+  step: string
+  total_citations: number
+  validated_all: number
+  unvalidated_all: number
+  needs_review_total: number
+  validated_needs_review: number
+  unvalidated_needs_review: number
+}
+
+export type ScreeningCriterionMetrics = {
+  criterion_key: string
+  label: string
+  threshold: number
+  total_citations: number
+  has_run_count: number
+  low_confidence_count: number
+  critical_disagreement_count: number
+  confident_exclude_count: number
+  needs_human_review_count: number
+}
+
+export type ScreeningMetricsPanelProps = {
+  title?: string
+  /**
+   * Legacy single threshold (Phase 1). Prefer criterionMetrics/summary instead.
+   */
+  threshold?: number
+  onThresholdChange?: (v: number) => void
+
+  /**
+   * Phase 2: per-criterion thresholds + metrics.
+   */
+  summary?: ScreeningMetricsSummary
+  criterionMetrics?: ScreeningCriterionMetrics[]
+  onCriterionThresholdChange?: (criterionKey: string, v: number) => void
+
+  filterMode: 'needs' | 'validated' | 'unvalidated' | 'all'
+  onFilterModeChange: (v: 'needs' | 'validated' | 'unvalidated' | 'all') => void
+  stats?: ScreeningMetricsStats
+}
+
+/**
+ * Phase 1/2 bridge component.
+ *
+ * Phase 1: provides the control surface (threshold + filter) and basic counts.
+ * Phase 2: will additionally display backend metrics (accuracy/curves/recommended threshold).
+ */
+export default function ScreeningMetricsPanel({
+  title = 'Metrics',
+  threshold,
+  onThresholdChange,
+  summary,
+  criterionMetrics,
+  onCriterionThresholdChange,
+  filterMode,
+  onFilterModeChange,
+  stats,
+}: ScreeningMetricsPanelProps) {
+  return (
+    <div className="rounded-lg border border-gray-200 bg-white p-4 shadow-sm">
+      <div className="mb-3">
+        <h3 className="text-sm font-semibold text-gray-900">{title}</h3>
+        <p className="mt-1 text-xs text-gray-600">
+          Threshold + validation workload controls. (Accuracy/curves will be powered by Phase 2 metrics.)
+        </p>
+      </div>
+
+      <div className="space-y-3">
+        {typeof threshold === 'number' && onThresholdChange && !criterionMetrics?.length ? (
+          <div className="flex items-center justify-between gap-3">
+            <label className="text-sm text-gray-700">Threshold</label>
+            <input
+              type="number"
+              min={0}
+              max={1}
+              step={0.01}
+              value={threshold}
+              onChange={(e: React.ChangeEvent<HTMLInputElement>) => {
+                const v = Number(e.target.value)
+                if (!Number.isFinite(v)) return
+                onThresholdChange(Math.max(0, Math.min(1, v)))
+              }}
+              className="w-24 rounded-md border border-gray-200 px-2 py-1 text-sm"
+            />
+          </div>
+        ) : null}
+
+        <div className="flex items-center justify-between gap-3">
+          <label className="text-sm text-gray-700">Filter</label>
+          <select
+            value={filterMode}
+            onChange={(e: React.ChangeEvent<HTMLSelectElement>) =>
+              onFilterModeChange(e.target.value as any)
+            }
+            className="rounded-md border border-gray-200 bg-white px-2 py-1 text-sm"
+          >
+            <option value="needs">Needs human review</option>
+            <option value="unvalidated">Unvalidated</option>
+            <option value="validated">Validated</option>
+            <option value="all">All</option>
+          </select>
+        </div>
+
+        {summary ? (
+          <div className="rounded-md border border-gray-100 bg-gray-50 p-3">
+            <div className="text-xs font-medium text-gray-700">Validation summary</div>
+            <div className="mt-2 grid grid-cols-2 gap-2 text-xs text-gray-700">
+              <div className="rounded border border-gray-100 bg-white p-2">
+                <div className="text-[11px] text-gray-500">All citations</div>
+                <div className="font-semibold">
+                  {summary.validated_all} / {summary.total_citations}
+                </div>
+              </div>
+              <div className="rounded border border-gray-100 bg-white p-2">
+                <div className="text-[11px] text-gray-500">Needs human review</div>
+                <div className="font-semibold">
+                  {summary.validated_needs_review} / {summary.needs_review_total}
+                </div>
+              </div>
+            </div>
+            <div className="mt-2 text-[11px] text-gray-500">
+              Unvalidated: {summary.unvalidated_all} (all), {summary.unvalidated_needs_review} (needs review)
+            </div>
+          </div>
+        ) : null}
+
+        <div className="rounded-md border border-gray-100 bg-gray-50 p-3">
+          <div className="text-xs font-medium text-gray-700">
+            Workload summary{stats?.scopeLabel ? ` (${stats.scopeLabel})` : ''}
+          </div>
+          <div className="mt-2 grid grid-cols-2 gap-2 text-xs text-gray-700">
+            <div className="rounded border border-gray-100 bg-white p-2">
+              <div className="text-[11px] text-gray-500">Total</div>
+              <div className="font-semibold">{stats ? stats.total : '—'}</div>
+            </div>
+            <div className="rounded border border-gray-100 bg-white p-2">
+              <div className="text-[11px] text-gray-500">Needs validation</div>
+              <div className="font-semibold">{stats ? stats.needsValidation : '—'}</div>
+            </div>
+            <div className="rounded border border-gray-100 bg-white p-2">
+              <div className="text-[11px] text-gray-500">Validated</div>
+              <div className="font-semibold">{stats ? stats.validated : '—'}</div>
+            </div>
+            <div className="rounded border border-gray-100 bg-white p-2">
+              <div className="text-[11px] text-gray-500">Unvalidated</div>
+              <div className="font-semibold">{stats ? stats.unvalidated : '—'}</div>
+            </div>
+          </div>
+        </div>
+
+        {criterionMetrics?.length ? (
+          <div className="rounded-md border border-gray-100 bg-gray-50 p-3">
+            <div className="text-xs font-medium text-gray-700">Criteria thresholds & metrics</div>
+            <div className="mt-2 space-y-2">
+              {criterionMetrics.map((c) => (
+                <div key={c.criterion_key} className="rounded border border-gray-100 bg-white p-2">
+                  <div className="flex items-start justify-between gap-2">
+                    <div className="min-w-0 flex-1">
+                      <div className="text-xs font-medium text-gray-800 truncate">{c.label}</div>
+                      <div className="mt-1 grid grid-cols-2 gap-x-2 gap-y-1 text-[11px] text-gray-600">
+                        <div>Low conf: {c.low_confidence_count}</div>
+                        <div>Critical disagree: {c.critical_disagreement_count}</div>
+                        <div>Confident exclude: {c.confident_exclude_count}</div>
+                        <div>Has run: {c.has_run_count}/{c.total_citations}</div>
+                      </div>
+                    </div>
+
+                    <div className="flex items-center gap-2">
+                      <label className="text-[11px] text-gray-600">Thr</label>
+                      <input
+                        type="number"
+                        min={0}
+                        max={1}
+                        step={0.01}
+                        value={Number.isFinite(c.threshold) ? c.threshold : 0.9}
+                        onChange={(e: React.ChangeEvent<HTMLInputElement>) => {
+                          const v = Number(e.target.value)
+                          if (!Number.isFinite(v)) return
+                          onCriterionThresholdChange?.(
+                            c.criterion_key,
+                            Math.max(0, Math.min(1, v)),
+                          )
+                        }}
+                        className="w-20 rounded-md border border-gray-200 px-2 py-1 text-sm"
+                      />
+                    </div>
+                  </div>
+                </div>
+              ))}
+            </div>
+          </div>
+        ) : null}
+
+        <div className="rounded-md border border-gray-100 bg-gray-50 p-3 text-xs text-gray-700">
+          <div className="font-medium">Performance (validated set)</div>
+          <div className="mt-1 text-gray-500">
+            Coming in Phase 2: agreement/accuracy, recommended thresholds, workload reduction curves.
+          </div>
+        </div>
+      </div>
+    </div>
+  )
+}

From 814a7707ea4255d3b0fe5518c4dd3a152c6d36fa Mon Sep 17 00:00:00 2001
From: bing1100 <bingxuhu@gmail.com>
Date: Tue, 14 Apr 2026 15:08:42 -0400
Subject: [PATCH 3/3] working metrics and critical agent

---
 backend/api/jobs/router.py                    |  28 +-
 backend/api/jobs/run_all_tasks.py             | 239 ++++-
 backend/api/screen/agentic_utils.py           |  15 +-
 backend/api/screen/prompts.py                 |   6 +
 backend/api/screen/router.py                  | 836 +++++++++++++++++-
 backend/api/services/cit_db_service.py        | 116 ++-
 backend/api/services/sr_db_service.py         |  55 +-
 backend/api/sr/router.py                      |  72 ++
 backend/docker-compose.yml                    |   7 +-
 frontend/app/[lang]/can-sr/l1-screen/page.tsx |  54 +-
 .../app/[lang]/can-sr/l1-screen/view/page.tsx |  80 +-
 frontend/app/[lang]/can-sr/l2-screen/page.tsx |  57 +-
 .../app/[lang]/can-sr/l2-screen/view/page.tsx | 120 +--
 .../critical-prompt-additions/route.ts        |  59 ++
 .../api/can-sr/screen/calibration/route.ts    |  66 ++
 .../api/can-sr/screen/fulltext/run/route.ts   |  36 +
 .../can-sr/screen/title-abstract/run/route.ts |  36 +
 .../components/can-sr/CitationListPage.tsx    | 141 ++-
 frontend/components/can-sr/PagedList.tsx      | 134 ++-
 .../can-sr/ScreeningMetricsModal.tsx          | 326 +++++++
 .../can-sr/ScreeningMetricsPanel.tsx          | 357 ++++++--
 21 files changed, 2488 insertions(+), 352 deletions(-)
 create mode 100644 frontend/app/api/can-sr/reviews/critical-prompt-additions/route.ts
 create mode 100644 frontend/app/api/can-sr/screen/calibration/route.ts
 create mode 100644 frontend/app/api/can-sr/screen/fulltext/run/route.ts
 create mode 100644 frontend/app/api/can-sr/screen/title-abstract/run/route.ts
 create mode 100644 frontend/components/can-sr/ScreeningMetricsModal.tsx

diff --git a/backend/api/jobs/router.py b/backend/api/jobs/router.py
index a49b8f6a..4d9d4517 100644
--- a/backend/api/jobs/router.py
+++ b/backend/api/jobs/router.py
@@ -11,6 +11,7 @@
 from ..core.cit_utils import load_sr_and_check
 from ..services.sr_db_service import srdb_service
 from ..services.azure_openai_client import azure_openai_client
+from ..services.cit_db_service import cits_dp_service
 
 from .run_all_repo import run_all_repo
 from .procrastinate_app import cancel_enqueued_jobs_for_run_all, jobs_enabled, worker_concurrency
@@ -126,12 +127,34 @@ async def start_run_all(
 
     # Authz: ensure user can access SR
     try:
-        _sr, _screening = await load_sr_and_check(sr_id, current_user, srdb_service)
+        sr, screening = await load_sr_and_check(sr_id, current_user, srdb_service)
     except HTTPException:
         raise
     except Exception as e:
         raise HTTPException(status_code=500, detail=f"Failed to load SR: {e}")
 
+    # Legacy safety:
+    # If legacy llm_* outputs exist but normalized agent runs are missing, we must
+    # regenerate results to populate screening_agent_runs.
+    # We enforce this by auto-enabling force overwrite.
+    force = bool(payload.force)
+    try:
+        table_name = (screening or {}).get("table_name") or "citations"
+        cp = (sr or {}).get("criteria_parsed") or {}
+        if step in {"l1", "l2"}:
+            legacy_needs = await run_in_threadpool(
+                cits_dp_service.legacy_needs_rerun,
+                sr_id=sr_id,
+                table_name=table_name,
+                criteria_parsed=cp,
+                step=step,
+            )
+            if legacy_needs:
+                force = True
+    except Exception:
+        # best-effort, do not block
+        pass
+
     # Ensure our job tables exist
     await run_in_threadpool(run_all_repo.ensure_tables)
 
@@ -179,9 +202,10 @@ async def start_run_all(
             created_by=str(current_user.get("id") or ""),
             model=normalized_model,
             meta={
-                "force": bool(payload.force),
+                "force": force,
                 "chunk_size": int(payload.chunk_size),
                 "explicit_ids": bool(sanitized_ids is not None),
+                "legacy_auto_force": (force and (not bool(payload.force))),
             },
             total=len(sanitized_ids) if sanitized_ids is not None else 0,
         )
diff --git a/backend/api/jobs/run_all_tasks.py b/backend/api/jobs/run_all_tasks.py
index c8c43c48..b469cd0a 100644
--- a/backend/api/jobs/run_all_tasks.py
+++ b/backend/api/jobs/run_all_tasks.py
@@ -14,8 +14,16 @@
 from ..services.azure_openai_client import azure_openai_client
 from ..services.storage import storage_service
 from ..extract.router import extract_fulltext_from_storage
-from ..screen.router import update_inclusion_decision
-from ..screen.prompts import PROMPT_JSON_TEMPLATE, PROMPT_JSON_TEMPLATE_FULLTEXT
+from ..screen.router import update_inclusion_decision, _build_guardrails
+from ..screen.prompts import (
+    PROMPT_JSON_TEMPLATE,
+    PROMPT_JSON_TEMPLATE_FULLTEXT,
+    PROMPT_XML_TEMPLATE_TA,
+    PROMPT_XML_TEMPLATE_TA_CRITICAL,
+    PROMPT_XML_TEMPLATE_FULLTEXT,
+    PROMPT_XML_TEMPLATE_FULLTEXT_CRITICAL,
+)
+from ..screen.agentic_utils import build_critical_options, parse_agent_xml, resolve_option
 from ..extract.prompts import PARAMETER_PROMPT_JSON
 from ..core.config import settings
 
@@ -159,34 +167,103 @@ async def _run_l1_for_citation(
         if not azure_openai_client.is_configured():
             raise RuntimeError("Azure OpenAI client not configured")
 
-        options_listed = "\n".join([f"{j}. {opt}" for j, opt in enumerate(opts)])
-        prompt = PROMPT_JSON_TEMPLATE.format(question=q, cit=citation_text, options=options_listed, xtra=xtra)
-        llm_response = await azure_openai_client.simple_chat(
-            user_message=prompt,
+        # --- Agentic (screening + critical) ---
+        # We persist normalized runs to screening_agent_runs so /screen/metrics can compute SR-wide progress.
+        # We ALSO persist llm_* JSONB columns for backwards compatibility with the existing UI.
+        options_listed = "\n".join([str(opt) for opt in opts])
+        criterion_key = snake_case(q, max_len=56)
+
+        screening_prompt = PROMPT_XML_TEMPLATE_TA.format(
+            question=q,
+            cit=citation_text,
+            options=options_listed,
+            xtra=xtra or "",
+        )
+        screening_raw = await azure_openai_client.simple_chat(
+            user_message=screening_prompt,
             system_prompt=None,
             model=model,
             max_tokens=2000,
             temperature=0.0,
         )
+        screening_parsed = parse_agent_xml(str(screening_raw))
+        screening_answer = resolve_option(screening_parsed.answer, opts)
+
+        await run_in_threadpool(
+            cits_dp_service.insert_screening_agent_run,
+            {
+                "sr_id": sr.get("_id") or sr.get("id") or sr.get("sr_id") or "",
+                "table_name": table_name,
+                "citation_id": int(citation_id),
+                "pipeline": "title_abstract",
+                "criterion_key": criterion_key,
+                "stage": "screening",
+                "answer": screening_answer,
+                "confidence": screening_parsed.confidence,
+                "rationale": screening_parsed.rationale,
+                "raw_response": str(screening_raw),
+                "guardrails": _build_guardrails(screening_parsed, raw_text=str(screening_raw), stage="screening"),
+                "model": model,
+                "prompt_version": "run_all",
+                "temperature": 0.0,
+            },
+        )
 
-        import json
-
-        parsed = json.loads(llm_response)
-        selected_value = str(parsed.get("selected", "")).strip()
-        resolved_selected = f"None of the above - {selected_value}"
-        for opt in opts:
-            if opt.lower() in selected_value.lower():
-                resolved_selected = opt
-                break
+        critical_opts = build_critical_options(all_options=opts, screening_answer=screening_answer)
+        critical_listed = "\n".join([str(o) for o in critical_opts])
+        critical_prompt = PROMPT_XML_TEMPLATE_TA_CRITICAL.format(
+            question=q,
+            cit=citation_text,
+            screening_answer=screening_answer,
+            options=critical_listed,
+            xtra=xtra or "",
+            # run-all does not currently inject SR-scoped critical prompt additions (done in /screen/*/run)
+            critical_additions="(none)",
+        )
+        critical_raw = await azure_openai_client.simple_chat(
+            user_message=critical_prompt,
+            system_prompt=None,
+            model=model,
+            max_tokens=2000,
+            temperature=0.0,
+        )
+        critical_parsed = parse_agent_xml(str(critical_raw))
+        critical_answer = resolve_option(critical_parsed.answer, critical_opts)
+
+        await run_in_threadpool(
+            cits_dp_service.insert_screening_agent_run,
+            {
+                "sr_id": sr.get("_id") or sr.get("id") or sr.get("sr_id") or "",
+                "table_name": table_name,
+                "citation_id": int(citation_id),
+                "pipeline": "title_abstract",
+                "criterion_key": criterion_key,
+                "stage": "critical",
+                "answer": critical_answer,
+                "confidence": critical_parsed.confidence,
+                "rationale": critical_parsed.rationale,
+                "raw_response": str(critical_raw),
+                "guardrails": _build_guardrails(critical_parsed, raw_text=str(critical_raw), stage="critical"),
+                "model": model,
+                "prompt_version": "run_all",
+                "temperature": 0.0,
+            },
+        )
 
         classification_json = {
-            "selected": resolved_selected,
-            "explanation": parsed.get("explanation") or parsed.get("reason") or parsed.get("explain") or "",
-            "confidence": float(parsed.get("confidence") or 0.0) if str(parsed.get("confidence") or "").strip() else 0.0,
-            "evidence_sentences": parsed.get("evidence_sentences") or [],
-            "evidence_tables": parsed.get("evidence_tables") or [],
-            "evidence_figures": parsed.get("evidence_figures") or [],
-            "llm_raw": llm_response,
+            "selected": screening_answer,
+            "explanation": screening_parsed.rationale or "",
+            "confidence": screening_parsed.confidence if screening_parsed.confidence is not None else 0.0,
+            "evidence_sentences": [],
+            "evidence_tables": [],
+            "evidence_figures": [],
+            "llm_raw": str(screening_raw),
+            "critical": {
+                "selected": critical_answer,
+                "explanation": critical_parsed.rationale or "",
+                "confidence": critical_parsed.confidence,
+                "llm_raw": str(critical_raw),
+            },
         }
 
         await run_in_threadpool(cits_dp_service.update_jsonb_column, citation_id, col, classification_json, table_name)
@@ -354,19 +431,21 @@ async def _run_l2_for_citation(
         if _should_skip_ai_output(existing, force=force):
             continue
 
-        options_listed = "\n".join([f"{j}. {opt}" for j, opt in enumerate(opts)])
-        prompt = PROMPT_JSON_TEMPLATE_FULLTEXT.format(
+        # --- Agentic (screening + critical) ---
+        options_listed = "\n".join([str(opt) for opt in opts])
+        criterion_key = snake_case(q, max_len=56)
+
+        screening_prompt = PROMPT_XML_TEMPLATE_FULLTEXT.format(
             question=q,
             options=options_listed,
-            xtra=xtra,
+            xtra=xtra or "",
             fulltext=fulltext,
             tables="\n".join(tables_md_lines) if tables_md_lines else "(none)",
             figures="\n".join(figures_lines) if figures_lines else "(none)",
         )
-
         if images:
-            llm_response = await azure_openai_client.multimodal_chat(
-                user_text=prompt,
+            screening_raw = await azure_openai_client.multimodal_chat(
+                user_text=screening_prompt,
                 images=images,
                 system_prompt=None,
                 model=model,
@@ -374,30 +453,102 @@ async def _run_l2_for_citation(
                 temperature=0.0,
             )
         else:
-            llm_response = await azure_openai_client.simple_chat(
-                user_message=prompt,
+            screening_raw = await azure_openai_client.simple_chat(
+                user_message=screening_prompt,
                 system_prompt=None,
                 model=model,
                 max_tokens=2000,
                 temperature=0.0,
             )
+        screening_parsed = parse_agent_xml(str(screening_raw))
+        screening_answer = resolve_option(screening_parsed.answer, opts)
+
+        await run_in_threadpool(
+            cits_dp_service.insert_screening_agent_run,
+            {
+                "sr_id": sr.get("_id") or sr.get("id") or sr.get("sr_id") or "",
+                "table_name": table_name,
+                "citation_id": int(citation_id),
+                "pipeline": "fulltext",
+                "criterion_key": criterion_key,
+                "stage": "screening",
+                "answer": screening_answer,
+                "confidence": screening_parsed.confidence,
+                "rationale": screening_parsed.rationale,
+                "raw_response": str(screening_raw),
+                "guardrails": _build_guardrails(screening_parsed, raw_text=str(screening_raw), stage="screening"),
+                "model": model,
+                "prompt_version": "run_all",
+                "temperature": 0.0,
+            },
+        )
 
-        parsed = json.loads(llm_response)
-        selected_value = str(parsed.get("selected", "")).strip()
-        resolved_selected = f"None of the above - {selected_value}"
-        for opt in opts:
-            if opt.lower() in selected_value.lower():
-                resolved_selected = opt
-                break
+        critical_opts = build_critical_options(all_options=opts, screening_answer=screening_answer)
+        critical_listed = "\n".join([str(o) for o in critical_opts])
+        critical_prompt = PROMPT_XML_TEMPLATE_FULLTEXT_CRITICAL.format(
+            question=q,
+            screening_answer=screening_answer,
+            options=critical_listed,
+            xtra=xtra or "",
+            critical_additions="(none)",
+            fulltext=fulltext,
+            tables="\n".join(tables_md_lines) if tables_md_lines else "(none)",
+            figures="\n".join(figures_lines) if figures_lines else "(none)",
+        )
+        if images:
+            critical_raw = await azure_openai_client.multimodal_chat(
+                user_text=critical_prompt,
+                images=images,
+                system_prompt=None,
+                model=model,
+                max_tokens=2000,
+                temperature=0.0,
+            )
+        else:
+            critical_raw = await azure_openai_client.simple_chat(
+                user_message=critical_prompt,
+                system_prompt=None,
+                model=model,
+                max_tokens=2000,
+                temperature=0.0,
+            )
+        critical_parsed = parse_agent_xml(str(critical_raw))
+        critical_answer = resolve_option(critical_parsed.answer, critical_opts)
+
+        await run_in_threadpool(
+            cits_dp_service.insert_screening_agent_run,
+            {
+                "sr_id": sr.get("_id") or sr.get("id") or sr.get("sr_id") or "",
+                "table_name": table_name,
+                "citation_id": int(citation_id),
+                "pipeline": "fulltext",
+                "criterion_key": criterion_key,
+                "stage": "critical",
+                "answer": critical_answer,
+                "confidence": critical_parsed.confidence,
+                "rationale": critical_parsed.rationale,
+                "raw_response": str(critical_raw),
+                "guardrails": _build_guardrails(critical_parsed, raw_text=str(critical_raw), stage="critical"),
+                "model": model,
+                "prompt_version": "run_all",
+                "temperature": 0.0,
+            },
+        )
 
         classification_json = {
-            "selected": resolved_selected,
-            "explanation": parsed.get("explanation") or parsed.get("reason") or parsed.get("explain") or "",
-            "confidence": float(parsed.get("confidence") or 0.0) if str(parsed.get("confidence") or "").strip() else 0.0,
-            "evidence_sentences": parsed.get("evidence_sentences") or [],
-            "evidence_tables": parsed.get("evidence_tables") or [],
-            "evidence_figures": parsed.get("evidence_figures") or [],
-            "llm_raw": llm_response,
+            "selected": screening_answer,
+            "explanation": screening_parsed.rationale or "",
+            "confidence": screening_parsed.confidence if screening_parsed.confidence is not None else 0.0,
+            "evidence_sentences": [],
+            "evidence_tables": [],
+            "evidence_figures": [],
+            "llm_raw": str(screening_raw),
+            "critical": {
+                "selected": critical_answer,
+                "explanation": critical_parsed.rationale or "",
+                "confidence": critical_parsed.confidence,
+                "llm_raw": str(critical_raw),
+            },
         }
 
         await run_in_threadpool(cits_dp_service.update_jsonb_column, citation_id, col, classification_json, table_name)
diff --git a/backend/api/screen/agentic_utils.py b/backend/api/screen/agentic_utils.py
index 1250287b..ec4efc36 100644
--- a/backend/api/screen/agentic_utils.py
+++ b/backend/api/screen/agentic_utils.py
@@ -19,6 +19,8 @@ class ParsedAgentXML:
     confidence: float
     rationale: str
     parse_ok: bool
+    missing_answer: bool
+    missing_confidence: bool
 
 
 _TAG_RE_CACHE: dict[str, re.Pattern[str]] = {}
@@ -49,8 +51,17 @@ def parse_agent_xml(text: str) -> ParsedAgentXML:
             conf_val = 0.0
     conf_val = max(0.0, min(1.0, conf_val))
 
-    parse_ok = bool(ans_m and conf_m)
-    return ParsedAgentXML(answer=answer, confidence=conf_val, rationale=rationale, parse_ok=parse_ok)
+    missing_answer = not bool(ans_m and answer.strip())
+    missing_confidence = not bool(conf_m)
+    parse_ok = (not missing_answer) and (not missing_confidence)
+    return ParsedAgentXML(
+        answer=answer,
+        confidence=conf_val,
+        rationale=rationale,
+        parse_ok=parse_ok,
+        missing_answer=missing_answer,
+        missing_confidence=missing_confidence,
+    )
 
 
 def resolve_option(raw_answer: str, options: list[str]) -> str:
diff --git a/backend/api/screen/prompts.py b/backend/api/screen/prompts.py
index 97861767..b7208f87 100644
--- a/backend/api/screen/prompts.py
+++ b/backend/api/screen/prompts.py
@@ -134,6 +134,9 @@
 Additional guidance:
 {xtra}
 
+CRITICAL PROMPT ADDITIONS (SR-scoped):
+{critical_additions}
+
 Output requirement:
 Return ONLY the following XML tags (no Markdown, no extra prose):
 <answer>...</answer>
@@ -199,6 +202,9 @@
 Additional guidance:
 {xtra}
 
+CRITICAL PROMPT ADDITIONS (SR-scoped):
+{critical_additions}
+
 Full text (numbered sentences):
 {fulltext}
 
diff --git a/backend/api/screen/router.py b/backend/api/screen/router.py
index 3e7f27da..6da5bf1e 100644
--- a/backend/api/screen/router.py
+++ b/backend/api/screen/router.py
@@ -1,10 +1,12 @@
 from typing import Any, Dict, List, Optional, Tuple
+import math
 import json
 import re
 from datetime import datetime
 import logging
 from fastapi import APIRouter, Depends, HTTPException, status
 from fastapi.concurrency import run_in_threadpool
+from fastapi.responses import Response
 from pydantic import BaseModel, Field
 
 from ..services.sr_db_service import srdb_service
@@ -51,6 +53,7 @@ class ScreeningMetricsCriterion(BaseModel):
     critical_disagreement_count: int
     confident_exclude_count: int
     needs_human_review_count: int
+    accuracy: Optional[float] = None
 
 
 class ScreeningMetricsSummary(BaseModel):
@@ -61,11 +64,77 @@ class ScreeningMetricsSummary(BaseModel):
     validated_needs_review: int
     unvalidated_needs_review: int
     needs_review_total: int
+    not_screened_yet: int
+    auto_excluded: int
 
 
 class ScreeningMetricsResponse(BaseModel):
     sr_id: str
     steps: Dict[str, Any]
+    warnings: Optional[List[Dict[str, Any]]] = None
+
+
+class CalibrationPoint(BaseModel):
+    threshold: float
+    tp: int
+    fp: int
+    fn: int
+    tn: int
+    precision: Optional[float] = None
+    recall: Optional[float] = None
+    fpr: Optional[float] = None
+    tpr: Optional[float] = None
+    workload_reduction: Optional[float] = None
+
+
+class CalibrationHistogramBin(BaseModel):
+    bin_start: float
+    bin_end: float
+    agree: int
+    disagree: int
+
+
+class CalibrationCriterionResponse(BaseModel):
+    criterion_key: str
+    label: str
+    validated_n: int
+    recommended_threshold: Optional[float] = None
+    recommended_reason: Optional[str] = None
+    curve: List[CalibrationPoint]
+    histogram: List[CalibrationHistogramBin]
+
+
+class CalibrationResponse(BaseModel):
+    sr_id: str
+    step: str
+    criteria: List[CalibrationCriterionResponse]
+
+
+class CalibrationSampleRow(BaseModel):
+    citation_id: int
+    criterion_key: str
+    label: str
+    validated: bool
+    confidence: Optional[float] = None
+    ai_answer: Optional[str] = None
+    human_selected: Optional[str] = None
+    agrees: Optional[bool] = None
+    bucket: Optional[str] = None  # tp/fp/fn/tn given a threshold
+
+
+class CalibrationSamplesResponse(BaseModel):
+    sr_id: str
+    step: str
+    threshold: float
+    rows: List[CalibrationSampleRow]
+
+
+def _csv_escape(v: Any) -> str:
+    s = "" if v is None else str(v)
+    # RFC 4180 basic escaping
+    if any(ch in s for ch in [",", "\n", "\r", '"']):
+        s = '"' + s.replace('"', '""') + '"'
+    return s
 
 
 def _normalize_int_list(v: Any) -> List[int]:
@@ -213,6 +282,34 @@ def _is_exclude_answer(ans: Any) -> bool:
     return "(exclude)" in s.lower()
 
 
+def _parse_selected_from_human_payload(v: Any) -> Optional[str]:
+    """Extract the human label (selected option) from a human_{criterion_key} cell.
+
+    Stored value is usually JSONB like:
+      {"selected": "...", "confidence": ..., ...}
+    but some deployments might store a plain string.
+    """
+    if v is None:
+        return None
+    if isinstance(v, str):
+        s = v.strip()
+        if not s:
+            return None
+        # Try JSON first
+        try:
+            obj = json.loads(s)
+            if isinstance(obj, dict):
+                sel = obj.get("selected")
+                return str(sel).strip() if isinstance(sel, str) else None
+        except Exception:
+            return s
+        return None
+    if isinstance(v, dict):
+        sel = v.get("selected")
+        return str(sel).strip() if isinstance(sel, str) else None
+    return None
+
+
 def _criterion_key_from_question(question: str) -> str:
     # Keep in sync with the frontend derivation in l2-screen view.
     q = str(question or "")
@@ -685,6 +782,7 @@ async def _call_llm(prompt: str) -> Tuple[str, Dict[str, Any], int]:
                     "confidence": screening_parsed.confidence,
                     "rationale": screening_parsed.rationale,
                     "raw_response": screening_raw,
+                    "guardrails": _build_guardrails(screening_parsed, raw_text=screening_raw, stage="screening"),
                     "model": payload.model,
                     "prompt_version": payload.prompt_version,
                     "temperature": payload.temperature,
@@ -699,6 +797,18 @@ async def _call_llm(prompt: str) -> Tuple[str, Dict[str, Any], int]:
             raise HTTPException(status_code=status.HTTP_500_INTERNAL_SERVER_ERROR, detail=f"Failed to persist screening run: {e}")
 
         # 2) critical
+        critical_additions = ""
+        try:
+            cpa = sr.get("critical_prompt_additions") or {}
+            if isinstance(cpa, dict):
+                block = cpa.get("l1")
+                if isinstance(block, dict):
+                    critical_additions = str(block.get(criterion_key) or "")
+        except Exception:
+            critical_additions = ""
+        if not critical_additions.strip():
+            critical_additions = "(none)"
+
         critical_opts = build_critical_options(all_options=opts, screening_answer=screening_answer)
         critical_listed = "\n".join(critical_opts)
         critical_prompt = PROMPT_XML_TEMPLATE_TA_CRITICAL.format(
@@ -707,6 +817,7 @@ async def _call_llm(prompt: str) -> Tuple[str, Dict[str, Any], int]:
             screening_answer=screening_answer,
             options=critical_listed,
             xtra=xtra or "",
+            critical_additions=critical_additions,
         )
         critical_raw, critical_usage, critical_latency = await _call_llm(critical_prompt)
         critical_parsed = parse_agent_xml(critical_raw)
@@ -728,6 +839,7 @@ async def _call_llm(prompt: str) -> Tuple[str, Dict[str, Any], int]:
                     "confidence": critical_parsed.confidence,
                     "rationale": critical_parsed.rationale,
                     "raw_response": critical_raw,
+                    "guardrails": _build_guardrails(critical_parsed, raw_text=critical_raw, stage="critical"),
                     "model": payload.model,
                     "prompt_version": payload.prompt_version,
                     "temperature": payload.temperature,
@@ -1075,6 +1187,7 @@ async def _call_llm(prompt: str) -> Tuple[str, Dict[str, Any], int]:
                     "confidence": screening_parsed.confidence,
                     "rationale": screening_parsed.rationale,
                     "raw_response": screening_raw,
+                    "guardrails": _build_guardrails(screening_parsed, raw_text=screening_raw, stage="screening"),
                     "model": payload.model,
                     "prompt_version": payload.prompt_version,
                     "temperature": payload.temperature,
@@ -1089,6 +1202,18 @@ async def _call_llm(prompt: str) -> Tuple[str, Dict[str, Any], int]:
             raise HTTPException(status_code=status.HTTP_500_INTERNAL_SERVER_ERROR, detail=f"Failed to persist screening run: {e}")
 
         # 2) critical
+        critical_additions = ""
+        try:
+            cpa = sr.get("critical_prompt_additions") or {}
+            if isinstance(cpa, dict):
+                block = cpa.get("l2")
+                if isinstance(block, dict):
+                    critical_additions = str(block.get(criterion_key) or "")
+        except Exception:
+            critical_additions = ""
+        if not critical_additions.strip():
+            critical_additions = "(none)"
+
         critical_opts = build_critical_options(all_options=opts, screening_answer=screening_answer)
         critical_listed = "\n".join(critical_opts)
         critical_prompt = PROMPT_XML_TEMPLATE_FULLTEXT_CRITICAL.format(
@@ -1096,6 +1221,7 @@ async def _call_llm(prompt: str) -> Tuple[str, Dict[str, Any], int]:
             screening_answer=screening_answer,
             options=critical_listed,
             xtra=xtra or "",
+            critical_additions=critical_additions,
             fulltext=fulltext,
             tables="\n".join(tables_md_lines) if tables_md_lines else "(none)",
             figures="\n".join(figures_lines) if figures_lines else "(none)",
@@ -1119,6 +1245,7 @@ async def _call_llm(prompt: str) -> Tuple[str, Dict[str, Any], int]:
                     "confidence": critical_parsed.confidence,
                     "rationale": critical_parsed.rationale,
                     "raw_response": critical_raw,
+                    "guardrails": _build_guardrails(critical_parsed, raw_text=critical_raw, stage="critical"),
                     "model": payload.model,
                     "prompt_version": payload.prompt_version,
                     "temperature": payload.temperature,
@@ -1231,9 +1358,11 @@ async def get_screening_metrics(
 
     - Each criterion uses its own threshold (from SR.screening_thresholds[step][criterion_key]).
     - Needs-human-review logic:
-        1) If ANY criterion is a confident exclude => no human review needed for the citation.
-        2) Else if ANY criterion has critical disagreement => needs review.
-        3) Else if ANY criterion is low confidence (below its threshold) => needs review.
+        0) Not screened yet: if no agent runs exist for this step/pipeline.
+        1) Auto-excluded if ANY criterion is a confident exclude AND critical agrees:
+           screening answer contains '(exclude)' AND screening_conf >= threshold AND critical answer == 'None of the above'.
+        2) Else needs review if ANY criterion has critical disagreement (critical answer != 'None of the above').
+        3) Else needs review if ANY criterion is low confidence (below its threshold).
     """
 
     step_norm = str(step or "l1").lower().strip()
@@ -1290,6 +1419,40 @@ async def get_screening_metrics(
 
     needed_cols.extend([validations_col, legacy_validated_by])
 
+    # Phase 2: canonical human labels (per criterion) live in human_{criterion_key} JSONB.
+    # We only need these to compute validated-set agreement metrics.
+    human_cols: Dict[str, str] = {}
+    for c in criteria:
+        ck = c["criterion_key"]
+        col = f"human_{ck}" if ck else "human_col"
+        human_cols[ck] = col
+        needed_cols.append(col)
+
+    warnings: List[Dict[str, Any]] = []
+    # Legacy safety: do NOT attempt to fabricate agent runs.
+    # If legacy llm_* outputs exist but normalized runs are missing, we warn the UI
+    # so the user can run run-all (which will force overwrite and create real runs).
+    try:
+        legacy_needs = await run_in_threadpool(
+            cits_dp_service.legacy_needs_rerun,
+            sr_id=sr_id,
+            table_name=table_name,
+            criteria_parsed=cp,
+            step=step_norm,
+        )
+        if legacy_needs:
+            warnings.append(
+                {
+                    "code": "LEGACY_DATA_NEEDS_RUN_ALL",
+                    "severity": "warning",
+                    "message": "Legacy screening results detected (llm_* columns) but agentic runs are missing. Please run Run-all to regenerate results.",
+                    "sr_id": sr_id,
+                    "step": step_norm,
+                }
+            )
+    except Exception:
+        pass
+
     # We'll compute per-citation needs-review based on agent runs only.
     # Fetch latest runs for all citations (bulk query using service helper)
     pipeline_norm = "title_abstract" if step_norm == "l1" else "fulltext"
@@ -1352,13 +1515,24 @@ def _is_validated(row: Dict[str, Any]) -> bool:
             "low_confidence_count": 0,
             "critical_disagreement_count": 0,
             "confident_exclude_count": 0,
+            # Count of citations where THIS criterion triggered needs-review.
             "needs_human_review_count": 0,
+            # Validated-set agreement counts (AI screening vs canonical human label).
+            "human_agree_count": 0,
+            "human_total_count": 0,
+
+            # Fallback proxy when human labels are not available:
+            # count how often critical agrees with screening.
+            "crit_agree_count": 0,
+            "crit_total_count": 0,
         }
 
     total_citations = 0
     validated_all = 0
     needs_review_total = 0
     validated_needs_review = 0
+    not_screened_yet = 0
+    auto_excluded = 0
 
     # Iterate citations and compute needs-review + per-criterion counts
     for row in rows or []:
@@ -1373,10 +1547,16 @@ def _is_validated(row: Dict[str, Any]) -> bool:
 
         per_crit = runs_by_cit.get(cid, {})
 
+        # Bucket 1: Not screened yet (no runs at all)
+        if not per_crit:
+            not_screened_yet += 1
+            continue
+
         # Evaluate confident exclude override
         has_confident_exclude = False
         has_critical_disagreement = False
         has_low_confidence = False
+        has_guardrail_issue = False
 
         for c in criteria:
             ck = c["criterion_key"]
@@ -1386,6 +1566,8 @@ def _is_validated(row: Dict[str, Any]) -> bool:
                 continue
             a["total_citations"] += 1
 
+            triggered_this_criterion = False
+
             rpair = per_crit.get(ck) or {}
             scr = rpair.get("screening")
             crit = rpair.get("critical")
@@ -1399,26 +1581,83 @@ def _is_validated(row: Dict[str, Any]) -> bool:
                     conf_f = None
                 ans = scr.get("answer")
 
+                # If citation is validated and a canonical human label exists, compute agreement.
+                if validated:
+                    hcol = human_cols.get(ck) or f"human_{ck}"
+                    human_sel = _parse_selected_from_human_payload(row.get(hcol))
+                    if human_sel is not None:
+                        a["human_total_count"] += 1
+                        # Agreement definition: exact string match after stripping.
+                        if str(human_sel).strip() == str(ans or "").strip():
+                            a["human_agree_count"] += 1
+
                 if conf_f is not None and conf_f < thr:
                     a["low_confidence_count"] += 1
                     has_low_confidence = True
+                    # This criterion triggers review for this citation.
+                    triggered_this_criterion = True
 
-                if conf_f is not None and conf_f >= thr and _is_exclude_answer(ans):
+                # Guardrails: missing/failed parse should be treated as needs review.
+                try:
+                    g = scr.get("guardrails")
+                    if isinstance(g, str):
+                        g = json.loads(g)
+                    if isinstance(g, dict):
+                        if g.get("parse_ok") is False or g.get("missing_answer") or g.get("missing_confidence"):
+                            has_guardrail_issue = True
+                            triggered_this_criterion = True
+                except Exception:
+                    # If guardrails column exists but is unparsable, treat as issue.
+                    if scr.get("guardrails") is not None:
+                        has_guardrail_issue = True
+                        triggered_this_criterion = True
+
+                # Confident exclude requires critical agreement
+                crit_has = bool(crit) and str(crit.get("answer") or "").strip() != ""
+                crit_agrees = crit_has and (not _is_disagreeing_critical_answer(crit.get("answer")))
+                if crit_has:
+                    a["crit_total_count"] += 1
+                    if crit_agrees:
+                        a["crit_agree_count"] += 1
+                if conf_f is not None and conf_f >= thr and _is_exclude_answer(ans) and crit_agrees:
                     a["confident_exclude_count"] += 1
                     has_confident_exclude = True
 
-            if crit and _is_disagreeing_critical_answer(crit.get("answer")):
+            # Treat missing/empty critical as disagreement/parse issue (conservative).
+            if not crit or str(crit.get("answer") or "").strip() == "":
                 a["critical_disagreement_count"] += 1
                 has_critical_disagreement = True
+                triggered_this_criterion = True
+            elif _is_disagreeing_critical_answer(crit.get("answer")):
+                a["critical_disagreement_count"] += 1
+                has_critical_disagreement = True
+                triggered_this_criterion = True
+
+            # Guardrails on critical stage
+            try:
+                if crit:
+                    g2 = crit.get("guardrails")
+                    if isinstance(g2, str):
+                        g2 = json.loads(g2)
+                    if isinstance(g2, dict):
+                        if g2.get("parse_ok") is False or g2.get("missing_answer") or g2.get("missing_confidence"):
+                            has_guardrail_issue = True
+                            triggered_this_criterion = True
+            except Exception:
+                if crit and crit.get("guardrails") is not None:
+                    has_guardrail_issue = True
+                    triggered_this_criterion = True
 
-        needs_review = (not has_confident_exclude) and (has_critical_disagreement or has_low_confidence)
+            if triggered_this_criterion:
+                a["needs_human_review_count"] += 1
+
+        if has_confident_exclude:
+            auto_excluded += 1
+        needs_review = (not has_confident_exclude) and (has_critical_disagreement or has_low_confidence or has_guardrail_issue)
         if needs_review:
             needs_review_total += 1
             if validated:
                 validated_needs_review += 1
-            # increment per-criterion needs-review count for all criteria
-            for c in criteria:
-                agg[c["criterion_key"]]["needs_human_review_count"] += 1
 
     unvalidated_all = max(0, total_citations - validated_all)
     unvalidated_needs_review = max(0, needs_review_total - validated_needs_review)
@@ -1428,11 +1667,25 @@ def _is_validated(row: Dict[str, Any]) -> bool:
     for c in criteria:
         ck = c["criterion_key"]
         a = agg.get(ck) or {}
+        # Prefer human-vs-AI agreement on the validated set when available.
+        # Fallback to critical-agreement proxy when no human labels exist yet.
+        try:
+            h_total = int(a.get("human_total_count") or 0)
+            h_agree = int(a.get("human_agree_count") or 0)
+            if h_total > 0:
+                accuracy = (h_agree / h_total)
+            else:
+                crit_total = int(a.get("crit_total_count") or 0)
+                crit_agree = int(a.get("crit_agree_count") or 0)
+                accuracy = (crit_agree / crit_total) if crit_total > 0 else None
+        except Exception:
+            accuracy = None
         crit_out.append(
             {
                 "criterion_key": ck,
                 "label": c["label"],
                 "threshold": float(c["threshold"]),
+                "accuracy": accuracy,
                 **a,
             }
         )
@@ -1449,12 +1702,577 @@ def _is_validated(row: Dict[str, Any]) -> bool:
                     "needs_review_total": needs_review_total,
                     "validated_needs_review": validated_needs_review,
                     "unvalidated_needs_review": unvalidated_needs_review,
+                    "not_screened_yet": not_screened_yet,
+                    "auto_excluded": auto_excluded,
                 },
                 "criteria": crit_out,
             }
         },
+        warnings=warnings or None,
     )
 
+
+def _safe_div(n: float, d: float) -> Optional[float]:
+    try:
+        if d == 0:
+            return None
+        return n / d
+    except Exception:
+        return None
+
+
+def _clip01(v: Any, default: float = 0.0) -> float:
+    try:
+        x = float(v)
+        if math.isnan(x) or math.isinf(x):
+            return float(default)
+        return max(0.0, min(1.0, x))
+    except Exception:
+        return float(default)
+
+
+def _parse_confidence(v: Any) -> Optional[float]:
+    if v is None:
+        return None
+    try:
+        x = float(v)
+        if math.isnan(x) or math.isinf(x):
+            return None
+        return max(0.0, min(1.0, x))
+    except Exception:
+        return None
+
+
+def _build_guardrails(parsed: Any, *, raw_text: str, stage: str) -> Dict[str, Any]:
+    """Build a compact guardrails payload for persisting with screening_agent_runs."""
+    raw = str(raw_text or "")
+    out: Dict[str, Any] = {
+        "schema_version": "v1",
+        "stage": str(stage or ""),
+        "parse_ok": bool(getattr(parsed, "parse_ok", False)),
+        "missing_answer": bool(getattr(parsed, "missing_answer", False)),
+        "missing_confidence": bool(getattr(parsed, "missing_confidence", False)),
+        "missing_rationale": not bool(str(getattr(parsed, "rationale", "") or "").strip()),
+        "raw_len": len(raw),
+        "has_answer_tag": "<answer" in raw.lower(),
+        "has_confidence_tag": "<confidence" in raw.lower(),
+        "has_rationale_tag": "<rationale" in raw.lower(),
+    }
+    # Flag when confidence appears outside [0,1] in raw (heuristic)
+    try:
+        conf = float(getattr(parsed, "confidence", 0.0))
+        out["confidence_clipped"] = bool(conf in (0.0, 1.0)) and ("confidence" in raw.lower())
+    except Exception:
+        out["confidence_clipped"] = False
+    return out
+
+
+@router.get("/calibration", response_model=CalibrationResponse)
+async def get_screening_calibration(
+    sr_id: str,
+    step: str = "l1",
+    thresholds: str = "0.5,0.6,0.7,0.8,0.85,0.9,0.95",
+    bins: int = 10,
+    current_user: Dict[str, Any] = Depends(get_current_active_user),
+):
+    """Compute calibration curves on the validated set.
+
+    Contract (Phase 2A):
+    - Uses validated citations for the step (`${step}_validations` or legacy `${step}_validated_by`).
+    - For each criterion:
+        - AI label: latest screening run's answer + confidence
+        - Human label: `human_{criterion_key}.selected`
+    - A "positive" is defined as **AI==Human** (agreement). Disagreement is negative.
+
+    Output:
+    - curve: confusion matrix + rates for each threshold (treat agreement as positive)
+    - histogram: confidence distribution split by agree/disagree
+    - recommended_threshold: best threshold by default objective (maximize Youden's J = TPR - FPR)
+      with a recall-first tie-break (higher recall).
+    """
+
+    step_norm = str(step or "l1").lower().strip()
+    if step_norm not in {"l1", "l2"}:
+        raise HTTPException(status_code=status.HTTP_400_BAD_REQUEST, detail="step must be l1 or l2")
+
+    try:
+        sr, screening = await load_sr_and_check(sr_id, current_user, srdb_service)
+    except HTTPException:
+        raise
+    except Exception as e:
+        raise HTTPException(status_code=status.HTTP_500_INTERNAL_SERVER_ERROR, detail=f"Failed to load SR: {e}")
+
+    table_name = (screening or {}).get("table_name") or "citations"
+
+    # Criteria questions for step
+    cp = sr.get("criteria_parsed") or {}
+    crit_block = cp.get(step_norm) if isinstance(cp, dict) else None
+    questions = (crit_block or {}).get("questions") if isinstance(crit_block, dict) else []
+    questions = questions if isinstance(questions, list) else []
+
+    criteria: List[Dict[str, str]] = []
+    for q in questions:
+        if not isinstance(q, str) or not q.strip():
+            continue
+        ck = _criterion_key_from_question(q)
+        criteria.append({"criterion_key": ck, "label": q})
+
+    # Determine SR scope ids for step (same as metrics)
+    filter_step = ""
+    if step_norm == "l2":
+        filter_step = "l1"
+    try:
+        ids = await run_in_threadpool(cits_dp_service.list_citation_ids, filter_step, table_name)
+    except Exception as e:
+        raise HTTPException(status_code=status.HTTP_500_INTERNAL_SERVER_ERROR, detail=f"Failed to list citations: {e}")
+
+    # Parse thresholds list
+    thr_list: List[float] = []
+    for part in str(thresholds or "").split(","):
+        part = part.strip()
+        if not part:
+            continue
+        try:
+            thr_list.append(_clip01(float(part), default=0.0))
+        except Exception:
+            continue
+    if not thr_list:
+        thr_list = [0.9]
+    thr_list = sorted(set([_clip01(t) for t in thr_list]))
+
+    try:
+        bins_n = int(bins)
+    except Exception:
+        bins_n = 10
+    bins_n = max(3, min(50, bins_n))
+
+    validations_col = f"{step_norm}_validations"
+    legacy_validated_by = f"{step_norm}_validated_by"
+
+    # Build columns for row fetch
+    needed_cols: List[str] = ["id", validations_col, legacy_validated_by]
+    human_cols: Dict[str, str] = {}
+    for c in criteria:
+        ck = c["criterion_key"]
+        hcol = f"human_{ck}" if ck else "human_col"
+        human_cols[ck] = hcol
+        needed_cols.append(hcol)
+
+    # Load latest screening runs for all ids in this step/pipeline
+    pipeline_norm = "title_abstract" if step_norm == "l1" else "fulltext"
+    try:
+        runs = await run_in_threadpool(
+            cits_dp_service.list_latest_agent_runs,
+            sr_id=sr_id,
+            table_name=table_name,
+            citation_ids=ids,
+            pipeline=pipeline_norm,
+        )
+    except Exception as e:
+        raise HTTPException(status_code=status.HTTP_500_INTERNAL_SERVER_ERROR, detail=f"Failed to load agent runs: {e}")
+
+    # Group screening runs by citation then criterion
+    screening_by_cit: Dict[int, Dict[str, Dict[str, Any]]] = {}
+    for r in runs or []:
+        try:
+            cid = int(r.get("citation_id"))
+        except Exception:
+            continue
+        if str(r.get("stage") or "") != "screening":
+            continue
+        ck = str(r.get("criterion_key") or "")
+        if not ck:
+            continue
+        if cid not in screening_by_cit:
+            screening_by_cit[cid] = {}
+        screening_by_cit[cid][ck] = r
+
+    # Load citation rows
+    try:
+        rows = await run_in_threadpool(cits_dp_service.get_citations_by_ids, ids, table_name, needed_cols)
+    except Exception as e:
+        raise HTTPException(status_code=status.HTTP_500_INTERNAL_SERVER_ERROR, detail=f"Failed to load citation rows: {e}")
+
+    def _is_validated_row(row: Dict[str, Any]) -> bool:
+        v = row.get(validations_col)
+        if v:
+            try:
+                parsed = v
+                if isinstance(v, str):
+                    parsed = json.loads(v)
+                if isinstance(parsed, list) and len(parsed) > 0:
+                    return True
+            except Exception:
+                pass
+        return bool(row.get(legacy_validated_by))
+
+    # Build validated examples per criterion: (confidence, agree_bool)
+    examples: Dict[str, List[Tuple[float, bool]]] = {c["criterion_key"]: [] for c in criteria}
+    for row in rows or []:
+        try:
+            cid = int(row.get("id"))
+        except Exception:
+            continue
+        if not _is_validated_row(row):
+            continue
+        scr_map = screening_by_cit.get(cid) or {}
+        for c in criteria:
+            ck = c["criterion_key"]
+            scr = scr_map.get(ck)
+            if not scr:
+                continue
+            conf = _parse_confidence(scr.get("confidence"))
+            if conf is None:
+                continue
+            ai_ans = str(scr.get("answer") or "").strip()
+            human_sel = _parse_selected_from_human_payload(row.get(human_cols.get(ck) or f"human_{ck}"))
+            if human_sel is None:
+                continue
+            agree = str(human_sel).strip() == ai_ans
+            examples[ck].append((conf, agree))
+
+    # Compute curve + histogram per criterion
+    out_criteria: List[CalibrationCriterionResponse] = []
+    for c in criteria:
+        ck = c["criterion_key"]
+        label = c["label"]
+        ex = examples.get(ck) or []
+        validated_n = len(ex)
+
+        # Histogram bins
+        hist: List[CalibrationHistogramBin] = []
+        if validated_n > 0:
+            for b in range(bins_n):
+                start = b / bins_n
+                end = (b + 1) / bins_n
+                agree_ct = 0
+                disagree_ct = 0
+                for conf, agree in ex:
+                    # include 1.0 in last bin
+                    in_bin = (conf >= start and conf < end) or (b == bins_n - 1 and conf == 1.0)
+                    if not in_bin:
+                        continue
+                    if agree:
+                        agree_ct += 1
+                    else:
+                        disagree_ct += 1
+                hist.append(
+                    CalibrationHistogramBin(
+                        bin_start=round(start, 6),
+                        bin_end=round(end, 6),
+                        agree=agree_ct,
+                        disagree=disagree_ct,
+                    )
+                )
+        else:
+            for b in range(bins_n):
+                start = b / bins_n
+                end = (b + 1) / bins_n
+                hist.append(CalibrationHistogramBin(bin_start=round(start, 6), bin_end=round(end, 6), agree=0, disagree=0))
+
+        curve: List[CalibrationPoint] = []
+        best_thr: Optional[float] = None
+        best_score: Optional[float] = None
+        best_recall: Optional[float] = None
+
+        for thr in thr_list:
+            tp = fp = fn = tn = 0
+            # Review queue size for this criterion at this threshold = count(conf < thr) among validated examples.
+            # Workload reduction proxy: 1 - queue/total.
+            queue = 0
+
+            for conf, agree in ex:
+                pred_pos = conf >= thr
+                if conf < thr:
+                    queue += 1
+
+                if pred_pos and agree:
+                    tp += 1
+                elif pred_pos and not agree:
+                    fp += 1
+                elif (not pred_pos) and agree:
+                    fn += 1
+                else:
+                    tn += 1
+
+            precision = _safe_div(tp, tp + fp)
+            recall = _safe_div(tp, tp + fn)
+            fpr = _safe_div(fp, fp + tn)
+            tpr = recall
+            workload_reduction = None
+            if validated_n > 0:
+                workload_reduction = 1.0 - (queue / validated_n)
+
+            curve.append(
+                CalibrationPoint(
+                    threshold=float(thr),
+                    tp=tp,
+                    fp=fp,
+                    fn=fn,
+                    tn=tn,
+                    precision=precision,
+                    recall=recall,
+                    fpr=fpr,
+                    tpr=tpr,
+                    workload_reduction=workload_reduction,
+                )
+            )
+
+            # Choose recommended threshold by maximizing Youden's J; tie-break by higher recall.
+            if recall is None or fpr is None:
+                continue
+            score = recall - fpr
+            if best_score is None or score > best_score + 1e-9:
+                best_score = score
+                best_thr = thr
+                best_recall = recall
+            elif best_score is not None and abs(score - best_score) <= 1e-9:
+                # tie-break: higher recall
+                if best_recall is None or recall > best_recall + 1e-9:
+                    best_thr = thr
+                    best_recall = recall
+
+        reason = None
+        if best_thr is not None:
+            reason = "max_youden_j (tpr-fpr), tie-break: max recall"
+
+        out_criteria.append(
+            CalibrationCriterionResponse(
+                criterion_key=ck,
+                label=label,
+                validated_n=validated_n,
+                recommended_threshold=float(best_thr) if best_thr is not None else None,
+                recommended_reason=reason,
+                curve=curve,
+                histogram=hist,
+            )
+        )
+
+    return CalibrationResponse(sr_id=sr_id, step=step_norm, criteria=out_criteria)
+
+
+@router.get("/calibration/samples")
+async def get_calibration_samples(
+    sr_id: str,
+    step: str = "l1",
+    threshold: float = 0.9,
+    criterion_key: Optional[str] = None,
+    limit: int = 200,
+    format: str = "json",
+    current_user: Dict[str, Any] = Depends(get_current_active_user),
+):
+    """Return calibration sample rows (validated citations only) for auditing.
+
+    This endpoint is meant for exporting / debugging calibration behavior.
+
+    Definitions:
+    - human label: `human_{criterion_key}.selected`
+    - AI label: latest `stage=screening` answer for this pipeline
+    - agrees: AI answer == human selected
+    - bucket at given threshold (positive == agreement, predicted positive == confidence >= threshold):
+        tp: pred_pos and agrees
+        fp: pred_pos and not agrees
+        fn: not pred_pos and agrees
+        tn: not pred_pos and not agrees
+
+    Query params:
+    - sr_id: SR id
+    - step: l1|l2
+    - threshold: float [0,1]
+    - criterion_key: optional filter for a single criterion
+    - limit: max rows returned (default 200, max 2000)
+    - format: json|csv
+    """
+
+    step_norm = str(step or "l1").lower().strip()
+    if step_norm not in {"l1", "l2"}:
+        raise HTTPException(status_code=status.HTTP_400_BAD_REQUEST, detail="step must be l1 or l2")
+
+    thr = _clip01(threshold, default=0.9)
+    lim = max(1, min(2000, int(limit or 200)))
+    fmt = str(format or "json").lower().strip()
+    if fmt not in {"json", "csv"}:
+        raise HTTPException(status_code=status.HTTP_400_BAD_REQUEST, detail="format must be json or csv")
+
+    try:
+        sr, screening = await load_sr_and_check(sr_id, current_user, srdb_service)
+    except HTTPException:
+        raise
+    except Exception as e:
+        raise HTTPException(status_code=status.HTTP_500_INTERNAL_SERVER_ERROR, detail=f"Failed to load SR: {e}")
+
+    table_name = (screening or {}).get("table_name") or "citations"
+
+    # Criteria questions for step
+    cp = sr.get("criteria_parsed") or {}
+    crit_block = cp.get(step_norm) if isinstance(cp, dict) else None
+    questions = (crit_block or {}).get("questions") if isinstance(crit_block, dict) else []
+    questions = questions if isinstance(questions, list) else []
+
+    criteria: List[Dict[str, str]] = []
+    for q in questions:
+        if not isinstance(q, str) or not q.strip():
+            continue
+        ck = _criterion_key_from_question(q)
+        if criterion_key and str(criterion_key).strip() != ck:
+            continue
+        criteria.append({"criterion_key": ck, "label": q})
+
+    if criterion_key and not criteria:
+        raise HTTPException(status_code=status.HTTP_400_BAD_REQUEST, detail="Unknown criterion_key for this step")
+
+    # Determine SR scope ids for step
+    filter_step = ""
+    if step_norm == "l2":
+        filter_step = "l1"
+    try:
+        ids = await run_in_threadpool(cits_dp_service.list_citation_ids, filter_step, table_name)
+    except Exception as e:
+        raise HTTPException(status_code=status.HTTP_500_INTERNAL_SERVER_ERROR, detail=f"Failed to list citations: {e}")
+
+    validations_col = f"{step_norm}_validations"
+    legacy_validated_by = f"{step_norm}_validated_by"
+
+    # Build columns for row fetch
+    needed_cols: List[str] = ["id", validations_col, legacy_validated_by]
+    human_cols: Dict[str, str] = {}
+    for c in criteria:
+        ck = c["criterion_key"]
+        hcol = f"human_{ck}" if ck else "human_col"
+        human_cols[ck] = hcol
+        needed_cols.append(hcol)
+
+    # Load latest screening runs for all ids
+    pipeline_norm = "title_abstract" if step_norm == "l1" else "fulltext"
+    try:
+        runs = await run_in_threadpool(
+            cits_dp_service.list_latest_agent_runs,
+            sr_id=sr_id,
+            table_name=table_name,
+            citation_ids=ids,
+            pipeline=pipeline_norm,
+        )
+    except Exception as e:
+        raise HTTPException(status_code=status.HTTP_500_INTERNAL_SERVER_ERROR, detail=f"Failed to load agent runs: {e}")
+
+    # Group screening runs by citation then criterion
+    screening_by_cit: Dict[int, Dict[str, Dict[str, Any]]] = {}
+    for r in runs or []:
+        try:
+            cid = int(r.get("citation_id"))
+        except Exception:
+            continue
+        if str(r.get("stage") or "") != "screening":
+            continue
+        ck = str(r.get("criterion_key") or "")
+        if not ck:
+            continue
+        if criterion_key and ck != str(criterion_key).strip():
+            continue
+        if cid not in screening_by_cit:
+            screening_by_cit[cid] = {}
+        screening_by_cit[cid][ck] = r
+
+    # Load citation rows
+    try:
+        rows = await run_in_threadpool(cits_dp_service.get_citations_by_ids, ids, table_name, needed_cols)
+    except Exception as e:
+        raise HTTPException(status_code=status.HTTP_500_INTERNAL_SERVER_ERROR, detail=f"Failed to load citation rows: {e}")
+
+    def _is_validated_row(row: Dict[str, Any]) -> bool:
+        v = row.get(validations_col)
+        if v:
+            try:
+                parsed = v
+                if isinstance(v, str):
+                    parsed = json.loads(v)
+                if isinstance(parsed, list) and len(parsed) > 0:
+                    return True
+            except Exception:
+                pass
+        return bool(row.get(legacy_validated_by))
+
+    out_rows: List[CalibrationSampleRow] = []
+    for row in rows or []:
+        if len(out_rows) >= lim:
+            break
+        try:
+            cid = int(row.get("id"))
+        except Exception:
+            continue
+        if not _is_validated_row(row):
+            continue
+        scr_map = screening_by_cit.get(cid) or {}
+        for c in criteria:
+            if len(out_rows) >= lim:
+                break
+            ck = c["criterion_key"]
+            scr = scr_map.get(ck)
+            if not scr:
+                continue
+            conf = _parse_confidence(scr.get("confidence"))
+            ai_ans = str(scr.get("answer") or "").strip() if scr.get("answer") is not None else None
+            human_sel = _parse_selected_from_human_payload(row.get(human_cols.get(ck) or f"human_{ck}"))
+            if human_sel is None:
+                continue
+            agrees = (str(human_sel).strip() == str(ai_ans or "").strip())
+            pred_pos = (conf is not None) and (conf >= thr)
+            if pred_pos and agrees:
+                bucket = "tp"
+            elif pred_pos and (not agrees):
+                bucket = "fp"
+            elif (not pred_pos) and agrees:
+                bucket = "fn"
+            else:
+                bucket = "tn"
+
+            out_rows.append(
+                CalibrationSampleRow(
+                    citation_id=cid,
+                    criterion_key=ck,
+                    label=c["label"],
+                    validated=True,
+                    confidence=conf,
+                    ai_answer=ai_ans,
+                    human_selected=human_sel,
+                    agrees=agrees,
+                    bucket=bucket,
+                )
+            )
+
+    if fmt == "json":
+        return CalibrationSamplesResponse(sr_id=sr_id, step=step_norm, threshold=thr, rows=out_rows)
+
+    # CSV format
+    header = [
+        "citation_id",
+        "criterion_key",
+        "label",
+        "confidence",
+        "ai_answer",
+        "human_selected",
+        "agrees",
+        "bucket",
+    ]
+    lines = [",".join(header)]
+    for r in out_rows:
+        lines.append(
+            ",".join(
+                [
+                    _csv_escape(r.citation_id),
+                    _csv_escape(r.criterion_key),
+                    _csv_escape(r.label),
+                    _csv_escape(r.confidence),
+                    _csv_escape(r.ai_answer),
+                    _csv_escape(r.human_selected),
+                    _csv_escape(r.agrees),
+                    _csv_escape(r.bucket),
+                ]
+            )
+        )
+    csv_bytes = ("\n".join(lines) + "\n").encode("utf-8")
+    return Response(content=csv_bytes, media_type="text/csv")
+
 async def update_inclusion_decision(
     sr: Dict[str, Any],
     citation_id: int,
diff --git a/backend/api/services/cit_db_service.py b/backend/api/services/cit_db_service.py
index 1e3f28e4..d928285f 100644
--- a/backend/api/services/cit_db_service.py
+++ b/backend/api/services/cit_db_service.py
@@ -254,11 +254,21 @@ def ensure_screening_agent_runs_table(self) -> None:
                     input_tokens INT,
                     output_tokens INT,
                     cost_usd DOUBLE PRECISION,
+                    guardrails JSONB,
                     created_at TIMESTAMPTZ DEFAULT now()
                 )
                 """
             )
 
+            # Runtime schema evolution for existing deployments
+            try:
+                cur.execute("ALTER TABLE screening_agent_runs ADD COLUMN IF NOT EXISTS guardrails JSONB")
+            except Exception:
+                try:
+                    cur.execute("ALTER TABLE screening_agent_runs ADD COLUMN guardrails JSONB")
+                except Exception:
+                    pass
+
             # A couple of pragmatic indexes for common lookups.
             cur.execute(
                 """
@@ -327,12 +337,12 @@ def insert_screening_agent_run(self, run: Dict[str, Any]) -> str:
                     id, sr_id, table_name, citation_id, pipeline, criterion_key, stage,
                     answer, confidence, rationale, raw_response,
                     model, prompt_version, temperature, top_p, seed,
-                    latency_ms, input_tokens, output_tokens, cost_usd, created_at
+                    latency_ms, input_tokens, output_tokens, cost_usd, guardrails, created_at
                 ) VALUES (
                     %s, %s, %s, %s, %s, %s, %s,
                     %s, %s, %s, %s,
                     %s, %s, %s, %s, %s,
-                    %s, %s, %s, %s, %s
+                    %s, %s, %s, %s, %s, %s
                 )
                 """,
                 (
@@ -356,6 +366,7 @@ def insert_screening_agent_run(self, run: Dict[str, Any]) -> str:
                     run.get("input_tokens"),
                     run.get("output_tokens"),
                     run.get("cost_usd"),
+                    json.dumps(run.get("guardrails")) if run.get("guardrails") is not None else None,
                     run.get("created_at") or datetime.utcnow().isoformat() + "Z",
                 ),
             )
@@ -368,6 +379,106 @@ def insert_screening_agent_run(self, run: Dict[str, Any]) -> str:
             if conn:
                 pass
 
+    def agent_runs_exist(self, *, sr_id: str, table_name: str, pipeline: str) -> bool:
+        """Return True if we have any normalized agent runs for this SR+table+pipeline."""
+
+        self._require_psycopg2()
+        self.ensure_screening_agent_runs_table()
+        conn = None
+        try:
+            conn = postgres_server.conn
+            cur = conn.cursor()
+            cur.execute(
+                """
+                SELECT 1
+                FROM screening_agent_runs
+                WHERE sr_id=%s AND table_name=%s AND pipeline=%s
+                LIMIT 1
+                """,
+                (str(sr_id), str(table_name), str(pipeline)),
+            )
+            return cur.fetchone() is not None
+        except Exception:
+            _safe_rollback(conn)
+            raise
+        finally:
+            if conn:
+                pass
+
+    def legacy_llm_outputs_exist_for_step(
+        self,
+        *,
+        table_name: str,
+        criteria_parsed: Dict[str, Any],
+        step: str,
+    ) -> bool:
+        """Return True if any legacy llm_* JSONB columns for this step contain data."""
+
+        table_name = _validate_ident(table_name, kind="table_name")
+        self._require_psycopg2()
+        if not self.table_exists(table_name):
+            return False
+
+        step_norm = str(step or "").lower().strip()
+        if step_norm not in {"l1", "l2"}:
+            return False
+
+        qs = (((criteria_parsed or {}).get(step_norm) or {}).get("questions") or [])
+        if not isinstance(qs, list) or not qs:
+            return False
+
+        # Determine which llm_* columns exist
+        cols_meta = self.get_table_columns(table_name)
+        existing_cols = {c.get("column_name") for c in cols_meta if c and c.get("column_name")}
+        llm_cols = []
+        for q in qs:
+            if not isinstance(q, str) or not q.strip():
+                continue
+            col = snake_case_column(q)
+            if col in existing_cols:
+                llm_cols.append(col)
+        if not llm_cols:
+            return False
+
+        # Any non-null legacy output?
+        or_sql = " OR ".join([f'"{c}" IS NOT NULL' for c in llm_cols])
+        conn = None
+        try:
+            conn = postgres_server.conn
+            cur = conn.cursor()
+            cur.execute(f'SELECT 1 FROM "{table_name}" WHERE {or_sql} LIMIT 1')
+            return cur.fetchone() is not None
+        except Exception:
+            _safe_rollback(conn)
+            raise
+        finally:
+            if conn:
+                pass
+
+    def legacy_needs_rerun(
+        self,
+        *,
+        sr_id: str,
+        table_name: str,
+        criteria_parsed: Dict[str, Any],
+        step: str,
+    ) -> bool:
+        """Return True when legacy llm_* outputs exist but normalized runs do not.
+
+        This is the signal to:
+        - warn the user that they must run run-all
+        - auto-enable force overwrite for run-all
+        """
+
+        step_norm = str(step or "").lower().strip()
+        if step_norm not in {"l1", "l2"}:
+            return False
+        pipeline = "title_abstract" if step_norm == "l1" else "fulltext"
+        legacy = self.legacy_llm_outputs_exist_for_step(table_name=table_name, criteria_parsed=criteria_parsed, step=step_norm)
+        if not legacy:
+            return False
+        return not self.agent_runs_exist(sr_id=sr_id, table_name=table_name, pipeline=pipeline)
+
     def list_latest_agent_runs(
         self,
         *,
@@ -416,6 +527,7 @@ def list_latest_agent_runs(
                     answer,
                     confidence,
                     rationale,
+                    guardrails,
                     model,
                     prompt_version,
                     temperature,
diff --git a/backend/api/services/sr_db_service.py b/backend/api/services/sr_db_service.py
index 013a4c32..b1e3a2d5 100644
--- a/backend/api/services/sr_db_service.py
+++ b/backend/api/services/sr_db_service.py
@@ -51,6 +51,7 @@ def ensure_table_exists(self) -> None:
                     criteria_yaml TEXT,
                     criteria_parsed JSONB,
                     screening_thresholds JSONB,
+                    critical_prompt_additions JSONB,
                     screening_db JSONB,
                     created_at TIMESTAMP WITH TIME ZONE DEFAULT now(),
                     updated_at TIMESTAMP WITH TIME ZONE DEFAULT now()
@@ -72,6 +73,18 @@ def ensure_table_exists(self) -> None:
                     )
                 except Exception:
                     pass
+
+            try:
+                cur.execute(
+                    "ALTER TABLE systematic_reviews ADD COLUMN IF NOT EXISTS critical_prompt_additions JSONB"
+                )
+            except Exception:
+                try:
+                    cur.execute(
+                        "ALTER TABLE systematic_reviews ADD COLUMN critical_prompt_additions JSONB"
+                    )
+                except Exception:
+                    pass
             conn.commit()
                 
             logger.info("Ensured systematic_reviews table exists")
@@ -202,7 +215,7 @@ def create_systematic_review(
             insert_sql = """
                 INSERT INTO systematic_reviews 
                 (id, name, description, owner_id, owner_email, users, visible, 
-                 criteria, criteria_yaml, criteria_parsed, screening_thresholds, created_at, updated_at)
+                 criteria, criteria_yaml, criteria_parsed, screening_thresholds, critical_prompt_additions, created_at, updated_at)
                 VALUES (%s, %s, %s, %s, %s, %s, %s, %s, %s, %s, %s, %s, %s)
             """
             
@@ -218,6 +231,7 @@ def create_systematic_review(
                 criteria_str,
                 json.dumps(criteria_parsed),
                 json.dumps({"l1": {}, "l2": {}, "parameters": {}}),
+                json.dumps({"l1": {}, "l2": {}}),
                 now,
                 now
             ))
@@ -239,6 +253,8 @@ def create_systematic_review(
                 sr_doc['criteria_parsed'] = json.loads(sr_doc['criteria_parsed'])
             if sr_doc.get('screening_thresholds') and isinstance(sr_doc['screening_thresholds'], str):
                 sr_doc['screening_thresholds'] = json.loads(sr_doc['screening_thresholds'])
+            if sr_doc.get('critical_prompt_additions') and isinstance(sr_doc['critical_prompt_additions'], str):
+                sr_doc['critical_prompt_additions'] = json.loads(sr_doc['critical_prompt_additions'])
             # Convert datetime objects to ISO strings
             from datetime import datetime as dt
             if sr_doc.get('created_at') and isinstance(sr_doc['created_at'], dt):
@@ -522,6 +538,8 @@ def list_systematic_reviews_for_user(self, user_email: str) -> List[Dict[str, An
                     doc['criteria_parsed'] = json.loads(doc['criteria_parsed'])
                 if doc.get('screening_thresholds') and isinstance(doc['screening_thresholds'], str):
                     doc['screening_thresholds'] = json.loads(doc['screening_thresholds'])
+                if doc.get('critical_prompt_additions') and isinstance(doc['critical_prompt_additions'], str):
+                    doc['critical_prompt_additions'] = json.loads(doc['critical_prompt_additions'])
                 # Convert datetime objects to ISO strings
                 from datetime import datetime as dt
                 if doc.get('created_at') and isinstance(doc['created_at'], dt):
@@ -582,6 +600,8 @@ def get_systematic_review(self, sr_id: str, ignore_visibility: bool = False) ->
                 doc['criteria_parsed'] = json.loads(doc['criteria_parsed'])
             if doc.get('screening_thresholds') and isinstance(doc['screening_thresholds'], str):
                 doc['screening_thresholds'] = json.loads(doc['screening_thresholds'])
+            if doc.get('critical_prompt_additions') and isinstance(doc['critical_prompt_additions'], str):
+                doc['critical_prompt_additions'] = json.loads(doc['critical_prompt_additions'])
             # Convert datetime objects to ISO strings
             from datetime import datetime as dt
             if doc.get('created_at') and isinstance(doc['created_at'], dt):
@@ -766,6 +786,39 @@ def update_screening_thresholds(self, sr_id: str, screening_thresholds: Dict[str
             if conn:
                 pass
 
+    def update_critical_prompt_additions(self, sr_id: str, critical_prompt_additions: Dict[str, Any]) -> None:
+        """Persist SR-scoped critical prompt additions.
+
+        Shape:
+          {"l1": {"criterion_key": "..."}, "l2": {"criterion_key": "..."}}
+        """
+
+        conn = None
+        try:
+            conn = postgres_server.conn
+            cur = conn.cursor()
+
+            updated_at = datetime.utcnow().isoformat()
+            cur.execute(
+                "UPDATE systematic_reviews SET critical_prompt_additions = %s, updated_at = %s WHERE id = %s",
+                (json.dumps(critical_prompt_additions), updated_at, sr_id),
+            )
+            conn.commit()
+        except Exception as e:
+            try:
+                if conn:
+                    conn.rollback()
+            except Exception:
+                pass
+            logger.exception(f"Failed to update critical prompt additions: {e}")
+            raise HTTPException(
+                status_code=status.HTTP_500_INTERNAL_SERVER_ERROR,
+                detail=f"Failed to update critical prompt additions: {e}",
+            )
+        finally:
+            if conn:
+                pass
+
     def clear_screening_db_info(self, sr_id: str) -> None:
         """
         Remove the screening_db field from the SR document.
diff --git a/backend/api/sr/router.py b/backend/api/sr/router.py
index c52416a2..05851733 100644
--- a/backend/api/sr/router.py
+++ b/backend/api/sr/router.py
@@ -59,6 +59,14 @@ class SystematicReviewRead(BaseModel):
     # }
     screening_thresholds: Optional[Dict[str, Any]] = None
 
+    # SR-scoped per-step per-criterion additions injected into CRITICAL prompts.
+    # Shape:
+    # {
+    #   "l1": {"criterion_key": "..."},
+    #   "l2": {"criterion_key": "..."}
+    # }
+    critical_prompt_additions: Optional[Dict[str, Any]] = None
+
 
 
 
@@ -144,6 +152,7 @@ async def create_systematic_review(
         criteria_yaml=sr_doc.get("criteria_yaml"),
         criteria_parsed=sr_doc.get("criteria_parsed"),
         screening_thresholds=sr_doc.get("screening_thresholds"),
+        critical_prompt_additions=sr_doc.get("critical_prompt_additions"),
     )
 
 
@@ -408,6 +417,10 @@ class ThresholdsUpdateRequest(BaseModel):
     screening_thresholds: Dict[str, Any] = {}
 
 
+class CriticalPromptAdditionsUpdateRequest(BaseModel):
+    critical_prompt_additions: Dict[str, Any] = {}
+
+
 @router.get("/{sr_id}/screening_thresholds")
 async def get_screening_thresholds(sr_id: str, current_user: Dict[str, Any] = Depends(get_current_active_user)):
     """Get SR-scoped per-step per-criterion thresholds."""
@@ -470,6 +483,65 @@ async def update_screening_thresholds(
     return {"status": "success", "sr_id": sr_id, "screening_thresholds": normalized}
 
 
+@router.get("/{sr_id}/critical_prompt_additions")
+async def get_critical_prompt_additions(sr_id: str, current_user: Dict[str, Any] = Depends(get_current_active_user)):
+    """Get SR-scoped per-step per-criterion critical prompt additions."""
+
+    try:
+        doc, _screening = await load_sr_and_check(sr_id, current_user, srdb_service, require_screening=False)
+    except HTTPException:
+        raise
+    except Exception as e:
+        raise HTTPException(status_code=status.HTTP_500_INTERNAL_SERVER_ERROR, detail=f"Failed to load systematic review: {e}")
+
+    cpa = doc.get("critical_prompt_additions") or {}
+    if not isinstance(cpa, dict):
+        cpa = {}
+    return {"sr_id": sr_id, "critical_prompt_additions": cpa}
+
+
+@router.put("/{sr_id}/critical_prompt_additions")
+async def update_critical_prompt_additions(
+    sr_id: str,
+    payload: CriticalPromptAdditionsUpdateRequest,
+    current_user: Dict[str, Any] = Depends(get_current_active_user),
+):
+    """Update SR-scoped per-step per-criterion critical prompt additions.
+
+    Any SR member may update these (mirrors thresholds permissions).
+    """
+
+    try:
+        _doc, _screening = await load_sr_and_check(sr_id, current_user, srdb_service, require_screening=False)
+    except HTTPException:
+        raise
+    except Exception as e:
+        raise HTTPException(status_code=status.HTTP_500_INTERNAL_SERVER_ERROR, detail=f"Failed to load systematic review: {e}")
+
+    cpa = payload.critical_prompt_additions or {}
+    if not isinstance(cpa, dict):
+        raise HTTPException(status_code=status.HTTP_400_BAD_REQUEST, detail="critical_prompt_additions must be an object")
+
+    normalized: Dict[str, Any] = {}
+    for step in ("l1", "l2"):
+        block = cpa.get(step)
+        if isinstance(block, dict):
+            out: Dict[str, str] = {}
+            for k, v in block.items():
+                if not isinstance(k, str) or not k.strip():
+                    continue
+                if v is None:
+                    out[k] = ""
+                else:
+                    out[k] = str(v)
+            normalized[step] = out
+        else:
+            normalized[step] = {}
+
+    await run_in_threadpool(srdb_service.update_critical_prompt_additions, sr_id, normalized)
+    return {"status": "success", "sr_id": sr_id, "critical_prompt_additions": normalized}
+
+
 @router.delete("/{sr_id}")
 async def delete_systematic_review(sr_id: str, current_user: Dict[str, Any] = Depends(get_current_active_user)):
     """
diff --git a/backend/docker-compose.yml b/backend/docker-compose.yml
index 1a8567a7..9479be17 100644
--- a/backend/docker-compose.yml
+++ b/backend/docker-compose.yml
@@ -60,7 +60,9 @@ services:
     ports:
       - "5432:5432"
     volumes:
-      - ./volumes/postgres:/var/lib/postgresql
+      # Use a named volume so we can reset with `docker compose down -v`
+      # without filesystem permission issues from bind-mounts.
+      - backend_pgdata:/var/lib/postgresql/data
     healthcheck:
       test: ["CMD-SHELL", "pg_isready -U admin -d postgres -h localhost"]
       interval: 30s
@@ -71,3 +73,6 @@ services:
 networks:
   default:
     driver: bridge
+
+volumes:
+  backend_pgdata:
diff --git a/frontend/app/[lang]/can-sr/l1-screen/page.tsx b/frontend/app/[lang]/can-sr/l1-screen/page.tsx
index e5f5d7de..1c129b8c 100644
--- a/frontend/app/[lang]/can-sr/l1-screen/page.tsx
+++ b/frontend/app/[lang]/can-sr/l1-screen/page.tsx
@@ -14,49 +14,39 @@ const buildCitationAiCalls: BuildCitationAiCalls = ({
   criteria,
   getAuthHeaders,
 }) => {
-  const calls: AiCall[] = []
-
-  for (let i = 0; i < (criteria?.questions || []).length; i++) {
-    const question = criteria.questions[i]
-    const options = criteria.possible_answers?.[i] || []
-
-    calls.push({
-      key: `l1_classify_${i}`,
-      label: `L1: ${question}`,
+  // Phase 2 wiring: L1 run-all uses the agentic orchestrator endpoint.
+  // We keep the existing “Run all AI” modal behavior, but instead of running per-question
+  // classify calls, we run a single orchestrated run per citation.
+  return [
+    {
+      key: `l1_agentic_run`,
+      label: `L1 agentic (screening + critical)`,
       run: async () => {
         const headers = {
           ...getAuthHeaders(),
           'Content-Type': 'application/json',
         }
 
-        const res = await fetch(
-          `/api/can-sr/screen?action=classify&sr_id=${encodeURIComponent(
-            srId,
-          )}&citation_id=${encodeURIComponent(String(citationId))}`,
-          {
-            method: 'POST',
-            headers,
-            body: JSON.stringify({
-              question,
-              options,
-              include_columns: ['title', 'abstract'],
-              screening_step: 'l1',
-              model,
-              temperature: 0.0,
-              max_tokens: 2000,
-            }),
-          },
-        )
+        const res = await fetch('/api/can-sr/screen/title-abstract/run', {
+          method: 'POST',
+          headers,
+          body: JSON.stringify({
+            sr_id: srId,
+            citation_id: Number(citationId),
+            model,
+            temperature: 0.0,
+            max_tokens: 1200,
+            prompt_version: 'v1',
+          }),
+        })
 
         if (!res.ok) {
           const text = await res.text().catch(() => '')
-          throw new Error(text || `L1 classify failed (${res.status})`)
+          throw new Error(text || `L1 agentic run failed (${res.status})`)
         }
       },
-    })
-  }
-
-  return calls
+    },
+  ]
 }
 
 export default function L1ScreenPage() {
diff --git a/frontend/app/[lang]/can-sr/l1-screen/view/page.tsx b/frontend/app/[lang]/can-sr/l1-screen/view/page.tsx
index 57891cb5..34111fad 100644
--- a/frontend/app/[lang]/can-sr/l1-screen/view/page.tsx
+++ b/frontend/app/[lang]/can-sr/l1-screen/view/page.tsx
@@ -450,57 +450,47 @@ export default function CanSrL1ScreenPage() {
   // Handler: call backend classify endpoint for a single question
   async function classifyQuestion(questionIndex: number) {
     if (!srId || !citationId || !criteriaData) return
-    const question = criteriaData.questions[questionIndex]
-    const options = criteriaData.possible_answers[questionIndex] || []
     try {
       const headers = {
         'Content-Type': 'application/json',
         ...getAuthHeaders(),
       }
-      const bodyPayload = {
-        question,
-        options,
-        include_columns: ['title', 'abstract'],
-        screening_step: 'l1',
-      }
-      const res = await fetch(
-        `/api/can-sr/screen?action=classify&sr_id=${encodeURIComponent(srId)}&citation_id=${encodeURIComponent(
-          citationId,
-        )}`,
-        {
-          method: 'POST',
-          headers,
-          body: JSON.stringify(bodyPayload),
-        },
-      )
-      const data = await res.json().catch(() => ({}))
-      // Expect the backend to return the classification_json or similar structure
-      // Try flexible extraction:
-      const classification =
-        data?.classification_json ||
-        data?.result ||
-        data?.classification ||
-        data?.llm_classification ||
-        data
-      if (classification && typeof classification === 'object') {
-        // Always show AI panel.
-        // IMPORTANT: do NOT overwrite an existing human selection in the UI.
-        if ((classification as any).selected !== undefined) {
-          setSelections((prev) => {
-            const already = prev?.[questionIndex]
-            if (already !== undefined && String(already).trim() !== '') return prev
-            return { ...prev, [questionIndex]: (classification as any).selected }
-          })
-        }
-        setAiPanels((prev) => ({ ...prev, [questionIndex]: classification }))
-        setPanelOpen((prev) => ({ ...prev, [questionIndex]: false }))
-      } else {
-        // If server returned a simple string, set it as selection
-        if (typeof data === 'string') {
-          setSelections((prev) => ({ ...prev, [questionIndex]: data }))
+
+      // Phase 1->2 wiring: reuse the existing per-question “AI” button, but call the
+      // agentic orchestrator endpoint which runs BOTH screening + critical and persists
+      // them to screening_agent_runs.
+      const res = await fetch('/api/can-sr/screen/title-abstract/run', {
+        method: 'POST',
+        headers,
+        body: JSON.stringify({
+          sr_id: srId,
+          citation_id: Number(citationId),
+          model: selectedModel,
+          temperature: 0.0,
+          max_tokens: 1200,
+          prompt_version: 'v1',
+        }),
+      })
+      await res.json().catch(() => ({}))
+
+      // Refresh latest runs + citation row so the UI shows critical + validations immediately.
+      await fetchCitationById(String(citationId))
+
+      try {
+        const r2 = await fetch(
+          `/api/can-sr/screen/agent-runs/latest?sr_id=${encodeURIComponent(
+            srId,
+          )}&pipeline=${encodeURIComponent('title_abstract')}&citation_ids=${encodeURIComponent(
+            String(citationId),
+          )}`,
+          { method: 'GET', headers: getAuthHeaders() },
+        )
+        const j2 = await r2.json().catch(() => ({}))
+        if (r2.ok && Array.isArray(j2?.runs)) {
+          setAgentRuns(j2.runs as LatestAgentRun[])
         }
-        setAiPanels((prev) => ({ ...prev, [questionIndex]: data || null }))
-        setPanelOpen((prev) => ({ ...prev, [questionIndex]: false }))
+      } catch {
+        // ignore
       }
     } catch (err) {
       console.error('Classify API error', err)
diff --git a/frontend/app/[lang]/can-sr/l2-screen/page.tsx b/frontend/app/[lang]/can-sr/l2-screen/page.tsx
index 81c8e5a8..1697fb0e 100644
--- a/frontend/app/[lang]/can-sr/l2-screen/page.tsx
+++ b/frontend/app/[lang]/can-sr/l2-screen/page.tsx
@@ -56,39 +56,30 @@ const buildCitationAiCalls: BuildCitationAiCalls = async ({
     },
   })
 
-  for (let i = 0; i < (criteria?.questions || []).length; i++) {
-    const question = criteria.questions[i]
-    const options = criteria.possible_answers?.[i] || []
-    calls.push({
-      key: `l2_classify_${i}`,
-      label: `L2: ${question}`,
-      run: async () => {
-        const res = await fetch(
-          `/api/can-sr/screen?action=classify&sr_id=${encodeURIComponent(
-            srId,
-          )}&citation_id=${encodeURIComponent(String(citationId))}`,
-          {
-            method: 'POST',
-            headers: { ...headers, 'Content-Type': 'application/json' },
-            body: JSON.stringify({
-              question,
-              options,
-              include_columns: ['title', 'abstract'],
-              screening_step: 'l2',
-              model,
-              temperature: 0.0,
-              max_tokens: 2000,
-            }),
-          },
-        )
-
-        if (!res.ok) {
-          const text = await res.text().catch(() => '')
-          throw new Error(text || `L2 classify failed (${res.status})`)
-        }
-      },
-    })
-  }
+  // Phase 2 wiring: run a single orchestrated fulltext screening+critical per citation.
+  // (The backend reads SR criteria, so we do not need to fan out per-question calls.)
+  calls.push({
+    key: `l2_agentic_run`,
+    label: `L2 agentic (screening + critical)`,
+    run: async () => {
+      const res = await fetch('/api/can-sr/screen/fulltext/run', {
+        method: 'POST',
+        headers: { ...headers, 'Content-Type': 'application/json' },
+        body: JSON.stringify({
+          sr_id: srId,
+          citation_id: Number(citationId),
+          model,
+          temperature: 0.0,
+          max_tokens: 2000,
+          prompt_version: 'v1',
+        }),
+      })
+      if (!res.ok) {
+        const text = await res.text().catch(() => '')
+        throw new Error(text || `L2 agentic run failed (${res.status})`)
+      }
+    },
+  })
 
   return calls
 }
diff --git a/frontend/app/[lang]/can-sr/l2-screen/view/page.tsx b/frontend/app/[lang]/can-sr/l2-screen/view/page.tsx
index 9af2199e..b7bfde89 100644
--- a/frontend/app/[lang]/can-sr/l2-screen/view/page.tsx
+++ b/frontend/app/[lang]/can-sr/l2-screen/view/page.tsx
@@ -290,32 +290,35 @@ export default function CanSrL2ScreenViewPage() {
   // Load latest agent runs for this citation (screening + critical per criterion)
   useEffect(() => {
     if (!srId || !citationId) return
-    const loadRuns = async () => {
-      setLoadingRuns(true)
-      try {
-        const headers = getAuthHeaders()
-        const res = await fetch(
-          `/api/can-sr/screen/agent-runs/latest?sr_id=${encodeURIComponent(
-            srId,
-          )}&pipeline=${encodeURIComponent('fulltext')}&citation_ids=${encodeURIComponent(
-            String(citationId),
-          )}`,
-          { method: 'GET', headers },
-        )
-        const data = await res.json().catch(() => ({}))
-        if (res.ok && Array.isArray(data?.runs)) {
-          setAgentRuns(data.runs as LatestAgentRun[])
-        } else {
-          setAgentRuns([])
-        }
-      } catch {
+    loadRuns()
+  }, [srId, citationId])
+
+  // Re-usable loader so we can refresh after triggering an agentic run.
+  async function loadRuns() {
+    if (!srId || !citationId) return
+    setLoadingRuns(true)
+    try {
+      const headers = getAuthHeaders()
+      const res = await fetch(
+        `/api/can-sr/screen/agent-runs/latest?sr_id=${encodeURIComponent(
+          srId,
+        )}&pipeline=${encodeURIComponent('fulltext')}&citation_ids=${encodeURIComponent(
+          String(citationId),
+        )}`,
+        { method: 'GET', headers },
+      )
+      const data = await res.json().catch(() => ({}))
+      if (res.ok && Array.isArray(data?.runs)) {
+        setAgentRuns(data.runs as LatestAgentRun[])
+      } else {
         setAgentRuns([])
-      } finally {
-        setLoadingRuns(false)
       }
+    } catch {
+      setAgentRuns([])
+    } finally {
+      setLoadingRuns(false)
     }
-    loadRuns()
-  }, [srId, citationId])
+  }
 
   const runsByCriterion = useMemo(() => {
     const by: Record<string, { screening?: LatestAgentRun; critical?: LatestAgentRun }> = {}
@@ -567,65 +570,32 @@ export default function CanSrL2ScreenViewPage() {
   // Call backend classify for a single question using fulltext template (screening_step='l2')
   async function classifyQuestion(questionIndex: number) {
     if (!srId || !citationId || !criteriaData) return
-    const question = criteriaData.questions[questionIndex]
-    const options = criteriaData.possible_answers[questionIndex] || []
-    const xtra = criteriaData.additional_infos?.[questionIndex] || ''
     try {
       const headers = {
         'Content-Type': 'application/json',
         ...getAuthHeaders(),
       }
-      const bodyPayload: any = {
-        question,
-        options,
-        screening_step: 'l2',
-        xtra,
-        model: selectedModel,
-        temperature: 0.0,
-        max_tokens: 1200,
-      }
-      // Provide full text directly to backend to prevent include_columns=None error.
-      // If fulltext is not yet available, fall back to title/abstract to avoid backend crash.
 
-      bodyPayload.citation_text = fulltextStr
-      bodyPayload.include_columns = ['title', 'abstract']
+      // Phase 2 wiring: reuse existing per-question “AI” button, but call the
+      // agentic orchestrator endpoint which runs BOTH screening + critical and persists
+      // them to screening_agent_runs.
+      const res = await fetch('/api/can-sr/screen/fulltext/run', {
+        method: 'POST',
+        headers,
+        body: JSON.stringify({
+          sr_id: srId,
+          citation_id: Number(citationId),
+          model: selectedModel,
+          temperature: 0.0,
+          max_tokens: 2000,
+          prompt_version: 'v1',
+        }),
+      })
+      await res.json().catch(() => ({}))
 
-      const res = await fetch(
-        `/api/can-sr/screen?action=classify&sr_id=${encodeURIComponent(srId)}&citation_id=${encodeURIComponent(
-          String(citationId),
-        )}`,
-        {
-          method: 'POST',
-          headers,
-          body: JSON.stringify(bodyPayload),
-        },
-      )
-      const data = await res.json().catch(() => ({}))
-      const classification =
-        data?.classification_json ||
-        data?.result ||
-        data?.classification ||
-        data?.llm_classification ||
-        data
-      if (classification && typeof classification === 'object') {
-        // Always show AI panel.
-        // IMPORTANT: do NOT overwrite an existing human selection in the UI.
-        if ((classification as any).selected !== undefined) {
-          setSelections((prev) => {
-            const already = prev?.[questionIndex]
-            if (already !== undefined && String(already).trim() !== '') return prev
-            return { ...prev, [questionIndex]: (classification as any).selected }
-          })
-        }
-        setAiPanels((prev) => ({ ...prev, [questionIndex]: classification }))
-        setPanelOpen((prev) => ({ ...prev, [questionIndex]: false }))
-      } else {
-        if (typeof data === 'string') {
-          setSelections((prev) => ({ ...prev, [questionIndex]: data }))
-        }
-        setAiPanels((prev) => ({ ...prev, [questionIndex]: data || null }))
-        setPanelOpen((prev) => ({ ...prev, [questionIndex]: false }))
-      }
+      // Refresh latest runs + citation row so the UI shows critical results immediately.
+      await fetchCitationById(String(citationId))
+      await loadRuns()
     } catch (err) {
       console.error('Classify API error', err)
     }
diff --git a/frontend/app/api/can-sr/reviews/critical-prompt-additions/route.ts b/frontend/app/api/can-sr/reviews/critical-prompt-additions/route.ts
new file mode 100644
index 00000000..835045cd
--- /dev/null
+++ b/frontend/app/api/can-sr/reviews/critical-prompt-additions/route.ts
@@ -0,0 +1,59 @@
+import { NextRequest, NextResponse } from 'next/server'
+import { BACKEND_URL } from '@/lib/config'
+
+/**
+ * Proxy:
+ *  GET /api/can-sr/reviews/critical-prompt-additions?sr_id=<sr>
+ *    -> GET {BACKEND_URL}/api/sr/<sr>/critical_prompt_additions
+ *  PUT /api/can-sr/reviews/critical-prompt-additions?sr_id=<sr>
+ *    body: { critical_prompt_additions: {...} }
+ *    -> PUT {BACKEND_URL}/api/sr/<sr>/critical_prompt_additions
+ */
+
+function authHeaders(request: NextRequest): Record<string, string> {
+  const auth = request.headers.get('authorization')
+  return auth ? { Authorization: auth } : {}
+}
+
+export async function GET(request: NextRequest) {
+  try {
+    const params = request.nextUrl.searchParams
+    const srId = params.get('sr_id')
+    if (!srId) {
+      return NextResponse.json({ error: 'sr_id is required' }, { status: 400 })
+    }
+    const url = `${BACKEND_URL}/api/sr/${encodeURIComponent(srId)}/critical_prompt_additions`
+    const res = await fetch(url, { method: 'GET', headers: authHeaders(request) })
+    const data = await res.json().catch(() => ({}))
+    return NextResponse.json(data, { status: res.status })
+  } catch (e) {
+    console.error('critical-prompt-additions GET error:', e)
+    return NextResponse.json({ error: 'Internal server error' }, { status: 500 })
+  }
+}
+
+export async function PUT(request: NextRequest) {
+  try {
+    const params = request.nextUrl.searchParams
+    const srId = params.get('sr_id')
+    if (!srId) {
+      return NextResponse.json({ error: 'sr_id is required' }, { status: 400 })
+    }
+
+    const body = await request.json().catch(() => ({}))
+    const url = `${BACKEND_URL}/api/sr/${encodeURIComponent(srId)}/critical_prompt_additions`
+    const res = await fetch(url, {
+      method: 'PUT',
+      headers: {
+        ...authHeaders(request),
+        'Content-Type': 'application/json',
+      },
+      body: JSON.stringify(body),
+    })
+    const data = await res.json().catch(() => ({}))
+    return NextResponse.json(data, { status: res.status })
+  } catch (e) {
+    console.error('critical-prompt-additions PUT error:', e)
+    return NextResponse.json({ error: 'Internal server error' }, { status: 500 })
+  }
+}
diff --git a/frontend/app/api/can-sr/screen/calibration/route.ts b/frontend/app/api/can-sr/screen/calibration/route.ts
new file mode 100644
index 00000000..b29e3633
--- /dev/null
+++ b/frontend/app/api/can-sr/screen/calibration/route.ts
@@ -0,0 +1,66 @@
+import { NextRequest, NextResponse } from 'next/server'
+import { BACKEND_URL } from '@/lib/config'
+
+/**
+ * Proxy: GET /api/can-sr/screen/calibration?sr_id=<sr>&step=l1|l2&thresholds=...&bins=...
+ *   -> GET {BACKEND_URL}/api/screen/calibration?sr_id=...&step=...&thresholds=...&bins=...
+ */
+
+export async function OPTIONS() {
+  return new Response(null, {
+    status: 204,
+    headers: {
+      'Access-Control-Allow-Origin': '*',
+      'Access-Control-Allow-Methods': 'GET,OPTIONS',
+      'Access-Control-Allow-Headers': 'Authorization, Content-Type',
+    },
+  })
+}
+
+export async function GET(request: NextRequest) {
+  try {
+    const params = request.nextUrl.searchParams
+    const srId = params.get('sr_id')
+    const step = params.get('step') || 'l1'
+    const thresholds = params.get('thresholds')
+    const bins = params.get('bins')
+
+    if (!srId) {
+      return NextResponse.json({ error: 'sr_id is required' }, { status: 400 })
+    }
+
+    const authHeader = request.headers.get('authorization')
+    if (!authHeader) {
+      return NextResponse.json(
+        { error: 'Authorization header is required' },
+        { status: 401 },
+      )
+    }
+
+    const url = new URL(`${BACKEND_URL}/api/screen/calibration`)
+    url.searchParams.set('sr_id', srId)
+    url.searchParams.set('step', step)
+    if (thresholds) url.searchParams.set('thresholds', thresholds)
+    if (bins) url.searchParams.set('bins', bins)
+
+    const res = await fetch(url.toString(), {
+      method: 'GET',
+      headers: {
+        Authorization: authHeader,
+      },
+    })
+
+    const text = await res.text().catch(() => '')
+    let json: any = null
+    try {
+      json = text ? JSON.parse(text) : {}
+    } catch {
+      json = { detail: text || null }
+    }
+
+    return NextResponse.json(json, { status: res.status })
+  } catch (err: any) {
+    console.error('screen calibration proxy GET error:', err)
+    return NextResponse.json({ error: 'Internal server error' }, { status: 500 })
+  }
+}
diff --git a/frontend/app/api/can-sr/screen/fulltext/run/route.ts b/frontend/app/api/can-sr/screen/fulltext/run/route.ts
new file mode 100644
index 00000000..1743fd56
--- /dev/null
+++ b/frontend/app/api/can-sr/screen/fulltext/run/route.ts
@@ -0,0 +1,36 @@
+import { NextRequest, NextResponse } from 'next/server'
+import { BACKEND_URL } from '@/lib/config'
+
+/**
+ * Proxy: POST /api/can-sr/screen/fulltext/run
+ *   -> POST {BACKEND_URL}/api/screen/fulltext/run
+ */
+export async function POST(request: NextRequest) {
+  try {
+    const authHeader = request.headers.get('authorization')
+    if (!authHeader) {
+      return NextResponse.json(
+        { error: 'Authorization header is required' },
+        { status: 401 },
+      )
+    }
+
+    const body = await request.json().catch(() => ({}))
+
+    const url = `${BACKEND_URL}/api/screen/fulltext/run`
+    const res = await fetch(url, {
+      method: 'POST',
+      headers: {
+        'Content-Type': 'application/json',
+        Authorization: authHeader,
+      },
+      body: JSON.stringify(body),
+    })
+
+    const data = await res.json().catch(() => ({}))
+    return NextResponse.json(data, { status: res.status })
+  } catch (err) {
+    console.error('fulltext/run proxy POST error:', err)
+    return NextResponse.json({ error: 'Internal server error' }, { status: 500 })
+  }
+}
diff --git a/frontend/app/api/can-sr/screen/title-abstract/run/route.ts b/frontend/app/api/can-sr/screen/title-abstract/run/route.ts
new file mode 100644
index 00000000..a628192f
--- /dev/null
+++ b/frontend/app/api/can-sr/screen/title-abstract/run/route.ts
@@ -0,0 +1,36 @@
+import { NextRequest, NextResponse } from 'next/server'
+import { BACKEND_URL } from '@/lib/config'
+
+/**
+ * Proxy: POST /api/can-sr/screen/title-abstract/run
+ *   -> POST {BACKEND_URL}/api/screen/title-abstract/run
+ */
+export async function POST(request: NextRequest) {
+  try {
+    const authHeader = request.headers.get('authorization')
+    if (!authHeader) {
+      return NextResponse.json(
+        { error: 'Authorization header is required' },
+        { status: 401 },
+      )
+    }
+
+    const body = await request.json().catch(() => ({}))
+
+    const url = `${BACKEND_URL}/api/screen/title-abstract/run`
+    const res = await fetch(url, {
+      method: 'POST',
+      headers: {
+        'Content-Type': 'application/json',
+        Authorization: authHeader,
+      },
+      body: JSON.stringify(body),
+    })
+
+    const data = await res.json().catch(() => ({}))
+    return NextResponse.json(data, { status: res.status })
+  } catch (err) {
+    console.error('title-abstract/run proxy POST error:', err)
+    return NextResponse.json({ error: 'Internal server error' }, { status: 500 })
+  }
+}
diff --git a/frontend/components/can-sr/CitationListPage.tsx b/frontend/components/can-sr/CitationListPage.tsx
index 3f99aa12..7b1cafb0 100644
--- a/frontend/components/can-sr/CitationListPage.tsx
+++ b/frontend/components/can-sr/CitationListPage.tsx
@@ -13,7 +13,9 @@ import ScreeningMetricsPanel, {
   type ScreeningMetricsStats,
   type ScreeningMetricsSummary,
   type ScreeningCriterionMetrics,
+  type CalibrationCriterion,
 } from '@/components/can-sr/ScreeningMetricsPanel'
+import ScreeningMetricsModal from '@/components/can-sr/ScreeningMetricsModal'
 import {
   Dialog,
   DialogContent,
@@ -79,17 +81,41 @@ export default function CitationsListPage({
   const [error, setError] = useState<string | null>(null)
   const [criteriaData, setCriteriaData] = useState<CriteriaData | null>()
 
-  // Phase 1 list control surface is now hosted by the left-side metrics module.
+  // Phase 1 single-threshold is deprecated; kept for backward compatibility.
   const [threshold, setThreshold] = useState<number>(0.9)
-  const [filterMode, setFilterMode] = useState<'needs' | 'validated' | 'unvalidated' | 'all'>('needs')
-  const [pageStats, setPageStats] = useState<ScreeningMetricsStats | undefined>(undefined)
+  const [filterMode, setFilterMode] = useState<'needs' | 'validated' | 'unvalidated' | 'not_screened' | 'all'>('needs')
+  // page-local stats no longer shown (SR-wide progress bar is in metrics panel)
+  const [_pageStats, setPageStats] = useState<ScreeningMetricsStats | undefined>(undefined)
 
   // Phase 2 metrics (SR-wide)
   const [srMetricsSummary, setSrMetricsSummary] = useState<ScreeningMetricsSummary | undefined>(undefined)
   const [srCriterionMetrics, setSrCriterionMetrics] = useState<ScreeningCriterionMetrics[] | undefined>(undefined)
-  const [srThresholds, setSrThresholds] = useState<Record<string, any> | null>(null)
+  const [srCalibration, setSrCalibration] = useState<CalibrationCriterion[] | undefined>(undefined)
+  const [_srThresholds, setSrThresholds] = useState<Record<string, any> | null>(null)
+
+  // Backend warnings (e.g., legacy data needs run-all)
+  const [srWarnings, setSrWarnings] = useState<any[] | null>(null)
+
+  const legacyWarning = useMemo(() => {
+    const ws = Array.isArray(srWarnings) ? srWarnings : []
+    return (
+      ws.find((w) => String(w?.code || '').toUpperCase() === 'LEGACY_DATA_NEEDS_RUN_ALL') ||
+      null
+    )
+  }, [srWarnings])
+
+  // Silence eslint unused warnings for state that is intentionally retained for backwards-compatibility.
+  void _pageStats
+  void _srThresholds
   const [metricsRefreshKey, setMetricsRefreshKey] = useState<number>(0)
 
+  const [metricsDrawerOpen, setMetricsDrawerOpen] = useState<boolean>(false)
+
+  // Draft editing: user can adjust thresholds locally, then click Save.
+  const [draftThresholds, setDraftThresholds] = useState<Record<string, any> | null>(null)
+  const [thresholdsDirty, setThresholdsDirty] = useState<boolean>(false)
+  const [savingThresholds, setSavingThresholds] = useState<boolean>(false)
+
   // Run-all job tracking (persist across modal close / refresh)
   const [runAllForce, setRunAllForce] = useState<boolean>(false)
   const [runAllJobId, setRunAllJobId] = useState<string | null>(null)
@@ -206,6 +232,8 @@ export default function CitationsListPage({
         const tJson = await tRes.json().catch(() => ({}))
         const thresholds = (tRes.ok ? tJson?.screening_thresholds : null) || {}
         setSrThresholds(typeof thresholds === 'object' && thresholds ? thresholds : {})
+        setDraftThresholds(typeof thresholds === 'object' && thresholds ? thresholds : {})
+        setThresholdsDirty(false)
 
         // 2) metrics
         const mRes = await fetch(
@@ -219,14 +247,32 @@ export default function CitationsListPage({
           const stepBlock = mJson?.steps?.[screeningStep]
           setSrMetricsSummary(stepBlock?.summary)
           setSrCriterionMetrics(stepBlock?.criteria)
+          setSrWarnings(Array.isArray(mJson?.warnings) ? mJson.warnings : null)
         } else {
           setSrMetricsSummary(undefined)
           setSrCriterionMetrics(undefined)
+          setSrWarnings(null)
+        }
+
+        // 3) calibration (validated set)
+        const cRes = await fetch(
+          `/api/can-sr/screen/calibration?sr_id=${encodeURIComponent(srId)}&step=${encodeURIComponent(
+            screeningStep,
+          )}`,
+          { method: 'GET', headers },
+        )
+        const cJson = await cRes.json().catch(() => ({}))
+        if (cRes.ok && Array.isArray(cJson?.criteria)) {
+          setSrCalibration(cJson.criteria as CalibrationCriterion[])
+        } else {
+          setSrCalibration(undefined)
         }
       } catch {
         setSrMetricsSummary(undefined)
         setSrCriterionMetrics(undefined)
+        setSrCalibration(undefined)
         setSrThresholds(null)
+        setSrWarnings(null)
       }
     }
     load()
@@ -236,6 +282,7 @@ export default function CitationsListPage({
     async (nextThresholds: Record<string, any>) => {
       if (!srId) return
       try {
+        setSavingThresholds(true)
         const headers = { ...getAuthHeaders(), 'Content-Type': 'application/json' }
         const res = await fetch(
           `/api/can-sr/reviews/thresholds?sr_id=${encodeURIComponent(srId)}`,
@@ -248,11 +295,15 @@ export default function CitationsListPage({
         const j = await res.json().catch(() => ({}))
         if (res.ok) {
           setSrThresholds(j?.screening_thresholds || nextThresholds)
+          setDraftThresholds(j?.screening_thresholds || nextThresholds)
+          setThresholdsDirty(false)
           // Refresh metrics so counts reflect the new thresholds.
           setMetricsRefreshKey((k) => k + 1)
         }
       } catch {
         // ignore
+      } finally {
+        setSavingThresholds(false)
       }
     },
     [srId],
@@ -417,7 +468,32 @@ export default function CitationsListPage({
         Layout: left floating/side metrics module + right list.
         (A true fixed overlay can be added later; this keeps it responsive and simple.)
       */}
-      <main className="mx-auto max-w-6xl px-6 py-10">
+      <main className="mx-auto max-w-7xl px-6 py-10">
+        {legacyWarning ? (
+          <div className="mb-4 rounded-md border border-amber-200 bg-amber-50 p-3 text-sm text-amber-900">
+            <div className="font-medium">Legacy screening data detected</div>
+            <div className="mt-1 text-amber-800">
+              {String(legacyWarning?.message ||
+                'This SR has legacy llm_* outputs but no agentic runs. Please run Run-all to regenerate results.')}
+            </div>
+            <div className="mt-2 text-[12px] text-amber-800">
+              Tip: when legacy data is detected, Run-all will automatically force overwrite to generate real agent runs.
+            </div>
+          </div>
+        ) : null}
+
+        <ScreeningMetricsModal
+          open={metricsDrawerOpen}
+          onOpenChange={setMetricsDrawerOpen}
+          title={dict?.screening?.metricsTitle || 'Screening metrics'}
+          stepLabel={displayMap[screeningStep]}
+          summary={srMetricsSummary}
+          criterionMetrics={srCriterionMetrics}
+          calibration={srCalibration}
+          srId={srId}
+          step={screeningStep}
+        />
+
         <Dialog open={runAllModalOpen} onOpenChange={() => setRunAllModalOpen(false)}>
           <DialogContent className="sm:max-w-[560px]">
             <DialogHeader>
@@ -457,32 +533,66 @@ export default function CitationsListPage({
           </DialogContent>
         </Dialog>
         <div className="grid grid-cols-12 gap-6">
-          <aside className="col-span-12 md:col-span-4">
+          <aside className="col-span-12 md:col-span-5">
             <div className="sticky top-6">
+              {/* Save controls */}
+              {(screeningStep === 'l1' || screeningStep === 'l2') ? (
+                <div className="mb-2 flex items-center justify-between gap-2">
+                  <div className="text-xs text-gray-600">
+                    {thresholdsDirty ? 'Unsaved threshold changes' : 'Thresholds up to date'}
+                  </div>
+                  <Button
+                    variant="outline"
+                    disabled={!thresholdsDirty || savingThresholds}
+                    onClick={() => {
+                      const next = draftThresholds && typeof draftThresholds === 'object' ? draftThresholds : {}
+                      void persistThresholds(next)
+                    }}
+                  >
+                    {savingThresholds ? 'Saving…' : 'Save thresholds'}
+                  </Button>
+                </div>
+              ) : null}
+
               <ScreeningMetricsPanel
                 title={dict?.screening?.metricsTitle || 'Screening metrics'}
                 filterMode={filterMode}
                 onFilterModeChange={setFilterMode}
-                stats={pageStats}
+                onOpenDetails={() => setMetricsDrawerOpen(true)}
+                stats={undefined}
                 summary={srMetricsSummary}
                 criterionMetrics={srCriterionMetrics}
+                calibration={srCalibration}
                 onCriterionThresholdChange={(criterionKey, v) => {
-                  // Update SR-scoped per-step thresholds
-                  const base = srThresholds && typeof srThresholds === 'object' ? { ...srThresholds } : {}
+                  // Update draft per-step thresholds
+                  const base = draftThresholds && typeof draftThresholds === 'object' ? { ...draftThresholds } : {}
                   const stepKey = String(screeningStep)
                   const stepMap = (base as any)[stepKey] && typeof (base as any)[stepKey] === 'object'
                     ? { ...(base as any)[stepKey] }
                     : {}
                   stepMap[criterionKey] = v
                   ;(base as any)[stepKey] = stepMap
-                  setSrThresholds(base)
+                  setDraftThresholds(base)
+                  setThresholdsDirty(true)
+                }}
+                onCriterionThresholdCommit={(criterionKey, v) => {
+                  // Ensure draft is updated and then persist.
+                  const base = draftThresholds && typeof draftThresholds === 'object' ? { ...draftThresholds } : {}
+                  const stepKey = String(screeningStep)
+                  const stepMap = (base as any)[stepKey] && typeof (base as any)[stepKey] === 'object'
+                    ? { ...(base as any)[stepKey] }
+                    : {}
+                  stepMap[criterionKey] = v
+                  ;(base as any)[stepKey] = stepMap
+                  setDraftThresholds(base)
+                  setThresholdsDirty(true)
                   void persistThresholds(base)
                 }}
               />
             </div>
           </aside>
 
-          <div className="col-span-12 md:col-span-8">
+          <div className="col-span-12 md:col-span-7">
             <div className="rounded-lg border border-gray-200 bg-white p-6 shadow-sm">
               <div className="flex items-start justify-between gap-4">
                 <div>
@@ -592,14 +702,15 @@ export default function CitationsListPage({
                       pageview={pageview}
                       threshold={threshold}
                       thresholdByCriterionKey={
-                        (srThresholds && typeof srThresholds === 'object'
-                          ? (srThresholds as any)[String(screeningStep)]
+                        (draftThresholds && typeof draftThresholds === 'object'
+                          ? (draftThresholds as any)[String(screeningStep)]
                           : null) || undefined
                       }
                       filterMode={filterMode}
                       onThresholdChange={setThreshold}
                       onFilterModeChange={setFilterMode}
-                      onStatsChange={(s) =>
+                      onStatsChange={(s) => {
+                        // keep state for now (used elsewhere), but do not show in metrics panel
                         setPageStats({
                           scopeLabel: 'this page',
                           total: s.total,
@@ -607,7 +718,7 @@ export default function CitationsListPage({
                           validated: s.validated,
                           unvalidated: s.unvalidated,
                         })
-                      }
+                      }}
                     />
                   </div>
                 )}
diff --git a/frontend/components/can-sr/PagedList.tsx b/frontend/components/can-sr/PagedList.tsx
index dced77a1..41acc61c 100644
--- a/frontend/components/can-sr/PagedList.tsx
+++ b/frontend/components/can-sr/PagedList.tsx
@@ -14,9 +14,9 @@ type CitationInfo = {
   pageview: string
   threshold?: number
   thresholdByCriterionKey?: Record<string, number>
-  filterMode?: 'needs' | 'validated' | 'unvalidated' | 'all'
+  filterMode?: 'needs' | 'validated' | 'unvalidated' | 'not_screened' | 'all'
   onThresholdChange?: (v: number) => void
-  onFilterModeChange?: (v: 'needs' | 'validated' | 'unvalidated' | 'all') => void
+  onFilterModeChange?: (v: 'needs' | 'validated' | 'unvalidated' | 'not_screened' | 'all') => void
   onStatsChange?: (stats: {
     total: number
     needsValidation: number
@@ -32,6 +32,21 @@ type LatestAgentRun = {
   answer?: string | null
   confidence?: number | null
   created_at?: string
+  guardrails?: any
+}
+
+function hasGuardrailIssue(g: any): boolean {
+  if (!g) return false
+  try {
+    const obj = typeof g === 'string' ? JSON.parse(g) : g
+    if (!obj || typeof obj !== 'object') return true
+    if (obj.parse_ok === false) return true
+    if (obj.missing_answer) return true
+    if (obj.missing_confidence) return true
+    return false
+  } catch {
+    return true
+  }
 }
 
 function getAuthHeaders(): Record<string, string> {
@@ -48,6 +63,14 @@ function snakeCaseColumn(name: string, llm: boolean) {
   return llm ? `llm_${s}`.slice(0, 60) : `human_${s}`.slice(0, 60)
 }
 
+function criterionKeyFromQuestion(question: string) {
+  if (!question) return ''
+  let s = question.trim().toLowerCase()
+  s = s.replace(/[^\w]+/g, '_')
+  s = s.replace(/_+/g, '_').replace(/^_+|_+$/g, '')
+  return s.slice(0, 56)
+}
+
 export default function PagedList({
   citationIds,
   srId,
@@ -81,7 +104,7 @@ export default function PagedList({
 
   // List controls (controlled by parent when provided; otherwise local state)
   const [thresholdLocal, setThresholdLocal] = useState<number>(0.9)
-  const [filterModeLocal, setFilterModeLocal] = useState<'needs' | 'validated' | 'unvalidated' | 'all'>('needs')
+  const [filterModeLocal, setFilterModeLocal] = useState<'needs' | 'validated' | 'unvalidated' | 'not_screened' | 'all'>('needs')
 
   const threshold = typeof thresholdProp === 'number' ? thresholdProp : thresholdLocal
   const filterMode = filterModeProp || filterModeLocal
@@ -216,9 +239,42 @@ export default function PagedList({
 
     const runs = latestRunsByCitation[citationId] || []
     if (!runs.length) {
-      // No agent runs yet => should be in "unvalidated" but not necessarily "needs"
-      // We'll treat missing runs as "needs" so it's easy to find.
-      return true
+      // Fallback for older flow where we only have llm_* columns (no screening_agent_runs).
+      // If we have any llm result, we can still apply the low-confidence rule.
+      let hasAnyLlm = false
+      let hasLowConfidence = false
+
+      for (const q of questions || []) {
+        if (!q) continue
+        const llmCol = snakeCaseColumn(q, true)
+        const llmVal = row?.[llmCol]
+        if (!llmVal) continue
+        hasAnyLlm = true
+        let conf: number | null = null
+        try {
+          const obj = typeof llmVal === 'string' ? JSON.parse(llmVal) : llmVal
+          conf = Number((obj as any)?.confidence)
+        } catch {
+          conf = null
+        }
+
+        const ck = criterionKeyFromQuestion(q)
+        const perThrRaw = thresholdByCriterionKey ? Number((thresholdByCriterionKey as any)[ck]) : NaN
+        const thr = Number.isFinite(perThrRaw) ? Math.max(0, Math.min(1, perThrRaw)) : threshold
+
+        if (!Number.isFinite(conf as any)) {
+          // Missing/invalid confidence is treated conservatively.
+          hasLowConfidence = true
+        } else if ((conf as number) < thr) {
+          hasLowConfidence = true
+        }
+      }
+
+      if (!hasAnyLlm) {
+        // Not screened yet => not in the human-review queue.
+        return false
+      }
+      return hasLowConfidence
     }
 
     // Group by criterion_key
@@ -230,19 +286,27 @@ export default function PagedList({
       byKey[key].push(r)
     }
 
-    // Rule:
-    // - If ANY criterion is a confident exclude (screening answer contains "(exclude)" AND conf >= threshold) => no review needed.
-    // - Else needs review if ANY criterion is low confidence OR critical disagrees.
+    // Rule (aligned with backend):
+    // - Auto-excluded iff ANY criterion is confident exclude AND critical agrees.
+    // - Needs human review iff NOT auto-excluded AND (any low confidence OR any critical disagreement).
+    // - PLUS: any parse/guardrail issue should trigger needs review (conservative).
 
     let hasConfidentExclude = false
     let hasLowConfidence = false
     let hasCriticalDisagree = false
+    let hasAnyGuardrailIssue = false
 
     for (const key of Object.keys(byKey)) {
       const items = byKey[key]
       const screening = items.find((x) => String((x as any)?.stage) === 'screening')
       const critical = items.find((x) => String((x as any)?.stage) === 'critical')
 
+      const scrG = (screening as any)?.guardrails
+      const critG = (critical as any)?.guardrails
+      if (hasGuardrailIssue(scrG) || hasGuardrailIssue(critG)) {
+        hasAnyGuardrailIssue = true
+      }
+
       const conf = Number((screening as any)?.confidence)
       const perThrRaw = thresholdByCriterionKey ? Number((thresholdByCriterionKey as any)[key]) : NaN
       const thr = Number.isFinite(perThrRaw) ? Math.max(0, Math.min(1, perThrRaw)) : threshold
@@ -250,19 +314,23 @@ export default function PagedList({
       if (Number.isFinite(conf) && conf < thr) hasLowConfidence = true
 
       const ans = String((screening as any)?.answer || '')
-      if (Number.isFinite(conf) && conf >= thr && ans.toLowerCase().includes('(exclude)')) {
+      const critAns = String((critical as any)?.answer || '').trim()
+      // Conservative: missing critical is treated as disagreement (needs review)
+      const critAgrees = !!critical && critAns !== '' && critAns === 'None of the above'
+
+      if (Number.isFinite(conf) && conf >= thr && ans.toLowerCase().includes('(exclude)') && critAgrees) {
         hasConfidentExclude = true
       }
 
       const criticalAns = String((critical as any)?.answer || '')
       // In our critical prompt contract, agreement is "None of the above".
-      if (critical && criticalAns.trim() !== '' && criticalAns.trim() !== 'None of the above') {
+      if (!critical || criticalAns.trim() === '' || criticalAns.trim() !== 'None of the above') {
         hasCriticalDisagree = true
       }
     }
 
     if (hasConfidentExclude) return false
-    return hasLowConfidence || hasCriticalDisagree
+    return hasLowConfidence || hasCriticalDisagree || hasAnyGuardrailIssue
   }
 
   const filteredCitationData = citationData.filter((row: any) => {
@@ -270,10 +338,16 @@ export default function PagedList({
     if (!Number.isFinite(id)) return false
     const validated = isValidatedForStep(row)
     const needs = computeNeedsValidation(id, row)
+    const runs = latestRunsByCitation[id] || []
+    const notScreened = (!runs.length) && !questions.some((q) => {
+      const llmCol = snakeCaseColumn(q, true)
+      return Boolean(row?.[llmCol])
+    })
     const unvalidated = !validated
     if (filterMode === 'all') return true
     if (filterMode === 'validated') return validated
     if (filterMode === 'unvalidated') return unvalidated
+    if (filterMode === 'not_screened') return notScreened
     if (filterMode === 'needs') return needs
     return true
   })
@@ -320,6 +394,41 @@ export default function PagedList({
     }
     if (!srId) return
 
+    // Phase 2 wiring: list-level “Classify/AI” triggers orchestrated agentic run.
+    // (Backend reads SR criteria; no need to fan out per-question calls.)
+    if (screeningStep === 'l1') {
+      await fetch('/api/can-sr/screen/title-abstract/run', {
+        method: 'POST',
+        headers,
+        body: JSON.stringify({
+          sr_id: srId,
+          citation_id: Number(id),
+          temperature: 0.0,
+          max_tokens: 1200,
+          prompt_version: 'v1',
+        }),
+      })
+      setLlmClassified((prev: Record<number, boolean>) => ({ ...prev, [id]: true }))
+      return
+    }
+
+    if (screeningStep === 'l2') {
+      await fetch('/api/can-sr/screen/fulltext/run', {
+        method: 'POST',
+        headers,
+        body: JSON.stringify({
+          sr_id: srId,
+          citation_id: Number(id),
+          temperature: 0.0,
+          max_tokens: 2000,
+          prompt_version: 'v1',
+        }),
+      })
+      setLlmClassified((prev: Record<number, boolean>) => ({ ...prev, [id]: true }))
+      return
+    }
+
+    // Other steps keep legacy behavior.
     for (let i = 0; i < questions.length; i++) {
       const bodyPayload = {
         question: questions[i],
@@ -415,6 +524,7 @@ export default function PagedList({
               <option value="needs">Needs human review</option>
               <option value="unvalidated">Unvalidated</option>
               <option value="validated">Validated</option>
+              <option value="not_screened">Not screened yet</option>
               <option value="all">All</option>
             </select>
           </div>
diff --git a/frontend/components/can-sr/ScreeningMetricsModal.tsx b/frontend/components/can-sr/ScreeningMetricsModal.tsx
new file mode 100644
index 00000000..d0ab98d4
--- /dev/null
+++ b/frontend/components/can-sr/ScreeningMetricsModal.tsx
@@ -0,0 +1,326 @@
+'use client'
+
+import React from 'react'
+
+import { Dialog, DialogContent, DialogHeader, DialogTitle } from '@/components/ui/dialog'
+import type {
+  CalibrationCriterion,
+  ScreeningCriterionMetrics,
+  ScreeningMetricsSummary,
+} from '@/components/can-sr/ScreeningMetricsPanel'
+
+type Props = {
+  open: boolean
+  onOpenChange: (open: boolean) => void
+  title?: string
+  stepLabel?: string
+  summary?: ScreeningMetricsSummary
+  criterionMetrics?: ScreeningCriterionMetrics[]
+  calibration?: CalibrationCriterion[]
+  srId?: string | null
+  step?: 'l1' | 'l2' | string
+}
+
+function pct(n: number | null | undefined, d: number | null | undefined): number | null {
+  const nn = typeof n === 'number' ? n : null
+  const dd = typeof d === 'number' ? d : null
+  if (nn === null || dd === null || dd <= 0) return null
+  return (nn / dd) * 100
+}
+
+function sparkline(values: number[], width = 140, height = 34): string {
+  if (!values.length) return ''
+  const min = Math.min(...values)
+  const max = Math.max(...values)
+  const span = max - min || 1
+  const dx = width / Math.max(1, values.length - 1)
+  return values
+    .map((v, i) => {
+      const x = i * dx
+      const y = height - ((v - min) / span) * height
+      return `${x.toFixed(2)},${y.toFixed(2)}`
+    })
+    .join(' ')
+}
+
+export default function ScreeningMetricsModal({
+  open,
+  onOpenChange,
+  title = 'Screening metrics',
+  stepLabel,
+  summary,
+  criterionMetrics,
+  calibration,
+  srId,
+  step,
+}: Props) {
+  const calibByKey = React.useMemo(() => {
+    const m = new Map<string, CalibrationCriterion>()
+    for (const c of calibration || []) m.set(c.criterion_key, c)
+    return m
+  }, [calibration])
+
+  const total = summary?.total_citations ?? 0
+  const validatedAll = summary?.validated_all ?? 0
+  const needsReview = summary?.needs_review_total ?? 0
+  const notScreened = summary?.not_screened_yet ?? 0
+
+  const validatedPct = pct(validatedAll, total)
+  const queuePct = pct(needsReview, total)
+  const notScreenedPct = pct(notScreened, total)
+
+  // --- Critical prompt additions editor (Phase 3+)
+  const stepNorm = (step || '').toLowerCase() as any
+  const [criticalAdditions, setCriticalAdditions] = React.useState<Record<string, any> | null>(null)
+  const [loadingCPA, setLoadingCPA] = React.useState(false)
+  const [savingCPA, setSavingCPA] = React.useState(false)
+
+  React.useEffect(() => {
+    if (!open) return
+    if (!srId) return
+    if (!(stepNorm === 'l1' || stepNorm === 'l2')) return
+
+    const load = async () => {
+      setLoadingCPA(true)
+      try {
+        const res = await fetch(
+          `/api/can-sr/reviews/critical-prompt-additions?sr_id=${encodeURIComponent(String(srId))}`,
+          { method: 'GET' },
+        )
+        const j = await res.json().catch(() => ({}))
+        const cpa = res.ok ? j?.critical_prompt_additions : null
+        setCriticalAdditions(typeof cpa === 'object' && cpa ? cpa : { l1: {}, l2: {} })
+      } catch {
+        setCriticalAdditions({ l1: {}, l2: {} })
+      } finally {
+        setLoadingCPA(false)
+      }
+    }
+    load()
+  }, [open, srId, stepNorm])
+
+  const updateAddition = (criterionKey: string, value: string) => {
+    setCriticalAdditions((prev) => {
+      const base = prev && typeof prev === 'object' ? prev : { l1: {}, l2: {} }
+      const block = (base as any)[stepNorm] && typeof (base as any)[stepNorm] === 'object' ? (base as any)[stepNorm] : {}
+      return {
+        ...(base as any),
+        [stepNorm]: {
+          ...block,
+          [criterionKey]: value,
+        },
+      }
+    })
+  }
+
+  const saveCPA = async () => {
+    if (!srId) return
+    if (!(stepNorm === 'l1' || stepNorm === 'l2')) return
+    setSavingCPA(true)
+    try {
+      const payload = { critical_prompt_additions: criticalAdditions || { l1: {}, l2: {} } }
+      const res = await fetch(
+        `/api/can-sr/reviews/critical-prompt-additions?sr_id=${encodeURIComponent(String(srId))}`,
+        {
+          method: 'PUT',
+          headers: { 'Content-Type': 'application/json' },
+          body: JSON.stringify(payload),
+        },
+      )
+      const j = await res.json().catch(() => ({}))
+      if (res.ok) {
+        setCriticalAdditions(j?.critical_prompt_additions || payload.critical_prompt_additions)
+      }
+    } finally {
+      setSavingCPA(false)
+    }
+  }
+
+  return (
+    <Dialog open={open} onOpenChange={onOpenChange}>
+      <DialogContent className="sm:max-w-[920px]">
+        <DialogHeader>
+          <DialogTitle>{title}</DialogTitle>
+          <div className="text-xs text-gray-600">
+            {stepLabel ? `${stepLabel} · ` : ''}Operational + reporting view (validated-set calibration)
+          </div>
+        </DialogHeader>
+
+        <div className="max-h-[70vh] overflow-auto">
+          {/* Summary */}
+          <div className="grid grid-cols-2 gap-3">
+            <div className="rounded border border-gray-100 bg-gray-50 p-3">
+              <div className="text-[11px] text-gray-500">Validated</div>
+              <div className="mt-1 text-sm font-semibold text-gray-900">
+                {validatedAll} / {total}
+              </div>
+              <div className="mt-1 text-[11px] text-gray-600">
+                {validatedPct === null ? '—' : `${validatedPct.toFixed(1)}%`}
+              </div>
+            </div>
+            <div className="rounded border border-gray-100 bg-gray-50 p-3">
+              <div className="text-[11px] text-gray-500">Human review queue</div>
+              <div className="mt-1 text-sm font-semibold text-gray-900">{needsReview}</div>
+              <div className="mt-1 text-[11px] text-gray-600">
+                {queuePct === null ? '—' : `${queuePct.toFixed(1)}% of SR`}
+              </div>
+            </div>
+            <div className="rounded border border-gray-100 bg-gray-50 p-3">
+              <div className="text-[11px] text-gray-500">Not screened yet</div>
+              <div className="mt-1 text-sm font-semibold text-gray-900">{notScreened}</div>
+              <div className="mt-1 text-[11px] text-gray-600">
+                {notScreenedPct === null ? '—' : `${notScreenedPct.toFixed(1)}% of SR`}
+              </div>
+            </div>
+            <div className="rounded border border-gray-100 bg-gray-50 p-3">
+              <div className="text-[11px] text-gray-500">Auto-excluded</div>
+              <div className="mt-1 text-sm font-semibold text-gray-900">
+                {summary?.auto_excluded ?? 0}
+              </div>
+              <div className="mt-1 text-[11px] text-gray-600">(confident exclude + critical agrees)</div>
+            </div>
+          </div>
+
+          {/* Per-criterion */}
+          <div className="mt-6">
+            <div className="text-xs font-semibold text-gray-800">Per-criterion analytics</div>
+            <div className="mt-2 space-y-3">
+              {(criterionMetrics || []).map((m) => {
+                const accPct = typeof m.accuracy === 'number' ? m.accuracy * 100 : null
+                const cal = calibByKey.get(m.criterion_key)
+                const rec =
+                  cal && typeof cal.recommended_threshold === 'number'
+                    ? cal.recommended_threshold
+                    : null
+                const curve = Array.isArray(cal?.curve) ? cal!.curve : []
+                const recPoint =
+                  rec === null
+                    ? null
+                    : curve.find((p) => Math.abs(p.threshold - rec) < 1e-9) || null
+                const wr =
+                  recPoint && typeof recPoint.workload_reduction === 'number'
+                    ? recPoint.workload_reduction * 100
+                    : null
+                const recall =
+                  recPoint && typeof recPoint.recall === 'number' ? recPoint.recall * 100 : null
+                const fpr = recPoint && typeof recPoint.fpr === 'number' ? recPoint.fpr * 100 : null
+
+                const hist = Array.isArray(cal?.histogram) ? cal!.histogram : []
+                const disagreeShare = hist.map((b) => {
+                  const t = (b.agree || 0) + (b.disagree || 0)
+                  return t > 0 ? (b.disagree || 0) / t : 0
+                })
+                const points = sparkline(disagreeShare)
+
+                return (
+                  <div key={m.criterion_key} className="rounded border border-gray-100 bg-white p-3">
+                    <div className="flex items-start justify-between gap-3">
+                      <div className="min-w-0 flex-1">
+                        <div className="truncate text-xs font-medium text-gray-900">{m.label}</div>
+                        <div className="mt-1 text-[11px] text-gray-600">
+                          Current threshold: <span className="font-medium">{m.threshold}</span>
+                          {rec === null ? '' : ` · Recommended: ${rec.toFixed(2)}`}
+                          {cal ? ` · Validated n=${cal.validated_n}` : ''}
+                        </div>
+                      </div>
+                      <div className="text-right">
+                        <div className="text-[11px] text-gray-500">Accuracy</div>
+                        <div className="text-sm font-semibold text-gray-900">
+                          {accPct === null ? '—' : `${accPct.toFixed(0)}%`}
+                        </div>
+                      </div>
+                    </div>
+
+                    <div className="mt-3 grid grid-cols-2 gap-2 text-[11px]">
+                      <div className="rounded border border-gray-100 bg-gray-50 p-2">
+                        Needs review (this criterion): {m.needs_human_review_count}
+                      </div>
+                      <div className="rounded border border-gray-100 bg-gray-50 p-2">
+                        Confident exclude: {m.confident_exclude_count}
+                      </div>
+                      <div className="rounded border border-gray-100 bg-gray-50 p-2">
+                        Low confidence: {m.low_confidence_count}
+                      </div>
+                      <div className="rounded border border-gray-100 bg-gray-50 p-2">
+                        Critical disagreement: {m.critical_disagreement_count}
+                      </div>
+                    </div>
+
+                    {(stepNorm === 'l1' || stepNorm === 'l2') ? (
+                      <div className="mt-3 rounded border border-gray-100 bg-gray-50 p-2">
+                        <div className="mb-1 text-[11px] font-medium text-gray-700">
+                          Critical prompt additions (for this criterion)
+                        </div>
+                        <textarea
+                          value={
+                            String(
+                              (criticalAdditions as any)?.[stepNorm]?.[m.criterion_key] || '',
+                            )
+                          }
+                          onChange={(e) => updateAddition(m.criterion_key, e.target.value)}
+                          rows={3}
+                          className="w-full rounded border border-gray-200 bg-white px-2 py-1 text-[11px]"
+                          placeholder="Add SR-specific instructions that will be injected into the CRITICAL prompt for this criterion."
+                        />
+                      </div>
+                    ) : null}
+
+                    <div className="mt-3 grid grid-cols-3 gap-2">
+                      <div className="rounded border border-gray-100 bg-gray-50 p-2 text-[11px]">
+                        <div className="text-gray-500">Recall @ rec</div>
+                        <div className="font-medium text-gray-900">
+                          {recall === null ? '—' : `${recall.toFixed(0)}%`}
+                        </div>
+                      </div>
+                      <div className="rounded border border-gray-100 bg-gray-50 p-2 text-[11px]">
+                        <div className="text-gray-500">FPR @ rec</div>
+                        <div className="font-medium text-gray-900">
+                          {fpr === null ? '—' : `${fpr.toFixed(0)}%`}
+                        </div>
+                      </div>
+                      <div className="rounded border border-gray-100 bg-gray-50 p-2 text-[11px]">
+                        <div className="text-gray-500">Workload ↓ @ rec</div>
+                        <div className="font-medium text-gray-900">
+                          {wr === null ? '—' : `${wr.toFixed(0)}%`}
+                        </div>
+                      </div>
+                    </div>
+
+                    <div className="mt-3 flex items-center justify-between gap-2">
+                      <div className="text-[11px] text-gray-500">
+                        Disagreement share by confidence bin
+                      </div>
+                      <svg width={140} height={34} className="shrink-0">
+                        <polyline
+                          fill="none"
+                          stroke="rgb(244 63 94)"
+                          strokeWidth="2"
+                          points={points}
+                        />
+                      </svg>
+                    </div>
+                  </div>
+                )
+              })}
+            </div>
+
+            {(stepNorm === 'l1' || stepNorm === 'l2') ? (
+              <div className="mt-4 flex items-center justify-between gap-3 rounded border border-gray-100 bg-white p-3">
+                <div className="text-xs text-gray-600">
+                  {loadingCPA ? 'Loading critical prompt additions…' : 'Edit additions above, then save.'}
+                </div>
+                <button
+                  onClick={saveCPA}
+                  disabled={savingCPA || loadingCPA || !srId}
+                  className="rounded bg-emerald-600 px-3 py-1 text-xs font-medium text-white disabled:opacity-50"
+                >
+                  {savingCPA ? 'Saving…' : 'Save critical additions'}
+                </button>
+              </div>
+            ) : null}
+          </div>
+        </div>
+      </DialogContent>
+    </Dialog>
+  )
+}
diff --git a/frontend/components/can-sr/ScreeningMetricsPanel.tsx b/frontend/components/can-sr/ScreeningMetricsPanel.tsx
index 2ad2b8bc..704e0c4c 100644
--- a/frontend/components/can-sr/ScreeningMetricsPanel.tsx
+++ b/frontend/components/can-sr/ScreeningMetricsPanel.tsx
@@ -16,6 +16,8 @@ export type ScreeningMetricsSummary = {
   needs_review_total: number
   validated_needs_review: number
   unvalidated_needs_review: number
+  not_screened_yet?: number
+  auto_excluded?: number
 }
 
 export type ScreeningCriterionMetrics = {
@@ -28,6 +30,37 @@ export type ScreeningCriterionMetrics = {
   critical_disagreement_count: number
   confident_exclude_count: number
   needs_human_review_count: number
+  accuracy?: number | null
+}
+
+export type CalibrationPoint = {
+  threshold: number
+  tp: number
+  fp: number
+  fn: number
+  tn: number
+  precision?: number | null
+  recall?: number | null
+  fpr?: number | null
+  tpr?: number | null
+  workload_reduction?: number | null
+}
+
+export type CalibrationHistogramBin = {
+  bin_start: number
+  bin_end: number
+  agree: number
+  disagree: number
+}
+
+export type CalibrationCriterion = {
+  criterion_key: string
+  label: string
+  validated_n: number
+  recommended_threshold?: number | null
+  recommended_reason?: string | null
+  curve: CalibrationPoint[]
+  histogram: CalibrationHistogramBin[]
 }
 
 export type ScreeningMetricsPanelProps = {
@@ -44,9 +77,18 @@ export type ScreeningMetricsPanelProps = {
   summary?: ScreeningMetricsSummary
   criterionMetrics?: ScreeningCriterionMetrics[]
   onCriterionThresholdChange?: (criterionKey: string, v: number) => void
+  onCriterionThresholdCommit?: (criterionKey: string, v: number) => void
+
+  /**
+   * Phase 2A: calibration curves + recommended thresholds (validated set).
+   */
+  calibration?: CalibrationCriterion[]
 
-  filterMode: 'needs' | 'validated' | 'unvalidated' | 'all'
-  onFilterModeChange: (v: 'needs' | 'validated' | 'unvalidated' | 'all') => void
+  /** Optional: open a larger reporting drawer. */
+  onOpenDetails?: () => void
+
+  filterMode: 'needs' | 'validated' | 'unvalidated' | 'not_screened' | 'all'
+  onFilterModeChange: (v: 'needs' | 'validated' | 'unvalidated' | 'not_screened' | 'all') => void
   stats?: ScreeningMetricsStats
 }
 
@@ -63,14 +105,59 @@ export default function ScreeningMetricsPanel({
   summary,
   criterionMetrics,
   onCriterionThresholdChange,
+  onCriterionThresholdCommit,
+  calibration,
+  onOpenDetails,
   filterMode,
   onFilterModeChange,
   stats,
 }: ScreeningMetricsPanelProps) {
+  const [thresholdText, setThresholdText] = React.useState<Record<string, string>>({})
+
+  // Keep a stable text representation so users can type freely.
+  React.useEffect(() => {
+    if (!criterionMetrics?.length) return
+    setThresholdText((prev) => {
+      const next = { ...prev }
+      for (const c of criterionMetrics) {
+        const k = c.criterion_key
+        if (!(k in next)) {
+          next[k] = Number.isFinite(c.threshold) ? String(c.threshold) : '0.9'
+        }
+      }
+      return next
+    })
+  }, [criterionMetrics])
+  const total = summary?.total_citations ?? 0
+  const validatedAll = summary?.validated_all ?? 0
+  const notScreened = summary?.not_screened_yet ?? 0
+
+  // Human review queue is a subset of unvalidated; "not screened yet" should not be part of the queue.
+  const queueTotal = summary?.needs_review_total ?? 0
+  const queueValidated = summary?.validated_needs_review ?? 0
+  const queueRemaining = Math.max(0, queueTotal - queueValidated)
+
+  const validatedPct = total > 0 ? (validatedAll / total) * 100 : 0
+  const queuePct = total > 0 ? (queueTotal / total) * 100 : 0
+  const notScreenedPct = total > 0 ? (notScreened / total) * 100 : 0
+
+  const queueStartPct = Math.min(100, Math.max(0, validatedPct))
+
   return (
     <div className="rounded-lg border border-gray-200 bg-white p-4 shadow-sm">
       <div className="mb-3">
-        <h3 className="text-sm font-semibold text-gray-900">{title}</h3>
+        <div className="flex items-start justify-between gap-2">
+          <h3 className="text-sm font-semibold text-gray-900">{title}</h3>
+          {onOpenDetails ? (
+            <button
+              type="button"
+              onClick={onOpenDetails}
+              className="rounded-md border border-gray-200 bg-white px-2 py-1 text-[11px] text-gray-700 hover:bg-gray-50"
+            >
+              Details
+            </button>
+          ) : null}
+        </div>
         <p className="mt-1 text-xs text-gray-600">
           Threshold + validation workload controls. (Accuracy/curves will be powered by Phase 2 metrics.)
         </p>
@@ -108,106 +195,218 @@ export default function ScreeningMetricsPanel({
             <option value="needs">Needs human review</option>
             <option value="unvalidated">Unvalidated</option>
             <option value="validated">Validated</option>
+            <option value="not_screened">Not screened yet</option>
             <option value="all">All</option>
           </select>
         </div>
 
         {summary ? (
           <div className="rounded-md border border-gray-100 bg-gray-50 p-3">
-            <div className="text-xs font-medium text-gray-700">Validation summary</div>
-            <div className="mt-2 grid grid-cols-2 gap-2 text-xs text-gray-700">
-              <div className="rounded border border-gray-100 bg-white p-2">
-                <div className="text-[11px] text-gray-500">All citations</div>
-                <div className="font-semibold">
-                  {summary.validated_all} / {summary.total_citations}
-                </div>
-              </div>
-              <div className="rounded border border-gray-100 bg-white p-2">
-                <div className="text-[11px] text-gray-500">Needs human review</div>
-                <div className="font-semibold">
-                  {summary.validated_needs_review} / {summary.needs_review_total}
-                </div>
+            <div className="text-xs font-medium text-gray-700">Progress</div>
+
+            {/* Combined progress bar */}
+            <div className="mt-2">
+              <div className="relative h-3 w-full overflow-hidden rounded bg-gray-200">
+                {/* Validated (green) */}
+                <div
+                  className="absolute left-0 top-0 h-3 bg-emerald-600"
+                  style={{ width: `${Math.min(100, Math.max(0, validatedPct))}%` }}
+                />
+
+                {/* Needs human review queue (amber) as part of remainder */}
+                <div
+                  className="absolute top-0 h-3 bg-amber-400"
+                  style={{
+                    left: `${Math.min(100, Math.max(0, validatedPct))}%`,
+                    width: `${Math.min(100, Math.max(0, queuePct))}%`,
+                  }}
+                />
+
+                {/* Not screened yet (gray) */}
+                <div
+                  className="absolute top-0 h-3 bg-gray-400"
+                  style={{
+                    left: `${Math.min(100, Math.max(0, validatedPct + queuePct))}%`,
+                    width: `${Math.min(100, Math.max(0, notScreenedPct))}%`,
+                  }}
+                />
+
+                {/* Inner overlay: progress within queue (thin bar) */}
+                {queueTotal > 0 ? (
+                  <div
+                    className="absolute top-0 h-1 bg-amber-700"
+                    style={{
+                      left: `${Math.min(100, Math.max(0, validatedPct))}%`,
+                      width: `${Math.min(100, Math.max(0, (queueValidated / total) * 100))}%`,
+                    }}
+                  />
+                ) : null}
+
+                {/* Dotted marker: start of human review queue */}
+                {total > 0 ? (
+                  <div
+                    className="absolute top-0 h-3 border-l border-dashed border-gray-900/40"
+                    style={{ left: `${queueStartPct}%` }}
+                    title="Human review queue starts here"
+                  />
+                ) : null}
               </div>
             </div>
-            <div className="mt-2 text-[11px] text-gray-500">
-              Unvalidated: {summary.unvalidated_all} (all), {summary.unvalidated_needs_review} (needs review)
+
+            <div className="mt-2 grid grid-cols-1 gap-1 text-[11px] text-gray-600">
+              <div>
+                <span className="font-medium text-gray-700">Validated:</span> {validatedAll} / {total}
+              </div>
+              <div>
+                <span className="font-medium text-gray-700">Human review queue:</span> {queueRemaining} remaining (of {queueTotal})
+              </div>
+              <div>
+                <span className="font-medium text-gray-700">Not screened yet:</span> {notScreened}
+              </div>
             </div>
           </div>
         ) : null}
 
-        <div className="rounded-md border border-gray-100 bg-gray-50 p-3">
-          <div className="text-xs font-medium text-gray-700">
-            Workload summary{stats?.scopeLabel ? ` (${stats.scopeLabel})` : ''}
-          </div>
-          <div className="mt-2 grid grid-cols-2 gap-2 text-xs text-gray-700">
-            <div className="rounded border border-gray-100 bg-white p-2">
-              <div className="text-[11px] text-gray-500">Total</div>
-              <div className="font-semibold">{stats ? stats.total : '—'}</div>
-            </div>
-            <div className="rounded border border-gray-100 bg-white p-2">
-              <div className="text-[11px] text-gray-500">Needs validation</div>
-              <div className="font-semibold">{stats ? stats.needsValidation : '—'}</div>
-            </div>
-            <div className="rounded border border-gray-100 bg-white p-2">
-              <div className="text-[11px] text-gray-500">Validated</div>
-              <div className="font-semibold">{stats ? stats.validated : '—'}</div>
-            </div>
-            <div className="rounded border border-gray-100 bg-white p-2">
-              <div className="text-[11px] text-gray-500">Unvalidated</div>
-              <div className="font-semibold">{stats ? stats.unvalidated : '—'}</div>
-            </div>
-          </div>
-        </div>
+        {/* Removed page-local workload summary (we want SR-wide progress only). */}
 
         {criterionMetrics?.length ? (
           <div className="rounded-md border border-gray-100 bg-gray-50 p-3">
-            <div className="text-xs font-medium text-gray-700">Criteria thresholds & metrics</div>
+            <div className="text-xs font-medium text-gray-700">Criteria thresholds</div>
+
             <div className="mt-2 space-y-2">
-              {criterionMetrics.map((c) => (
-                <div key={c.criterion_key} className="rounded border border-gray-100 bg-white p-2">
-                  <div className="flex items-start justify-between gap-2">
-                    <div className="min-w-0 flex-1">
-                      <div className="text-xs font-medium text-gray-800 truncate">{c.label}</div>
-                      <div className="mt-1 grid grid-cols-2 gap-x-2 gap-y-1 text-[11px] text-gray-600">
-                        <div>Low conf: {c.low_confidence_count}</div>
-                        <div>Critical disagree: {c.critical_disagreement_count}</div>
-                        <div>Confident exclude: {c.confident_exclude_count}</div>
-                        <div>Has run: {c.has_run_count}/{c.total_citations}</div>
+              {criterionMetrics.map((c) => {
+                const acc = typeof c.accuracy === 'number' ? Math.round(c.accuracy * 100) : null
+                const textVal = thresholdText[c.criterion_key] ?? (Number.isFinite(c.threshold) ? String(c.threshold) : '0.9')
+                return (
+                  <details key={c.criterion_key} className="rounded border border-gray-100 bg-white p-2">
+                    <summary className="cursor-pointer list-none">
+                      <div className="flex items-center justify-between gap-2">
+                        <div className="min-w-0 flex-1">
+                          <div className="truncate text-xs font-medium text-gray-800">{c.label}</div>
+                          <div className="mt-1 text-[11px] text-gray-500">
+                            Accuracy: {acc === null ? '—' : `${acc}%`}
+                          </div>
+                        </div>
+
+                        <div className="flex items-center gap-2">
+                          <label className="text-[11px] text-gray-600">Thr</label>
+                          <input
+                            type="text"
+                            inputMode="decimal"
+                            value={textVal}
+                            onChange={(e: React.ChangeEvent<HTMLInputElement>) => {
+                              const raw = e.target.value
+                              setThresholdText((p) => ({ ...p, [c.criterion_key]: raw }))
+                              const v = Number(raw)
+                              if (!Number.isFinite(v)) return
+                              onCriterionThresholdChange?.(c.criterion_key, Math.max(0, Math.min(1, v)))
+                            }}
+                            onKeyDown={(e: React.KeyboardEvent<HTMLInputElement>) => {
+                              if (e.key !== 'Enter') return
+                              const v = Number((e.currentTarget as HTMLInputElement).value)
+                              if (!Number.isFinite(v)) return
+                              onCriterionThresholdCommit?.(c.criterion_key, Math.max(0, Math.min(1, v)))
+                            }}
+                            className="w-20 rounded-md border border-gray-200 px-2 py-1 text-xs"
+                          />
+                          <span className="text-[11px] text-gray-400">▾</span>
+                        </div>
                       </div>
-                    </div>
+                    </summary>
 
-                    <div className="flex items-center gap-2">
-                      <label className="text-[11px] text-gray-600">Thr</label>
-                      <input
-                        type="number"
-                        min={0}
-                        max={1}
-                        step={0.01}
-                        value={Number.isFinite(c.threshold) ? c.threshold : 0.9}
-                        onChange={(e: React.ChangeEvent<HTMLInputElement>) => {
-                          const v = Number(e.target.value)
-                          if (!Number.isFinite(v)) return
-                          onCriterionThresholdChange?.(
-                            c.criterion_key,
-                            Math.max(0, Math.min(1, v)),
-                          )
-                        }}
-                        className="w-20 rounded-md border border-gray-200 px-2 py-1 text-sm"
-                      />
+                    <div className="mt-2 grid grid-cols-2 gap-2 text-[11px] text-gray-700">
+                      <div className="rounded border border-gray-100 bg-gray-50 p-2">
+                        Low confidence: {c.low_confidence_count}
+                      </div>
+                      <div className="rounded border border-gray-100 bg-gray-50 p-2">
+                        Disagreement / missing critical: {c.critical_disagreement_count}
+                      </div>
+                      <div className="rounded border border-gray-100 bg-gray-50 p-2">
+                        Confident exclude: {c.confident_exclude_count}
+                      </div>
+                      <div className="rounded border border-gray-100 bg-gray-50 p-2">
+                        Has run: {c.has_run_count}/{c.total_citations}
+                      </div>
+                      <div className="col-span-2 rounded border border-gray-100 bg-gray-50 p-2">
+                        Triggered review (this criterion): {c.needs_human_review_count}
+                      </div>
                     </div>
-                  </div>
-                </div>
-              ))}
+                  </details>
+                )
+              })}
             </div>
           </div>
         ) : null}
 
-        <div className="rounded-md border border-gray-100 bg-gray-50 p-3 text-xs text-gray-700">
-          <div className="font-medium">Performance (validated set)</div>
-          <div className="mt-1 text-gray-500">
-            Coming in Phase 2: agreement/accuracy, recommended thresholds, workload reduction curves.
+        {calibration?.length ? (
+          <div className="rounded-md border border-gray-100 bg-gray-50 p-3">
+            <div className="text-xs font-medium text-gray-700">Calibration (validated set)</div>
+            <div className="mt-2 space-y-2">
+              {calibration.map((c) => {
+                const rec =
+                  typeof c.recommended_threshold === 'number'
+                    ? Math.round(c.recommended_threshold * 100) / 100
+                    : null
+                const best = Array.isArray(c.curve)
+                  ? c.curve.find(
+                      (p) =>
+                        typeof c.recommended_threshold === 'number' &&
+                        Math.abs(p.threshold - c.recommended_threshold) < 1e-9,
+                    )
+                  : undefined
+
+                const wr =
+                  typeof best?.workload_reduction === 'number'
+                    ? Math.round(best.workload_reduction * 100)
+                    : null
+                const recall =
+                  typeof best?.recall === 'number' ? Math.round(best.recall * 100) : null
+                const fpr = typeof best?.fpr === 'number' ? Math.round(best.fpr * 100) : null
+
+                return (
+                  <details key={c.criterion_key} className="rounded border border-gray-100 bg-white p-2">
+                    <summary className="cursor-pointer list-none">
+                      <div className="flex items-center justify-between gap-2">
+                        <div className="min-w-0 flex-1">
+                          <div className="truncate text-xs font-medium text-gray-800">{c.label}</div>
+                          <div className="mt-1 text-[11px] text-gray-500">
+                            Validated: {c.validated_n}
+                            {rec === null ? '' : ` · Recommended thr: ${rec}`}
+                          </div>
+                        </div>
+                        <span className="text-[11px] text-gray-400">▾</span>
+                      </div>
+                    </summary>
+
+                    <div className="mt-2 grid grid-cols-2 gap-2 text-[11px] text-gray-700">
+                      <div className="rounded border border-gray-100 bg-gray-50 p-2">
+                        Recall: {recall === null ? '—' : `${recall}%`}
+                      </div>
+                      <div className="rounded border border-gray-100 bg-gray-50 p-2">
+                        FPR: {fpr === null ? '—' : `${fpr}%`}
+                      </div>
+                      <div className="col-span-2 rounded border border-gray-100 bg-gray-50 p-2">
+                        Workload reduction: {wr === null ? '—' : `${wr}%`}
+                      </div>
+                      {c.recommended_reason ? (
+                        <div className="col-span-2 text-[11px] text-gray-500">
+                          {c.recommended_reason}
+                        </div>
+                      ) : null}
+                    </div>
+                  </details>
+                )
+              })}
+            </div>
           </div>
-        </div>
+        ) : (
+          <div className="rounded-md border border-gray-100 bg-gray-50 p-3 text-xs text-gray-700">
+            <div className="font-medium">Calibration (validated set)</div>
+            <div className="mt-1 text-gray-500">
+              No calibration data yet. Validate citations to accumulate a comparison set.
+            </div>
+          </div>
+        )}
       </div>
     </div>
   )