From 5d93ea74f02470180f645dc611edb57246ba6543 Mon Sep 17 00:00:00 2001 From: Bing1100 Date: Tue, 31 Mar 2026 23:01:30 -0400 Subject: [PATCH 1/3] feat(agents:AI agents for title/abstract screening and workload reduction --- backend/api/core/cit_utils.py | 11 + backend/api/screen/agentic_utils.py | 95 +++ backend/api/screen/prompts.py | 145 ++++ backend/api/screen/router.py | 666 +++++++++++++++++- backend/api/services/cit_db_service.py | 330 ++++++++- backend/docker-compose.yml | 5 +- backend/main.py | 10 + .../app/[lang]/can-sr/l1-screen/view/page.tsx | 160 +++++ .../app/[lang]/can-sr/l2-screen/view/page.tsx | 167 +++++ .../can-sr/screen/agent-runs/latest/route.ts | 67 ++ .../app/api/can-sr/screen/validate/route.ts | 44 ++ frontend/components/can-sr/PagedList.tsx | 165 ++++- frontend/package-lock.json | 129 ++-- 13 files changed, 1908 insertions(+), 86 deletions(-) create mode 100644 backend/api/screen/agentic_utils.py create mode 100644 frontend/app/api/can-sr/screen/agent-runs/latest/route.ts create mode 100644 frontend/app/api/can-sr/screen/validate/route.ts diff --git a/backend/api/core/cit_utils.py b/backend/api/core/cit_utils.py index 1e33ce13..5e0461ae 100644 --- a/backend/api/core/cit_utils.py +++ b/backend/api/core/cit_utils.py @@ -14,6 +14,7 @@ from fastapi.concurrency import run_in_threadpool from .config import settings +from ..services.cit_db_service import cits_dp_service def _is_postgres_configured() -> bool: @@ -94,5 +95,15 @@ async def load_sr_and_check( if not screening: raise HTTPException(status_code=status.HTTP_404_NOT_FOUND, detail="No screening database configured for this systematic review") + # Best-effort runtime schema evolution for agentic screening. + # CAN-SR uses per-upload screening tables, so we may need to add the + # validation columns to the specific table referenced by the SR. + try: + table_name = (screening or {}).get("table_name") or "citations" + await run_in_threadpool(cits_dp_service.ensure_step_validation_columns, table_name) + except Exception: + # Don't block requests if the DB isn't ready/configured. + pass + return sr, screening diff --git a/backend/api/screen/agentic_utils.py b/backend/api/screen/agentic_utils.py new file mode 100644 index 00000000..1250287b --- /dev/null +++ b/backend/api/screen/agentic_utils.py @@ -0,0 +1,95 @@ +"""backend.api.screen.agentic_utils + +Utilities for the GREP-Agent style "screening + critical" workflow. + +We keep this module small and dependency-free so routers can reuse the helpers +for title/abstract and fulltext pipelines. +""" + +from __future__ import annotations + +import re +from dataclasses import dataclass +from typing import Optional + + +@dataclass +class ParsedAgentXML: + answer: str + confidence: float + rationale: str + parse_ok: bool + + +_TAG_RE_CACHE: dict[str, re.Pattern[str]] = {} + + +def _tag_re(tag: str) -> re.Pattern[str]: + if tag not in _TAG_RE_CACHE: + _TAG_RE_CACHE[tag] = re.compile(rf"<{tag}>(.*?)", re.IGNORECASE | re.DOTALL) + return _TAG_RE_CACHE[tag] + + +def parse_agent_xml(text: str) -> ParsedAgentXML: + """Parse , , tags from model output.""" + + raw = (text or "").strip() + ans_m = _tag_re("answer").search(raw) + conf_m = _tag_re("confidence").search(raw) + rat_m = _tag_re("rationale").search(raw) + + answer = (ans_m.group(1).strip() if ans_m else "") + rationale = (rat_m.group(1).strip() if rat_m else "") + + conf_val = 0.0 + if conf_m: + try: + conf_val = float(conf_m.group(1).strip()) + except Exception: + conf_val = 0.0 + conf_val = max(0.0, min(1.0, conf_val)) + + parse_ok = bool(ans_m and conf_m) + return ParsedAgentXML(answer=answer, confidence=conf_val, rationale=rationale, parse_ok=parse_ok) + + +def resolve_option(raw_answer: str, options: list[str]) -> str: + """Resolve a model answer to one of the provided options (best-effort).""" + ans = (raw_answer or "").strip() + if not ans: + return ans + + # Exact match first + for opt in options or []: + if ans == opt: + return opt + + # Case-insensitive exact + ans_l = ans.lower() + for opt in options or []: + if ans_l == (opt or "").lower(): + return opt + + # Substring containment (mirrors existing CAN-SR JSON screening logic) + for opt in options or []: + if (opt or "").lower() in ans_l: + return opt + + return ans + + +def build_critical_options(*, all_options: list[str], screening_answer: str) -> list[str]: + """Forced alternatives: (all_options - {screening_answer}) + ["None of the above"].""" + base = [o for o in (all_options or []) if (o or "").strip()] + sa = (screening_answer or "").strip() + if sa: + base = [o for o in base if o.strip() != sa] + base.append("None of the above") + # stable unique + seen = set() + out = [] + for o in base: + if o not in seen: + seen.add(o) + out.append(o) + return out diff --git a/backend/api/screen/prompts.py b/backend/api/screen/prompts.py index ba7ac9d5..97861767 100644 --- a/backend/api/screen/prompts.py +++ b/backend/api/screen/prompts.py @@ -72,4 +72,149 @@ - Use sentence indices from the numbered full text for "evidence_sentences" - Use table numbers from the Tables section for "evidence_tables" - Use figure numbers from the Figures section for "evidence_figures" +""" + + +# --------------------------------------------------------------------------- +# Agentic screening (GREP-Agent style) prompt contracts +# --------------------------------------------------------------------------- + +# NOTE: +# CAN-SR historically used JSON output for screening. The agentic plan expects +# XML-tag parsing (, , ) so we can reuse a stable +# parsing contract across screening + critical steps. + +PROMPT_XML_TEMPLATE_TA = """ +You are a highly critical, helpful scientific evaluator completing an academic review. + +Task: +Answer the question "{question}" for the following citation. + +Citation: +{cit} + +Choose EXACTLY ONE of these options (exact text): +{options} + +Additional guidance: +{xtra} + +Output requirement: +Return ONLY the following XML tags (no Markdown, no extra prose): +... +... +... + +Confidence requirements: +- confidence is a float between 0 and 1 +- be conservative; do not overestimate confidence +""" + + +PROMPT_XML_TEMPLATE_TA_CRITICAL = """ +You are a critical reviewer double-checking another model's screening answer. + +Original question: +"{question}" + +Citation: +{cit} + +The first model answered: +"{screening_answer}" + +Now, you MUST choose from the following forced alternatives. +Rules: +- You are NOT allowed to choose the original answer. +- If you agree with the original answer, choose "None of the above". + +Forced alternatives (choose exactly one; exact text): +{options} + +Additional guidance: +{xtra} + +Output requirement: +Return ONLY the following XML tags (no Markdown, no extra prose): +... +... +... + +Confidence requirements: +- confidence is a float between 0 and 1 +- be conservative; do not overestimate confidence +""" + + +PROMPT_XML_TEMPLATE_FULLTEXT = """ +You are assisting with a scientific full-text screening task. + +Task: +Evaluate the question "{question}" against the paper content provided as numbered sentences (e.g., "[0] ...", "[1] ..."). + +Choose EXACTLY ONE of these options (exact text): +{options} + +Additional guidance: +{xtra} + +Full text (numbered sentences): +{fulltext} + +Tables (numbered): +{tables} + +Figures (numbered; captions correspond to images provided alongside this message): +{figures} + +Output requirement: +Return ONLY the following XML tags (no Markdown, no extra prose): +... +... +... + +Confidence requirements: +- confidence is a float between 0 and 1 +- be conservative; do not overestimate confidence +""" + + +PROMPT_XML_TEMPLATE_FULLTEXT_CRITICAL = """ +You are a critical reviewer double-checking another model's full-text screening answer. + +Original question: +"{question}" + +The first model answered: +"{screening_answer}" + +Now, you MUST choose from the following forced alternatives. +Rules: +- You are NOT allowed to choose the original answer. +- If you agree with the original answer, choose "None of the above". + +Forced alternatives (choose exactly one; exact text): +{options} + +Additional guidance: +{xtra} + +Full text (numbered sentences): +{fulltext} + +Tables (numbered): +{tables} + +Figures (numbered; captions correspond to images provided alongside this message): +{figures} + +Output requirement: +Return ONLY the following XML tags (no Markdown, no extra prose): +... +... +... + +Confidence requirements: +- confidence is a float between 0 and 1 +- be conservative; do not overestimate confidence """ \ No newline at end of file diff --git a/backend/api/screen/router.py b/backend/api/screen/router.py index 10174b82..400ea9b0 100644 --- a/backend/api/screen/router.py +++ b/backend/api/screen/router.py @@ -19,13 +19,28 @@ # Import consolidated Postgres helpers if available (optional) from ..services.cit_db_service import cits_dp_service, snake_case_column, snake_case -from .prompts import PROMPT_JSON_TEMPLATE, PROMPT_JSON_TEMPLATE_FULLTEXT +from .prompts import ( + PROMPT_JSON_TEMPLATE, + PROMPT_JSON_TEMPLATE_FULLTEXT, + PROMPT_XML_TEMPLATE_TA, + PROMPT_XML_TEMPLATE_TA_CRITICAL, + PROMPT_XML_TEMPLATE_FULLTEXT, + PROMPT_XML_TEMPLATE_FULLTEXT_CRITICAL, +) +from .agentic_utils import build_critical_options, parse_agent_xml, resolve_option logger = logging.getLogger(__name__) router = APIRouter() +class AgentRunsQueryResponse(BaseModel): + sr_id: str + pipeline: str + citation_ids: List[int] + runs: List[Dict[str, Any]] + + def _normalize_int_list(v: Any) -> List[int]: if v is None: return [] @@ -85,6 +100,31 @@ class HumanClassifyRequest(BaseModel): explanation: Optional[str] = Field("", description="Optional free-text explanation from the human reviewer") confidence: Optional[float] = Field(None, ge=0.0, le=1.0, description="Optional confidence (0.0 - 1.0)") reviewer: Optional[str] = Field(None, description="Optional reviewer id or name") + + +class TitleAbstractRunRequest(BaseModel): + sr_id: str = Field(..., description="Systematic review id") + citation_id: int = Field(..., ge=1, description="Citation id (row id in the SR screening table)") + model: Optional[str] = Field(None, description="Model key/deployment to use") + temperature: float = Field(0.0, ge=0.0, le=1.0) + max_tokens: int = Field(1200, ge=64, le=4000) + prompt_version: Optional[str] = Field("v1", description="Prompt version tag for auditing") + + +class ValidateStepRequest(BaseModel): + sr_id: str = Field(..., description="Systematic review id") + citation_id: int = Field(..., ge=1, description="Citation id (row id in the SR screening table)") + step: str = Field("l1", description="Validation step: l1|l2|parameters") + + +class FulltextRunRequest(BaseModel): + sr_id: str = Field(..., description="Systematic review id") + citation_id: int = Field(..., ge=1, description="Citation id (row id in the SR screening table)") + model: Optional[str] = Field(None, description="Model key/deployment to use") + temperature: float = Field(0.0, ge=0.0, le=1.0) + max_tokens: int = Field(2000, ge=64, le=4000) + prompt_version: Optional[str] = Field("v1", description="Prompt version tag for auditing") + # _update_sync moved to backend.api.core.postgres.update_jsonb_column # Use run_in_threadpool(update_jsonb_column, ...) where needed. @@ -397,6 +437,630 @@ async def human_classify_citation( return {"status": "success", "sr_id": sr_id, "citation_id": citation_id, "column": col_name, "classification": classification_json} + +@router.post("/title-abstract/run") +async def run_title_abstract_agentic( + payload: TitleAbstractRunRequest, + current_user: Dict[str, Any] = Depends(get_current_active_user), +): + """Run orchestrated Title/Abstract screening + critical for one citation. + + Implements Phase 1 MVP endpoint from planning/agentic_implementation_plan. + """ + + sr_id = str(payload.sr_id) + citation_id = int(payload.citation_id) + + try: + sr, screening = await load_sr_and_check(sr_id, current_user, srdb_service) + except HTTPException: + raise + except Exception as e: + raise HTTPException( + status_code=status.HTTP_500_INTERNAL_SERVER_ERROR, + detail=f"Failed to load systematic review or screening: {e}", + ) + + table_name = (screening or {}).get("table_name") or "citations" + + # Ensure LLM client is available + if not azure_openai_client.is_configured(): + raise HTTPException( + status_code=status.HTTP_503_SERVICE_UNAVAILABLE, + detail="Azure OpenAI client is not configured on the server", + ) + + # Load citation row + try: + row = await run_in_threadpool(cits_dp_service.get_citation_by_id, citation_id, table_name) + except RuntimeError as rexc: + raise HTTPException(status_code=status.HTTP_503_SERVICE_UNAVAILABLE, detail=str(rexc)) + except Exception as e: + raise HTTPException(status_code=status.HTTP_500_INTERNAL_SERVER_ERROR, detail=f"Failed to query screening DB: {e}") + + if not row: + raise HTTPException(status_code=status.HTTP_404_NOT_FOUND, detail="Citation not found") + + # Build combined citation text (use SR include columns or fallback to title+abstract) + include_cols = [] + try: + include_cols = cits_dp_service.load_include_columns_from_criteria(sr) or [] + except Exception: + include_cols = [] + if not include_cols: + include_cols = ["title", "abstract"] + + citation_text = citations_router._build_combined_citation_from_row(row, include_cols) + + # Load L1 criteria + cp = sr.get("criteria_parsed") or sr.get("criteria") or {} + l1 = cp.get("l1") if isinstance(cp, dict) else None + questions = (l1 or {}).get("questions") if isinstance(l1, dict) else [] + possible = (l1 or {}).get("possible_answers") if isinstance(l1, dict) else [] + addinfos = (l1 or {}).get("additional_infos") if isinstance(l1, dict) else [] + questions = questions if isinstance(questions, list) else [] + possible = possible if isinstance(possible, list) else [] + addinfos = addinfos if isinstance(addinfos, list) else [] + + if not questions: + raise HTTPException(status_code=status.HTTP_400_BAD_REQUEST, detail="SR has no L1 criteria questions configured") + + async def _call_llm(prompt: str) -> Tuple[str, Dict[str, Any], int]: + """Return (content, usage, latency_ms).""" + import time + + t0 = time.time() + messages = [{"role": "user", "content": prompt}] + resp = await azure_openai_client.chat_completion( + messages=messages, + model=payload.model, + max_tokens=payload.max_tokens, + temperature=payload.temperature, + stream=False, + ) + latency_ms = int((time.time() - t0) * 1000) + content = ((resp.get("choices") or [{}])[0].get("message") or {}).get("content") or "" + usage = resp.get("usage") or {} + return str(content), dict(usage), latency_ms + + results: List[Dict[str, Any]] = [] + user_email = str(current_user.get("email") or current_user.get("id") or "") + + for i, q in enumerate(questions): + if not isinstance(q, str) or not q.strip(): + continue + + opts = possible[i] if i < len(possible) and isinstance(possible[i], list) else [] + opts = [str(o) for o in opts if o is not None and str(o).strip()] + xtra = addinfos[i] if i < len(addinfos) and isinstance(addinfos[i], str) else "" + + if not opts: + # still return shape to UI + results.append( + { + "question": q, + "criterion_key": snake_case(q, max_len=56), + "error": "No options configured", + } + ) + continue + + options_listed = "\n".join(opts) + criterion_key = snake_case(q, max_len=56) + + # 1) screening + screening_prompt = PROMPT_XML_TEMPLATE_TA.format( + question=q, + cit=citation_text, + options=options_listed, + xtra=xtra or "", + ) + screening_raw, screening_usage, screening_latency = await _call_llm(screening_prompt) + screening_parsed = parse_agent_xml(screening_raw) + screening_answer = resolve_option(screening_parsed.answer, opts) + + try: + screening_run_id = await run_in_threadpool( + cits_dp_service.insert_screening_agent_run, + { + "sr_id": sr_id, + "table_name": table_name, + "citation_id": citation_id, + "pipeline": "title_abstract", + "criterion_key": criterion_key, + "stage": "screening", + "answer": screening_answer, + "confidence": screening_parsed.confidence, + "rationale": screening_parsed.rationale, + "raw_response": screening_raw, + "model": payload.model, + "prompt_version": payload.prompt_version, + "temperature": payload.temperature, + "latency_ms": screening_latency, + "input_tokens": screening_usage.get("prompt_tokens"), + "output_tokens": screening_usage.get("completion_tokens"), + }, + ) + except RuntimeError as rexc: + raise HTTPException(status_code=status.HTTP_503_SERVICE_UNAVAILABLE, detail=str(rexc)) + except Exception as e: + raise HTTPException(status_code=status.HTTP_500_INTERNAL_SERVER_ERROR, detail=f"Failed to persist screening run: {e}") + + # 2) critical + critical_opts = build_critical_options(all_options=opts, screening_answer=screening_answer) + critical_listed = "\n".join(critical_opts) + critical_prompt = PROMPT_XML_TEMPLATE_TA_CRITICAL.format( + question=q, + cit=citation_text, + screening_answer=screening_answer, + options=critical_listed, + xtra=xtra or "", + ) + critical_raw, critical_usage, critical_latency = await _call_llm(critical_prompt) + critical_parsed = parse_agent_xml(critical_raw) + critical_answer = resolve_option(critical_parsed.answer, critical_opts) + + disagrees = str(critical_answer).strip() != "None of the above" + + try: + critical_run_id = await run_in_threadpool( + cits_dp_service.insert_screening_agent_run, + { + "sr_id": sr_id, + "table_name": table_name, + "citation_id": citation_id, + "pipeline": "title_abstract", + "criterion_key": criterion_key, + "stage": "critical", + "answer": critical_answer, + "confidence": critical_parsed.confidence, + "rationale": critical_parsed.rationale, + "raw_response": critical_raw, + "model": payload.model, + "prompt_version": payload.prompt_version, + "temperature": payload.temperature, + "latency_ms": critical_latency, + "input_tokens": critical_usage.get("prompt_tokens"), + "output_tokens": critical_usage.get("completion_tokens"), + }, + ) + except RuntimeError as rexc: + raise HTTPException(status_code=status.HTTP_503_SERVICE_UNAVAILABLE, detail=str(rexc)) + except Exception as e: + raise HTTPException(status_code=status.HTTP_500_INTERNAL_SERVER_ERROR, detail=f"Failed to persist critical run: {e}") + + results.append( + { + "question": q, + "criterion_key": criterion_key, + "screening": { + "run_id": screening_run_id, + "answer": screening_answer, + "confidence": screening_parsed.confidence, + "rationale": screening_parsed.rationale, + "parse_ok": screening_parsed.parse_ok, + }, + "critical": { + "run_id": critical_run_id, + "answer": critical_answer, + "confidence": critical_parsed.confidence, + "rationale": critical_parsed.rationale, + "parse_ok": critical_parsed.parse_ok, + "disagrees": disagrees, + }, + } + ) + + return { + "status": "success", + "sr_id": sr_id, + "citation_id": citation_id, + "pipeline": "title_abstract", + "criteria": results, + } + + +@router.post("/validate") +async def validate_screening_step( + payload: ValidateStepRequest, + current_user: Dict[str, Any] = Depends(get_current_active_user), +): + """Mark a citation as validated for a given step. + + Phase 1 MVP uses step=l1 (Title/Abstract). This endpoint is written to be + forward-compatible with l2/parameters. + """ + + sr_id = str(payload.sr_id) + citation_id = int(payload.citation_id) + step = (payload.step or "l1").lower().strip() + + if step not in {"l1", "l2", "parameters"}: + raise HTTPException(status_code=status.HTTP_400_BAD_REQUEST, detail="step must be one of: l1, l2, parameters") + + try: + _sr, screening = await load_sr_and_check(sr_id, current_user, srdb_service) + except HTTPException: + raise + except Exception as e: + raise HTTPException(status_code=status.HTTP_500_INTERNAL_SERVER_ERROR, detail=f"Failed to load SR: {e}") + + table_name = (screening or {}).get("table_name") or "citations" + + validated_by_col = f"{step}_validated_by" + validated_at_col = f"{step}_validated_at" + validated_by = str(current_user.get("email") or current_user.get("id") or "") + now_iso = datetime.utcnow().isoformat() + "Z" + + try: + # Ensure columns exist (best-effort; no-migrations philosophy) + await run_in_threadpool(cits_dp_service.create_column, validated_by_col, "TEXT", table_name) + await run_in_threadpool(cits_dp_service.create_column, validated_at_col, "TIMESTAMPTZ", table_name) + + u1 = await run_in_threadpool(cits_dp_service.update_text_column, citation_id, validated_by_col, validated_by, table_name) + u2 = await run_in_threadpool(cits_dp_service.update_text_column, citation_id, validated_at_col, now_iso, table_name) + except RuntimeError as rexc: + raise HTTPException(status_code=status.HTTP_503_SERVICE_UNAVAILABLE, detail=str(rexc)) + except Exception as e: + raise HTTPException(status_code=status.HTTP_500_INTERNAL_SERVER_ERROR, detail=f"Failed to update validation fields: {e}") + + if not (u1 and u2): + raise HTTPException(status_code=status.HTTP_404_NOT_FOUND, detail="Citation not found to update") + + return { + "status": "success", + "sr_id": sr_id, + "citation_id": citation_id, + "step": step, + "validated_by": validated_by, + "validated_at": now_iso, + } + + +@router.post("/fulltext/run") +async def run_fulltext_agentic( + payload: FulltextRunRequest, + current_user: Dict[str, Any] = Depends(get_current_active_user), +): + """Run orchestrated Fulltext screening + critical for one citation (L2).""" + + sr_id = str(payload.sr_id) + citation_id = int(payload.citation_id) + + try: + sr, screening = await load_sr_and_check(sr_id, current_user, srdb_service) + except HTTPException: + raise + except Exception as e: + raise HTTPException(status_code=status.HTTP_500_INTERNAL_SERVER_ERROR, detail=f"Failed to load SR: {e}") + + table_name = (screening or {}).get("table_name") or "citations" + + if not azure_openai_client.is_configured(): + raise HTTPException( + status_code=status.HTTP_503_SERVICE_UNAVAILABLE, + detail="Azure OpenAI client is not configured on the server", + ) + + # Load citation row + try: + row = await run_in_threadpool(cits_dp_service.get_citation_by_id, citation_id, table_name) + except RuntimeError as rexc: + raise HTTPException(status_code=status.HTTP_503_SERVICE_UNAVAILABLE, detail=str(rexc)) + except Exception as e: + raise HTTPException(status_code=status.HTTP_500_INTERNAL_SERVER_ERROR, detail=f"Failed to query screening DB: {e}") + + if not row: + raise HTTPException(status_code=status.HTTP_404_NOT_FOUND, detail="Citation not found") + + # Ensure fulltext exists (CAN-SR source of truth: extracted DI/Grobid artifacts) + if not row.get("fulltext"): + # We don't have a direct SR id in the extract endpoint signature; it expects sr_id. + # We'll try best-effort to trigger extraction if fulltext_url exists. + try: + from ..extract.router import extract_fulltext_from_storage + + await extract_fulltext_from_storage(sr_id, citation_id, current_user=current_user) # type: ignore + except Exception: + pass + + row = await run_in_threadpool(cits_dp_service.get_citation_by_id, citation_id, table_name) + + include_cols = [] + try: + include_cols = cits_dp_service.load_include_columns_from_criteria(sr) or [] + except Exception: + include_cols = [] + if not include_cols: + include_cols = ["title", "abstract"] + + citation_text = citations_router._build_combined_citation_from_row(row or {}, include_cols) + fulltext = (row or {}).get("fulltext") or citation_text + + # Tables/Figures context from row + tables_md_lines: List[str] = [] + figures_lines: List[str] = [] + images: List[Tuple[bytes, str]] = [] + + ft_tables = (row or {}).get("fulltext_tables") + if isinstance(ft_tables, str): + try: + ft_tables = json.loads(ft_tables) + except Exception: + ft_tables = None + if isinstance(ft_tables, list): + for item in ft_tables: + if not isinstance(item, dict): + continue + idx = item.get("index") + blob_addr = item.get("blob_address") + caption = item.get("caption") + if not idx or not blob_addr: + continue + try: + md_bytes, _ = await storage_service.get_bytes_by_path(blob_addr) + md_txt = md_bytes.decode("utf-8", errors="replace") + header = f"Table [T{idx}]" + (f" caption: {caption}" if caption else "") + tables_md_lines.extend([header, md_txt, ""]) + except Exception: + continue + + ft_figs = (row or {}).get("fulltext_figures") + if isinstance(ft_figs, str): + try: + ft_figs = json.loads(ft_figs) + except Exception: + ft_figs = None + if isinstance(ft_figs, list): + for item in ft_figs: + if not isinstance(item, dict): + continue + idx = item.get("index") + blob_addr = item.get("blob_address") + caption = item.get("caption") + if not idx or not blob_addr: + continue + figures_lines.append(f"Figure [F{idx}] caption: {caption or '(no caption)'} (see attached image F{idx})") + try: + img_bytes, _ = await storage_service.get_bytes_by_path(blob_addr) + if img_bytes: + images.append((img_bytes, "image/png")) + except Exception: + continue + + # Load L2 criteria + cp = sr.get("criteria_parsed") or sr.get("criteria") or {} + l2 = cp.get("l2") if isinstance(cp, dict) else None + questions = (l2 or {}).get("questions") if isinstance(l2, dict) else [] + possible = (l2 or {}).get("possible_answers") if isinstance(l2, dict) else [] + addinfos = (l2 or {}).get("additional_infos") if isinstance(l2, dict) else [] + questions = questions if isinstance(questions, list) else [] + possible = possible if isinstance(possible, list) else [] + addinfos = addinfos if isinstance(addinfos, list) else [] + + if not questions: + raise HTTPException(status_code=status.HTTP_400_BAD_REQUEST, detail="SR has no L2 criteria questions configured") + + async def _call_llm(prompt: str) -> Tuple[str, Dict[str, Any], int]: + import time + + t0 = time.time() + # Use multimodal API when we have figure images + if images: + content = await azure_openai_client.multimodal_chat( + user_text=prompt, + images=images, + system_prompt=None, + model=payload.model, + max_tokens=payload.max_tokens, + temperature=payload.temperature, + ) + latency_ms = int((time.time() - t0) * 1000) + # multimodal_chat does not expose usage + return str(content), {}, latency_ms + + messages = [{"role": "user", "content": prompt}] + resp = await azure_openai_client.chat_completion( + messages=messages, + model=payload.model, + max_tokens=payload.max_tokens, + temperature=payload.temperature, + stream=False, + ) + latency_ms = int((time.time() - t0) * 1000) + content = ((resp.get("choices") or [{}])[0].get("message") or {}).get("content") or "" + usage = resp.get("usage") or {} + return str(content), dict(usage), latency_ms + + results: List[Dict[str, Any]] = [] + + for i, q in enumerate(questions): + if not isinstance(q, str) or not q.strip(): + continue + + opts = possible[i] if i < len(possible) and isinstance(possible[i], list) else [] + opts = [str(o) for o in opts if o is not None and str(o).strip()] + xtra = addinfos[i] if i < len(addinfos) and isinstance(addinfos[i], str) else "" + + if not opts: + results.append({"question": q, "criterion_key": snake_case(q, max_len=56), "error": "No options configured"}) + continue + + criterion_key = snake_case(q, max_len=56) + options_listed = "\n".join(opts) + + # 1) screening + screening_prompt = PROMPT_XML_TEMPLATE_FULLTEXT.format( + question=q, + options=options_listed, + xtra=xtra or "", + fulltext=fulltext, + tables="\n".join(tables_md_lines) if tables_md_lines else "(none)", + figures="\n".join(figures_lines) if figures_lines else "(none)", + ) + screening_raw, screening_usage, screening_latency = await _call_llm(screening_prompt) + screening_parsed = parse_agent_xml(screening_raw) + screening_answer = resolve_option(screening_parsed.answer, opts) + + try: + screening_run_id = await run_in_threadpool( + cits_dp_service.insert_screening_agent_run, + { + "sr_id": sr_id, + "table_name": table_name, + "citation_id": citation_id, + "pipeline": "fulltext", + "criterion_key": criterion_key, + "stage": "screening", + "answer": screening_answer, + "confidence": screening_parsed.confidence, + "rationale": screening_parsed.rationale, + "raw_response": screening_raw, + "model": payload.model, + "prompt_version": payload.prompt_version, + "temperature": payload.temperature, + "latency_ms": screening_latency, + "input_tokens": screening_usage.get("prompt_tokens"), + "output_tokens": screening_usage.get("completion_tokens"), + }, + ) + except RuntimeError as rexc: + raise HTTPException(status_code=status.HTTP_503_SERVICE_UNAVAILABLE, detail=str(rexc)) + except Exception as e: + raise HTTPException(status_code=status.HTTP_500_INTERNAL_SERVER_ERROR, detail=f"Failed to persist screening run: {e}") + + # 2) critical + critical_opts = build_critical_options(all_options=opts, screening_answer=screening_answer) + critical_listed = "\n".join(critical_opts) + critical_prompt = PROMPT_XML_TEMPLATE_FULLTEXT_CRITICAL.format( + question=q, + screening_answer=screening_answer, + options=critical_listed, + xtra=xtra or "", + fulltext=fulltext, + tables="\n".join(tables_md_lines) if tables_md_lines else "(none)", + figures="\n".join(figures_lines) if figures_lines else "(none)", + ) + critical_raw, critical_usage, critical_latency = await _call_llm(critical_prompt) + critical_parsed = parse_agent_xml(critical_raw) + critical_answer = resolve_option(critical_parsed.answer, critical_opts) + disagrees = str(critical_answer).strip() != "None of the above" + + try: + critical_run_id = await run_in_threadpool( + cits_dp_service.insert_screening_agent_run, + { + "sr_id": sr_id, + "table_name": table_name, + "citation_id": citation_id, + "pipeline": "fulltext", + "criterion_key": criterion_key, + "stage": "critical", + "answer": critical_answer, + "confidence": critical_parsed.confidence, + "rationale": critical_parsed.rationale, + "raw_response": critical_raw, + "model": payload.model, + "prompt_version": payload.prompt_version, + "temperature": payload.temperature, + "latency_ms": critical_latency, + "input_tokens": critical_usage.get("prompt_tokens"), + "output_tokens": critical_usage.get("completion_tokens"), + }, + ) + except RuntimeError as rexc: + raise HTTPException(status_code=status.HTTP_503_SERVICE_UNAVAILABLE, detail=str(rexc)) + except Exception as e: + raise HTTPException(status_code=status.HTTP_500_INTERNAL_SERVER_ERROR, detail=f"Failed to persist critical run: {e}") + + results.append( + { + "question": q, + "criterion_key": criterion_key, + "screening": { + "run_id": screening_run_id, + "answer": screening_answer, + "confidence": screening_parsed.confidence, + "rationale": screening_parsed.rationale, + "parse_ok": screening_parsed.parse_ok, + }, + "critical": { + "run_id": critical_run_id, + "answer": critical_answer, + "confidence": critical_parsed.confidence, + "rationale": critical_parsed.rationale, + "parse_ok": critical_parsed.parse_ok, + "disagrees": disagrees, + }, + } + ) + + return { + "status": "success", + "sr_id": sr_id, + "citation_id": citation_id, + "pipeline": "fulltext", + "criteria": results, + } + + +@router.get("/agent-runs/latest", response_model=AgentRunsQueryResponse) +async def get_latest_agent_runs( + sr_id: str, + pipeline: str, + citation_ids: str, + current_user: Dict[str, Any] = Depends(get_current_active_user), +): + """Fetch latest screening_agent_runs for a set of citations. + + Query params: + - sr_id: SR id + - pipeline: title_abstract | fulltext + - citation_ids: comma-separated citation ids + """ + + pipeline_norm = (pipeline or "").strip().lower() + if pipeline_norm in {"ta", "titleabstract", "title-abstract"}: + pipeline_norm = "title_abstract" + if pipeline_norm not in {"title_abstract", "fulltext"}: + raise HTTPException(status_code=status.HTTP_400_BAD_REQUEST, detail="pipeline must be 'title_abstract' or 'fulltext'") + + raw_ids = [p.strip() for p in (citation_ids or "").split(",") if p.strip()] + if not raw_ids: + raise HTTPException(status_code=status.HTTP_400_BAD_REQUEST, detail="citation_ids is required") + parsed_ids: List[int] = [] + for p in raw_ids: + try: + parsed_ids.append(int(p)) + except Exception: + continue + if not parsed_ids: + raise HTTPException(status_code=status.HTTP_400_BAD_REQUEST, detail="citation_ids must be a comma-separated list of integers") + + try: + _sr, screening = await load_sr_and_check(sr_id, current_user, srdb_service) + except HTTPException: + raise + except Exception as e: + raise HTTPException(status_code=status.HTTP_500_INTERNAL_SERVER_ERROR, detail=f"Failed to load SR: {e}") + + table_name = (screening or {}).get("table_name") or "citations" + + try: + rows = await run_in_threadpool( + cits_dp_service.list_latest_agent_runs, + sr_id=sr_id, + table_name=table_name, + citation_ids=parsed_ids, + pipeline=pipeline_norm, + ) + except RuntimeError as rexc: + raise HTTPException(status_code=status.HTTP_503_SERVICE_UNAVAILABLE, detail=str(rexc)) + except Exception as e: + raise HTTPException(status_code=status.HTTP_500_INTERNAL_SERVER_ERROR, detail=f"Failed to query screening_agent_runs: {e}") + + return AgentRunsQueryResponse(sr_id=sr_id, pipeline=pipeline_norm, citation_ids=parsed_ids, runs=rows) + async def update_inclusion_decision( sr: Dict[str, Any], citation_id: int, diff --git a/backend/api/services/cit_db_service.py b/backend/api/services/cit_db_service.py index 85371835..1e3f28e4 100644 --- a/backend/api/services/cit_db_service.py +++ b/backend/api/services/cit_db_service.py @@ -13,8 +13,15 @@ can surface a 503 with an actionable message. """ from typing import Any, Dict, List, Optional, Tuple -import psycopg2 -import psycopg2.extras + +# psycopg2 is optional in some deploy/test contexts. +# Per module docstring contract: methods should raise RuntimeError when psycopg2 +# is unavailable so routers can surface a 503. +try: + import psycopg2 # type: ignore + import psycopg2.extras # type: ignore +except Exception: # pragma: no cover + psycopg2 = None import json import re import os @@ -22,6 +29,8 @@ import csv import urllib.parse as up import hashlib +from datetime import datetime +import uuid # Local settings import (for POSTGRES_ADMIN_DSN / DATABASE_URL usage) try: @@ -145,6 +154,297 @@ def __init__(self): # nothing stateful for now; keep class for ergonomics and easier testing pass + def _require_psycopg2(self) -> None: + if psycopg2 is None: + raise RuntimeError( + "psycopg2 is not installed. Install backend dependencies (requirements.txt) " + "or run with the docker backend image." + ) + + # ----------------------- + # Schema helpers + # ----------------------- + def table_exists(self, table_name: str = "citations") -> bool: + """Return True if a public table exists. + + NOTE: We intentionally use runtime schema evolution (ALTER TABLE ...) + throughout CAN-SR, so callers need a safe way to check existence before + attempting to add columns. + """ + table_name = _validate_ident(table_name, kind="table_name") + self._require_psycopg2() + conn = None + try: + conn = postgres_server.conn + cur = conn.cursor() + cur.execute( + """ + SELECT 1 + FROM information_schema.tables + WHERE table_schema = 'public' AND table_name = %s + LIMIT 1 + """, + (table_name,), + ) + return cur.fetchone() is not None + except Exception: + _safe_rollback(conn) + raise + finally: + if conn: + pass + + def ensure_step_validation_columns(self, table_name: str = "citations") -> None: + """Ensure step-level validation columns exist for a screening table. + + CAN-SR uses per-upload screening tables, so we create these columns on + those tables (not just a single shared citations table). + + This is intentionally NOT backwards-compatible: it will eagerly add the + columns to whatever table is passed. + """ + if not self.table_exists(table_name): + return + + # L1 (Title/Abstract) + self.create_column("l1_validated_by", "TEXT", table_name=table_name) + self.create_column("l1_validated_at", "TIMESTAMPTZ", table_name=table_name) + + # L2 (Full Text) + self.create_column("l2_validated_by", "TEXT", table_name=table_name) + self.create_column("l2_validated_at", "TIMESTAMPTZ", table_name=table_name) + + # Parameters / extraction + self.create_column("parameters_validated_by", "TEXT", table_name=table_name) + self.create_column("parameters_validated_at", "TIMESTAMPTZ", table_name=table_name) + + def ensure_screening_agent_runs_table(self) -> None: + """Ensure the normalized agent-run storage table exists. + + We keep it in the shared Postgres DB (public schema). Because CAN-SR uses + per-upload screening tables (each with its own id sequence), we store + both the `sr_id` and the screening `table_name` alongside `citation_id`. + """ + conn = None + try: + self._require_psycopg2() + conn = postgres_server.conn + cur = conn.cursor() + + cur.execute( + """ + CREATE TABLE IF NOT EXISTS screening_agent_runs ( + id TEXT PRIMARY KEY, + sr_id TEXT NOT NULL, + table_name TEXT NOT NULL, + citation_id INT NOT NULL, + pipeline TEXT NOT NULL, + criterion_key TEXT NOT NULL, + stage TEXT NOT NULL, + answer TEXT, + confidence DOUBLE PRECISION, + rationale TEXT, + raw_response TEXT, + model TEXT, + prompt_version TEXT, + temperature DOUBLE PRECISION, + top_p DOUBLE PRECISION, + seed INT, + latency_ms INT, + input_tokens INT, + output_tokens INT, + cost_usd DOUBLE PRECISION, + created_at TIMESTAMPTZ DEFAULT now() + ) + """ + ) + + # A couple of pragmatic indexes for common lookups. + cur.execute( + """ + CREATE INDEX IF NOT EXISTS idx_screening_agent_runs_citation + ON screening_agent_runs (sr_id, table_name, citation_id, pipeline) + """ + ) + cur.execute( + """ + CREATE INDEX IF NOT EXISTS idx_screening_agent_runs_criterion + ON screening_agent_runs (sr_id, pipeline, criterion_key, stage) + """ + ) + + conn.commit() + except Exception: + _safe_rollback(conn) + raise + finally: + if conn: + pass + + def ensure_agentic_screening_schema(self) -> None: + """One-call bootstrap for agentic screening. + + This is safe to call at startup (creates only global tables), and can + also be called by endpoints before use. + """ + self.ensure_screening_agent_runs_table() + + # ----------------------- + # Agent-run persistence + # ----------------------- + def insert_screening_agent_run(self, run: Dict[str, Any]) -> str: + """Insert a single screening_agent_runs row. + + Expected keys (most optional): + - sr_id, table_name, citation_id, pipeline, criterion_key, stage + - answer, confidence, rationale, raw_response + - model, prompt_version, temperature, top_p, seed + - latency_ms, input_tokens, output_tokens, cost_usd + + Returns the generated run id. + """ + self._require_psycopg2() + self.ensure_screening_agent_runs_table() + + run_id = str(run.get("id") or uuid.uuid4()) + sr_id = str(run.get("sr_id") or "") + table_name = str(run.get("table_name") or "") + citation_id = int(run.get("citation_id") or 0) + pipeline = str(run.get("pipeline") or "") + criterion_key = str(run.get("criterion_key") or "") + stage = str(run.get("stage") or "") + + if not (sr_id and table_name and citation_id and pipeline and criterion_key and stage): + raise ValueError("insert_screening_agent_run missing required fields") + + conn = None + try: + conn = postgres_server.conn + cur = conn.cursor() + cur.execute( + """ + INSERT INTO screening_agent_runs ( + id, sr_id, table_name, citation_id, pipeline, criterion_key, stage, + answer, confidence, rationale, raw_response, + model, prompt_version, temperature, top_p, seed, + latency_ms, input_tokens, output_tokens, cost_usd, created_at + ) VALUES ( + %s, %s, %s, %s, %s, %s, %s, + %s, %s, %s, %s, + %s, %s, %s, %s, %s, + %s, %s, %s, %s, %s + ) + """, + ( + run_id, + sr_id, + table_name, + citation_id, + pipeline, + criterion_key, + stage, + run.get("answer"), + run.get("confidence"), + run.get("rationale"), + run.get("raw_response"), + run.get("model"), + run.get("prompt_version"), + run.get("temperature"), + run.get("top_p"), + run.get("seed"), + run.get("latency_ms"), + run.get("input_tokens"), + run.get("output_tokens"), + run.get("cost_usd"), + run.get("created_at") or datetime.utcnow().isoformat() + "Z", + ), + ) + conn.commit() + return run_id + except Exception: + _safe_rollback(conn) + raise + finally: + if conn: + pass + + def list_latest_agent_runs( + self, + *, + sr_id: str, + table_name: str, + citation_ids: List[int], + pipeline: str, + ) -> List[Dict[str, Any]]: + """Return latest agent runs per (citation_id, criterion_key, stage) for a set of citations. + + This is designed for list pages where we need to compute "needs validation" + without loading full raw responses. + """ + self._require_psycopg2() + self.ensure_screening_agent_runs_table() + + sr_id = str(sr_id or "") + table_name = str(table_name or "") + pipeline = str(pipeline or "") + + ids: List[int] = [] + for i in citation_ids or []: + try: + ids.append(int(i)) + except Exception: + continue + if not (sr_id and table_name and pipeline and ids): + return [] + + conn = None + try: + conn = postgres_server.conn + cur = conn.cursor(cursor_factory=psycopg2.extras.RealDictCursor) + + # DISTINCT ON picks the first row per group according to ORDER BY. + cur.execute( + """ + SELECT DISTINCT ON (citation_id, criterion_key, stage) + id, + sr_id, + table_name, + citation_id, + pipeline, + criterion_key, + stage, + answer, + confidence, + rationale, + model, + prompt_version, + temperature, + top_p, + seed, + latency_ms, + input_tokens, + output_tokens, + cost_usd, + created_at + FROM screening_agent_runs + WHERE sr_id = %s + AND table_name = %s + AND pipeline = %s + AND citation_id = ANY(%s) + ORDER BY citation_id, criterion_key, stage, created_at DESC + """, + (sr_id, table_name, pipeline, ids), + ) + + rows = cur.fetchall() or [] + return [dict(r) for r in rows if r] + except Exception: + _safe_rollback(conn) + raise + finally: + if conn: + pass + # ----------------------- # Low level connection helpers # ----------------------- @@ -160,6 +460,7 @@ def create_column(self, col: str, col_type: str, table_name: str = "citations") col_type is the SQL type (e.g. TEXT, JSONB). """ table_name = _validate_ident(table_name, kind="table_name") + self._require_psycopg2() conn = None try: conn = postgres_server.conn @@ -193,6 +494,7 @@ def update_jsonb_column( Update a JSONB column for a citation. Creates the column if needed. """ table_name = _validate_ident(table_name, kind="table_name") + self._require_psycopg2() conn = None try: conn = postgres_server.conn @@ -227,6 +529,7 @@ def update_text_column( Update a TEXT column for a citation. Creates the column if needed. """ table_name = _validate_ident(table_name, kind="table_name") + self._require_psycopg2() conn = None try: conn = postgres_server.conn @@ -259,6 +562,7 @@ def update_bool_column( ) -> int: """Update a BOOLEAN column for a citation. Creates the column if needed.""" table_name = _validate_ident(table_name, kind="table_name") + self._require_psycopg2() conn = None try: conn = postgres_server.conn @@ -284,6 +588,7 @@ def update_bool_column( def get_table_columns(self, table_name: str = "citations") -> List[Dict[str, str]]: """Return [{name, data_type, udt_name}] for table columns ordered by ordinal_position.""" table_name = _validate_ident(table_name, kind="table_name") + self._require_psycopg2() conn = None try: conn = postgres_server.conn @@ -368,6 +673,7 @@ def copy_jsonb_if_empty( Intended for auto-filling human_* from llm_* while never overwriting. """ table_name = _validate_ident(table_name, kind="table_name") + self._require_psycopg2() conn = None try: conn = postgres_server.conn @@ -407,6 +713,7 @@ def dump_citations_csv(self, table_name: str = "citations") -> bytes: Uses Postgres COPY for correctness and performance. """ table_name = _validate_ident(table_name, kind="table_name") + self._require_psycopg2() conn = None try: conn = postgres_server.conn @@ -439,6 +746,7 @@ def dump_citations_csv_filtered(self, table_name: str = "citations") -> bytes: explicit scalar columns (selected/explanation/confidence/found/value/...). """ table_name = _validate_ident(table_name, kind="table_name") + self._require_psycopg2() # 1) Determine columns to export cols_meta = self.get_table_columns(table_name) @@ -596,6 +904,7 @@ def get_citation_by_id(self, citation_id: int, table_name: str = "citations") -> Return a dict mapping column -> value for the citation row, or None. """ table_name = _validate_ident(table_name, kind="table_name") + self._require_psycopg2() conn = None try: conn = postgres_server.conn @@ -638,6 +947,7 @@ def get_citations_by_ids( List[dict] rows. Missing ids are omitted. """ table_name = _validate_ident(table_name, kind="table_name") + self._require_psycopg2() ids: List[int] = [] for i in citation_ids or []: try: @@ -691,6 +1001,7 @@ def backfill_human_decisions(self, criteria_parsed: Dict[str, Any], table_name: - undecided: any question missing/unanswered """ table_name = _validate_ident(table_name, kind="table_name") + self._require_psycopg2() cp = criteria_parsed or {} l1_qs = (cp.get("l1") or {}).get("questions") if isinstance(cp.get("l1"), dict) else None @@ -815,6 +1126,7 @@ def list_citation_ids(self, filter_step=None, table_name: str = "citations") -> Return list of integer primary keys (id) from citations table ordered by id. """ table_name = _validate_ident(table_name, kind="table_name") + self._require_psycopg2() conn = None try: conn = postgres_server.conn @@ -862,6 +1174,7 @@ def list_fulltext_urls(self, table_name: str = "citations") -> List[str]: Return list of fulltext_url values (non-null) from citations table. """ table_name = _validate_ident(table_name, kind="table_name") + self._require_psycopg2() conn = None try: conn = postgres_server.conn @@ -898,6 +1211,7 @@ def attach_fulltext( Creates columns if necessary. Returns rows modified (0/1). """ table_name = _validate_ident(table_name, kind="table_name") + self._require_psycopg2() # create columns if missing self.create_column("fulltext_url", "TEXT", table_name=table_name) # compute md5 @@ -929,6 +1243,7 @@ def get_column_value(self, citation_id: int, column: str, table_name: str = "cit Return the value stored in `column` for the citation row (or None). """ table_name = _validate_ident(table_name, kind="table_name") + self._require_psycopg2() conn = None try: conn = postgres_server.conn @@ -968,6 +1283,7 @@ def set_column_value(self, citation_id: int, column: str, value: Any, table_name def drop_table(self, table_name: str, cascade: bool = True) -> None: """Drop a screening table in the shared database.""" table_name = _validate_ident(table_name, kind="table_name") + self._require_psycopg2() conn = None try: conn = postgres_server.conn @@ -994,6 +1310,7 @@ def create_table_and_insert_sync( is per-upload (e.g. sr___citations) inside the shared DB. """ table_name = _validate_ident(table_name, kind="table_name") + self._require_psycopg2() conn = None try: conn = postgres_server.conn @@ -1009,6 +1326,15 @@ def create_table_and_insert_sync( col_defs.append('"fulltext_url" TEXT') col_defs.append('"fulltext" TEXT') col_defs.append('"fulltext_md5" TEXT') + + # Step-level validation fields (agentic screening plan) + col_defs.append('"l1_validated_by" TEXT') + col_defs.append('"l1_validated_at" TIMESTAMP WITH TIME ZONE') + col_defs.append('"l2_validated_by" TEXT') + col_defs.append('"l2_validated_at" TIMESTAMP WITH TIME ZONE') + col_defs.append('"parameters_validated_by" TEXT') + col_defs.append('"parameters_validated_at" TIMESTAMP WITH TIME ZONE') + col_defs.append('"created_at" TIMESTAMP WITH TIME ZONE DEFAULT now()') cols_sql = ", ".join(col_defs) diff --git a/backend/docker-compose.yml b/backend/docker-compose.yml index 3cca1cbd..1a8567a7 100644 --- a/backend/docker-compose.yml +++ b/backend/docker-compose.yml @@ -47,6 +47,9 @@ services: # POSTGRESQL - Database (Citations & Systematic Reviews) # ============================================================================= pgdb-service: + # IMPORTANT: pin to a major version. + # Using `postgres` (latest) can auto-upgrade across major versions (e.g., 16 -> 18) + # and break existing on-disk data without a pg_upgrade/backup-restore. image: postgres container_name: pgdb-service restart: unless-stopped @@ -57,7 +60,7 @@ services: ports: - "5432:5432" volumes: - - ./volumes/postgres:/var/lib/postgresql/data + - ./volumes/postgres:/var/lib/postgresql healthcheck: test: ["CMD-SHELL", "pg_isready -U admin -d postgres -h localhost"] interval: 30s diff --git a/backend/main.py b/backend/main.py index 606dc55f..379553c5 100644 --- a/backend/main.py +++ b/backend/main.py @@ -14,6 +14,7 @@ from api.core.config import settings from api.services.sr_db_service import srdb_service from api.services.user_db import user_db_service +from api.services.cit_db_service import cits_dp_service app = FastAPI( @@ -45,6 +46,15 @@ async def startup_event(): except Exception as e: print(f"⚠️ Failed to ensure SR table exists: {e}", flush=True) + # Agentic screening schema bootstrap (no migrations; runtime schema evolution) + try: + print("🤖 Ensuring agentic screening tables...", flush=True) + await run_in_threadpool(cits_dp_service.ensure_agentic_screening_schema) + print("✓ Agentic screening tables initialized", flush=True) + except Exception as e: + # Do not fail startup; allow deployments without Postgres / in degraded mode. + print(f"⚠️ Failed to ensure agentic screening tables: {e}", flush=True) + # Procrastinate schema + run-all job tables try: from api.jobs.procrastinate_app import ( diff --git a/frontend/app/[lang]/can-sr/l1-screen/view/page.tsx b/frontend/app/[lang]/can-sr/l1-screen/view/page.tsx index 75721fb6..bb4dd22f 100644 --- a/frontend/app/[lang]/can-sr/l1-screen/view/page.tsx +++ b/frontend/app/[lang]/can-sr/l1-screen/view/page.tsx @@ -60,6 +60,16 @@ type CriteriaData = { possible_answers: string[][] } +type LatestAgentRun = { + citation_id: number + criterion_key: string + stage: 'screening' | 'critical' | string + answer?: string | null + confidence?: number | null + rationale?: string | null + created_at?: string +} + /* Main page component */ export default function CanSrL1ScreenPage() { const router = useRouter() @@ -91,6 +101,12 @@ export default function CanSrL1ScreenPage() { // Collapsible open state for LLM panels const [panelOpen, setPanelOpen] = useState>({}) + // Agentic runs (screening_agent_runs) for this citation + const [agentRuns, setAgentRuns] = useState([]) + const [loadingRuns, setLoadingRuns] = useState(false) + + const [validating, setValidating] = useState(false) + useEffect(() => { if (!srId || !citationId) { router.replace('/can-sr') @@ -159,6 +175,49 @@ export default function CanSrL1ScreenPage() { fetchCitationById(citationId) }, [srId, citationId]) + // Load latest agent runs for this citation (screening + critical per criterion) + useEffect(() => { + if (!srId || !citationId) return + const loadRuns = async () => { + setLoadingRuns(true) + try { + const headers = getAuthHeaders() + const res = await fetch( + `/api/can-sr/screen/agent-runs/latest?sr_id=${encodeURIComponent( + srId, + )}&pipeline=${encodeURIComponent('title_abstract')}&citation_ids=${encodeURIComponent( + String(citationId), + )}`, + { method: 'GET', headers }, + ) + const data = await res.json().catch(() => ({})) + if (res.ok && Array.isArray(data?.runs)) { + setAgentRuns(data.runs as LatestAgentRun[]) + } else { + setAgentRuns([]) + } + } catch { + setAgentRuns([]) + } finally { + setLoadingRuns(false) + } + } + loadRuns() + }, [srId, citationId]) + + const runsByCriterion = useMemo(() => { + const by: Record = {} + for (const r of agentRuns) { + const key = String((r as any)?.criterion_key || '') + if (!key) continue + if (!by[key]) by[key] = {} + const stage = String((r as any)?.stage || '') + if (stage === 'screening') by[key].screening = r + if (stage === 'critical') by[key].critical = r + } + return by + }, [agentRuns]) + // Load parsed criteria (L1) useEffect(() => { if (!srId) return @@ -438,6 +497,107 @@ export default function CanSrL1ScreenPage() { />
+ {/* Agentic summary + Validate */} +
+
+
+

Agentic results

+

+ Latest screening + critical runs per criterion. +

+
+
+ + + {citation?.l1_validated_by ? ( + + Validated by {String(citation.l1_validated_by)} + + ) : ( + Not validated + )} +
+
+ + {loadingRuns ? ( +
Loading agent runs…
+ ) : criteriaData?.questions?.length ? ( +
+ {criteriaData.questions.map((q, idx) => { + const criterionKey = q + ? q + .trim() + .toLowerCase() + .replace(/[^\w]+/g, '_') + .replace(/_+/g, '_') + .replace(/^_+|_+$/g, '') + .slice(0, 56) + : '' + + const r = runsByCriterion[criterionKey] || {} + const scr = r.screening + const crit = r.critical + + const critDisagrees = + crit && String((crit as any)?.answer || '').trim() !== '' && + String((crit as any)?.answer || '').trim() !== 'None of the above' + + return ( +
+
{q}
+
+
+
Screening
+
Answer: {String((scr as any)?.answer ?? '—')}
+
Confidence: {String((scr as any)?.confidence ?? '—')}
+
+
+
Critical
+
Answer: {String((crit as any)?.answer ?? '—')}
+
Confidence: {String((crit as any)?.confidence ?? '—')}
+ {critDisagrees ? ( +
Disagrees
+ ) : null} +
+
+
+ ) + })} +
+ ) : ( +
No criteria loaded yet.
+ )} +
+
{/* Workspace (left) */}
diff --git a/frontend/app/[lang]/can-sr/l2-screen/view/page.tsx b/frontend/app/[lang]/can-sr/l2-screen/view/page.tsx index 914634da..fef5b4f6 100644 --- a/frontend/app/[lang]/can-sr/l2-screen/view/page.tsx +++ b/frontend/app/[lang]/can-sr/l2-screen/view/page.tsx @@ -63,6 +63,16 @@ type CriteriaData = { additional_infos?: (string | null)[] // optional per-question extra guidance when available } +type LatestAgentRun = { + citation_id: number + criterion_key: string + stage: 'screening' | 'critical' | string + answer?: string | null + confidence?: number | null + rationale?: string | null + created_at?: string +} + /* Main page component */ export default function CanSrL2ScreenViewPage() { const router = useRouter() @@ -98,6 +108,11 @@ export default function CanSrL2ScreenViewPage() { // Hint text from Title/Abstract screening for L1 questions const [hintByIndex, setHintByIndex] = useState>({}) + // Agentic runs (screening_agent_runs) for this citation + const [agentRuns, setAgentRuns] = useState([]) + const [loadingRuns, setLoadingRuns] = useState(false) + const [validating, setValidating] = useState(false) + // Fulltext PDF viewer linkage const [fulltextCoords, setFulltextCoords] = useState(null) const [fulltextPages, setFulltextPages] = useState<{ width: number; height: number }[] | null>(null) @@ -218,6 +233,49 @@ export default function CanSrL2ScreenViewPage() { fetchCitationById(citationId) }, [srId, citationId]) + // Load latest agent runs for this citation (screening + critical per criterion) + useEffect(() => { + if (!srId || !citationId) return + const loadRuns = async () => { + setLoadingRuns(true) + try { + const headers = getAuthHeaders() + const res = await fetch( + `/api/can-sr/screen/agent-runs/latest?sr_id=${encodeURIComponent( + srId, + )}&pipeline=${encodeURIComponent('fulltext')}&citation_ids=${encodeURIComponent( + String(citationId), + )}`, + { method: 'GET', headers }, + ) + const data = await res.json().catch(() => ({})) + if (res.ok && Array.isArray(data?.runs)) { + setAgentRuns(data.runs as LatestAgentRun[]) + } else { + setAgentRuns([]) + } + } catch { + setAgentRuns([]) + } finally { + setLoadingRuns(false) + } + } + loadRuns() + }, [srId, citationId]) + + const runsByCriterion = useMemo(() => { + const by: Record = {} + for (const r of agentRuns) { + const key = String((r as any)?.criterion_key || '') + if (!key) continue + if (!by[key]) by[key] = {} + const stage = String((r as any)?.stage || '') + if (stage === 'screening') by[key].screening = r + if (stage === 'critical') by[key].critical = r + } + return by + }, [agentRuns]) + // Load parsed criteria (L1 + L2 merged, L1 first) useEffect(() => { if (!srId) return @@ -616,6 +674,115 @@ export default function CanSrL2ScreenViewPage() { />
+ {/* Agentic summary + Validate */} +
+
+
+

Agentic results

+

+ Latest screening + critical runs for L2/fulltext per criterion. +

+
+
+ + + {citation?.l2_validated_by ? ( + + Validated by {String(citation.l2_validated_by)} + + ) : ( + Not validated + )} +
+
+ + {loadingRuns ? ( +
Loading agent runs…
+ ) : criteriaData?.questions?.length ? ( +
+ {criteriaData.questions + .map((q, idx) => ({ q, idx })) + .filter(({ idx }) => sourceFlags[idx] === 'l2') + .map(({ q, idx }) => { + const criterionKey = q + ? q + .trim() + .toLowerCase() + .replace(/[^\w]+/g, '_') + .replace(/_+/g, '_') + .replace(/^_+|_+$/g, '') + .slice(0, 56) + : '' + + const r = runsByCriterion[criterionKey] || {} + const scr = r.screening + const crit = r.critical + + const critDisagrees = + crit && + String((crit as any)?.answer || '').trim() !== '' && + String((crit as any)?.answer || '').trim() !== 'None of the above' + + return ( +
+
{q}
+
+
+
Screening
+
Answer: {String((scr as any)?.answer ?? '—')}
+
Confidence: {String((scr as any)?.confidence ?? '—')}
+
+
+
Critical
+
Answer: {String((crit as any)?.answer ?? '—')}
+
Confidence: {String((crit as any)?.confidence ?? '—')}
+ {critDisagrees ? ( +
Disagrees
+ ) : null} +
+
+
+ ) + })} +
+ ) : ( +
No criteria loaded yet.
+ )} +
+
{/* Workspace (left) */}
diff --git a/frontend/app/api/can-sr/screen/agent-runs/latest/route.ts b/frontend/app/api/can-sr/screen/agent-runs/latest/route.ts new file mode 100644 index 00000000..4fb1b1b9 --- /dev/null +++ b/frontend/app/api/can-sr/screen/agent-runs/latest/route.ts @@ -0,0 +1,67 @@ +import { NextRequest, NextResponse } from 'next/server' +import { BACKEND_URL } from '@/lib/config' + +/** + * Proxy: GET /api/can-sr/screen/agent-runs/latest?sr_id=&pipeline=title_abstract&citation_ids=1,2,3 + * -> GET {BACKEND_URL}/api/screen/agent-runs/latest?sr_id=...&pipeline=...&citation_ids=... + */ + +export async function OPTIONS() { + return new Response(null, { + status: 204, + headers: { + 'Access-Control-Allow-Origin': '*', + 'Access-Control-Allow-Methods': 'GET,OPTIONS', + 'Access-Control-Allow-Headers': 'Authorization, Content-Type', + }, + }) +} + +export async function GET(request: NextRequest) { + try { + const params = request.nextUrl.searchParams + const srId = params.get('sr_id') + const pipeline = params.get('pipeline') + const citationIds = params.get('citation_ids') + + if (!srId || !pipeline || !citationIds) { + return NextResponse.json( + { error: 'sr_id, pipeline, citation_ids are required' }, + { status: 400 }, + ) + } + + const authHeader = request.headers.get('authorization') + if (!authHeader) { + return NextResponse.json( + { error: 'Authorization header is required' }, + { status: 401 }, + ) + } + + const url = new URL(`${BACKEND_URL}/api/screen/agent-runs/latest`) + url.searchParams.set('sr_id', srId) + url.searchParams.set('pipeline', pipeline) + url.searchParams.set('citation_ids', citationIds) + + const res = await fetch(url.toString(), { + method: 'GET', + headers: { + Authorization: authHeader, + }, + }) + + const text = await res.text().catch(() => '') + let json: any = null + try { + json = text ? JSON.parse(text) : {} + } catch { + json = { detail: text || null } + } + + return NextResponse.json(json, { status: res.status }) + } catch (err: any) { + console.error('Agent runs latest proxy GET error:', err) + return NextResponse.json({ error: 'Internal server error' }, { status: 500 }) + } +} diff --git a/frontend/app/api/can-sr/screen/validate/route.ts b/frontend/app/api/can-sr/screen/validate/route.ts new file mode 100644 index 00000000..5ea0e153 --- /dev/null +++ b/frontend/app/api/can-sr/screen/validate/route.ts @@ -0,0 +1,44 @@ +import { NextRequest, NextResponse } from 'next/server' +import { BACKEND_URL } from '@/lib/config' + +/** + * Proxy: POST /api/can-sr/screen/validate + * body: { sr_id, citation_id, step } + * -> POST {BACKEND_URL}/api/screen/validate + */ + +export async function POST(request: NextRequest) { + try { + const authHeader = request.headers.get('authorization') + if (!authHeader) { + return NextResponse.json( + { error: 'Authorization header is required' }, + { status: 401 }, + ) + } + + const body = await request.json().catch(() => ({})) + + const res = await fetch(`${BACKEND_URL}/api/screen/validate`, { + method: 'POST', + headers: { + Authorization: authHeader, + 'Content-Type': 'application/json', + }, + body: JSON.stringify(body), + }) + + const text = await res.text().catch(() => '') + let json: any = null + try { + json = text ? JSON.parse(text) : {} + } catch { + json = { detail: text || null } + } + + return NextResponse.json(json, { status: res.status }) + } catch (err: any) { + console.error('Validate proxy POST error:', err) + return NextResponse.json({ error: 'Internal server error' }, { status: 500 }) + } +} diff --git a/frontend/components/can-sr/PagedList.tsx b/frontend/components/can-sr/PagedList.tsx index c0ab7a05..b0417b50 100644 --- a/frontend/components/can-sr/PagedList.tsx +++ b/frontend/components/can-sr/PagedList.tsx @@ -14,6 +14,15 @@ type CitationInfo = { pageview: string } +type LatestAgentRun = { + citation_id: number + criterion_key: string + stage: 'screening' | 'critical' | string + answer?: string | null + confidence?: number | null + created_at?: string +} + function getAuthHeaders(): Record { const token = getAuthToken() const tokenType = getTokenType() @@ -53,13 +62,19 @@ export default function PagedList({ ) const [showClassify, setShowClassify] = useState>({}) + // TA list controls + const [threshold, setThreshold] = useState(0.9) + const [filterMode, setFilterMode] = useState<'needs' | 'validated' | 'unvalidated' | 'all'>('needs') + + const [latestRunsByCitation, setLatestRunsByCitation] = useState>({}) + const fileInputRefs = useRef>({}) // --- paging --- useEffect(() => { const lp = Math.max(1, Math.ceil((citationIds?.length || 0) / pageSize)) setLastpage(lp) - setpage((prev) => Math.min(Math.max(1, prev), lp)) + setpage((prev: number) => Math.min(Math.max(1, prev), lp)) }, [citationIds, pageSize]) useEffect(() => { @@ -112,12 +127,102 @@ export default function PagedList({ if (row?.fulltext_url) nextShow[id] = true } - setLlmClassified((prev) => ({ ...prev, ...nextLlm })) - setHumanVerified((prev) => ({ ...prev, ...nextHuman })) - setShowClassify((prev) => ({ ...prev, ...nextShow })) + setLlmClassified((prev: Record) => ({ ...prev, ...nextLlm })) + setHumanVerified((prev: Record) => ({ ...prev, ...nextHuman })) + setShowClassify((prev: Record) => ({ ...prev, ...nextShow })) + + // Fetch latest agent runs for this page (L1=title_abstract, L2=fulltext) + try { + const shouldFetchRuns = (screeningStep === 'l1' || screeningStep === 'l2') && pageIds.length + if (shouldFetchRuns) { + const pipeline = screeningStep === 'l2' ? 'fulltext' : 'title_abstract' + const r2 = await fetch( + `/api/can-sr/screen/agent-runs/latest?sr_id=${encodeURIComponent(srId)}&pipeline=${encodeURIComponent( + pipeline, + )}&citation_ids=${encodeURIComponent(pageIds.join(','))}`, + { method: 'GET', headers }, + ) + const j2 = await r2.json().catch(() => ({})) + if (r2.ok && Array.isArray(j2?.runs)) { + const grouped: Record = {} + for (const run of j2.runs as LatestAgentRun[]) { + const cid = Number((run as any)?.citation_id) + if (!Number.isFinite(cid)) continue + if (!grouped[cid]) grouped[cid] = [] + grouped[cid].push(run) + } + setLatestRunsByCitation((prev: Record) => ({ ...prev, ...grouped })) + } + } + } catch (e) { + // best-effort + } } fetchCitations() - }, [citationIds, page, pageSize, questions, srId]) + }, [citationIds, page, pageSize, questions, srId, screeningStep]) + + // Reset cached runs when switching steps (avoid mixing l1/l2 pipeline results) + useEffect(() => { + setLatestRunsByCitation({}) + }, [screeningStep]) + + const isValidatedForStep = (row: any): boolean => { + if (!row) return false + if (screeningStep === 'l1') return Boolean(row?.l1_validated_by) + if (screeningStep === 'l2') return Boolean(row?.l2_validated_by) + if (screeningStep === 'extract') return Boolean(row?.parameters_validated_by) + return false + } + + const computeNeedsValidation = (citationId: number, row: any): boolean => { + // If validated, it no longer “needs validation” + if (isValidatedForStep(row)) return false + + const runs = latestRunsByCitation[citationId] || [] + if (!runs.length) { + // No agent runs yet => should be in "unvalidated" but not necessarily "needs" + // We'll treat missing runs as "needs" so it's easy to find. + return true + } + + // Group by criterion_key + const byKey: Record = {} + for (const r of runs) { + const key = String((r as any)?.criterion_key || '') + if (!key) continue + if (!byKey[key]) byKey[key] = [] + byKey[key].push(r) + } + + // Needs validation if ANY criterion is low confidence OR critical disagrees + for (const key of Object.keys(byKey)) { + const items = byKey[key] + const screening = items.find((x) => String((x as any)?.stage) === 'screening') + const critical = items.find((x) => String((x as any)?.stage) === 'critical') + + const conf = Number((screening as any)?.confidence) + if (Number.isFinite(conf) && conf < threshold) return true + + const criticalAns = String((critical as any)?.answer || '') + // In our critical prompt contract, agreement is "None of the above". + if (critical && criticalAns.trim() !== '' && criticalAns.trim() !== 'None of the above') return true + } + + return false + } + + const filteredCitationData = citationData.filter((row: any) => { + const id = Number(row?.id) + if (!Number.isFinite(id)) return false + const validated = isValidatedForStep(row) + const needs = computeNeedsValidation(id, row) + const unvalidated = !validated + if (filterMode === 'all') return true + if (filterMode === 'validated') return validated + if (filterMode === 'unvalidated') return unvalidated + if (filterMode === 'needs') return needs + return true + }) // NOTE: Previously we fetched each citation via /citations/get. // This is now replaced by a single /citations/batch call per page. @@ -156,7 +261,7 @@ export default function PagedList({ { method: 'POST', headers, body: JSON.stringify(bodyPayload) }, ) } - setLlmClassified((prev) => ({ ...prev, [id]: true })) + setLlmClassified((prev: Record) => ({ ...prev, [id]: true })) } const onChooseFile = (id: number) => { @@ -196,16 +301,56 @@ export default function PagedList({ { method: 'POST', headers, body: fd as any }, ) - setShowClassify((prev) => ({ ...prev, [id]: true })) + setShowClassify((prev: Record) => ({ ...prev, [id]: true })) } return (
+ {screeningStep === 'l1' || screeningStep === 'l2' ? ( +
+
+ + ) => { + const v = Number(e.target.value) + if (!Number.isFinite(v)) return + setThreshold(Math.max(0, Math.min(1, v))) + }} + className="w-24 rounded-md border border-gray-200 px-2 py-1 text-sm" + /> +
+ +
+ + +
+
+ ) : null} +
    - {citationData.map((data) => ( + {filteredCitationData.map((data: any) => (
  • Citation #{data.id}

    @@ -334,7 +479,7 @@ export default function PagedList({ setJumpPageInput(e.target.value)} + onChange={(e: React.ChangeEvent) => setJumpPageInput(e.target.value)} className="w-20 rounded-md border border-gray-200 px-2 py-1 text-sm" placeholder={String(page)} inputMode="numeric" diff --git a/frontend/package-lock.json b/frontend/package-lock.json index a8237aa6..ae2deb78 100644 --- a/frontend/package-lock.json +++ b/frontend/package-lock.json @@ -1022,10 +1022,9 @@ } }, "node_modules/@next/env": { - "version": "16.1.6", - "resolved": "https://registry.npmjs.org/@next/env/-/env-16.1.6.tgz", - "integrity": "sha512-N1ySLuZjnAtN3kFnwhAwPvZah8RJxKasD7x1f8shFqhncnWZn4JMfg37diLNuoHsLAlrDfM3g4mawVdtAG8XLQ==", - "license": "MIT" + "version": "16.2.2", + "resolved": "https://registry.npmjs.org/@next/env/-/env-16.2.2.tgz", + "integrity": "sha512-LqSGz5+xGk9EL/iBDr2yo/CgNQV6cFsNhRR2xhSXYh7B/hb4nePCxlmDvGEKG30NMHDFf0raqSyOZiQrO7BkHQ==" }, "node_modules/@next/eslint-plugin-next": { "version": "15.5.9", @@ -1037,13 +1036,12 @@ } }, "node_modules/@next/swc-darwin-arm64": { - "version": "16.1.6", - "resolved": "https://registry.npmjs.org/@next/swc-darwin-arm64/-/swc-darwin-arm64-16.1.6.tgz", - "integrity": "sha512-wTzYulosJr/6nFnqGW7FrG3jfUUlEf8UjGA0/pyypJl42ExdVgC6xJgcXQ+V8QFn6niSG2Pb8+MIG1mZr2vczw==", + "version": "16.2.2", + "resolved": "https://registry.npmjs.org/@next/swc-darwin-arm64/-/swc-darwin-arm64-16.2.2.tgz", + "integrity": "sha512-B92G3ulrwmkDSEJEp9+XzGLex5wC1knrmCSIylyVeiAtCIfvEJYiN3v5kXPlYt5R4RFlsfO/v++aKV63Acrugg==", "cpu": [ "arm64" ], - "license": "MIT", "optional": true, "os": [ "darwin" @@ -1053,13 +1051,12 @@ } }, "node_modules/@next/swc-darwin-x64": { - "version": "16.1.6", - "resolved": "https://registry.npmjs.org/@next/swc-darwin-x64/-/swc-darwin-x64-16.1.6.tgz", - "integrity": "sha512-BLFPYPDO+MNJsiDWbeVzqvYd4NyuRrEYVB5k2N3JfWncuHAy2IVwMAOlVQDFjj+krkWzhY2apvmekMkfQR0CUQ==", + "version": "16.2.2", + "resolved": "https://registry.npmjs.org/@next/swc-darwin-x64/-/swc-darwin-x64-16.2.2.tgz", + "integrity": "sha512-7ZwSgNKJNQiwW0CKhNm9B1WS2L1Olc4B2XY0hPYCAL3epFnugMhuw5TMWzMilQ3QCZcCHoYm9NGWTHbr5REFxw==", "cpu": [ "x64" ], - "license": "MIT", "optional": true, "os": [ "darwin" @@ -1069,13 +1066,12 @@ } }, "node_modules/@next/swc-linux-arm64-gnu": { - "version": "16.1.6", - "resolved": "https://registry.npmjs.org/@next/swc-linux-arm64-gnu/-/swc-linux-arm64-gnu-16.1.6.tgz", - "integrity": "sha512-OJYkCd5pj/QloBvoEcJ2XiMnlJkRv9idWA/j0ugSuA34gMT6f5b7vOiCQHVRpvStoZUknhl6/UxOXL4OwtdaBw==", + "version": "16.2.2", + "resolved": "https://registry.npmjs.org/@next/swc-linux-arm64-gnu/-/swc-linux-arm64-gnu-16.2.2.tgz", + "integrity": "sha512-c3m8kBHMziMgo2fICOP/cd/5YlrxDU5YYjAJeQLyFsCqVF8xjOTH/QYG4a2u48CvvZZSj1eHQfBCbyh7kBr30Q==", "cpu": [ "arm64" ], - "license": "MIT", "optional": true, "os": [ "linux" @@ -1085,13 +1081,12 @@ } }, "node_modules/@next/swc-linux-arm64-musl": { - "version": "16.1.6", - "resolved": "https://registry.npmjs.org/@next/swc-linux-arm64-musl/-/swc-linux-arm64-musl-16.1.6.tgz", - "integrity": "sha512-S4J2v+8tT3NIO9u2q+S0G5KdvNDjXfAv06OhfOzNDaBn5rw84DGXWndOEB7d5/x852A20sW1M56vhC/tRVbccQ==", + "version": "16.2.2", + "resolved": "https://registry.npmjs.org/@next/swc-linux-arm64-musl/-/swc-linux-arm64-musl-16.2.2.tgz", + "integrity": "sha512-VKLuscm0P/mIfzt+SDdn2+8TNNJ7f0qfEkA+az7OqQbjzKdBxAHs0UvuiVoCtbwX+dqMEL9U54b5wQ/aN3dHeg==", "cpu": [ "arm64" ], - "license": "MIT", "optional": true, "os": [ "linux" @@ -1101,13 +1096,12 @@ } }, "node_modules/@next/swc-linux-x64-gnu": { - "version": "16.1.6", - "resolved": "https://registry.npmjs.org/@next/swc-linux-x64-gnu/-/swc-linux-x64-gnu-16.1.6.tgz", - "integrity": "sha512-2eEBDkFlMMNQnkTyPBhQOAyn2qMxyG2eE7GPH2WIDGEpEILcBPI/jdSv4t6xupSP+ot/jkfrCShLAa7+ZUPcJQ==", + "version": "16.2.2", + "resolved": "https://registry.npmjs.org/@next/swc-linux-x64-gnu/-/swc-linux-x64-gnu-16.2.2.tgz", + "integrity": "sha512-kU3OPHJq6sBUjOk7wc5zJ7/lipn8yGldMoAv4z67j6ov6Xo/JvzA7L7LCsyzzsXmgLEhk3Qkpwqaq/1+XpNR3g==", "cpu": [ "x64" ], - "license": "MIT", "optional": true, "os": [ "linux" @@ -1117,13 +1111,12 @@ } }, "node_modules/@next/swc-linux-x64-musl": { - "version": "16.1.6", - "resolved": "https://registry.npmjs.org/@next/swc-linux-x64-musl/-/swc-linux-x64-musl-16.1.6.tgz", - "integrity": "sha512-oicJwRlyOoZXVlxmIMaTq7f8pN9QNbdes0q2FXfRsPhfCi8n8JmOZJm5oo1pwDaFbnnD421rVU409M3evFbIqg==", + "version": "16.2.2", + "resolved": "https://registry.npmjs.org/@next/swc-linux-x64-musl/-/swc-linux-x64-musl-16.2.2.tgz", + "integrity": "sha512-CKXRILyErMtUftp+coGcZ38ZwE/Aqq45VMCcRLr2I4OXKrgxIBDXHnBgeX/UMil0S09i2JXaDL3Q+TN8D/cKmg==", "cpu": [ "x64" ], - "license": "MIT", "optional": true, "os": [ "linux" @@ -1133,13 +1126,12 @@ } }, "node_modules/@next/swc-win32-arm64-msvc": { - "version": "16.1.6", - "resolved": "https://registry.npmjs.org/@next/swc-win32-arm64-msvc/-/swc-win32-arm64-msvc-16.1.6.tgz", - "integrity": "sha512-gQmm8izDTPgs+DCWH22kcDmuUp7NyiJgEl18bcr8irXA5N2m2O+JQIr6f3ct42GOs9c0h8QF3L5SzIxcYAAXXw==", + "version": "16.2.2", + "resolved": "https://registry.npmjs.org/@next/swc-win32-arm64-msvc/-/swc-win32-arm64-msvc-16.2.2.tgz", + "integrity": "sha512-sS/jSk5VUoShUqINJFvNjVT7JfR5ORYj/+/ZpOYbbIohv/lQfduWnGAycq2wlknbOql2xOR0DoV0s6Xfcy49+g==", "cpu": [ "arm64" ], - "license": "MIT", "optional": true, "os": [ "win32" @@ -1149,13 +1141,12 @@ } }, "node_modules/@next/swc-win32-x64-msvc": { - "version": "16.1.6", - "resolved": "https://registry.npmjs.org/@next/swc-win32-x64-msvc/-/swc-win32-x64-msvc-16.1.6.tgz", - "integrity": "sha512-NRfO39AIrzBnixKbjuo2YiYhB6o9d8v/ymU9m/Xk8cyVk+k7XylniXkHwjs4s70wedVffc6bQNbufk5v0xEm0A==", + "version": "16.2.2", + "resolved": "https://registry.npmjs.org/@next/swc-win32-x64-msvc/-/swc-win32-x64-msvc-16.2.2.tgz", + "integrity": "sha512-aHaKceJgdySReT7qeck5oShucxWRiiEuwCGK8HHALe6yZga8uyFpLkPgaRw3kkF04U7ROogL/suYCNt/+CuXGA==", "cpu": [ "x64" ], - "license": "MIT", "optional": true, "os": [ "win32" @@ -2823,11 +2814,10 @@ } }, "node_modules/@typescript-eslint/typescript-estree/node_modules/brace-expansion": { - "version": "2.0.2", - "resolved": "https://registry.npmjs.org/brace-expansion/-/brace-expansion-2.0.2.tgz", - "integrity": "sha512-Jt0vHyM+jmUBqojB7E1NIYadt0vI0Qxjxd2TErW94wDz+E2LAm5vKMXXwg6ZZBTHPuUlDgQHKXvjGBdfcF1ZDQ==", + "version": "2.0.3", + "resolved": "https://registry.npmjs.org/brace-expansion/-/brace-expansion-2.0.3.tgz", + "integrity": "sha512-MCV/fYJEbqx68aE58kv2cA/kiky1G8vux3OR6/jbS+jIMe/6fJWa0DTzJU7dqijOWYwHi1t29FlfYI9uytqlpA==", "dev": true, - "license": "MIT", "dependencies": { "balanced-match": "^1.0.0" } @@ -3587,11 +3577,10 @@ } }, "node_modules/brace-expansion": { - "version": "1.1.12", - "resolved": "https://registry.npmjs.org/brace-expansion/-/brace-expansion-1.1.12.tgz", - "integrity": "sha512-9T9UjW3r0UW5c1Q7GTwllptXwhvYmEzFhzMfZ9H7FQWt+uZePjZPjBP/W1ZEyZ1twGWom5/56TF4lPcqjnDHcg==", + "version": "1.1.13", + "resolved": "https://registry.npmjs.org/brace-expansion/-/brace-expansion-1.1.13.tgz", + "integrity": "sha512-9ZLprWS6EENmhEOpjCYW2c8VkmOvckIJZfkr7rBW6dObmfgJ/L1GpSYW5Hpo9lDz4D1+n0Ckz8rU7FwHDQiG/w==", "dev": true, - "license": "MIT", "dependencies": { "balanced-match": "^1.0.0", "concat-map": "0.0.1" @@ -4863,11 +4852,10 @@ } }, "node_modules/flatted": { - "version": "3.3.3", - "resolved": "https://registry.npmjs.org/flatted/-/flatted-3.3.3.tgz", - "integrity": "sha512-GX+ysw4PBCz0PzosHDepZGANEuFCMLrnRTiEy9McGjmkCQYwRq4A/X786G/fjM/+OjsWSU1ZrY5qyARZmO/uwg==", - "dev": true, - "license": "ISC" + "version": "3.4.2", + "resolved": "https://registry.npmjs.org/flatted/-/flatted-3.4.2.tgz", + "integrity": "sha512-PjDse7RzhcPkIJwy5t7KPWQSZ9cAbzQXcafsetQoD7sOJRQlGikNbx7yZp2OotDnJyrDcbyRq3Ttb18iYOqkxA==", + "dev": true }, "node_modules/for-each": { "version": "0.3.5", @@ -7350,14 +7338,13 @@ } }, "node_modules/next": { - "version": "16.1.6", - "resolved": "https://registry.npmjs.org/next/-/next-16.1.6.tgz", - "integrity": "sha512-hkyRkcu5x/41KoqnROkfTm2pZVbKxvbZRuNvKXLRXxs3VfyO0WhY50TQS40EuKO9SW3rBj/sF3WbVwDACeMZyw==", - "license": "MIT", + "version": "16.2.2", + "resolved": "https://registry.npmjs.org/next/-/next-16.2.2.tgz", + "integrity": "sha512-i6AJdyVa4oQjyvX/6GeER8dpY/xlIV+4NMv/svykcLtURJSy/WzDnnUk/TM4d0uewFHK7xSQz4TbIwPgjky+3A==", "dependencies": { - "@next/env": "16.1.6", + "@next/env": "16.2.2", "@swc/helpers": "0.5.15", - "baseline-browser-mapping": "^2.8.3", + "baseline-browser-mapping": "^2.9.19", "caniuse-lite": "^1.0.30001579", "postcss": "8.4.31", "styled-jsx": "5.1.6" @@ -7369,15 +7356,15 @@ "node": ">=20.9.0" }, "optionalDependencies": { - "@next/swc-darwin-arm64": "16.1.6", - "@next/swc-darwin-x64": "16.1.6", - "@next/swc-linux-arm64-gnu": "16.1.6", - "@next/swc-linux-arm64-musl": "16.1.6", - "@next/swc-linux-x64-gnu": "16.1.6", - "@next/swc-linux-x64-musl": "16.1.6", - "@next/swc-win32-arm64-msvc": "16.1.6", - "@next/swc-win32-x64-msvc": "16.1.6", - "sharp": "^0.34.4" + "@next/swc-darwin-arm64": "16.2.2", + "@next/swc-darwin-x64": "16.2.2", + "@next/swc-linux-arm64-gnu": "16.2.2", + "@next/swc-linux-arm64-musl": "16.2.2", + "@next/swc-linux-x64-gnu": "16.2.2", + "@next/swc-linux-x64-musl": "16.2.2", + "@next/swc-win32-arm64-msvc": "16.2.2", + "@next/swc-win32-x64-msvc": "16.2.2", + "sharp": "^0.34.5" }, "peerDependencies": { "@opentelemetry/api": "^1.1.0", @@ -7701,11 +7688,10 @@ "license": "ISC" }, "node_modules/picomatch": { - "version": "2.3.1", - "resolved": "https://registry.npmjs.org/picomatch/-/picomatch-2.3.1.tgz", - "integrity": "sha512-JU3teHTNjmE2VCGFzuY8EXzCDVwEqB2a8fsIvwaStHhAWJEeVd1o1QD80CU6+ZdEXXSLbSsuLwJjkCBWqRQUVA==", + "version": "2.3.2", + "resolved": "https://registry.npmjs.org/picomatch/-/picomatch-2.3.2.tgz", + "integrity": "sha512-V7+vQEJ06Z+c5tSye8S+nHUfI51xoXIXjHQ99cQtKUkQqqO1kO/KCJUfZXuB47h/YBlDhah2H3hdUGXn8ie0oA==", "dev": true, - "license": "MIT", "engines": { "node": ">=8.6" }, @@ -8973,11 +8959,10 @@ } }, "node_modules/tinyglobby/node_modules/picomatch": { - "version": "4.0.2", - "resolved": "https://registry.npmjs.org/picomatch/-/picomatch-4.0.2.tgz", - "integrity": "sha512-M7BAV6Rlcy5u+m6oPhAPFgJTzAioX/6B0DxyvDlo9l8+T3nLKbrczg2WLUyzd45L8RqfUMyGPzekbMvX2Ldkwg==", + "version": "4.0.4", + "resolved": "https://registry.npmjs.org/picomatch/-/picomatch-4.0.4.tgz", + "integrity": "sha512-QP88BAKvMam/3NxH6vj2o21R6MjxZUAd6nlwAS/pnGvN9IVLocLHxGYIzFhg6fUQ+5th6P4dv4eW9jX3DSIj7A==", "dev": true, - "license": "MIT", "engines": { "node": ">=12" }, From 94e748a7269e07fdd7ebdae373270aeae2c35655 Mon Sep 17 00:00:00 2001 From: bing1100 Date: Mon, 13 Apr 2026 23:18:47 -0400 Subject: [PATCH 2/3] fixing up metrics --- backend/api/screen/router.py | 410 +++++++++++++++++- backend/api/services/sr_db_service.py | 61 ++- backend/api/sr/router.py | 77 ++++ .../app/[lang]/can-sr/l1-screen/view/page.tsx | 260 ++++++----- .../app/[lang]/can-sr/l2-screen/view/page.tsx | 132 ++++-- .../api/can-sr/reviews/thresholds/route.ts | 57 +++ .../app/api/can-sr/screen/metrics/route.ts | 59 +++ .../components/can-sr/CitationListPage.tsx | 351 ++++++++++----- frontend/components/can-sr/PagedList.tsx | 112 ++++- .../can-sr/ScreeningMetricsPanel.tsx | 214 +++++++++ 10 files changed, 1451 insertions(+), 282 deletions(-) create mode 100644 frontend/app/api/can-sr/reviews/thresholds/route.ts create mode 100644 frontend/app/api/can-sr/screen/metrics/route.ts create mode 100644 frontend/components/can-sr/ScreeningMetricsPanel.tsx diff --git a/backend/api/screen/router.py b/backend/api/screen/router.py index 400ea9b0..3e7f27da 100644 --- a/backend/api/screen/router.py +++ b/backend/api/screen/router.py @@ -41,6 +41,33 @@ class AgentRunsQueryResponse(BaseModel): runs: List[Dict[str, Any]] +class ScreeningMetricsCriterion(BaseModel): + criterion_key: str + label: str + threshold: float + total_citations: int + has_run_count: int + low_confidence_count: int + critical_disagreement_count: int + confident_exclude_count: int + needs_human_review_count: int + + +class ScreeningMetricsSummary(BaseModel): + step: str + total_citations: int + validated_all: int + unvalidated_all: int + validated_needs_review: int + unvalidated_needs_review: int + needs_review_total: int + + +class ScreeningMetricsResponse(BaseModel): + sr_id: str + steps: Dict[str, Any] + + def _normalize_int_list(v: Any) -> List[int]: if v is None: return [] @@ -115,6 +142,91 @@ class ValidateStepRequest(BaseModel): sr_id: str = Field(..., description="Systematic review id") citation_id: int = Field(..., ge=1, description="Citation id (row id in the SR screening table)") step: str = Field("l1", description="Validation step: l1|l2|parameters") + checked: bool = Field(True, description="If true, add/update the current user's validation; if false, remove it") + + +def _as_validation_list(v: Any) -> List[Dict[str, str]]: + """Normalize DB values into a list of {user, validated_at} dicts.""" + + if v is None: + return [] + + # JSONB may come back as a list already; some deployments may return it as string. + if isinstance(v, str): + try: + v = json.loads(v) + except Exception: + return [] + + if not isinstance(v, list): + return [] + + out: List[Dict[str, str]] = [] + for item in v: + if not isinstance(item, dict): + continue + user = item.get("user") or item.get("email") or item.get("validated_by") + ts = item.get("validated_at") or item.get("timestamp") or item.get("validatedAt") + if not user: + continue + out.append({"user": str(user), "validated_at": str(ts or "")}) + return out + + +def _dedupe_validations(items: List[Dict[str, str]]) -> List[Dict[str, str]]: + """Keep only one entry per user, keeping the latest timestamp lexicographically (ISO8601).""" + + by_user: Dict[str, Dict[str, str]] = {} + for it in items or []: + user = str(it.get("user") or "").strip() + if not user: + continue + cur = by_user.get(user) + if not cur: + by_user[user] = {"user": user, "validated_at": str(it.get("validated_at") or "")} + continue + # Prefer newest timestamp (ISO strings compare in chronological order) + if str(it.get("validated_at") or "") >= str(cur.get("validated_at") or ""): + by_user[user] = {"user": user, "validated_at": str(it.get("validated_at") or "")} + + # Return newest-first for nicer UI (most recent first) + return sorted(by_user.values(), key=lambda x: str(x.get("validated_at") or ""), reverse=True) + + +def _is_disagreeing_critical_answer(ans: Any) -> bool: + """Return True if critical stage indicates disagreement. + + Contract: agreement is encoded as "None of the above". + Any non-empty answer other than that is treated as critical disagreement. + """ + + s = str(ans or "").strip() + if not s: + return False + return s != "None of the above" + + +def _is_exclude_answer(ans: Any) -> bool: + """Detect exclude answers by convention: contains '(exclude)' (case-insensitive).""" + + s = str(ans or "") + return "(exclude)" in s.lower() + + +def _criterion_key_from_question(question: str) -> str: + # Keep in sync with the frontend derivation in l2-screen view. + q = str(question or "") + try: + # Prefer shared helper when available. + return str(snake_case(q, max_len=56)) + except Exception: + # Fallback: lowercase, non-word -> underscore, collapse underscores. + s = q.strip().lower() + s = re.sub(r"[^\w]+", "_", s) + s = re.sub(r"_+", "_", s) + s = re.sub(r"^_+|_+$", "", s) + return s[:56] + class FulltextRunRequest(BaseModel): @@ -674,6 +786,7 @@ async def validate_screening_step( sr_id = str(payload.sr_id) citation_id = int(payload.citation_id) step = (payload.step or "l1").lower().strip() + checked = bool(payload.checked) if step not in {"l1", "l2", "parameters"}: raise HTTPException(status_code=status.HTTP_400_BAD_REQUEST, detail="step must be one of: l1, l2, parameters") @@ -687,24 +800,66 @@ async def validate_screening_step( table_name = (screening or {}).get("table_name") or "citations" - validated_by_col = f"{step}_validated_by" - validated_at_col = f"{step}_validated_at" - validated_by = str(current_user.get("email") or current_user.get("id") or "") + # New storage: per-step validations list (JSONB) + validations_col = f"{step}_validations" + validated_by_col = f"{step}_validated_by" # legacy summary + validated_at_col = f"{step}_validated_at" # legacy summary + + user_email = str(current_user.get("email") or current_user.get("id") or "").strip() now_iso = datetime.utcnow().isoformat() + "Z" try: # Ensure columns exist (best-effort; no-migrations philosophy) + await run_in_threadpool(cits_dp_service.create_column, validations_col, "JSONB", table_name) await run_in_threadpool(cits_dp_service.create_column, validated_by_col, "TEXT", table_name) await run_in_threadpool(cits_dp_service.create_column, validated_at_col, "TIMESTAMPTZ", table_name) - u1 = await run_in_threadpool(cits_dp_service.update_text_column, citation_id, validated_by_col, validated_by, table_name) - u2 = await run_in_threadpool(cits_dp_service.update_text_column, citation_id, validated_at_col, now_iso, table_name) + # Load row to get existing validations list + row = await run_in_threadpool(cits_dp_service.get_citation_by_id, citation_id, table_name) + if not row: + raise HTTPException(status_code=status.HTTP_404_NOT_FOUND, detail="Citation not found") + + existing = _as_validation_list(row.get(validations_col)) + + if checked: + # Upsert (replace existing entry for this user with new timestamp) + existing = [x for x in existing if str(x.get("user") or "") != user_email] + existing.append({"user": user_email, "validated_at": now_iso}) + else: + # Remove + existing = [x for x in existing if str(x.get("user") or "") != user_email] + + normalized = _dedupe_validations(existing) + + u_list = await run_in_threadpool( + cits_dp_service.update_jsonb_column, + citation_id, + validations_col, + normalized, + table_name, + ) + + # Keep legacy summary fields in sync for existing UI/components: + # - if list empty => NULL out by/at + # - else => most recent validation + if not normalized: + await run_in_threadpool(cits_dp_service.clear_columns, citation_id, [validated_by_col, validated_at_col], table_name) + summary_by = None + summary_at = None + else: + summary_by = normalized[0].get("user") + summary_at = normalized[0].get("validated_at") + await run_in_threadpool(cits_dp_service.update_text_column, citation_id, validated_by_col, str(summary_by or ""), table_name) + await run_in_threadpool(cits_dp_service.update_text_column, citation_id, validated_at_col, str(summary_at or ""), table_name) + + except HTTPException: + raise except RuntimeError as rexc: raise HTTPException(status_code=status.HTTP_503_SERVICE_UNAVAILABLE, detail=str(rexc)) except Exception as e: raise HTTPException(status_code=status.HTTP_500_INTERNAL_SERVER_ERROR, detail=f"Failed to update validation fields: {e}") - if not (u1 and u2): + if not u_list: raise HTTPException(status_code=status.HTTP_404_NOT_FOUND, detail="Citation not found to update") return { @@ -712,8 +867,12 @@ async def validate_screening_step( "sr_id": sr_id, "citation_id": citation_id, "step": step, - "validated_by": validated_by, - "validated_at": now_iso, + "checked": checked, + "user": user_email, + "validated_at": now_iso if checked else None, + "validations": normalized, + "summary_validated_by": summary_by, + "summary_validated_at": summary_at, } @@ -1061,6 +1220,241 @@ async def get_latest_agent_runs( return AgentRunsQueryResponse(sr_id=sr_id, pipeline=pipeline_norm, citation_ids=parsed_ids, runs=rows) + +@router.get("/metrics", response_model=ScreeningMetricsResponse) +async def get_screening_metrics( + sr_id: str, + step: str = "l1", + current_user: Dict[str, Any] = Depends(get_current_active_user), +): + """Return per-criterion metrics + validation summaries for a screening step. + + - Each criterion uses its own threshold (from SR.screening_thresholds[step][criterion_key]). + - Needs-human-review logic: + 1) If ANY criterion is a confident exclude => no human review needed for the citation. + 2) Else if ANY criterion has critical disagreement => needs review. + 3) Else if ANY criterion is low confidence (below its threshold) => needs review. + """ + + step_norm = str(step or "l1").lower().strip() + if step_norm not in {"l1", "l2"}: + raise HTTPException(status_code=status.HTTP_400_BAD_REQUEST, detail="step must be l1 or l2") + + try: + sr, screening = await load_sr_and_check(sr_id, current_user, srdb_service) + except HTTPException: + raise + except Exception as e: + raise HTTPException(status_code=status.HTTP_500_INTERNAL_SERVER_ERROR, detail=f"Failed to load SR: {e}") + + table_name = (screening or {}).get("table_name") or "citations" + + # Criteria questions for step + cp = sr.get("criteria_parsed") or {} + crit_block = cp.get(step_norm) if isinstance(cp, dict) else None + questions = (crit_block or {}).get("questions") if isinstance(crit_block, dict) else [] + questions = questions if isinstance(questions, list) else [] + + # Threshold map + sr_thresholds = sr.get("screening_thresholds") or {} + step_thresholds = sr_thresholds.get(step_norm) if isinstance(sr_thresholds, dict) else None + step_thresholds = step_thresholds if isinstance(step_thresholds, dict) else {} + + # Build criterion list (key + label + threshold) + criteria: List[Dict[str, Any]] = [] + for q in questions: + if not isinstance(q, str) or not q.strip(): + continue + ck = _criterion_key_from_question(q) + thr_raw = step_thresholds.get(ck) + try: + thr = float(thr_raw) + thr = max(0.0, min(1.0, thr)) + except Exception: + thr = 0.9 + criteria.append({"criterion_key": ck, "label": q, "threshold": thr}) + + # Pull all citation ids for this step (L2 list is filtered by human_l1_decision include) + filter_step = "" + if step_norm == "l2": + filter_step = "l1" + try: + ids = await run_in_threadpool(cits_dp_service.list_citation_ids, filter_step, table_name) + except Exception as e: + raise HTTPException(status_code=status.HTTP_500_INTERNAL_SERVER_ERROR, detail=f"Failed to list citations: {e}") + + # Load only columns we need + needed_cols: List[str] = ["id"] + validations_col = f"{step_norm}_validations" + legacy_validated_by = f"{step_norm}_validated_by" + + needed_cols.extend([validations_col, legacy_validated_by]) + + # We'll compute per-citation needs-review based on agent runs only. + # Fetch latest runs for all citations (bulk query using service helper) + pipeline_norm = "title_abstract" if step_norm == "l1" else "fulltext" + try: + runs = await run_in_threadpool( + cits_dp_service.list_latest_agent_runs, + sr_id=sr_id, + table_name=table_name, + citation_ids=ids, + pipeline=pipeline_norm, + ) + except Exception as e: + raise HTTPException(status_code=status.HTTP_500_INTERNAL_SERVER_ERROR, detail=f"Failed to load agent runs: {e}") + + # Group runs by citation then criterion + runs_by_cit: Dict[int, Dict[str, Dict[str, Dict[str, Any]]]] = {} + for r in runs or []: + try: + cid = int(r.get("citation_id")) + except Exception: + continue + ck = str(r.get("criterion_key") or "") + stg = str(r.get("stage") or "") + if not ck or stg not in {"screening", "critical"}: + continue + if cid not in runs_by_cit: + runs_by_cit[cid] = {} + if ck not in runs_by_cit[cid]: + runs_by_cit[cid][ck] = {} + runs_by_cit[cid][ck][stg] = r + + # Load citation rows for validations (and to know total citations count) + # If ids huge, this could be heavy; acceptable for now, can paginate later. + try: + rows = await run_in_threadpool(cits_dp_service.get_citations_by_ids, ids, table_name, needed_cols) + except Exception as e: + raise HTTPException(status_code=status.HTTP_500_INTERNAL_SERVER_ERROR, detail=f"Failed to load citation rows: {e}") + + # Helper: is validated? + def _is_validated(row: Dict[str, Any]) -> bool: + v = row.get(validations_col) + if v: + try: + parsed = v + if isinstance(v, str): + parsed = json.loads(v) + if isinstance(parsed, list) and len(parsed) > 0: + return True + except Exception: + pass + return bool(row.get(legacy_validated_by)) + + # Per-criterion aggregates + agg: Dict[str, Dict[str, int]] = {} + for c in criteria: + ck = c["criterion_key"] + agg[ck] = { + "total_citations": 0, + "has_run_count": 0, + "low_confidence_count": 0, + "critical_disagreement_count": 0, + "confident_exclude_count": 0, + "needs_human_review_count": 0, + } + + total_citations = 0 + validated_all = 0 + needs_review_total = 0 + validated_needs_review = 0 + + # Iterate citations and compute needs-review + per-criterion counts + for row in rows or []: + try: + cid = int(row.get("id")) + except Exception: + continue + total_citations += 1 + validated = _is_validated(row) + if validated: + validated_all += 1 + + per_crit = runs_by_cit.get(cid, {}) + + # Evaluate confident exclude override + has_confident_exclude = False + has_critical_disagreement = False + has_low_confidence = False + + for c in criteria: + ck = c["criterion_key"] + thr = float(c["threshold"]) + a = agg.get(ck) + if a is None: + continue + a["total_citations"] += 1 + + rpair = per_crit.get(ck) or {} + scr = rpair.get("screening") + crit = rpair.get("critical") + + if scr: + a["has_run_count"] += 1 + conf = scr.get("confidence") + try: + conf_f = float(conf) + except Exception: + conf_f = None + ans = scr.get("answer") + + if conf_f is not None and conf_f < thr: + a["low_confidence_count"] += 1 + has_low_confidence = True + + if conf_f is not None and conf_f >= thr and _is_exclude_answer(ans): + a["confident_exclude_count"] += 1 + has_confident_exclude = True + + if crit and _is_disagreeing_critical_answer(crit.get("answer")): + a["critical_disagreement_count"] += 1 + has_critical_disagreement = True + + needs_review = (not has_confident_exclude) and (has_critical_disagreement or has_low_confidence) + if needs_review: + needs_review_total += 1 + if validated: + validated_needs_review += 1 + # increment per-criterion needs-review count for all criteria + for c in criteria: + agg[c["criterion_key"]]["needs_human_review_count"] += 1 + + unvalidated_all = max(0, total_citations - validated_all) + unvalidated_needs_review = max(0, needs_review_total - validated_needs_review) + + # Build response + crit_out: List[Dict[str, Any]] = [] + for c in criteria: + ck = c["criterion_key"] + a = agg.get(ck) or {} + crit_out.append( + { + "criterion_key": ck, + "label": c["label"], + "threshold": float(c["threshold"]), + **a, + } + ) + + return ScreeningMetricsResponse( + sr_id=sr_id, + steps={ + step_norm: { + "summary": { + "step": step_norm, + "total_citations": total_citations, + "validated_all": validated_all, + "unvalidated_all": unvalidated_all, + "needs_review_total": needs_review_total, + "validated_needs_review": validated_needs_review, + "unvalidated_needs_review": unvalidated_needs_review, + }, + "criteria": crit_out, + } + }, + ) + async def update_inclusion_decision( sr: Dict[str, Any], citation_id: int, diff --git a/backend/api/services/sr_db_service.py b/backend/api/services/sr_db_service.py index f936ee2a..013a4c32 100644 --- a/backend/api/services/sr_db_service.py +++ b/backend/api/services/sr_db_service.py @@ -50,12 +50,28 @@ def ensure_table_exists(self) -> None: criteria JSONB, criteria_yaml TEXT, criteria_parsed JSONB, + screening_thresholds JSONB, screening_db JSONB, created_at TIMESTAMP WITH TIME ZONE DEFAULT now(), updated_at TIMESTAMP WITH TIME ZONE DEFAULT now() ) """ cur.execute(create_table_sql) + + # Runtime schema evolution for existing deployments. + # (No migrations philosophy: add columns if missing.) + try: + cur.execute( + "ALTER TABLE systematic_reviews ADD COLUMN IF NOT EXISTS screening_thresholds JSONB" + ) + except Exception: + # Older PG versions might not support IF NOT EXISTS. + try: + cur.execute( + "ALTER TABLE systematic_reviews ADD COLUMN screening_thresholds JSONB" + ) + except Exception: + pass conn.commit() logger.info("Ensured systematic_reviews table exists") @@ -186,8 +202,8 @@ def create_systematic_review( insert_sql = """ INSERT INTO systematic_reviews (id, name, description, owner_id, owner_email, users, visible, - criteria, criteria_yaml, criteria_parsed, created_at, updated_at) - VALUES (%s, %s, %s, %s, %s, %s, %s, %s, %s, %s, %s, %s) + criteria, criteria_yaml, criteria_parsed, screening_thresholds, created_at, updated_at) + VALUES (%s, %s, %s, %s, %s, %s, %s, %s, %s, %s, %s, %s, %s) """ cur.execute(insert_sql, ( @@ -201,6 +217,7 @@ def create_systematic_review( json.dumps(criteria_obj) if criteria_obj else None, criteria_str, json.dumps(criteria_parsed), + json.dumps({"l1": {}, "l2": {}, "parameters": {}}), now, now )) @@ -220,6 +237,8 @@ def create_systematic_review( sr_doc['criteria'] = json.loads(sr_doc['criteria']) if sr_doc.get('criteria_parsed') and isinstance(sr_doc['criteria_parsed'], str): sr_doc['criteria_parsed'] = json.loads(sr_doc['criteria_parsed']) + if sr_doc.get('screening_thresholds') and isinstance(sr_doc['screening_thresholds'], str): + sr_doc['screening_thresholds'] = json.loads(sr_doc['screening_thresholds']) # Convert datetime objects to ISO strings from datetime import datetime as dt if sr_doc.get('created_at') and isinstance(sr_doc['created_at'], dt): @@ -501,6 +520,8 @@ def list_systematic_reviews_for_user(self, user_email: str) -> List[Dict[str, An doc['criteria'] = json.loads(doc['criteria']) if doc.get('criteria_parsed') and isinstance(doc['criteria_parsed'], str): doc['criteria_parsed'] = json.loads(doc['criteria_parsed']) + if doc.get('screening_thresholds') and isinstance(doc['screening_thresholds'], str): + doc['screening_thresholds'] = json.loads(doc['screening_thresholds']) # Convert datetime objects to ISO strings from datetime import datetime as dt if doc.get('created_at') and isinstance(doc['created_at'], dt): @@ -559,6 +580,8 @@ def get_systematic_review(self, sr_id: str, ignore_visibility: bool = False) -> doc['criteria'] = json.loads(doc['criteria']) if doc.get('criteria_parsed') and isinstance(doc['criteria_parsed'], str): doc['criteria_parsed'] = json.loads(doc['criteria_parsed']) + if doc.get('screening_thresholds') and isinstance(doc['screening_thresholds'], str): + doc['screening_thresholds'] = json.loads(doc['screening_thresholds']) # Convert datetime objects to ISO strings from datetime import datetime as dt if doc.get('created_at') and isinstance(doc['created_at'], dt): @@ -709,6 +732,40 @@ def update_screening_db_info(self, sr_id: str, screening_db: Dict[str, Any]) -> if conn: pass + + def update_screening_thresholds(self, sr_id: str, screening_thresholds: Dict[str, Any]) -> None: + """Persist per-criterion screening thresholds on the SR record. + + This is SR-scoped shared state. Permission checks are expected to be + enforced by callers (routers) before calling this helper. + """ + + conn = None + try: + conn = postgres_server.conn + cur = conn.cursor() + + updated_at = datetime.utcnow().isoformat() + cur.execute( + "UPDATE systematic_reviews SET screening_thresholds = %s, updated_at = %s WHERE id = %s", + (json.dumps(screening_thresholds), updated_at, sr_id), + ) + conn.commit() + except Exception as e: + try: + if conn: + conn.rollback() + except Exception: + pass + logger.exception(f"Failed to update screening thresholds: {e}") + raise HTTPException( + status_code=status.HTTP_500_INTERNAL_SERVER_ERROR, + detail=f"Failed to update screening thresholds: {e}", + ) + finally: + if conn: + pass + def clear_screening_db_info(self, sr_id: str) -> None: """ Remove the screening_db field from the SR document. diff --git a/backend/api/sr/router.py b/backend/api/sr/router.py index e36b095d..c52416a2 100644 --- a/backend/api/sr/router.py +++ b/backend/api/sr/router.py @@ -52,6 +52,13 @@ class SystematicReviewRead(BaseModel): # convenience structured metadata extracted from criteria (l1, l2, parameters) criteria_parsed: Optional[Dict[str, Any]] = None + # Per-step, per-criterion thresholds (SR-scoped). Example: + # { + # "l1": {"population": 0.9, "intervention": 0.85}, + # "l2": {"outcome": 0.9} + # } + screening_thresholds: Optional[Dict[str, Any]] = None + @@ -136,6 +143,7 @@ async def create_systematic_review( criteria=sr_doc.get("criteria"), criteria_yaml=sr_doc.get("criteria_yaml"), criteria_parsed=sr_doc.get("criteria_parsed"), + screening_thresholds=sr_doc.get("screening_thresholds"), ) @@ -261,6 +269,7 @@ async def list_systematic_reviews_for_user( criteria=doc.get("criteria"), criteria_yaml=doc.get("criteria_yaml"), criteria_parsed=doc.get("criteria_parsed"), + screening_thresholds=doc.get("screening_thresholds"), ) ) @@ -293,6 +302,7 @@ async def get_systematic_review(sr_id: str, current_user: Dict[str, Any] = Depen criteria=doc.get("criteria"), criteria_yaml=doc.get("criteria_yaml"), criteria_parsed=doc.get("criteria_parsed"), + screening_thresholds=doc.get("screening_thresholds"), ) @@ -390,9 +400,76 @@ async def update_systematic_review_criteria( criteria=doc.get("criteria"), criteria_yaml=doc.get("criteria_yaml"), criteria_parsed=doc.get("criteria_parsed"), + screening_thresholds=doc.get("screening_thresholds"), ) +class ThresholdsUpdateRequest(BaseModel): + screening_thresholds: Dict[str, Any] = {} + + +@router.get("/{sr_id}/screening_thresholds") +async def get_screening_thresholds(sr_id: str, current_user: Dict[str, Any] = Depends(get_current_active_user)): + """Get SR-scoped per-step per-criterion thresholds.""" + + try: + doc, _screening = await load_sr_and_check(sr_id, current_user, srdb_service, require_screening=False) + except HTTPException: + raise + except Exception as e: + raise HTTPException(status_code=status.HTTP_500_INTERNAL_SERVER_ERROR, detail=f"Failed to load systematic review: {e}") + + thresholds = doc.get("screening_thresholds") or {} + if not isinstance(thresholds, dict): + thresholds = {} + return {"sr_id": sr_id, "screening_thresholds": thresholds} + + +@router.put("/{sr_id}/screening_thresholds") +async def update_screening_thresholds( + sr_id: str, + payload: ThresholdsUpdateRequest, + current_user: Dict[str, Any] = Depends(get_current_active_user), +): + """Update SR-scoped per-step per-criterion thresholds. + + Any SR member may update thresholds (per product requirement). + """ + + try: + _doc, _screening = await load_sr_and_check(sr_id, current_user, srdb_service, require_screening=False) + except HTTPException: + raise + except Exception as e: + raise HTTPException(status_code=status.HTTP_500_INTERNAL_SERVER_ERROR, detail=f"Failed to load systematic review: {e}") + + thresholds = payload.screening_thresholds or {} + if not isinstance(thresholds, dict): + raise HTTPException(status_code=status.HTTP_400_BAD_REQUEST, detail="screening_thresholds must be an object") + + # Normalize: only allow known steps keys, but keep it permissive. + normalized: Dict[str, Any] = {} + for step in ("l1", "l2"): + block = thresholds.get(step) + if isinstance(block, dict): + out: Dict[str, float] = {} + for k, v in block.items(): + if not isinstance(k, str) or not k.strip(): + continue + try: + f = float(v) + except Exception: + continue + f = max(0.0, min(1.0, f)) + out[k] = f + normalized[step] = out + else: + normalized[step] = {} + + await run_in_threadpool(srdb_service.update_screening_thresholds, sr_id, normalized) + return {"status": "success", "sr_id": sr_id, "screening_thresholds": normalized} + + @router.delete("/{sr_id}") async def delete_systematic_review(sr_id: str, current_user: Dict[str, Any] = Depends(get_current_active_user)): """ diff --git a/frontend/app/[lang]/can-sr/l1-screen/view/page.tsx b/frontend/app/[lang]/can-sr/l1-screen/view/page.tsx index bb4dd22f..57891cb5 100644 --- a/frontend/app/[lang]/can-sr/l1-screen/view/page.tsx +++ b/frontend/app/[lang]/can-sr/l1-screen/view/page.tsx @@ -54,6 +54,32 @@ function humanScreenColumn(name: string) { return base.replace(/^llm_/, 'human_') } +type ValidationEntry = { user: string; validated_at: string } + +function parseValidations(v: any): ValidationEntry[] { + if (!v) return [] + try { + const parsed = typeof v === 'string' ? JSON.parse(v) : v + if (!Array.isArray(parsed)) return [] + return parsed + .filter((x: any) => x && typeof x === 'object') + .map((x: any) => ({ + user: String(x.user ?? x.email ?? x.validated_by ?? ''), + validated_at: String(x.validated_at ?? x.timestamp ?? ''), + })) + .filter((x: any) => x.user) + } catch { + return [] + } +} + +function formatValidationDate(v: string): string { + if (!v) return '' + const d = new Date(v) + if (Number.isNaN(d.getTime())) return v + return d.toLocaleString() +} + /* Types for local clarity */ type CriteriaData = { questions: string[] @@ -76,6 +102,11 @@ export default function CanSrL1ScreenPage() { const searchParams = useSearchParams() const srId = searchParams?.get('sr_id') const citationId = searchParams?.get('citation_id') + const thresholdParam = searchParams?.get('threshold') + const threshold = useMemo(() => { + const v = Number(thresholdParam) + return Number.isFinite(v) ? Math.max(0, Math.min(1, v)) : 0.9 + }, [thresholdParam]) // Get current language to keep language when navigating (must be unconditional hook call) const { lang } = useParams<{ lang: string }>() const [selectedModel, setSelectedModel] = useState('gpt-5-mini') @@ -106,6 +137,17 @@ export default function CanSrL1ScreenPage() { const [loadingRuns, setLoadingRuns] = useState(false) const [validating, setValidating] = useState(false) + const [userEmail, setUserEmail] = useState(null) + + const l1Validations = useMemo(() => parseValidations((citation as any)?.l1_validations), [citation]) + const l1Checked = useMemo(() => { + const me = String(userEmail || '') + if (!me) return false + return l1Validations.some((v) => v.user === me) + }, [l1Validations, userEmail]) + const l1ValidationsSorted = useMemo(() => { + return [...l1Validations].sort((a, b) => String(b.validated_at || '').localeCompare(String(a.validated_at || ''))) + }, [l1Validations]) useEffect(() => { if (!srId || !citationId) { @@ -137,6 +179,23 @@ export default function CanSrL1ScreenPage() { loadIds() }, [srId]) + // Fetch current user email for the "Validated by [UserEmail]" checkbox label. + useEffect(() => { + const loadMe = async () => { + try { + const headers = { ...getAuthHeaders() } + const res = await fetch('/api/auth/me', { method: 'GET', headers }) + const data = await res.json().catch(() => ({})) + if (res.ok) { + setUserEmail(String(data?.user?.email || data?.email || '')) + } + } catch { + // ignore + } + } + loadMe() + }, []) + // Load citation row // Extracted fetch function so we can re-use it when navigating between citations async function fetchCitationById(id: string) { @@ -486,8 +545,7 @@ export default function CanSrL1ScreenPage() { -
    - {/* Agentic summary + Validate */} -
    -
    -
    -

    Agentic results

    -

    - Latest screening + critical runs per criterion. -

    -
    -
    - - - {citation?.l1_validated_by ? ( - - Validated by {String(citation.l1_validated_by)} - - ) : ( - Not validated - )} -
    -
    - - {loadingRuns ? ( -
    Loading agent runs…
    - ) : criteriaData?.questions?.length ? ( -
    - {criteriaData.questions.map((q, idx) => { - const criterionKey = q - ? q - .trim() - .toLowerCase() - .replace(/[^\w]+/g, '_') - .replace(/_+/g, '_') - .replace(/^_+|_+$/g, '') - .slice(0, 56) - : '' - - const r = runsByCriterion[criterionKey] || {} - const scr = r.screening - const crit = r.critical - - const critDisagrees = - crit && String((crit as any)?.answer || '').trim() !== '' && - String((crit as any)?.answer || '').trim() !== 'None of the above' - - return ( -
    -
    {q}
    -
    -
    -
    Screening
    -
    Answer: {String((scr as any)?.answer ?? '—')}
    -
    Confidence: {String((scr as any)?.confidence ?? '—')}
    -
    -
    -
    Critical
    -
    Answer: {String((crit as any)?.answer ?? '—')}
    -
    Confidence: {String((crit as any)?.confidence ?? '—')}
    - {critDisagrees ? ( -
    Disagrees
    - ) : null} -
    -
    -
    - ) - })} -
    - ) : ( -
    No criteria loaded yet.
    - )} -
    - +
    {/* Workspace (left) */} -
    +
    {workspace}
    {/* Selection sidebar (right) */} -
    @@ -752,7 +780,7 @@ export default function CanSrL1ScreenPage() { router.push( `/${lang}/can-sr/l1-screen/view?sr_id=${encodeURIComponent(srId)}&citation_id=${encodeURIComponent( target, - )}`, + )}&threshold=${encodeURIComponent(String(threshold))}`, ) }} className="rounded-md border bg-white px-4 py-2 text-sm shadow-sm hover:bg-gray-50" @@ -775,7 +803,7 @@ export default function CanSrL1ScreenPage() { router.push( `/${lang}/can-sr/l1-screen/view?sr_id=${encodeURIComponent(srId)}&citation_id=${encodeURIComponent( target, - )}`, + )}&threshold=${encodeURIComponent(String(threshold))}`, ) }} className="rounded-md bg-emerald-600 px-4 py-2 text-sm font-medium text-white hover:bg-emerald-700" diff --git a/frontend/app/[lang]/can-sr/l2-screen/view/page.tsx b/frontend/app/[lang]/can-sr/l2-screen/view/page.tsx index fef5b4f6..9af2199e 100644 --- a/frontend/app/[lang]/can-sr/l2-screen/view/page.tsx +++ b/frontend/app/[lang]/can-sr/l2-screen/view/page.tsx @@ -9,6 +9,32 @@ import { Wand2 } from 'lucide-react' import { getAuthToken, getTokenType } from '@/lib/auth' import { useDictionary } from '@/app/[lang]/DictionaryProvider' +type ValidationEntry = { user: string; validated_at: string } + +function parseValidations(v: any): ValidationEntry[] { + if (!v) return [] + try { + const parsed = typeof v === 'string' ? JSON.parse(v) : v + if (!Array.isArray(parsed)) return [] + return parsed + .filter((x: any) => x && typeof x === 'object') + .map((x: any) => ({ + user: String(x.user ?? x.email ?? x.validated_by ?? ''), + validated_at: String(x.validated_at ?? x.timestamp ?? ''), + })) + .filter((x: any) => x.user) + } catch { + return [] + } +} + +function formatValidationDate(v: string): string { + if (!v) return '' + const d = new Date(v) + if (Number.isNaN(d.getTime())) return v + return d.toLocaleString() +} + /* Full-text single-citation viewer for L2 screening. @@ -112,6 +138,17 @@ export default function CanSrL2ScreenViewPage() { const [agentRuns, setAgentRuns] = useState([]) const [loadingRuns, setLoadingRuns] = useState(false) const [validating, setValidating] = useState(false) + const [userEmail, setUserEmail] = useState(null) + + const l2Validations = useMemo(() => parseValidations((citation as any)?.l2_validations), [citation]) + const l2Checked = useMemo(() => { + const me = String(userEmail || '') + if (!me) return false + return l2Validations.some((v) => v.user === me) + }, [l2Validations, userEmail]) + const l2ValidationsSorted = useMemo(() => { + return [...l2Validations].sort((a, b) => String(b.validated_at || '').localeCompare(String(a.validated_at || ''))) + }, [l2Validations]) // Fulltext PDF viewer linkage const [fulltextCoords, setFulltextCoords] = useState(null) @@ -149,6 +186,23 @@ export default function CanSrL2ScreenViewPage() { loadIds() }, [srId]) + // Fetch current user email for validation toggling. + useEffect(() => { + const loadMe = async () => { + try { + const headers = { ...getAuthHeaders() } + const res = await fetch('/api/auth/me', { method: 'GET', headers }) + const data = await res.json().catch(() => ({})) + if (res.ok) { + setUserEmail(String(data?.user?.email || data?.email || '')) + } + } catch { + // ignore + } + } + loadMe() + }, []) + // Load citation row (and ensure fulltext is extracted if missing) async function fetchCitationById(id: string) { if (!srId || !id) return @@ -684,46 +738,54 @@ export default function CanSrL2ScreenViewPage() {

    - - - {citation?.l2_validated_by ? ( - - Validated by {String(citation.l2_validated_by)} + }} + /> + + Validated by {String(userEmail || '—')} - ) : ( - Not validated - )} +
    + {l2ValidationsSorted.length ? ( +
    + {l2ValidationsSorted.map((v, idx) => ( +
    + Validated on {formatValidationDate(v.validated_at)} by {v.user} +
    + ))} +
    + ) : ( +
    Not validated
    + )} + {loadingRuns ? (
    Loading agent runs…
    ) : criteriaData?.questions?.length ? ( diff --git a/frontend/app/api/can-sr/reviews/thresholds/route.ts b/frontend/app/api/can-sr/reviews/thresholds/route.ts new file mode 100644 index 00000000..2e439af8 --- /dev/null +++ b/frontend/app/api/can-sr/reviews/thresholds/route.ts @@ -0,0 +1,57 @@ +import { NextRequest, NextResponse } from 'next/server' +import { BACKEND_URL } from '@/lib/config' + +/** + * Frontend proxy for SR screening thresholds. + * + * Routes handled: + * - GET /api/can-sr/reviews/thresholds?sr_id=... -> BACKEND_URL/api/sr/{sr_id}/screening_thresholds + * - PUT /api/can-sr/reviews/thresholds?sr_id=... -> BACKEND_URL/api/sr/{sr_id}/screening_thresholds + */ + +async function forward(request: NextRequest, method: 'GET' | 'PUT') { + const params = request.nextUrl.searchParams + const srId = params.get('sr_id') + if (!srId) { + return NextResponse.json({ error: 'sr_id query parameter is required' }, { status: 400 }) + } + + const authHeader = request.headers.get('authorization') + if (!authHeader) { + return NextResponse.json({ error: 'Authorization header is required' }, { status: 401 }) + } + + const url = `${BACKEND_URL}/api/sr/${encodeURIComponent(srId)}/screening_thresholds` + + const body = method === 'PUT' ? JSON.stringify(await request.json()) : undefined + + const res = await fetch(url, { + method, + headers: { + Authorization: authHeader, + ...(method === 'PUT' ? { 'Content-Type': 'application/json' } : {}), + }, + body, + }) + + const data = await res.json().catch(() => ({})) + return NextResponse.json(data, { status: res.status }) +} + +export async function GET(request: NextRequest) { + try { + return await forward(request, 'GET') + } catch (error) { + console.error('thresholds GET API error:', error) + return NextResponse.json({ error: 'Internal server error' }, { status: 500 }) + } +} + +export async function PUT(request: NextRequest) { + try { + return await forward(request, 'PUT') + } catch (error) { + console.error('thresholds PUT API error:', error) + return NextResponse.json({ error: 'Internal server error' }, { status: 500 }) + } +} diff --git a/frontend/app/api/can-sr/screen/metrics/route.ts b/frontend/app/api/can-sr/screen/metrics/route.ts new file mode 100644 index 00000000..710dfc2e --- /dev/null +++ b/frontend/app/api/can-sr/screen/metrics/route.ts @@ -0,0 +1,59 @@ +import { NextRequest, NextResponse } from 'next/server' +import { BACKEND_URL } from '@/lib/config' + +/** + * Proxy: GET /api/can-sr/screen/metrics?sr_id=&step=l1|l2 + * -> GET {BACKEND_URL}/api/screen/metrics?sr_id=...&step=... + */ + +export async function OPTIONS() { + return new Response(null, { + status: 204, + headers: { + 'Access-Control-Allow-Origin': '*', + 'Access-Control-Allow-Methods': 'GET,OPTIONS', + 'Access-Control-Allow-Headers': 'Authorization, Content-Type', + }, + }) +} + +export async function GET(request: NextRequest) { + try { + const params = request.nextUrl.searchParams + const srId = params.get('sr_id') + const step = params.get('step') || 'l1' + + if (!srId) { + return NextResponse.json({ error: 'sr_id is required' }, { status: 400 }) + } + + const authHeader = request.headers.get('authorization') + if (!authHeader) { + return NextResponse.json({ error: 'Authorization header is required' }, { status: 401 }) + } + + const url = new URL(`${BACKEND_URL}/api/screen/metrics`) + url.searchParams.set('sr_id', srId) + url.searchParams.set('step', step) + + const res = await fetch(url.toString(), { + method: 'GET', + headers: { + Authorization: authHeader, + }, + }) + + const text = await res.text().catch(() => '') + let json: any = null + try { + json = text ? JSON.parse(text) : {} + } catch { + json = { detail: text || null } + } + + return NextResponse.json(json, { status: res.status }) + } catch (err: any) { + console.error('screen metrics proxy GET error:', err) + return NextResponse.json({ error: 'Internal server error' }, { status: 500 }) + } +} diff --git a/frontend/components/can-sr/CitationListPage.tsx b/frontend/components/can-sr/CitationListPage.tsx index df823536..3f99aa12 100644 --- a/frontend/components/can-sr/CitationListPage.tsx +++ b/frontend/components/can-sr/CitationListPage.tsx @@ -9,6 +9,11 @@ import { Bot, Check, Wand2 } from 'lucide-react' import { useDictionary } from '@/app/[lang]/DictionaryProvider' import { ModelSelector } from '@/components/chat' import { toast } from 'react-hot-toast' +import ScreeningMetricsPanel, { + type ScreeningMetricsStats, + type ScreeningMetricsSummary, + type ScreeningCriterionMetrics, +} from '@/components/can-sr/ScreeningMetricsPanel' import { Dialog, DialogContent, @@ -74,6 +79,17 @@ export default function CitationsListPage({ const [error, setError] = useState(null) const [criteriaData, setCriteriaData] = useState() + // Phase 1 list control surface is now hosted by the left-side metrics module. + const [threshold, setThreshold] = useState(0.9) + const [filterMode, setFilterMode] = useState<'needs' | 'validated' | 'unvalidated' | 'all'>('needs') + const [pageStats, setPageStats] = useState(undefined) + + // Phase 2 metrics (SR-wide) + const [srMetricsSummary, setSrMetricsSummary] = useState(undefined) + const [srCriterionMetrics, setSrCriterionMetrics] = useState(undefined) + const [srThresholds, setSrThresholds] = useState | null>(null) + const [metricsRefreshKey, setMetricsRefreshKey] = useState(0) + // Run-all job tracking (persist across modal close / refresh) const [runAllForce, setRunAllForce] = useState(false) const [runAllJobId, setRunAllJobId] = useState(null) @@ -168,6 +184,80 @@ export default function CitationsListPage({ loadCitations() }, [srId, router, screeningStep]) + // Load SR thresholds + metrics (L1/L2 only) + useEffect(() => { + if (!srId) return + if (!(screeningStep === 'l1' || screeningStep === 'l2')) { + setSrMetricsSummary(undefined) + setSrCriterionMetrics(undefined) + setSrThresholds(null) + return + } + + const load = async () => { + try { + const headers = getAuthHeaders() + + // 1) thresholds + const tRes = await fetch( + `/api/can-sr/reviews/thresholds?sr_id=${encodeURIComponent(srId)}`, + { method: 'GET', headers }, + ) + const tJson = await tRes.json().catch(() => ({})) + const thresholds = (tRes.ok ? tJson?.screening_thresholds : null) || {} + setSrThresholds(typeof thresholds === 'object' && thresholds ? thresholds : {}) + + // 2) metrics + const mRes = await fetch( + `/api/can-sr/screen/metrics?sr_id=${encodeURIComponent(srId)}&step=${encodeURIComponent( + screeningStep, + )}`, + { method: 'GET', headers }, + ) + const mJson = await mRes.json().catch(() => ({})) + if (mRes.ok) { + const stepBlock = mJson?.steps?.[screeningStep] + setSrMetricsSummary(stepBlock?.summary) + setSrCriterionMetrics(stepBlock?.criteria) + } else { + setSrMetricsSummary(undefined) + setSrCriterionMetrics(undefined) + } + } catch { + setSrMetricsSummary(undefined) + setSrCriterionMetrics(undefined) + setSrThresholds(null) + } + } + load() + }, [srId, screeningStep, metricsRefreshKey]) + + const persistThresholds = useCallback( + async (nextThresholds: Record) => { + if (!srId) return + try { + const headers = { ...getAuthHeaders(), 'Content-Type': 'application/json' } + const res = await fetch( + `/api/can-sr/reviews/thresholds?sr_id=${encodeURIComponent(srId)}`, + { + method: 'PUT', + headers, + body: JSON.stringify({ screening_thresholds: nextThresholds }), + }, + ) + const j = await res.json().catch(() => ({})) + if (res.ok) { + setSrThresholds(j?.screening_thresholds || nextThresholds) + // Refresh metrics so counts reflect the new thresholds. + setMetricsRefreshKey((k) => k + 1) + } + } catch { + // ignore + } + }, + [srId], + ) + // Restore persisted run-all job id useEffect(() => { if (!runAllStorageKey) return @@ -323,7 +413,11 @@ export default function CitationsListPage({ } /> -
    + {/* + Layout: left floating/side metrics module + right list. + (A true fixed overlay can be added later; this keeps it responsive and simple.) + */} +
    setRunAllModalOpen(false)}> @@ -362,116 +456,163 @@ export default function CitationsListPage({ -
    -
    -
    -

    - {dict.screening.citationsList} -

    -

    - {dict.screening.citationsListDesc} -

    -
    - -
    - - - - -
    -
    - - - {dict.screening.llmClassified} - -
    -
    - - - {dict.screening.humanVerified} - -
    -
    + />
    -
    - - {/* Run-all status/controls are shown in the bottom-right floating panel. */} + + +
    +
    +
    +
    +

    + {dict.screening.citationsList} +

    +

    + {dict.screening.citationsListDesc} +

    +
    -
    - {loading ? ( -
    - {dict.screening.loadingCitations} -
    - ) : error ? ( -
    {error}
    - ) : citationIds && citationIds.length === 0 ? ( -
    - {dict.screening.noCitations} -
    - ) : ( -
    -
    - {dict.screening.totalCitations}{' '} - {citationIds ? citationIds.length : 0} +
    + + + + +
    +
    + + + {dict.screening.llmClassified} + +
    +
    + + + {dict.screening.humanVerified} + +
    +
    +
    - + {/* Run-all status/controls are shown in the bottom-right floating panel. */} + +
    + {loading ? ( +
    + {dict.screening.loadingCitations} +
    + ) : error ? ( +
    {error}
    + ) : citationIds && citationIds.length === 0 ? ( +
    + {dict.screening.noCitations} +
    + ) : ( +
    +
    + {dict.screening.totalCitations}{' '} + {citationIds ? citationIds.length : 0} +
    + + + setPageStats({ + scopeLabel: 'this page', + total: s.total, + needsValidation: s.needsValidation, + validated: s.validated, + unvalidated: s.unvalidated, + }) + } + /> +
    + )}
    - )} +
    diff --git a/frontend/components/can-sr/PagedList.tsx b/frontend/components/can-sr/PagedList.tsx index b0417b50..dced77a1 100644 --- a/frontend/components/can-sr/PagedList.tsx +++ b/frontend/components/can-sr/PagedList.tsx @@ -12,6 +12,17 @@ type CitationInfo = { include: string[] screeningStep: string pageview: string + threshold?: number + thresholdByCriterionKey?: Record + filterMode?: 'needs' | 'validated' | 'unvalidated' | 'all' + onThresholdChange?: (v: number) => void + onFilterModeChange?: (v: 'needs' | 'validated' | 'unvalidated' | 'all') => void + onStatsChange?: (stats: { + total: number + needsValidation: number + validated: number + unvalidated: number + }) => void } type LatestAgentRun = { @@ -44,6 +55,12 @@ export default function PagedList({ possible_answers, screeningStep, pageview, + threshold: thresholdProp, + thresholdByCriterionKey, + filterMode: filterModeProp, + onThresholdChange, + onFilterModeChange, + onStatsChange, }: CitationInfo) { const router = useRouter() const { lang } = useParams<{ lang: string }>() @@ -62,9 +79,12 @@ export default function PagedList({ ) const [showClassify, setShowClassify] = useState>({}) - // TA list controls - const [threshold, setThreshold] = useState(0.9) - const [filterMode, setFilterMode] = useState<'needs' | 'validated' | 'unvalidated' | 'all'>('needs') + // List controls (controlled by parent when provided; otherwise local state) + const [thresholdLocal, setThresholdLocal] = useState(0.9) + const [filterModeLocal, setFilterModeLocal] = useState<'needs' | 'validated' | 'unvalidated' | 'all'>('needs') + + const threshold = typeof thresholdProp === 'number' ? thresholdProp : thresholdLocal + const filterMode = filterModeProp || filterModeLocal const [latestRunsByCitation, setLatestRunsByCitation] = useState>({}) @@ -154,7 +174,7 @@ export default function PagedList({ setLatestRunsByCitation((prev: Record) => ({ ...prev, ...grouped })) } } - } catch (e) { + } catch { // best-effort } } @@ -168,9 +188,25 @@ export default function PagedList({ const isValidatedForStep = (row: any): boolean => { if (!row) return false - if (screeningStep === 'l1') return Boolean(row?.l1_validated_by) - if (screeningStep === 'l2') return Boolean(row?.l2_validated_by) - if (screeningStep === 'extract') return Boolean(row?.parameters_validated_by) + const hasValidationsList = (v: any): boolean => { + if (!v) return false + if (Array.isArray(v)) return v.length > 0 + if (typeof v === 'string') { + try { + const parsed = JSON.parse(v) + return Array.isArray(parsed) && parsed.length > 0 + } catch { + return false + } + } + return false + } + + // Prefer new per-step validations list; fall back to legacy single fields. + if (screeningStep === 'l1') return hasValidationsList(row?.l1_validations) || Boolean(row?.l1_validated_by) + if (screeningStep === 'l2') return hasValidationsList(row?.l2_validations) || Boolean(row?.l2_validated_by) + if (screeningStep === 'extract') + return hasValidationsList(row?.parameters_validations) || Boolean(row?.parameters_validated_by) return false } @@ -194,21 +230,39 @@ export default function PagedList({ byKey[key].push(r) } - // Needs validation if ANY criterion is low confidence OR critical disagrees + // Rule: + // - If ANY criterion is a confident exclude (screening answer contains "(exclude)" AND conf >= threshold) => no review needed. + // - Else needs review if ANY criterion is low confidence OR critical disagrees. + + let hasConfidentExclude = false + let hasLowConfidence = false + let hasCriticalDisagree = false + for (const key of Object.keys(byKey)) { const items = byKey[key] const screening = items.find((x) => String((x as any)?.stage) === 'screening') const critical = items.find((x) => String((x as any)?.stage) === 'critical') const conf = Number((screening as any)?.confidence) - if (Number.isFinite(conf) && conf < threshold) return true + const perThrRaw = thresholdByCriterionKey ? Number((thresholdByCriterionKey as any)[key]) : NaN + const thr = Number.isFinite(perThrRaw) ? Math.max(0, Math.min(1, perThrRaw)) : threshold + + if (Number.isFinite(conf) && conf < thr) hasLowConfidence = true + + const ans = String((screening as any)?.answer || '') + if (Number.isFinite(conf) && conf >= thr && ans.toLowerCase().includes('(exclude)')) { + hasConfidentExclude = true + } const criticalAns = String((critical as any)?.answer || '') // In our critical prompt contract, agreement is "None of the above". - if (critical && criticalAns.trim() !== '' && criticalAns.trim() !== 'None of the above') return true + if (critical && criticalAns.trim() !== '' && criticalAns.trim() !== 'None of the above') { + hasCriticalDisagree = true + } } - return false + if (hasConfidentExclude) return false + return hasLowConfidence || hasCriticalDisagree } const filteredCitationData = citationData.filter((row: any) => { @@ -224,6 +278,23 @@ export default function PagedList({ return true }) + // Emit list stats upward (so CitationListPage can render a floating metrics module) + useEffect(() => { + if (!onStatsChange) return + const total = citationData.length + let validated = 0 + let needsValidation = 0 + for (const row of citationData) { + const id = Number(row?.id) + if (!Number.isFinite(id)) continue + if (isValidatedForStep(row)) validated += 1 + if (computeNeedsValidation(id, row)) needsValidation += 1 + } + const unvalidated = Math.max(0, total - validated) + onStatsChange({ total, needsValidation, validated, unvalidated }) + // eslint-disable-next-line react-hooks/exhaustive-deps + }, [citationData, threshold, screeningStep, latestRunsByCitation, thresholdByCriterionKey]) + // NOTE: Previously we fetched each citation via /citations/get. // This is now replaced by a single /citations/batch call per page. @@ -306,7 +377,10 @@ export default function PagedList({ return (
    - {screeningStep === 'l1' || screeningStep === 'l2' ? ( + {/* Controls moved to CitationListPage floating metrics module when provided. + Keep fallback local controls for other callers. */} + {((screeningStep === 'l1' || screeningStep === 'l2') && + (typeof thresholdProp !== 'number' || !filterModeProp)) ? (
    @@ -319,7 +393,9 @@ export default function PagedList({ onChange={(e: React.ChangeEvent) => { const v = Number(e.target.value) if (!Number.isFinite(v)) return - setThreshold(Math.max(0, Math.min(1, v))) + const nv = Math.max(0, Math.min(1, v)) + setThresholdLocal(nv) + onThresholdChange?.(nv) }} className="w-24 rounded-md border border-gray-200 px-2 py-1 text-sm" /> @@ -329,10 +405,14 @@ export default function PagedList({ ) => { + const v = Number(e.target.value) + if (!Number.isFinite(v)) return + onThresholdChange(Math.max(0, Math.min(1, v))) + }} + className="w-24 rounded-md border border-gray-200 px-2 py-1 text-sm" + /> +
    + ) : null} + +
    + + +
    + + {summary ? ( +
    +
    Validation summary
    +
    +
    +
    All citations
    +
    + {summary.validated_all} / {summary.total_citations} +
    +
    +
    +
    Needs human review
    +
    + {summary.validated_needs_review} / {summary.needs_review_total} +
    +
    +
    +
    + Unvalidated: {summary.unvalidated_all} (all), {summary.unvalidated_needs_review} (needs review) +
    +
    + ) : null} + +
    +
    + Workload summary{stats?.scopeLabel ? ` (${stats.scopeLabel})` : ''} +
    +
    +
    +
    Total
    +
    {stats ? stats.total : '—'}
    +
    +
    +
    Needs validation
    +
    {stats ? stats.needsValidation : '—'}
    +
    +
    +
    Validated
    +
    {stats ? stats.validated : '—'}
    +
    +
    +
    Unvalidated
    +
    {stats ? stats.unvalidated : '—'}
    +
    +
    +
    + + {criterionMetrics?.length ? ( +
    +
    Criteria thresholds & metrics
    +
    + {criterionMetrics.map((c) => ( +
    +
    +
    +
    {c.label}
    +
    +
    Low conf: {c.low_confidence_count}
    +
    Critical disagree: {c.critical_disagreement_count}
    +
    Confident exclude: {c.confident_exclude_count}
    +
    Has run: {c.has_run_count}/{c.total_citations}
    +
    +
    + +
    + + ) => { + const v = Number(e.target.value) + if (!Number.isFinite(v)) return + onCriterionThresholdChange?.( + c.criterion_key, + Math.max(0, Math.min(1, v)), + ) + }} + className="w-20 rounded-md border border-gray-200 px-2 py-1 text-sm" + /> +
    +
    +
    + ))} +
    +
    + ) : null} + +
    +
    Performance (validated set)
    +
    + Coming in Phase 2: agreement/accuracy, recommended thresholds, workload reduction curves. +
    +
    +
    +
    + ) +} From 814a7707ea4255d3b0fe5518c4dd3a152c6d36fa Mon Sep 17 00:00:00 2001 From: bing1100 Date: Tue, 14 Apr 2026 15:08:42 -0400 Subject: [PATCH 3/3] working metrics and critical agent --- backend/api/jobs/router.py | 28 +- backend/api/jobs/run_all_tasks.py | 239 ++++- backend/api/screen/agentic_utils.py | 15 +- backend/api/screen/prompts.py | 6 + backend/api/screen/router.py | 836 +++++++++++++++++- backend/api/services/cit_db_service.py | 116 ++- backend/api/services/sr_db_service.py | 55 +- backend/api/sr/router.py | 72 ++ backend/docker-compose.yml | 7 +- frontend/app/[lang]/can-sr/l1-screen/page.tsx | 54 +- .../app/[lang]/can-sr/l1-screen/view/page.tsx | 80 +- frontend/app/[lang]/can-sr/l2-screen/page.tsx | 57 +- .../app/[lang]/can-sr/l2-screen/view/page.tsx | 120 +-- .../critical-prompt-additions/route.ts | 59 ++ .../api/can-sr/screen/calibration/route.ts | 66 ++ .../api/can-sr/screen/fulltext/run/route.ts | 36 + .../can-sr/screen/title-abstract/run/route.ts | 36 + .../components/can-sr/CitationListPage.tsx | 141 ++- frontend/components/can-sr/PagedList.tsx | 134 ++- .../can-sr/ScreeningMetricsModal.tsx | 326 +++++++ .../can-sr/ScreeningMetricsPanel.tsx | 357 ++++++-- 21 files changed, 2488 insertions(+), 352 deletions(-) create mode 100644 frontend/app/api/can-sr/reviews/critical-prompt-additions/route.ts create mode 100644 frontend/app/api/can-sr/screen/calibration/route.ts create mode 100644 frontend/app/api/can-sr/screen/fulltext/run/route.ts create mode 100644 frontend/app/api/can-sr/screen/title-abstract/run/route.ts create mode 100644 frontend/components/can-sr/ScreeningMetricsModal.tsx diff --git a/backend/api/jobs/router.py b/backend/api/jobs/router.py index a49b8f6a..4d9d4517 100644 --- a/backend/api/jobs/router.py +++ b/backend/api/jobs/router.py @@ -11,6 +11,7 @@ from ..core.cit_utils import load_sr_and_check from ..services.sr_db_service import srdb_service from ..services.azure_openai_client import azure_openai_client +from ..services.cit_db_service import cits_dp_service from .run_all_repo import run_all_repo from .procrastinate_app import cancel_enqueued_jobs_for_run_all, jobs_enabled, worker_concurrency @@ -126,12 +127,34 @@ async def start_run_all( # Authz: ensure user can access SR try: - _sr, _screening = await load_sr_and_check(sr_id, current_user, srdb_service) + sr, screening = await load_sr_and_check(sr_id, current_user, srdb_service) except HTTPException: raise except Exception as e: raise HTTPException(status_code=500, detail=f"Failed to load SR: {e}") + # Legacy safety: + # If legacy llm_* outputs exist but normalized agent runs are missing, we must + # regenerate results to populate screening_agent_runs. + # We enforce this by auto-enabling force overwrite. + force = bool(payload.force) + try: + table_name = (screening or {}).get("table_name") or "citations" + cp = (sr or {}).get("criteria_parsed") or {} + if step in {"l1", "l2"}: + legacy_needs = await run_in_threadpool( + cits_dp_service.legacy_needs_rerun, + sr_id=sr_id, + table_name=table_name, + criteria_parsed=cp, + step=step, + ) + if legacy_needs: + force = True + except Exception: + # best-effort, do not block + pass + # Ensure our job tables exist await run_in_threadpool(run_all_repo.ensure_tables) @@ -179,9 +202,10 @@ async def start_run_all( created_by=str(current_user.get("id") or ""), model=normalized_model, meta={ - "force": bool(payload.force), + "force": force, "chunk_size": int(payload.chunk_size), "explicit_ids": bool(sanitized_ids is not None), + "legacy_auto_force": (force and (not bool(payload.force))), }, total=len(sanitized_ids) if sanitized_ids is not None else 0, ) diff --git a/backend/api/jobs/run_all_tasks.py b/backend/api/jobs/run_all_tasks.py index c8c43c48..b469cd0a 100644 --- a/backend/api/jobs/run_all_tasks.py +++ b/backend/api/jobs/run_all_tasks.py @@ -14,8 +14,16 @@ from ..services.azure_openai_client import azure_openai_client from ..services.storage import storage_service from ..extract.router import extract_fulltext_from_storage -from ..screen.router import update_inclusion_decision -from ..screen.prompts import PROMPT_JSON_TEMPLATE, PROMPT_JSON_TEMPLATE_FULLTEXT +from ..screen.router import update_inclusion_decision, _build_guardrails +from ..screen.prompts import ( + PROMPT_JSON_TEMPLATE, + PROMPT_JSON_TEMPLATE_FULLTEXT, + PROMPT_XML_TEMPLATE_TA, + PROMPT_XML_TEMPLATE_TA_CRITICAL, + PROMPT_XML_TEMPLATE_FULLTEXT, + PROMPT_XML_TEMPLATE_FULLTEXT_CRITICAL, +) +from ..screen.agentic_utils import build_critical_options, parse_agent_xml, resolve_option from ..extract.prompts import PARAMETER_PROMPT_JSON from ..core.config import settings @@ -159,34 +167,103 @@ async def _run_l1_for_citation( if not azure_openai_client.is_configured(): raise RuntimeError("Azure OpenAI client not configured") - options_listed = "\n".join([f"{j}. {opt}" for j, opt in enumerate(opts)]) - prompt = PROMPT_JSON_TEMPLATE.format(question=q, cit=citation_text, options=options_listed, xtra=xtra) - llm_response = await azure_openai_client.simple_chat( - user_message=prompt, + # --- Agentic (screening + critical) --- + # We persist normalized runs to screening_agent_runs so /screen/metrics can compute SR-wide progress. + # We ALSO persist llm_* JSONB columns for backwards compatibility with the existing UI. + options_listed = "\n".join([str(opt) for opt in opts]) + criterion_key = snake_case(q, max_len=56) + + screening_prompt = PROMPT_XML_TEMPLATE_TA.format( + question=q, + cit=citation_text, + options=options_listed, + xtra=xtra or "", + ) + screening_raw = await azure_openai_client.simple_chat( + user_message=screening_prompt, system_prompt=None, model=model, max_tokens=2000, temperature=0.0, ) + screening_parsed = parse_agent_xml(str(screening_raw)) + screening_answer = resolve_option(screening_parsed.answer, opts) + + await run_in_threadpool( + cits_dp_service.insert_screening_agent_run, + { + "sr_id": sr.get("_id") or sr.get("id") or sr.get("sr_id") or "", + "table_name": table_name, + "citation_id": int(citation_id), + "pipeline": "title_abstract", + "criterion_key": criterion_key, + "stage": "screening", + "answer": screening_answer, + "confidence": screening_parsed.confidence, + "rationale": screening_parsed.rationale, + "raw_response": str(screening_raw), + "guardrails": _build_guardrails(screening_parsed, raw_text=str(screening_raw), stage="screening"), + "model": model, + "prompt_version": "run_all", + "temperature": 0.0, + }, + ) - import json - - parsed = json.loads(llm_response) - selected_value = str(parsed.get("selected", "")).strip() - resolved_selected = f"None of the above - {selected_value}" - for opt in opts: - if opt.lower() in selected_value.lower(): - resolved_selected = opt - break + critical_opts = build_critical_options(all_options=opts, screening_answer=screening_answer) + critical_listed = "\n".join([str(o) for o in critical_opts]) + critical_prompt = PROMPT_XML_TEMPLATE_TA_CRITICAL.format( + question=q, + cit=citation_text, + screening_answer=screening_answer, + options=critical_listed, + xtra=xtra or "", + # run-all does not currently inject SR-scoped critical prompt additions (done in /screen/*/run) + critical_additions="(none)", + ) + critical_raw = await azure_openai_client.simple_chat( + user_message=critical_prompt, + system_prompt=None, + model=model, + max_tokens=2000, + temperature=0.0, + ) + critical_parsed = parse_agent_xml(str(critical_raw)) + critical_answer = resolve_option(critical_parsed.answer, critical_opts) + + await run_in_threadpool( + cits_dp_service.insert_screening_agent_run, + { + "sr_id": sr.get("_id") or sr.get("id") or sr.get("sr_id") or "", + "table_name": table_name, + "citation_id": int(citation_id), + "pipeline": "title_abstract", + "criterion_key": criterion_key, + "stage": "critical", + "answer": critical_answer, + "confidence": critical_parsed.confidence, + "rationale": critical_parsed.rationale, + "raw_response": str(critical_raw), + "guardrails": _build_guardrails(critical_parsed, raw_text=str(critical_raw), stage="critical"), + "model": model, + "prompt_version": "run_all", + "temperature": 0.0, + }, + ) classification_json = { - "selected": resolved_selected, - "explanation": parsed.get("explanation") or parsed.get("reason") or parsed.get("explain") or "", - "confidence": float(parsed.get("confidence") or 0.0) if str(parsed.get("confidence") or "").strip() else 0.0, - "evidence_sentences": parsed.get("evidence_sentences") or [], - "evidence_tables": parsed.get("evidence_tables") or [], - "evidence_figures": parsed.get("evidence_figures") or [], - "llm_raw": llm_response, + "selected": screening_answer, + "explanation": screening_parsed.rationale or "", + "confidence": screening_parsed.confidence if screening_parsed.confidence is not None else 0.0, + "evidence_sentences": [], + "evidence_tables": [], + "evidence_figures": [], + "llm_raw": str(screening_raw), + "critical": { + "selected": critical_answer, + "explanation": critical_parsed.rationale or "", + "confidence": critical_parsed.confidence, + "llm_raw": str(critical_raw), + }, } await run_in_threadpool(cits_dp_service.update_jsonb_column, citation_id, col, classification_json, table_name) @@ -354,19 +431,21 @@ async def _run_l2_for_citation( if _should_skip_ai_output(existing, force=force): continue - options_listed = "\n".join([f"{j}. {opt}" for j, opt in enumerate(opts)]) - prompt = PROMPT_JSON_TEMPLATE_FULLTEXT.format( + # --- Agentic (screening + critical) --- + options_listed = "\n".join([str(opt) for opt in opts]) + criterion_key = snake_case(q, max_len=56) + + screening_prompt = PROMPT_XML_TEMPLATE_FULLTEXT.format( question=q, options=options_listed, - xtra=xtra, + xtra=xtra or "", fulltext=fulltext, tables="\n".join(tables_md_lines) if tables_md_lines else "(none)", figures="\n".join(figures_lines) if figures_lines else "(none)", ) - if images: - llm_response = await azure_openai_client.multimodal_chat( - user_text=prompt, + screening_raw = await azure_openai_client.multimodal_chat( + user_text=screening_prompt, images=images, system_prompt=None, model=model, @@ -374,30 +453,102 @@ async def _run_l2_for_citation( temperature=0.0, ) else: - llm_response = await azure_openai_client.simple_chat( - user_message=prompt, + screening_raw = await azure_openai_client.simple_chat( + user_message=screening_prompt, system_prompt=None, model=model, max_tokens=2000, temperature=0.0, ) + screening_parsed = parse_agent_xml(str(screening_raw)) + screening_answer = resolve_option(screening_parsed.answer, opts) + + await run_in_threadpool( + cits_dp_service.insert_screening_agent_run, + { + "sr_id": sr.get("_id") or sr.get("id") or sr.get("sr_id") or "", + "table_name": table_name, + "citation_id": int(citation_id), + "pipeline": "fulltext", + "criterion_key": criterion_key, + "stage": "screening", + "answer": screening_answer, + "confidence": screening_parsed.confidence, + "rationale": screening_parsed.rationale, + "raw_response": str(screening_raw), + "guardrails": _build_guardrails(screening_parsed, raw_text=str(screening_raw), stage="screening"), + "model": model, + "prompt_version": "run_all", + "temperature": 0.0, + }, + ) - parsed = json.loads(llm_response) - selected_value = str(parsed.get("selected", "")).strip() - resolved_selected = f"None of the above - {selected_value}" - for opt in opts: - if opt.lower() in selected_value.lower(): - resolved_selected = opt - break + critical_opts = build_critical_options(all_options=opts, screening_answer=screening_answer) + critical_listed = "\n".join([str(o) for o in critical_opts]) + critical_prompt = PROMPT_XML_TEMPLATE_FULLTEXT_CRITICAL.format( + question=q, + screening_answer=screening_answer, + options=critical_listed, + xtra=xtra or "", + critical_additions="(none)", + fulltext=fulltext, + tables="\n".join(tables_md_lines) if tables_md_lines else "(none)", + figures="\n".join(figures_lines) if figures_lines else "(none)", + ) + if images: + critical_raw = await azure_openai_client.multimodal_chat( + user_text=critical_prompt, + images=images, + system_prompt=None, + model=model, + max_tokens=2000, + temperature=0.0, + ) + else: + critical_raw = await azure_openai_client.simple_chat( + user_message=critical_prompt, + system_prompt=None, + model=model, + max_tokens=2000, + temperature=0.0, + ) + critical_parsed = parse_agent_xml(str(critical_raw)) + critical_answer = resolve_option(critical_parsed.answer, critical_opts) + + await run_in_threadpool( + cits_dp_service.insert_screening_agent_run, + { + "sr_id": sr.get("_id") or sr.get("id") or sr.get("sr_id") or "", + "table_name": table_name, + "citation_id": int(citation_id), + "pipeline": "fulltext", + "criterion_key": criterion_key, + "stage": "critical", + "answer": critical_answer, + "confidence": critical_parsed.confidence, + "rationale": critical_parsed.rationale, + "raw_response": str(critical_raw), + "guardrails": _build_guardrails(critical_parsed, raw_text=str(critical_raw), stage="critical"), + "model": model, + "prompt_version": "run_all", + "temperature": 0.0, + }, + ) classification_json = { - "selected": resolved_selected, - "explanation": parsed.get("explanation") or parsed.get("reason") or parsed.get("explain") or "", - "confidence": float(parsed.get("confidence") or 0.0) if str(parsed.get("confidence") or "").strip() else 0.0, - "evidence_sentences": parsed.get("evidence_sentences") or [], - "evidence_tables": parsed.get("evidence_tables") or [], - "evidence_figures": parsed.get("evidence_figures") or [], - "llm_raw": llm_response, + "selected": screening_answer, + "explanation": screening_parsed.rationale or "", + "confidence": screening_parsed.confidence if screening_parsed.confidence is not None else 0.0, + "evidence_sentences": [], + "evidence_tables": [], + "evidence_figures": [], + "llm_raw": str(screening_raw), + "critical": { + "selected": critical_answer, + "explanation": critical_parsed.rationale or "", + "confidence": critical_parsed.confidence, + "llm_raw": str(critical_raw), + }, } await run_in_threadpool(cits_dp_service.update_jsonb_column, citation_id, col, classification_json, table_name) diff --git a/backend/api/screen/agentic_utils.py b/backend/api/screen/agentic_utils.py index 1250287b..ec4efc36 100644 --- a/backend/api/screen/agentic_utils.py +++ b/backend/api/screen/agentic_utils.py @@ -19,6 +19,8 @@ class ParsedAgentXML: confidence: float rationale: str parse_ok: bool + missing_answer: bool + missing_confidence: bool _TAG_RE_CACHE: dict[str, re.Pattern[str]] = {} @@ -49,8 +51,17 @@ def parse_agent_xml(text: str) -> ParsedAgentXML: conf_val = 0.0 conf_val = max(0.0, min(1.0, conf_val)) - parse_ok = bool(ans_m and conf_m) - return ParsedAgentXML(answer=answer, confidence=conf_val, rationale=rationale, parse_ok=parse_ok) + missing_answer = not bool(ans_m and answer.strip()) + missing_confidence = not bool(conf_m) + parse_ok = (not missing_answer) and (not missing_confidence) + return ParsedAgentXML( + answer=answer, + confidence=conf_val, + rationale=rationale, + parse_ok=parse_ok, + missing_answer=missing_answer, + missing_confidence=missing_confidence, + ) def resolve_option(raw_answer: str, options: list[str]) -> str: diff --git a/backend/api/screen/prompts.py b/backend/api/screen/prompts.py index 97861767..b7208f87 100644 --- a/backend/api/screen/prompts.py +++ b/backend/api/screen/prompts.py @@ -134,6 +134,9 @@ Additional guidance: {xtra} +CRITICAL PROMPT ADDITIONS (SR-scoped): +{critical_additions} + Output requirement: Return ONLY the following XML tags (no Markdown, no extra prose): ... @@ -199,6 +202,9 @@ Additional guidance: {xtra} +CRITICAL PROMPT ADDITIONS (SR-scoped): +{critical_additions} + Full text (numbered sentences): {fulltext} diff --git a/backend/api/screen/router.py b/backend/api/screen/router.py index 3e7f27da..6da5bf1e 100644 --- a/backend/api/screen/router.py +++ b/backend/api/screen/router.py @@ -1,10 +1,12 @@ from typing import Any, Dict, List, Optional, Tuple +import math import json import re from datetime import datetime import logging from fastapi import APIRouter, Depends, HTTPException, status from fastapi.concurrency import run_in_threadpool +from fastapi.responses import Response from pydantic import BaseModel, Field from ..services.sr_db_service import srdb_service @@ -51,6 +53,7 @@ class ScreeningMetricsCriterion(BaseModel): critical_disagreement_count: int confident_exclude_count: int needs_human_review_count: int + accuracy: Optional[float] = None class ScreeningMetricsSummary(BaseModel): @@ -61,11 +64,77 @@ class ScreeningMetricsSummary(BaseModel): validated_needs_review: int unvalidated_needs_review: int needs_review_total: int + not_screened_yet: int + auto_excluded: int class ScreeningMetricsResponse(BaseModel): sr_id: str steps: Dict[str, Any] + warnings: Optional[List[Dict[str, Any]]] = None + + +class CalibrationPoint(BaseModel): + threshold: float + tp: int + fp: int + fn: int + tn: int + precision: Optional[float] = None + recall: Optional[float] = None + fpr: Optional[float] = None + tpr: Optional[float] = None + workload_reduction: Optional[float] = None + + +class CalibrationHistogramBin(BaseModel): + bin_start: float + bin_end: float + agree: int + disagree: int + + +class CalibrationCriterionResponse(BaseModel): + criterion_key: str + label: str + validated_n: int + recommended_threshold: Optional[float] = None + recommended_reason: Optional[str] = None + curve: List[CalibrationPoint] + histogram: List[CalibrationHistogramBin] + + +class CalibrationResponse(BaseModel): + sr_id: str + step: str + criteria: List[CalibrationCriterionResponse] + + +class CalibrationSampleRow(BaseModel): + citation_id: int + criterion_key: str + label: str + validated: bool + confidence: Optional[float] = None + ai_answer: Optional[str] = None + human_selected: Optional[str] = None + agrees: Optional[bool] = None + bucket: Optional[str] = None # tp/fp/fn/tn given a threshold + + +class CalibrationSamplesResponse(BaseModel): + sr_id: str + step: str + threshold: float + rows: List[CalibrationSampleRow] + + +def _csv_escape(v: Any) -> str: + s = "" if v is None else str(v) + # RFC 4180 basic escaping + if any(ch in s for ch in [",", "\n", "\r", '"']): + s = '"' + s.replace('"', '""') + '"' + return s def _normalize_int_list(v: Any) -> List[int]: @@ -213,6 +282,34 @@ def _is_exclude_answer(ans: Any) -> bool: return "(exclude)" in s.lower() +def _parse_selected_from_human_payload(v: Any) -> Optional[str]: + """Extract the human label (selected option) from a human_{criterion_key} cell. + + Stored value is usually JSONB like: + {"selected": "...", "confidence": ..., ...} + but some deployments might store a plain string. + """ + if v is None: + return None + if isinstance(v, str): + s = v.strip() + if not s: + return None + # Try JSON first + try: + obj = json.loads(s) + if isinstance(obj, dict): + sel = obj.get("selected") + return str(sel).strip() if isinstance(sel, str) else None + except Exception: + return s + return None + if isinstance(v, dict): + sel = v.get("selected") + return str(sel).strip() if isinstance(sel, str) else None + return None + + def _criterion_key_from_question(question: str) -> str: # Keep in sync with the frontend derivation in l2-screen view. q = str(question or "") @@ -685,6 +782,7 @@ async def _call_llm(prompt: str) -> Tuple[str, Dict[str, Any], int]: "confidence": screening_parsed.confidence, "rationale": screening_parsed.rationale, "raw_response": screening_raw, + "guardrails": _build_guardrails(screening_parsed, raw_text=screening_raw, stage="screening"), "model": payload.model, "prompt_version": payload.prompt_version, "temperature": payload.temperature, @@ -699,6 +797,18 @@ async def _call_llm(prompt: str) -> Tuple[str, Dict[str, Any], int]: raise HTTPException(status_code=status.HTTP_500_INTERNAL_SERVER_ERROR, detail=f"Failed to persist screening run: {e}") # 2) critical + critical_additions = "" + try: + cpa = sr.get("critical_prompt_additions") or {} + if isinstance(cpa, dict): + block = cpa.get("l1") + if isinstance(block, dict): + critical_additions = str(block.get(criterion_key) or "") + except Exception: + critical_additions = "" + if not critical_additions.strip(): + critical_additions = "(none)" + critical_opts = build_critical_options(all_options=opts, screening_answer=screening_answer) critical_listed = "\n".join(critical_opts) critical_prompt = PROMPT_XML_TEMPLATE_TA_CRITICAL.format( @@ -707,6 +817,7 @@ async def _call_llm(prompt: str) -> Tuple[str, Dict[str, Any], int]: screening_answer=screening_answer, options=critical_listed, xtra=xtra or "", + critical_additions=critical_additions, ) critical_raw, critical_usage, critical_latency = await _call_llm(critical_prompt) critical_parsed = parse_agent_xml(critical_raw) @@ -728,6 +839,7 @@ async def _call_llm(prompt: str) -> Tuple[str, Dict[str, Any], int]: "confidence": critical_parsed.confidence, "rationale": critical_parsed.rationale, "raw_response": critical_raw, + "guardrails": _build_guardrails(critical_parsed, raw_text=critical_raw, stage="critical"), "model": payload.model, "prompt_version": payload.prompt_version, "temperature": payload.temperature, @@ -1075,6 +1187,7 @@ async def _call_llm(prompt: str) -> Tuple[str, Dict[str, Any], int]: "confidence": screening_parsed.confidence, "rationale": screening_parsed.rationale, "raw_response": screening_raw, + "guardrails": _build_guardrails(screening_parsed, raw_text=screening_raw, stage="screening"), "model": payload.model, "prompt_version": payload.prompt_version, "temperature": payload.temperature, @@ -1089,6 +1202,18 @@ async def _call_llm(prompt: str) -> Tuple[str, Dict[str, Any], int]: raise HTTPException(status_code=status.HTTP_500_INTERNAL_SERVER_ERROR, detail=f"Failed to persist screening run: {e}") # 2) critical + critical_additions = "" + try: + cpa = sr.get("critical_prompt_additions") or {} + if isinstance(cpa, dict): + block = cpa.get("l2") + if isinstance(block, dict): + critical_additions = str(block.get(criterion_key) or "") + except Exception: + critical_additions = "" + if not critical_additions.strip(): + critical_additions = "(none)" + critical_opts = build_critical_options(all_options=opts, screening_answer=screening_answer) critical_listed = "\n".join(critical_opts) critical_prompt = PROMPT_XML_TEMPLATE_FULLTEXT_CRITICAL.format( @@ -1096,6 +1221,7 @@ async def _call_llm(prompt: str) -> Tuple[str, Dict[str, Any], int]: screening_answer=screening_answer, options=critical_listed, xtra=xtra or "", + critical_additions=critical_additions, fulltext=fulltext, tables="\n".join(tables_md_lines) if tables_md_lines else "(none)", figures="\n".join(figures_lines) if figures_lines else "(none)", @@ -1119,6 +1245,7 @@ async def _call_llm(prompt: str) -> Tuple[str, Dict[str, Any], int]: "confidence": critical_parsed.confidence, "rationale": critical_parsed.rationale, "raw_response": critical_raw, + "guardrails": _build_guardrails(critical_parsed, raw_text=critical_raw, stage="critical"), "model": payload.model, "prompt_version": payload.prompt_version, "temperature": payload.temperature, @@ -1231,9 +1358,11 @@ async def get_screening_metrics( - Each criterion uses its own threshold (from SR.screening_thresholds[step][criterion_key]). - Needs-human-review logic: - 1) If ANY criterion is a confident exclude => no human review needed for the citation. - 2) Else if ANY criterion has critical disagreement => needs review. - 3) Else if ANY criterion is low confidence (below its threshold) => needs review. + 0) Not screened yet: if no agent runs exist for this step/pipeline. + 1) Auto-excluded if ANY criterion is a confident exclude AND critical agrees: + screening answer contains '(exclude)' AND screening_conf >= threshold AND critical answer == 'None of the above'. + 2) Else needs review if ANY criterion has critical disagreement (critical answer != 'None of the above'). + 3) Else needs review if ANY criterion is low confidence (below its threshold). """ step_norm = str(step or "l1").lower().strip() @@ -1290,6 +1419,40 @@ async def get_screening_metrics( needed_cols.extend([validations_col, legacy_validated_by]) + # Phase 2: canonical human labels (per criterion) live in human_{criterion_key} JSONB. + # We only need these to compute validated-set agreement metrics. + human_cols: Dict[str, str] = {} + for c in criteria: + ck = c["criterion_key"] + col = f"human_{ck}" if ck else "human_col" + human_cols[ck] = col + needed_cols.append(col) + + warnings: List[Dict[str, Any]] = [] + # Legacy safety: do NOT attempt to fabricate agent runs. + # If legacy llm_* outputs exist but normalized runs are missing, we warn the UI + # so the user can run run-all (which will force overwrite and create real runs). + try: + legacy_needs = await run_in_threadpool( + cits_dp_service.legacy_needs_rerun, + sr_id=sr_id, + table_name=table_name, + criteria_parsed=cp, + step=step_norm, + ) + if legacy_needs: + warnings.append( + { + "code": "LEGACY_DATA_NEEDS_RUN_ALL", + "severity": "warning", + "message": "Legacy screening results detected (llm_* columns) but agentic runs are missing. Please run Run-all to regenerate results.", + "sr_id": sr_id, + "step": step_norm, + } + ) + except Exception: + pass + # We'll compute per-citation needs-review based on agent runs only. # Fetch latest runs for all citations (bulk query using service helper) pipeline_norm = "title_abstract" if step_norm == "l1" else "fulltext" @@ -1352,13 +1515,24 @@ def _is_validated(row: Dict[str, Any]) -> bool: "low_confidence_count": 0, "critical_disagreement_count": 0, "confident_exclude_count": 0, + # Count of citations where THIS criterion triggered needs-review. "needs_human_review_count": 0, + # Validated-set agreement counts (AI screening vs canonical human label). + "human_agree_count": 0, + "human_total_count": 0, + + # Fallback proxy when human labels are not available: + # count how often critical agrees with screening. + "crit_agree_count": 0, + "crit_total_count": 0, } total_citations = 0 validated_all = 0 needs_review_total = 0 validated_needs_review = 0 + not_screened_yet = 0 + auto_excluded = 0 # Iterate citations and compute needs-review + per-criterion counts for row in rows or []: @@ -1373,10 +1547,16 @@ def _is_validated(row: Dict[str, Any]) -> bool: per_crit = runs_by_cit.get(cid, {}) + # Bucket 1: Not screened yet (no runs at all) + if not per_crit: + not_screened_yet += 1 + continue + # Evaluate confident exclude override has_confident_exclude = False has_critical_disagreement = False has_low_confidence = False + has_guardrail_issue = False for c in criteria: ck = c["criterion_key"] @@ -1386,6 +1566,8 @@ def _is_validated(row: Dict[str, Any]) -> bool: continue a["total_citations"] += 1 + triggered_this_criterion = False + rpair = per_crit.get(ck) or {} scr = rpair.get("screening") crit = rpair.get("critical") @@ -1399,26 +1581,83 @@ def _is_validated(row: Dict[str, Any]) -> bool: conf_f = None ans = scr.get("answer") + # If citation is validated and a canonical human label exists, compute agreement. + if validated: + hcol = human_cols.get(ck) or f"human_{ck}" + human_sel = _parse_selected_from_human_payload(row.get(hcol)) + if human_sel is not None: + a["human_total_count"] += 1 + # Agreement definition: exact string match after stripping. + if str(human_sel).strip() == str(ans or "").strip(): + a["human_agree_count"] += 1 + if conf_f is not None and conf_f < thr: a["low_confidence_count"] += 1 has_low_confidence = True + # This criterion triggers review for this citation. + triggered_this_criterion = True - if conf_f is not None and conf_f >= thr and _is_exclude_answer(ans): + # Guardrails: missing/failed parse should be treated as needs review. + try: + g = scr.get("guardrails") + if isinstance(g, str): + g = json.loads(g) + if isinstance(g, dict): + if g.get("parse_ok") is False or g.get("missing_answer") or g.get("missing_confidence"): + has_guardrail_issue = True + triggered_this_criterion = True + except Exception: + # If guardrails column exists but is unparsable, treat as issue. + if scr.get("guardrails") is not None: + has_guardrail_issue = True + triggered_this_criterion = True + + # Confident exclude requires critical agreement + crit_has = bool(crit) and str(crit.get("answer") or "").strip() != "" + crit_agrees = crit_has and (not _is_disagreeing_critical_answer(crit.get("answer"))) + if crit_has: + a["crit_total_count"] += 1 + if crit_agrees: + a["crit_agree_count"] += 1 + if conf_f is not None and conf_f >= thr and _is_exclude_answer(ans) and crit_agrees: a["confident_exclude_count"] += 1 has_confident_exclude = True - if crit and _is_disagreeing_critical_answer(crit.get("answer")): + # Treat missing/empty critical as disagreement/parse issue (conservative). + if not crit or str(crit.get("answer") or "").strip() == "": a["critical_disagreement_count"] += 1 has_critical_disagreement = True + triggered_this_criterion = True + elif _is_disagreeing_critical_answer(crit.get("answer")): + a["critical_disagreement_count"] += 1 + has_critical_disagreement = True + triggered_this_criterion = True + + # Guardrails on critical stage + try: + if crit: + g2 = crit.get("guardrails") + if isinstance(g2, str): + g2 = json.loads(g2) + if isinstance(g2, dict): + if g2.get("parse_ok") is False or g2.get("missing_answer") or g2.get("missing_confidence"): + has_guardrail_issue = True + triggered_this_criterion = True + except Exception: + if crit and crit.get("guardrails") is not None: + has_guardrail_issue = True + triggered_this_criterion = True - needs_review = (not has_confident_exclude) and (has_critical_disagreement or has_low_confidence) + if triggered_this_criterion: + a["needs_human_review_count"] += 1 + + if has_confident_exclude: + auto_excluded += 1 + needs_review = (not has_confident_exclude) and (has_critical_disagreement or has_low_confidence or has_guardrail_issue) if needs_review: needs_review_total += 1 if validated: validated_needs_review += 1 - # increment per-criterion needs-review count for all criteria - for c in criteria: - agg[c["criterion_key"]]["needs_human_review_count"] += 1 unvalidated_all = max(0, total_citations - validated_all) unvalidated_needs_review = max(0, needs_review_total - validated_needs_review) @@ -1428,11 +1667,25 @@ def _is_validated(row: Dict[str, Any]) -> bool: for c in criteria: ck = c["criterion_key"] a = agg.get(ck) or {} + # Prefer human-vs-AI agreement on the validated set when available. + # Fallback to critical-agreement proxy when no human labels exist yet. + try: + h_total = int(a.get("human_total_count") or 0) + h_agree = int(a.get("human_agree_count") or 0) + if h_total > 0: + accuracy = (h_agree / h_total) + else: + crit_total = int(a.get("crit_total_count") or 0) + crit_agree = int(a.get("crit_agree_count") or 0) + accuracy = (crit_agree / crit_total) if crit_total > 0 else None + except Exception: + accuracy = None crit_out.append( { "criterion_key": ck, "label": c["label"], "threshold": float(c["threshold"]), + "accuracy": accuracy, **a, } ) @@ -1449,12 +1702,577 @@ def _is_validated(row: Dict[str, Any]) -> bool: "needs_review_total": needs_review_total, "validated_needs_review": validated_needs_review, "unvalidated_needs_review": unvalidated_needs_review, + "not_screened_yet": not_screened_yet, + "auto_excluded": auto_excluded, }, "criteria": crit_out, } }, + warnings=warnings or None, ) + +def _safe_div(n: float, d: float) -> Optional[float]: + try: + if d == 0: + return None + return n / d + except Exception: + return None + + +def _clip01(v: Any, default: float = 0.0) -> float: + try: + x = float(v) + if math.isnan(x) or math.isinf(x): + return float(default) + return max(0.0, min(1.0, x)) + except Exception: + return float(default) + + +def _parse_confidence(v: Any) -> Optional[float]: + if v is None: + return None + try: + x = float(v) + if math.isnan(x) or math.isinf(x): + return None + return max(0.0, min(1.0, x)) + except Exception: + return None + + +def _build_guardrails(parsed: Any, *, raw_text: str, stage: str) -> Dict[str, Any]: + """Build a compact guardrails payload for persisting with screening_agent_runs.""" + raw = str(raw_text or "") + out: Dict[str, Any] = { + "schema_version": "v1", + "stage": str(stage or ""), + "parse_ok": bool(getattr(parsed, "parse_ok", False)), + "missing_answer": bool(getattr(parsed, "missing_answer", False)), + "missing_confidence": bool(getattr(parsed, "missing_confidence", False)), + "missing_rationale": not bool(str(getattr(parsed, "rationale", "") or "").strip()), + "raw_len": len(raw), + "has_answer_tag": " bool: + v = row.get(validations_col) + if v: + try: + parsed = v + if isinstance(v, str): + parsed = json.loads(v) + if isinstance(parsed, list) and len(parsed) > 0: + return True + except Exception: + pass + return bool(row.get(legacy_validated_by)) + + # Build validated examples per criterion: (confidence, agree_bool) + examples: Dict[str, List[Tuple[float, bool]]] = {c["criterion_key"]: [] for c in criteria} + for row in rows or []: + try: + cid = int(row.get("id")) + except Exception: + continue + if not _is_validated_row(row): + continue + scr_map = screening_by_cit.get(cid) or {} + for c in criteria: + ck = c["criterion_key"] + scr = scr_map.get(ck) + if not scr: + continue + conf = _parse_confidence(scr.get("confidence")) + if conf is None: + continue + ai_ans = str(scr.get("answer") or "").strip() + human_sel = _parse_selected_from_human_payload(row.get(human_cols.get(ck) or f"human_{ck}")) + if human_sel is None: + continue + agree = str(human_sel).strip() == ai_ans + examples[ck].append((conf, agree)) + + # Compute curve + histogram per criterion + out_criteria: List[CalibrationCriterionResponse] = [] + for c in criteria: + ck = c["criterion_key"] + label = c["label"] + ex = examples.get(ck) or [] + validated_n = len(ex) + + # Histogram bins + hist: List[CalibrationHistogramBin] = [] + if validated_n > 0: + for b in range(bins_n): + start = b / bins_n + end = (b + 1) / bins_n + agree_ct = 0 + disagree_ct = 0 + for conf, agree in ex: + # include 1.0 in last bin + in_bin = (conf >= start and conf < end) or (b == bins_n - 1 and conf == 1.0) + if not in_bin: + continue + if agree: + agree_ct += 1 + else: + disagree_ct += 1 + hist.append( + CalibrationHistogramBin( + bin_start=round(start, 6), + bin_end=round(end, 6), + agree=agree_ct, + disagree=disagree_ct, + ) + ) + else: + for b in range(bins_n): + start = b / bins_n + end = (b + 1) / bins_n + hist.append(CalibrationHistogramBin(bin_start=round(start, 6), bin_end=round(end, 6), agree=0, disagree=0)) + + curve: List[CalibrationPoint] = [] + best_thr: Optional[float] = None + best_score: Optional[float] = None + best_recall: Optional[float] = None + + for thr in thr_list: + tp = fp = fn = tn = 0 + # Review queue size for this criterion at this threshold = count(conf < thr) among validated examples. + # Workload reduction proxy: 1 - queue/total. + queue = 0 + + for conf, agree in ex: + pred_pos = conf >= thr + if conf < thr: + queue += 1 + + if pred_pos and agree: + tp += 1 + elif pred_pos and not agree: + fp += 1 + elif (not pred_pos) and agree: + fn += 1 + else: + tn += 1 + + precision = _safe_div(tp, tp + fp) + recall = _safe_div(tp, tp + fn) + fpr = _safe_div(fp, fp + tn) + tpr = recall + workload_reduction = None + if validated_n > 0: + workload_reduction = 1.0 - (queue / validated_n) + + curve.append( + CalibrationPoint( + threshold=float(thr), + tp=tp, + fp=fp, + fn=fn, + tn=tn, + precision=precision, + recall=recall, + fpr=fpr, + tpr=tpr, + workload_reduction=workload_reduction, + ) + ) + + # Choose recommended threshold by maximizing Youden's J; tie-break by higher recall. + if recall is None or fpr is None: + continue + score = recall - fpr + if best_score is None or score > best_score + 1e-9: + best_score = score + best_thr = thr + best_recall = recall + elif best_score is not None and abs(score - best_score) <= 1e-9: + # tie-break: higher recall + if best_recall is None or recall > best_recall + 1e-9: + best_thr = thr + best_recall = recall + + reason = None + if best_thr is not None: + reason = "max_youden_j (tpr-fpr), tie-break: max recall" + + out_criteria.append( + CalibrationCriterionResponse( + criterion_key=ck, + label=label, + validated_n=validated_n, + recommended_threshold=float(best_thr) if best_thr is not None else None, + recommended_reason=reason, + curve=curve, + histogram=hist, + ) + ) + + return CalibrationResponse(sr_id=sr_id, step=step_norm, criteria=out_criteria) + + +@router.get("/calibration/samples") +async def get_calibration_samples( + sr_id: str, + step: str = "l1", + threshold: float = 0.9, + criterion_key: Optional[str] = None, + limit: int = 200, + format: str = "json", + current_user: Dict[str, Any] = Depends(get_current_active_user), +): + """Return calibration sample rows (validated citations only) for auditing. + + This endpoint is meant for exporting / debugging calibration behavior. + + Definitions: + - human label: `human_{criterion_key}.selected` + - AI label: latest `stage=screening` answer for this pipeline + - agrees: AI answer == human selected + - bucket at given threshold (positive == agreement, predicted positive == confidence >= threshold): + tp: pred_pos and agrees + fp: pred_pos and not agrees + fn: not pred_pos and agrees + tn: not pred_pos and not agrees + + Query params: + - sr_id: SR id + - step: l1|l2 + - threshold: float [0,1] + - criterion_key: optional filter for a single criterion + - limit: max rows returned (default 200, max 2000) + - format: json|csv + """ + + step_norm = str(step or "l1").lower().strip() + if step_norm not in {"l1", "l2"}: + raise HTTPException(status_code=status.HTTP_400_BAD_REQUEST, detail="step must be l1 or l2") + + thr = _clip01(threshold, default=0.9) + lim = max(1, min(2000, int(limit or 200))) + fmt = str(format or "json").lower().strip() + if fmt not in {"json", "csv"}: + raise HTTPException(status_code=status.HTTP_400_BAD_REQUEST, detail="format must be json or csv") + + try: + sr, screening = await load_sr_and_check(sr_id, current_user, srdb_service) + except HTTPException: + raise + except Exception as e: + raise HTTPException(status_code=status.HTTP_500_INTERNAL_SERVER_ERROR, detail=f"Failed to load SR: {e}") + + table_name = (screening or {}).get("table_name") or "citations" + + # Criteria questions for step + cp = sr.get("criteria_parsed") or {} + crit_block = cp.get(step_norm) if isinstance(cp, dict) else None + questions = (crit_block or {}).get("questions") if isinstance(crit_block, dict) else [] + questions = questions if isinstance(questions, list) else [] + + criteria: List[Dict[str, str]] = [] + for q in questions: + if not isinstance(q, str) or not q.strip(): + continue + ck = _criterion_key_from_question(q) + if criterion_key and str(criterion_key).strip() != ck: + continue + criteria.append({"criterion_key": ck, "label": q}) + + if criterion_key and not criteria: + raise HTTPException(status_code=status.HTTP_400_BAD_REQUEST, detail="Unknown criterion_key for this step") + + # Determine SR scope ids for step + filter_step = "" + if step_norm == "l2": + filter_step = "l1" + try: + ids = await run_in_threadpool(cits_dp_service.list_citation_ids, filter_step, table_name) + except Exception as e: + raise HTTPException(status_code=status.HTTP_500_INTERNAL_SERVER_ERROR, detail=f"Failed to list citations: {e}") + + validations_col = f"{step_norm}_validations" + legacy_validated_by = f"{step_norm}_validated_by" + + # Build columns for row fetch + needed_cols: List[str] = ["id", validations_col, legacy_validated_by] + human_cols: Dict[str, str] = {} + for c in criteria: + ck = c["criterion_key"] + hcol = f"human_{ck}" if ck else "human_col" + human_cols[ck] = hcol + needed_cols.append(hcol) + + # Load latest screening runs for all ids + pipeline_norm = "title_abstract" if step_norm == "l1" else "fulltext" + try: + runs = await run_in_threadpool( + cits_dp_service.list_latest_agent_runs, + sr_id=sr_id, + table_name=table_name, + citation_ids=ids, + pipeline=pipeline_norm, + ) + except Exception as e: + raise HTTPException(status_code=status.HTTP_500_INTERNAL_SERVER_ERROR, detail=f"Failed to load agent runs: {e}") + + # Group screening runs by citation then criterion + screening_by_cit: Dict[int, Dict[str, Dict[str, Any]]] = {} + for r in runs or []: + try: + cid = int(r.get("citation_id")) + except Exception: + continue + if str(r.get("stage") or "") != "screening": + continue + ck = str(r.get("criterion_key") or "") + if not ck: + continue + if criterion_key and ck != str(criterion_key).strip(): + continue + if cid not in screening_by_cit: + screening_by_cit[cid] = {} + screening_by_cit[cid][ck] = r + + # Load citation rows + try: + rows = await run_in_threadpool(cits_dp_service.get_citations_by_ids, ids, table_name, needed_cols) + except Exception as e: + raise HTTPException(status_code=status.HTTP_500_INTERNAL_SERVER_ERROR, detail=f"Failed to load citation rows: {e}") + + def _is_validated_row(row: Dict[str, Any]) -> bool: + v = row.get(validations_col) + if v: + try: + parsed = v + if isinstance(v, str): + parsed = json.loads(v) + if isinstance(parsed, list) and len(parsed) > 0: + return True + except Exception: + pass + return bool(row.get(legacy_validated_by)) + + out_rows: List[CalibrationSampleRow] = [] + for row in rows or []: + if len(out_rows) >= lim: + break + try: + cid = int(row.get("id")) + except Exception: + continue + if not _is_validated_row(row): + continue + scr_map = screening_by_cit.get(cid) or {} + for c in criteria: + if len(out_rows) >= lim: + break + ck = c["criterion_key"] + scr = scr_map.get(ck) + if not scr: + continue + conf = _parse_confidence(scr.get("confidence")) + ai_ans = str(scr.get("answer") or "").strip() if scr.get("answer") is not None else None + human_sel = _parse_selected_from_human_payload(row.get(human_cols.get(ck) or f"human_{ck}")) + if human_sel is None: + continue + agrees = (str(human_sel).strip() == str(ai_ans or "").strip()) + pred_pos = (conf is not None) and (conf >= thr) + if pred_pos and agrees: + bucket = "tp" + elif pred_pos and (not agrees): + bucket = "fp" + elif (not pred_pos) and agrees: + bucket = "fn" + else: + bucket = "tn" + + out_rows.append( + CalibrationSampleRow( + citation_id=cid, + criterion_key=ck, + label=c["label"], + validated=True, + confidence=conf, + ai_answer=ai_ans, + human_selected=human_sel, + agrees=agrees, + bucket=bucket, + ) + ) + + if fmt == "json": + return CalibrationSamplesResponse(sr_id=sr_id, step=step_norm, threshold=thr, rows=out_rows) + + # CSV format + header = [ + "citation_id", + "criterion_key", + "label", + "confidence", + "ai_answer", + "human_selected", + "agrees", + "bucket", + ] + lines = [",".join(header)] + for r in out_rows: + lines.append( + ",".join( + [ + _csv_escape(r.citation_id), + _csv_escape(r.criterion_key), + _csv_escape(r.label), + _csv_escape(r.confidence), + _csv_escape(r.ai_answer), + _csv_escape(r.human_selected), + _csv_escape(r.agrees), + _csv_escape(r.bucket), + ] + ) + ) + csv_bytes = ("\n".join(lines) + "\n").encode("utf-8") + return Response(content=csv_bytes, media_type="text/csv") + async def update_inclusion_decision( sr: Dict[str, Any], citation_id: int, diff --git a/backend/api/services/cit_db_service.py b/backend/api/services/cit_db_service.py index 1e3f28e4..d928285f 100644 --- a/backend/api/services/cit_db_service.py +++ b/backend/api/services/cit_db_service.py @@ -254,11 +254,21 @@ def ensure_screening_agent_runs_table(self) -> None: input_tokens INT, output_tokens INT, cost_usd DOUBLE PRECISION, + guardrails JSONB, created_at TIMESTAMPTZ DEFAULT now() ) """ ) + # Runtime schema evolution for existing deployments + try: + cur.execute("ALTER TABLE screening_agent_runs ADD COLUMN IF NOT EXISTS guardrails JSONB") + except Exception: + try: + cur.execute("ALTER TABLE screening_agent_runs ADD COLUMN guardrails JSONB") + except Exception: + pass + # A couple of pragmatic indexes for common lookups. cur.execute( """ @@ -327,12 +337,12 @@ def insert_screening_agent_run(self, run: Dict[str, Any]) -> str: id, sr_id, table_name, citation_id, pipeline, criterion_key, stage, answer, confidence, rationale, raw_response, model, prompt_version, temperature, top_p, seed, - latency_ms, input_tokens, output_tokens, cost_usd, created_at + latency_ms, input_tokens, output_tokens, cost_usd, guardrails, created_at ) VALUES ( %s, %s, %s, %s, %s, %s, %s, %s, %s, %s, %s, %s, %s, %s, %s, %s, - %s, %s, %s, %s, %s + %s, %s, %s, %s, %s, %s ) """, ( @@ -356,6 +366,7 @@ def insert_screening_agent_run(self, run: Dict[str, Any]) -> str: run.get("input_tokens"), run.get("output_tokens"), run.get("cost_usd"), + json.dumps(run.get("guardrails")) if run.get("guardrails") is not None else None, run.get("created_at") or datetime.utcnow().isoformat() + "Z", ), ) @@ -368,6 +379,106 @@ def insert_screening_agent_run(self, run: Dict[str, Any]) -> str: if conn: pass + def agent_runs_exist(self, *, sr_id: str, table_name: str, pipeline: str) -> bool: + """Return True if we have any normalized agent runs for this SR+table+pipeline.""" + + self._require_psycopg2() + self.ensure_screening_agent_runs_table() + conn = None + try: + conn = postgres_server.conn + cur = conn.cursor() + cur.execute( + """ + SELECT 1 + FROM screening_agent_runs + WHERE sr_id=%s AND table_name=%s AND pipeline=%s + LIMIT 1 + """, + (str(sr_id), str(table_name), str(pipeline)), + ) + return cur.fetchone() is not None + except Exception: + _safe_rollback(conn) + raise + finally: + if conn: + pass + + def legacy_llm_outputs_exist_for_step( + self, + *, + table_name: str, + criteria_parsed: Dict[str, Any], + step: str, + ) -> bool: + """Return True if any legacy llm_* JSONB columns for this step contain data.""" + + table_name = _validate_ident(table_name, kind="table_name") + self._require_psycopg2() + if not self.table_exists(table_name): + return False + + step_norm = str(step or "").lower().strip() + if step_norm not in {"l1", "l2"}: + return False + + qs = (((criteria_parsed or {}).get(step_norm) or {}).get("questions") or []) + if not isinstance(qs, list) or not qs: + return False + + # Determine which llm_* columns exist + cols_meta = self.get_table_columns(table_name) + existing_cols = {c.get("column_name") for c in cols_meta if c and c.get("column_name")} + llm_cols = [] + for q in qs: + if not isinstance(q, str) or not q.strip(): + continue + col = snake_case_column(q) + if col in existing_cols: + llm_cols.append(col) + if not llm_cols: + return False + + # Any non-null legacy output? + or_sql = " OR ".join([f'"{c}" IS NOT NULL' for c in llm_cols]) + conn = None + try: + conn = postgres_server.conn + cur = conn.cursor() + cur.execute(f'SELECT 1 FROM "{table_name}" WHERE {or_sql} LIMIT 1') + return cur.fetchone() is not None + except Exception: + _safe_rollback(conn) + raise + finally: + if conn: + pass + + def legacy_needs_rerun( + self, + *, + sr_id: str, + table_name: str, + criteria_parsed: Dict[str, Any], + step: str, + ) -> bool: + """Return True when legacy llm_* outputs exist but normalized runs do not. + + This is the signal to: + - warn the user that they must run run-all + - auto-enable force overwrite for run-all + """ + + step_norm = str(step or "").lower().strip() + if step_norm not in {"l1", "l2"}: + return False + pipeline = "title_abstract" if step_norm == "l1" else "fulltext" + legacy = self.legacy_llm_outputs_exist_for_step(table_name=table_name, criteria_parsed=criteria_parsed, step=step_norm) + if not legacy: + return False + return not self.agent_runs_exist(sr_id=sr_id, table_name=table_name, pipeline=pipeline) + def list_latest_agent_runs( self, *, @@ -416,6 +527,7 @@ def list_latest_agent_runs( answer, confidence, rationale, + guardrails, model, prompt_version, temperature, diff --git a/backend/api/services/sr_db_service.py b/backend/api/services/sr_db_service.py index 013a4c32..b1e3a2d5 100644 --- a/backend/api/services/sr_db_service.py +++ b/backend/api/services/sr_db_service.py @@ -51,6 +51,7 @@ def ensure_table_exists(self) -> None: criteria_yaml TEXT, criteria_parsed JSONB, screening_thresholds JSONB, + critical_prompt_additions JSONB, screening_db JSONB, created_at TIMESTAMP WITH TIME ZONE DEFAULT now(), updated_at TIMESTAMP WITH TIME ZONE DEFAULT now() @@ -72,6 +73,18 @@ def ensure_table_exists(self) -> None: ) except Exception: pass + + try: + cur.execute( + "ALTER TABLE systematic_reviews ADD COLUMN IF NOT EXISTS critical_prompt_additions JSONB" + ) + except Exception: + try: + cur.execute( + "ALTER TABLE systematic_reviews ADD COLUMN critical_prompt_additions JSONB" + ) + except Exception: + pass conn.commit() logger.info("Ensured systematic_reviews table exists") @@ -202,7 +215,7 @@ def create_systematic_review( insert_sql = """ INSERT INTO systematic_reviews (id, name, description, owner_id, owner_email, users, visible, - criteria, criteria_yaml, criteria_parsed, screening_thresholds, created_at, updated_at) + criteria, criteria_yaml, criteria_parsed, screening_thresholds, critical_prompt_additions, created_at, updated_at) VALUES (%s, %s, %s, %s, %s, %s, %s, %s, %s, %s, %s, %s, %s) """ @@ -218,6 +231,7 @@ def create_systematic_review( criteria_str, json.dumps(criteria_parsed), json.dumps({"l1": {}, "l2": {}, "parameters": {}}), + json.dumps({"l1": {}, "l2": {}}), now, now )) @@ -239,6 +253,8 @@ def create_systematic_review( sr_doc['criteria_parsed'] = json.loads(sr_doc['criteria_parsed']) if sr_doc.get('screening_thresholds') and isinstance(sr_doc['screening_thresholds'], str): sr_doc['screening_thresholds'] = json.loads(sr_doc['screening_thresholds']) + if sr_doc.get('critical_prompt_additions') and isinstance(sr_doc['critical_prompt_additions'], str): + sr_doc['critical_prompt_additions'] = json.loads(sr_doc['critical_prompt_additions']) # Convert datetime objects to ISO strings from datetime import datetime as dt if sr_doc.get('created_at') and isinstance(sr_doc['created_at'], dt): @@ -522,6 +538,8 @@ def list_systematic_reviews_for_user(self, user_email: str) -> List[Dict[str, An doc['criteria_parsed'] = json.loads(doc['criteria_parsed']) if doc.get('screening_thresholds') and isinstance(doc['screening_thresholds'], str): doc['screening_thresholds'] = json.loads(doc['screening_thresholds']) + if doc.get('critical_prompt_additions') and isinstance(doc['critical_prompt_additions'], str): + doc['critical_prompt_additions'] = json.loads(doc['critical_prompt_additions']) # Convert datetime objects to ISO strings from datetime import datetime as dt if doc.get('created_at') and isinstance(doc['created_at'], dt): @@ -582,6 +600,8 @@ def get_systematic_review(self, sr_id: str, ignore_visibility: bool = False) -> doc['criteria_parsed'] = json.loads(doc['criteria_parsed']) if doc.get('screening_thresholds') and isinstance(doc['screening_thresholds'], str): doc['screening_thresholds'] = json.loads(doc['screening_thresholds']) + if doc.get('critical_prompt_additions') and isinstance(doc['critical_prompt_additions'], str): + doc['critical_prompt_additions'] = json.loads(doc['critical_prompt_additions']) # Convert datetime objects to ISO strings from datetime import datetime as dt if doc.get('created_at') and isinstance(doc['created_at'], dt): @@ -766,6 +786,39 @@ def update_screening_thresholds(self, sr_id: str, screening_thresholds: Dict[str if conn: pass + def update_critical_prompt_additions(self, sr_id: str, critical_prompt_additions: Dict[str, Any]) -> None: + """Persist SR-scoped critical prompt additions. + + Shape: + {"l1": {"criterion_key": "..."}, "l2": {"criterion_key": "..."}} + """ + + conn = None + try: + conn = postgres_server.conn + cur = conn.cursor() + + updated_at = datetime.utcnow().isoformat() + cur.execute( + "UPDATE systematic_reviews SET critical_prompt_additions = %s, updated_at = %s WHERE id = %s", + (json.dumps(critical_prompt_additions), updated_at, sr_id), + ) + conn.commit() + except Exception as e: + try: + if conn: + conn.rollback() + except Exception: + pass + logger.exception(f"Failed to update critical prompt additions: {e}") + raise HTTPException( + status_code=status.HTTP_500_INTERNAL_SERVER_ERROR, + detail=f"Failed to update critical prompt additions: {e}", + ) + finally: + if conn: + pass + def clear_screening_db_info(self, sr_id: str) -> None: """ Remove the screening_db field from the SR document. diff --git a/backend/api/sr/router.py b/backend/api/sr/router.py index c52416a2..05851733 100644 --- a/backend/api/sr/router.py +++ b/backend/api/sr/router.py @@ -59,6 +59,14 @@ class SystematicReviewRead(BaseModel): # } screening_thresholds: Optional[Dict[str, Any]] = None + # SR-scoped per-step per-criterion additions injected into CRITICAL prompts. + # Shape: + # { + # "l1": {"criterion_key": "..."}, + # "l2": {"criterion_key": "..."} + # } + critical_prompt_additions: Optional[Dict[str, Any]] = None + @@ -144,6 +152,7 @@ async def create_systematic_review( criteria_yaml=sr_doc.get("criteria_yaml"), criteria_parsed=sr_doc.get("criteria_parsed"), screening_thresholds=sr_doc.get("screening_thresholds"), + critical_prompt_additions=sr_doc.get("critical_prompt_additions"), ) @@ -408,6 +417,10 @@ class ThresholdsUpdateRequest(BaseModel): screening_thresholds: Dict[str, Any] = {} +class CriticalPromptAdditionsUpdateRequest(BaseModel): + critical_prompt_additions: Dict[str, Any] = {} + + @router.get("/{sr_id}/screening_thresholds") async def get_screening_thresholds(sr_id: str, current_user: Dict[str, Any] = Depends(get_current_active_user)): """Get SR-scoped per-step per-criterion thresholds.""" @@ -470,6 +483,65 @@ async def update_screening_thresholds( return {"status": "success", "sr_id": sr_id, "screening_thresholds": normalized} +@router.get("/{sr_id}/critical_prompt_additions") +async def get_critical_prompt_additions(sr_id: str, current_user: Dict[str, Any] = Depends(get_current_active_user)): + """Get SR-scoped per-step per-criterion critical prompt additions.""" + + try: + doc, _screening = await load_sr_and_check(sr_id, current_user, srdb_service, require_screening=False) + except HTTPException: + raise + except Exception as e: + raise HTTPException(status_code=status.HTTP_500_INTERNAL_SERVER_ERROR, detail=f"Failed to load systematic review: {e}") + + cpa = doc.get("critical_prompt_additions") or {} + if not isinstance(cpa, dict): + cpa = {} + return {"sr_id": sr_id, "critical_prompt_additions": cpa} + + +@router.put("/{sr_id}/critical_prompt_additions") +async def update_critical_prompt_additions( + sr_id: str, + payload: CriticalPromptAdditionsUpdateRequest, + current_user: Dict[str, Any] = Depends(get_current_active_user), +): + """Update SR-scoped per-step per-criterion critical prompt additions. + + Any SR member may update these (mirrors thresholds permissions). + """ + + try: + _doc, _screening = await load_sr_and_check(sr_id, current_user, srdb_service, require_screening=False) + except HTTPException: + raise + except Exception as e: + raise HTTPException(status_code=status.HTTP_500_INTERNAL_SERVER_ERROR, detail=f"Failed to load systematic review: {e}") + + cpa = payload.critical_prompt_additions or {} + if not isinstance(cpa, dict): + raise HTTPException(status_code=status.HTTP_400_BAD_REQUEST, detail="critical_prompt_additions must be an object") + + normalized: Dict[str, Any] = {} + for step in ("l1", "l2"): + block = cpa.get(step) + if isinstance(block, dict): + out: Dict[str, str] = {} + for k, v in block.items(): + if not isinstance(k, str) or not k.strip(): + continue + if v is None: + out[k] = "" + else: + out[k] = str(v) + normalized[step] = out + else: + normalized[step] = {} + + await run_in_threadpool(srdb_service.update_critical_prompt_additions, sr_id, normalized) + return {"status": "success", "sr_id": sr_id, "critical_prompt_additions": normalized} + + @router.delete("/{sr_id}") async def delete_systematic_review(sr_id: str, current_user: Dict[str, Any] = Depends(get_current_active_user)): """ diff --git a/backend/docker-compose.yml b/backend/docker-compose.yml index 1a8567a7..9479be17 100644 --- a/backend/docker-compose.yml +++ b/backend/docker-compose.yml @@ -60,7 +60,9 @@ services: ports: - "5432:5432" volumes: - - ./volumes/postgres:/var/lib/postgresql + # Use a named volume so we can reset with `docker compose down -v` + # without filesystem permission issues from bind-mounts. + - backend_pgdata:/var/lib/postgresql/data healthcheck: test: ["CMD-SHELL", "pg_isready -U admin -d postgres -h localhost"] interval: 30s @@ -71,3 +73,6 @@ services: networks: default: driver: bridge + +volumes: + backend_pgdata: diff --git a/frontend/app/[lang]/can-sr/l1-screen/page.tsx b/frontend/app/[lang]/can-sr/l1-screen/page.tsx index e5f5d7de..1c129b8c 100644 --- a/frontend/app/[lang]/can-sr/l1-screen/page.tsx +++ b/frontend/app/[lang]/can-sr/l1-screen/page.tsx @@ -14,49 +14,39 @@ const buildCitationAiCalls: BuildCitationAiCalls = ({ criteria, getAuthHeaders, }) => { - const calls: AiCall[] = [] - - for (let i = 0; i < (criteria?.questions || []).length; i++) { - const question = criteria.questions[i] - const options = criteria.possible_answers?.[i] || [] - - calls.push({ - key: `l1_classify_${i}`, - label: `L1: ${question}`, + // Phase 2 wiring: L1 run-all uses the agentic orchestrator endpoint. + // We keep the existing “Run all AI” modal behavior, but instead of running per-question + // classify calls, we run a single orchestrated run per citation. + return [ + { + key: `l1_agentic_run`, + label: `L1 agentic (screening + critical)`, run: async () => { const headers = { ...getAuthHeaders(), 'Content-Type': 'application/json', } - const res = await fetch( - `/api/can-sr/screen?action=classify&sr_id=${encodeURIComponent( - srId, - )}&citation_id=${encodeURIComponent(String(citationId))}`, - { - method: 'POST', - headers, - body: JSON.stringify({ - question, - options, - include_columns: ['title', 'abstract'], - screening_step: 'l1', - model, - temperature: 0.0, - max_tokens: 2000, - }), - }, - ) + const res = await fetch('/api/can-sr/screen/title-abstract/run', { + method: 'POST', + headers, + body: JSON.stringify({ + sr_id: srId, + citation_id: Number(citationId), + model, + temperature: 0.0, + max_tokens: 1200, + prompt_version: 'v1', + }), + }) if (!res.ok) { const text = await res.text().catch(() => '') - throw new Error(text || `L1 classify failed (${res.status})`) + throw new Error(text || `L1 agentic run failed (${res.status})`) } }, - }) - } - - return calls + }, + ] } export default function L1ScreenPage() { diff --git a/frontend/app/[lang]/can-sr/l1-screen/view/page.tsx b/frontend/app/[lang]/can-sr/l1-screen/view/page.tsx index 57891cb5..34111fad 100644 --- a/frontend/app/[lang]/can-sr/l1-screen/view/page.tsx +++ b/frontend/app/[lang]/can-sr/l1-screen/view/page.tsx @@ -450,57 +450,47 @@ export default function CanSrL1ScreenPage() { // Handler: call backend classify endpoint for a single question async function classifyQuestion(questionIndex: number) { if (!srId || !citationId || !criteriaData) return - const question = criteriaData.questions[questionIndex] - const options = criteriaData.possible_answers[questionIndex] || [] try { const headers = { 'Content-Type': 'application/json', ...getAuthHeaders(), } - const bodyPayload = { - question, - options, - include_columns: ['title', 'abstract'], - screening_step: 'l1', - } - const res = await fetch( - `/api/can-sr/screen?action=classify&sr_id=${encodeURIComponent(srId)}&citation_id=${encodeURIComponent( - citationId, - )}`, - { - method: 'POST', - headers, - body: JSON.stringify(bodyPayload), - }, - ) - const data = await res.json().catch(() => ({})) - // Expect the backend to return the classification_json or similar structure - // Try flexible extraction: - const classification = - data?.classification_json || - data?.result || - data?.classification || - data?.llm_classification || - data - if (classification && typeof classification === 'object') { - // Always show AI panel. - // IMPORTANT: do NOT overwrite an existing human selection in the UI. - if ((classification as any).selected !== undefined) { - setSelections((prev) => { - const already = prev?.[questionIndex] - if (already !== undefined && String(already).trim() !== '') return prev - return { ...prev, [questionIndex]: (classification as any).selected } - }) - } - setAiPanels((prev) => ({ ...prev, [questionIndex]: classification })) - setPanelOpen((prev) => ({ ...prev, [questionIndex]: false })) - } else { - // If server returned a simple string, set it as selection - if (typeof data === 'string') { - setSelections((prev) => ({ ...prev, [questionIndex]: data })) + + // Phase 1->2 wiring: reuse the existing per-question “AI” button, but call the + // agentic orchestrator endpoint which runs BOTH screening + critical and persists + // them to screening_agent_runs. + const res = await fetch('/api/can-sr/screen/title-abstract/run', { + method: 'POST', + headers, + body: JSON.stringify({ + sr_id: srId, + citation_id: Number(citationId), + model: selectedModel, + temperature: 0.0, + max_tokens: 1200, + prompt_version: 'v1', + }), + }) + await res.json().catch(() => ({})) + + // Refresh latest runs + citation row so the UI shows critical + validations immediately. + await fetchCitationById(String(citationId)) + + try { + const r2 = await fetch( + `/api/can-sr/screen/agent-runs/latest?sr_id=${encodeURIComponent( + srId, + )}&pipeline=${encodeURIComponent('title_abstract')}&citation_ids=${encodeURIComponent( + String(citationId), + )}`, + { method: 'GET', headers: getAuthHeaders() }, + ) + const j2 = await r2.json().catch(() => ({})) + if (r2.ok && Array.isArray(j2?.runs)) { + setAgentRuns(j2.runs as LatestAgentRun[]) } - setAiPanels((prev) => ({ ...prev, [questionIndex]: data || null })) - setPanelOpen((prev) => ({ ...prev, [questionIndex]: false })) + } catch { + // ignore } } catch (err) { console.error('Classify API error', err) diff --git a/frontend/app/[lang]/can-sr/l2-screen/page.tsx b/frontend/app/[lang]/can-sr/l2-screen/page.tsx index 81c8e5a8..1697fb0e 100644 --- a/frontend/app/[lang]/can-sr/l2-screen/page.tsx +++ b/frontend/app/[lang]/can-sr/l2-screen/page.tsx @@ -56,39 +56,30 @@ const buildCitationAiCalls: BuildCitationAiCalls = async ({ }, }) - for (let i = 0; i < (criteria?.questions || []).length; i++) { - const question = criteria.questions[i] - const options = criteria.possible_answers?.[i] || [] - calls.push({ - key: `l2_classify_${i}`, - label: `L2: ${question}`, - run: async () => { - const res = await fetch( - `/api/can-sr/screen?action=classify&sr_id=${encodeURIComponent( - srId, - )}&citation_id=${encodeURIComponent(String(citationId))}`, - { - method: 'POST', - headers: { ...headers, 'Content-Type': 'application/json' }, - body: JSON.stringify({ - question, - options, - include_columns: ['title', 'abstract'], - screening_step: 'l2', - model, - temperature: 0.0, - max_tokens: 2000, - }), - }, - ) - - if (!res.ok) { - const text = await res.text().catch(() => '') - throw new Error(text || `L2 classify failed (${res.status})`) - } - }, - }) - } + // Phase 2 wiring: run a single orchestrated fulltext screening+critical per citation. + // (The backend reads SR criteria, so we do not need to fan out per-question calls.) + calls.push({ + key: `l2_agentic_run`, + label: `L2 agentic (screening + critical)`, + run: async () => { + const res = await fetch('/api/can-sr/screen/fulltext/run', { + method: 'POST', + headers: { ...headers, 'Content-Type': 'application/json' }, + body: JSON.stringify({ + sr_id: srId, + citation_id: Number(citationId), + model, + temperature: 0.0, + max_tokens: 2000, + prompt_version: 'v1', + }), + }) + if (!res.ok) { + const text = await res.text().catch(() => '') + throw new Error(text || `L2 agentic run failed (${res.status})`) + } + }, + }) return calls } diff --git a/frontend/app/[lang]/can-sr/l2-screen/view/page.tsx b/frontend/app/[lang]/can-sr/l2-screen/view/page.tsx index 9af2199e..b7bfde89 100644 --- a/frontend/app/[lang]/can-sr/l2-screen/view/page.tsx +++ b/frontend/app/[lang]/can-sr/l2-screen/view/page.tsx @@ -290,32 +290,35 @@ export default function CanSrL2ScreenViewPage() { // Load latest agent runs for this citation (screening + critical per criterion) useEffect(() => { if (!srId || !citationId) return - const loadRuns = async () => { - setLoadingRuns(true) - try { - const headers = getAuthHeaders() - const res = await fetch( - `/api/can-sr/screen/agent-runs/latest?sr_id=${encodeURIComponent( - srId, - )}&pipeline=${encodeURIComponent('fulltext')}&citation_ids=${encodeURIComponent( - String(citationId), - )}`, - { method: 'GET', headers }, - ) - const data = await res.json().catch(() => ({})) - if (res.ok && Array.isArray(data?.runs)) { - setAgentRuns(data.runs as LatestAgentRun[]) - } else { - setAgentRuns([]) - } - } catch { + loadRuns() + }, [srId, citationId]) + + // Re-usable loader so we can refresh after triggering an agentic run. + async function loadRuns() { + if (!srId || !citationId) return + setLoadingRuns(true) + try { + const headers = getAuthHeaders() + const res = await fetch( + `/api/can-sr/screen/agent-runs/latest?sr_id=${encodeURIComponent( + srId, + )}&pipeline=${encodeURIComponent('fulltext')}&citation_ids=${encodeURIComponent( + String(citationId), + )}`, + { method: 'GET', headers }, + ) + const data = await res.json().catch(() => ({})) + if (res.ok && Array.isArray(data?.runs)) { + setAgentRuns(data.runs as LatestAgentRun[]) + } else { setAgentRuns([]) - } finally { - setLoadingRuns(false) } + } catch { + setAgentRuns([]) + } finally { + setLoadingRuns(false) } - loadRuns() - }, [srId, citationId]) + } const runsByCriterion = useMemo(() => { const by: Record = {} @@ -567,65 +570,32 @@ export default function CanSrL2ScreenViewPage() { // Call backend classify for a single question using fulltext template (screening_step='l2') async function classifyQuestion(questionIndex: number) { if (!srId || !citationId || !criteriaData) return - const question = criteriaData.questions[questionIndex] - const options = criteriaData.possible_answers[questionIndex] || [] - const xtra = criteriaData.additional_infos?.[questionIndex] || '' try { const headers = { 'Content-Type': 'application/json', ...getAuthHeaders(), } - const bodyPayload: any = { - question, - options, - screening_step: 'l2', - xtra, - model: selectedModel, - temperature: 0.0, - max_tokens: 1200, - } - // Provide full text directly to backend to prevent include_columns=None error. - // If fulltext is not yet available, fall back to title/abstract to avoid backend crash. - bodyPayload.citation_text = fulltextStr - bodyPayload.include_columns = ['title', 'abstract'] + // Phase 2 wiring: reuse existing per-question “AI” button, but call the + // agentic orchestrator endpoint which runs BOTH screening + critical and persists + // them to screening_agent_runs. + const res = await fetch('/api/can-sr/screen/fulltext/run', { + method: 'POST', + headers, + body: JSON.stringify({ + sr_id: srId, + citation_id: Number(citationId), + model: selectedModel, + temperature: 0.0, + max_tokens: 2000, + prompt_version: 'v1', + }), + }) + await res.json().catch(() => ({})) - const res = await fetch( - `/api/can-sr/screen?action=classify&sr_id=${encodeURIComponent(srId)}&citation_id=${encodeURIComponent( - String(citationId), - )}`, - { - method: 'POST', - headers, - body: JSON.stringify(bodyPayload), - }, - ) - const data = await res.json().catch(() => ({})) - const classification = - data?.classification_json || - data?.result || - data?.classification || - data?.llm_classification || - data - if (classification && typeof classification === 'object') { - // Always show AI panel. - // IMPORTANT: do NOT overwrite an existing human selection in the UI. - if ((classification as any).selected !== undefined) { - setSelections((prev) => { - const already = prev?.[questionIndex] - if (already !== undefined && String(already).trim() !== '') return prev - return { ...prev, [questionIndex]: (classification as any).selected } - }) - } - setAiPanels((prev) => ({ ...prev, [questionIndex]: classification })) - setPanelOpen((prev) => ({ ...prev, [questionIndex]: false })) - } else { - if (typeof data === 'string') { - setSelections((prev) => ({ ...prev, [questionIndex]: data })) - } - setAiPanels((prev) => ({ ...prev, [questionIndex]: data || null })) - setPanelOpen((prev) => ({ ...prev, [questionIndex]: false })) - } + // Refresh latest runs + citation row so the UI shows critical results immediately. + await fetchCitationById(String(citationId)) + await loadRuns() } catch (err) { console.error('Classify API error', err) } diff --git a/frontend/app/api/can-sr/reviews/critical-prompt-additions/route.ts b/frontend/app/api/can-sr/reviews/critical-prompt-additions/route.ts new file mode 100644 index 00000000..835045cd --- /dev/null +++ b/frontend/app/api/can-sr/reviews/critical-prompt-additions/route.ts @@ -0,0 +1,59 @@ +import { NextRequest, NextResponse } from 'next/server' +import { BACKEND_URL } from '@/lib/config' + +/** + * Proxy: + * GET /api/can-sr/reviews/critical-prompt-additions?sr_id= + * -> GET {BACKEND_URL}/api/sr//critical_prompt_additions + * PUT /api/can-sr/reviews/critical-prompt-additions?sr_id= + * body: { critical_prompt_additions: {...} } + * -> PUT {BACKEND_URL}/api/sr//critical_prompt_additions + */ + +function authHeaders(request: NextRequest): Record { + const auth = request.headers.get('authorization') + return auth ? { Authorization: auth } : {} +} + +export async function GET(request: NextRequest) { + try { + const params = request.nextUrl.searchParams + const srId = params.get('sr_id') + if (!srId) { + return NextResponse.json({ error: 'sr_id is required' }, { status: 400 }) + } + const url = `${BACKEND_URL}/api/sr/${encodeURIComponent(srId)}/critical_prompt_additions` + const res = await fetch(url, { method: 'GET', headers: authHeaders(request) }) + const data = await res.json().catch(() => ({})) + return NextResponse.json(data, { status: res.status }) + } catch (e) { + console.error('critical-prompt-additions GET error:', e) + return NextResponse.json({ error: 'Internal server error' }, { status: 500 }) + } +} + +export async function PUT(request: NextRequest) { + try { + const params = request.nextUrl.searchParams + const srId = params.get('sr_id') + if (!srId) { + return NextResponse.json({ error: 'sr_id is required' }, { status: 400 }) + } + + const body = await request.json().catch(() => ({})) + const url = `${BACKEND_URL}/api/sr/${encodeURIComponent(srId)}/critical_prompt_additions` + const res = await fetch(url, { + method: 'PUT', + headers: { + ...authHeaders(request), + 'Content-Type': 'application/json', + }, + body: JSON.stringify(body), + }) + const data = await res.json().catch(() => ({})) + return NextResponse.json(data, { status: res.status }) + } catch (e) { + console.error('critical-prompt-additions PUT error:', e) + return NextResponse.json({ error: 'Internal server error' }, { status: 500 }) + } +} diff --git a/frontend/app/api/can-sr/screen/calibration/route.ts b/frontend/app/api/can-sr/screen/calibration/route.ts new file mode 100644 index 00000000..b29e3633 --- /dev/null +++ b/frontend/app/api/can-sr/screen/calibration/route.ts @@ -0,0 +1,66 @@ +import { NextRequest, NextResponse } from 'next/server' +import { BACKEND_URL } from '@/lib/config' + +/** + * Proxy: GET /api/can-sr/screen/calibration?sr_id=&step=l1|l2&thresholds=...&bins=... + * -> GET {BACKEND_URL}/api/screen/calibration?sr_id=...&step=...&thresholds=...&bins=... + */ + +export async function OPTIONS() { + return new Response(null, { + status: 204, + headers: { + 'Access-Control-Allow-Origin': '*', + 'Access-Control-Allow-Methods': 'GET,OPTIONS', + 'Access-Control-Allow-Headers': 'Authorization, Content-Type', + }, + }) +} + +export async function GET(request: NextRequest) { + try { + const params = request.nextUrl.searchParams + const srId = params.get('sr_id') + const step = params.get('step') || 'l1' + const thresholds = params.get('thresholds') + const bins = params.get('bins') + + if (!srId) { + return NextResponse.json({ error: 'sr_id is required' }, { status: 400 }) + } + + const authHeader = request.headers.get('authorization') + if (!authHeader) { + return NextResponse.json( + { error: 'Authorization header is required' }, + { status: 401 }, + ) + } + + const url = new URL(`${BACKEND_URL}/api/screen/calibration`) + url.searchParams.set('sr_id', srId) + url.searchParams.set('step', step) + if (thresholds) url.searchParams.set('thresholds', thresholds) + if (bins) url.searchParams.set('bins', bins) + + const res = await fetch(url.toString(), { + method: 'GET', + headers: { + Authorization: authHeader, + }, + }) + + const text = await res.text().catch(() => '') + let json: any = null + try { + json = text ? JSON.parse(text) : {} + } catch { + json = { detail: text || null } + } + + return NextResponse.json(json, { status: res.status }) + } catch (err: any) { + console.error('screen calibration proxy GET error:', err) + return NextResponse.json({ error: 'Internal server error' }, { status: 500 }) + } +} diff --git a/frontend/app/api/can-sr/screen/fulltext/run/route.ts b/frontend/app/api/can-sr/screen/fulltext/run/route.ts new file mode 100644 index 00000000..1743fd56 --- /dev/null +++ b/frontend/app/api/can-sr/screen/fulltext/run/route.ts @@ -0,0 +1,36 @@ +import { NextRequest, NextResponse } from 'next/server' +import { BACKEND_URL } from '@/lib/config' + +/** + * Proxy: POST /api/can-sr/screen/fulltext/run + * -> POST {BACKEND_URL}/api/screen/fulltext/run + */ +export async function POST(request: NextRequest) { + try { + const authHeader = request.headers.get('authorization') + if (!authHeader) { + return NextResponse.json( + { error: 'Authorization header is required' }, + { status: 401 }, + ) + } + + const body = await request.json().catch(() => ({})) + + const url = `${BACKEND_URL}/api/screen/fulltext/run` + const res = await fetch(url, { + method: 'POST', + headers: { + 'Content-Type': 'application/json', + Authorization: authHeader, + }, + body: JSON.stringify(body), + }) + + const data = await res.json().catch(() => ({})) + return NextResponse.json(data, { status: res.status }) + } catch (err) { + console.error('fulltext/run proxy POST error:', err) + return NextResponse.json({ error: 'Internal server error' }, { status: 500 }) + } +} diff --git a/frontend/app/api/can-sr/screen/title-abstract/run/route.ts b/frontend/app/api/can-sr/screen/title-abstract/run/route.ts new file mode 100644 index 00000000..a628192f --- /dev/null +++ b/frontend/app/api/can-sr/screen/title-abstract/run/route.ts @@ -0,0 +1,36 @@ +import { NextRequest, NextResponse } from 'next/server' +import { BACKEND_URL } from '@/lib/config' + +/** + * Proxy: POST /api/can-sr/screen/title-abstract/run + * -> POST {BACKEND_URL}/api/screen/title-abstract/run + */ +export async function POST(request: NextRequest) { + try { + const authHeader = request.headers.get('authorization') + if (!authHeader) { + return NextResponse.json( + { error: 'Authorization header is required' }, + { status: 401 }, + ) + } + + const body = await request.json().catch(() => ({})) + + const url = `${BACKEND_URL}/api/screen/title-abstract/run` + const res = await fetch(url, { + method: 'POST', + headers: { + 'Content-Type': 'application/json', + Authorization: authHeader, + }, + body: JSON.stringify(body), + }) + + const data = await res.json().catch(() => ({})) + return NextResponse.json(data, { status: res.status }) + } catch (err) { + console.error('title-abstract/run proxy POST error:', err) + return NextResponse.json({ error: 'Internal server error' }, { status: 500 }) + } +} diff --git a/frontend/components/can-sr/CitationListPage.tsx b/frontend/components/can-sr/CitationListPage.tsx index 3f99aa12..7b1cafb0 100644 --- a/frontend/components/can-sr/CitationListPage.tsx +++ b/frontend/components/can-sr/CitationListPage.tsx @@ -13,7 +13,9 @@ import ScreeningMetricsPanel, { type ScreeningMetricsStats, type ScreeningMetricsSummary, type ScreeningCriterionMetrics, + type CalibrationCriterion, } from '@/components/can-sr/ScreeningMetricsPanel' +import ScreeningMetricsModal from '@/components/can-sr/ScreeningMetricsModal' import { Dialog, DialogContent, @@ -79,17 +81,41 @@ export default function CitationsListPage({ const [error, setError] = useState(null) const [criteriaData, setCriteriaData] = useState() - // Phase 1 list control surface is now hosted by the left-side metrics module. + // Phase 1 single-threshold is deprecated; kept for backward compatibility. const [threshold, setThreshold] = useState(0.9) - const [filterMode, setFilterMode] = useState<'needs' | 'validated' | 'unvalidated' | 'all'>('needs') - const [pageStats, setPageStats] = useState(undefined) + const [filterMode, setFilterMode] = useState<'needs' | 'validated' | 'unvalidated' | 'not_screened' | 'all'>('needs') + // page-local stats no longer shown (SR-wide progress bar is in metrics panel) + const [_pageStats, setPageStats] = useState(undefined) // Phase 2 metrics (SR-wide) const [srMetricsSummary, setSrMetricsSummary] = useState(undefined) const [srCriterionMetrics, setSrCriterionMetrics] = useState(undefined) - const [srThresholds, setSrThresholds] = useState | null>(null) + const [srCalibration, setSrCalibration] = useState(undefined) + const [_srThresholds, setSrThresholds] = useState | null>(null) + + // Backend warnings (e.g., legacy data needs run-all) + const [srWarnings, setSrWarnings] = useState(null) + + const legacyWarning = useMemo(() => { + const ws = Array.isArray(srWarnings) ? srWarnings : [] + return ( + ws.find((w) => String(w?.code || '').toUpperCase() === 'LEGACY_DATA_NEEDS_RUN_ALL') || + null + ) + }, [srWarnings]) + + // Silence eslint unused warnings for state that is intentionally retained for backwards-compatibility. + void _pageStats + void _srThresholds const [metricsRefreshKey, setMetricsRefreshKey] = useState(0) + const [metricsDrawerOpen, setMetricsDrawerOpen] = useState(false) + + // Draft editing: user can adjust thresholds locally, then click Save. + const [draftThresholds, setDraftThresholds] = useState | null>(null) + const [thresholdsDirty, setThresholdsDirty] = useState(false) + const [savingThresholds, setSavingThresholds] = useState(false) + // Run-all job tracking (persist across modal close / refresh) const [runAllForce, setRunAllForce] = useState(false) const [runAllJobId, setRunAllJobId] = useState(null) @@ -206,6 +232,8 @@ export default function CitationsListPage({ const tJson = await tRes.json().catch(() => ({})) const thresholds = (tRes.ok ? tJson?.screening_thresholds : null) || {} setSrThresholds(typeof thresholds === 'object' && thresholds ? thresholds : {}) + setDraftThresholds(typeof thresholds === 'object' && thresholds ? thresholds : {}) + setThresholdsDirty(false) // 2) metrics const mRes = await fetch( @@ -219,14 +247,32 @@ export default function CitationsListPage({ const stepBlock = mJson?.steps?.[screeningStep] setSrMetricsSummary(stepBlock?.summary) setSrCriterionMetrics(stepBlock?.criteria) + setSrWarnings(Array.isArray(mJson?.warnings) ? mJson.warnings : null) } else { setSrMetricsSummary(undefined) setSrCriterionMetrics(undefined) + setSrWarnings(null) + } + + // 3) calibration (validated set) + const cRes = await fetch( + `/api/can-sr/screen/calibration?sr_id=${encodeURIComponent(srId)}&step=${encodeURIComponent( + screeningStep, + )}`, + { method: 'GET', headers }, + ) + const cJson = await cRes.json().catch(() => ({})) + if (cRes.ok && Array.isArray(cJson?.criteria)) { + setSrCalibration(cJson.criteria as CalibrationCriterion[]) + } else { + setSrCalibration(undefined) } } catch { setSrMetricsSummary(undefined) setSrCriterionMetrics(undefined) + setSrCalibration(undefined) setSrThresholds(null) + setSrWarnings(null) } } load() @@ -236,6 +282,7 @@ export default function CitationsListPage({ async (nextThresholds: Record) => { if (!srId) return try { + setSavingThresholds(true) const headers = { ...getAuthHeaders(), 'Content-Type': 'application/json' } const res = await fetch( `/api/can-sr/reviews/thresholds?sr_id=${encodeURIComponent(srId)}`, @@ -248,11 +295,15 @@ export default function CitationsListPage({ const j = await res.json().catch(() => ({})) if (res.ok) { setSrThresholds(j?.screening_thresholds || nextThresholds) + setDraftThresholds(j?.screening_thresholds || nextThresholds) + setThresholdsDirty(false) // Refresh metrics so counts reflect the new thresholds. setMetricsRefreshKey((k) => k + 1) } } catch { // ignore + } finally { + setSavingThresholds(false) } }, [srId], @@ -417,7 +468,32 @@ export default function CitationsListPage({ Layout: left floating/side metrics module + right list. (A true fixed overlay can be added later; this keeps it responsive and simple.) */} -
    +
    + {legacyWarning ? ( +
    +
    Legacy screening data detected
    +
    + {String(legacyWarning?.message || + 'This SR has legacy llm_* outputs but no agentic runs. Please run Run-all to regenerate results.')} +
    +
    + Tip: when legacy data is detected, Run-all will automatically force overwrite to generate real agent runs. +
    +
    + ) : null} + + + setRunAllModalOpen(false)}> @@ -457,32 +533,66 @@ export default function CitationsListPage({
    -