From af0992708716cfbef9e3749becf53caf778c45b7 Mon Sep 17 00:00:00 2001
From: Tuminha <83960151+Tuminha@users.noreply.github.com>
Date: Tue, 2 Jun 2026 10:15:40 +0200
Subject: [PATCH 1/3] Add full reproduction submission gates

---
 .github/workflows/submission-readiness.yml |  31 ++
 MODEL_CARD.md                              |   4 +-
 Makefile                                   |  46 ++-
 README.md                                  |  14 +-
 results/v13_primary_norc_summary.json      |   5 +-
 results/v13_secondary_full_summary.json    |   5 +-
 scripts/02_process_nhanes_data.py          |  28 +-
 scripts/04_publication_analyses.py         |   2 +-
 scripts/check_publication_consistency.py   | 115 +++---
 scripts/download_nhanes.py                 |   2 +-
 scripts/reproduce_v13_primary.py           | 294 +++++++++++++++
 scripts/run_external_validation.sh         |  21 +-
 scripts/run_temporal_validation.py         | 193 ++++++++++
 scripts/run_v13_primary.sh                 |  29 +-
 scripts/verify_submission.py               | 113 ++++++
 src/reproduction.py                        | 414 +++++++++++++++++++++
 tests/test_reproduction_contract.py        |  59 +++
 17 files changed, 1236 insertions(+), 139 deletions(-)
 create mode 100644 .github/workflows/submission-readiness.yml
 create mode 100644 scripts/reproduce_v13_primary.py
 create mode 100644 scripts/run_temporal_validation.py
 create mode 100644 scripts/verify_submission.py
 create mode 100644 src/reproduction.py
 create mode 100644 tests/test_reproduction_contract.py

diff --git a/.github/workflows/submission-readiness.yml b/.github/workflows/submission-readiness.yml
new file mode 100644
index 0000000..64df284
--- /dev/null
+++ b/.github/workflows/submission-readiness.yml
@@ -0,0 +1,31 @@
+name: submission-readiness
+
+on:
+  push:
+    branches: ["main", "checks/**"]
+  pull_request:
+    branches: ["main"]
+
+jobs:
+  lightweight-checks:
+    runs-on: ubuntu-latest
+    steps:
+      - name: Check out repository
+        uses: actions/checkout@v4
+
+      - name: Set up Python
+        uses: actions/setup-python@v5
+        with:
+          python-version: "3.14"
+
+      - name: Install pinned dependencies
+        run: make setup-lock
+
+      - name: Run unit tests
+        run: make test
+
+      - name: Check publication consistency
+        run: make consistency
+
+      - name: Run submission readiness checks
+        run: make verify-submission
diff --git a/MODEL_CARD.md b/MODEL_CARD.md
index 30517ef..e0fb0e9 100644
--- a/MODEL_CARD.md
+++ b/MODEL_CARD.md
@@ -70,8 +70,8 @@ make setup-lock
 source venv/bin/activate
 make test
 make consistency
-make reproduce
-make temporal
+make verify-submission
+make reproduce-full
 ```
 
 The consistency check enforces agreement between result artifacts, README, this model card, and the manuscript source.
diff --git a/Makefile b/Makefile
index e02d913..618f733 100644
--- a/Makefile
+++ b/Makefile
@@ -1,4 +1,6 @@
-.PHONY: help setup setup-lock download process train reproduce temporal test consistency notebook clean figures lock dirs manuscript
+PYTHON ?= ./venv/bin/python
+
+.PHONY: help setup setup-lock download process train reproduce temporal test consistency verify-submission reproduce-full notebook clean figures lock dirs manuscript
 
 help:
 	@echo "NHANES Periodontitis ML Project - Make Commands"
@@ -21,6 +23,8 @@ help:
 	@echo ""
 	@echo "Publication:"
 	@echo "  make consistency  - Check result and manuscript consistency"
+	@echo "  make verify-submission - Run lightweight submission-readiness checks"
+	@echo "  make reproduce-full - Run full local reproduction workflow"
 	@echo "  make manuscript   - Render PDF manuscript if pandoc is installed"
 	@echo "  make figures      - Generate publication figures from saved results"
 	@echo ""
@@ -44,36 +48,60 @@ setup-lock:
 
 test:
 	@echo "Running pytest unit tests..."
-	./venv/bin/python -m pytest tests/ -v --tb=short
+	$(PYTHON) -m pytest tests/ -v --tb=short
 	@echo "Tests complete"
 
 consistency:
 	@echo "Checking publication consistency..."
-	python3 scripts/check_publication_consistency.py
+	$(PYTHON) scripts/check_publication_consistency.py
 	@echo "Publication consistency checks passed"
 
+verify-submission:
+	@echo "Running submission-readiness checks..."
+	$(MAKE) test
+	$(MAKE) consistency
+	$(PYTHON) scripts/verify_submission.py
+	$(PYTHON) scripts/05_number_manuscript_lines.py
+	@echo "Submission-readiness checks complete"
+
 download:
 	@echo "Downloading NHANES data..."
-	python3 scripts/01_download_nhanes_data.py
+	$(PYTHON) scripts/01_download_nhanes_data.py
 	@echo "Download complete"
 
 process:
 	@echo "Processing and merging NHANES components..."
-	python3 scripts/02_process_nhanes_data.py
+	$(PYTHON) scripts/02_process_nhanes_data.py
 	@echo "Processing complete"
 
 train:
 	@echo "Training models..."
-	python3 scripts/03_train_models.py
+	$(PYTHON) scripts/03_train_models.py
 	@echo "Training complete"
 
 reproduce:
 	@echo "Running primary-model reproduction workflow..."
-	bash scripts/run_v13_primary.sh
+	$(PYTHON) scripts/reproduce_v13_primary.py
 
 temporal:
 	@echo "Running same-source temporal validation workflow..."
-	bash scripts/run_external_validation.sh
+	$(PYTHON) scripts/run_temporal_validation.py
+
+reproduce-full:
+	@mkdir -p logs
+	@LOG="logs/full_reproduction_$$(date -u +%Y%m%dT%H%M%SZ).log"; \
+	echo "Writing full reproduction log to $$LOG"; \
+	{ \
+		$(MAKE) download; \
+		$(MAKE) process; \
+		$(MAKE) reproduce; \
+		$(MAKE) temporal; \
+		$(PYTHON) scripts/04_publication_analyses.py \
+			--input data/processed/publication_predictions.parquet \
+			--feature-cols age bmi waist_cm waist_height height_cm systolic_bp diastolic_bp glucose triglycerides hdl; \
+		$(MAKE) consistency; \
+		$(MAKE) verify-submission; \
+	} 2>&1 | tee "$$LOG"
 
 notebook:
 	@echo "Launching Jupyter notebook..."
@@ -85,7 +113,7 @@ figures:
 
 manuscript:
 	@echo "Rendering manuscript if pandoc is installed..."
-	python3 scripts/05_number_manuscript_lines.py
+	$(PYTHON) scripts/05_number_manuscript_lines.py
 	@if command -v pandoc >/dev/null 2>&1; then \
 		mkdir -p reports; \
 		pandoc docs/publication/ARTICLE_DRAFT.md \
diff --git a/README.md b/README.md
index ccc6e7b..3a8fde9 100644
--- a/README.md
+++ b/README.md
@@ -51,18 +51,13 @@ Run lightweight checks that do not require NHANES data:
 ```bash
 make test
 make consistency
+make verify-submission
 ```
 
-Run the full workflows after NHANES data are available:
+Run the full local reproduction after NHANES data are available:
 
 ```bash
-make download
-make process
-make reproduce
-make temporal
-python3 scripts/04_publication_analyses.py \
-  --input data/processed/publication_predictions.parquet \
-  --feature-cols age bmi waist_cm systolic_bp diastolic_bp glucose triglycerides hdl
+make reproduce-full
 ```
 
 The legacy notebooks are retired as source-of-truth artifacts. The maintained publication surface is the script targets, result artifacts, model card, tests, and manuscript source, with consistency checks to prevent silent drift across those files.
@@ -75,6 +70,9 @@ The legacy notebooks are retired as source-of-truth artifacts. The maintained pu
 | `src/evaluation.py` | Metrics, threshold selection, calibration, and plotting helpers |
 | `src/publication_analysis.py` | Survey-weighted prevalence, subgroup performance, and missingness tables |
 | `scripts/check_publication_consistency.py` | Guards canonical values and conservative publication wording |
+| `scripts/verify_submission.py` | Runs lightweight submission-readiness gates |
+| `scripts/reproduce_v13_primary.py` | Regenerates internal v1.3 benchmark result artifacts |
+| `scripts/run_temporal_validation.py` | Regenerates same-source temporal validation artifacts |
 | `scripts/04_publication_analyses.py` | Generates publication sensitivity tables from processed predictions |
 | `results/` | Saved result artifacts used by the manuscript and model card |
 | `docs/publication/ARTICLE_DRAFT.md` | Current manuscript source |
diff --git a/results/v13_primary_norc_summary.json b/results/v13_primary_norc_summary.json
index bde8e28..2193400 100644
--- a/results/v13_primary_norc_summary.json
+++ b/results/v13_primary_norc_summary.json
@@ -26,7 +26,7 @@
       "precision": 0.692,
       "npv": 0.96,
       "f1": 0.818,
-      "use_case": "Initial screening. Negative result rules out disease."
+      "use_case": "High-sensitivity triage; periodontal examination remains required."
     },
     "balanced": {
       "name": "Balanced (Youden J)",
@@ -37,7 +37,7 @@
       "npv": 0.51,
       "f1": 0.758,
       "youden_j": 0.32,
-      "use_case": "Clinical decision-making. Balanced tradeoff."
+      "use_case": "Balanced triage; periodontal examination remains required."
     }
   },
   "confusion_matrix": {
@@ -63,4 +63,3 @@
   },
   "timestamp": "2025-12-02T00:00:00Z"
 }
-
diff --git a/results/v13_secondary_full_summary.json b/results/v13_secondary_full_summary.json
index 41a4eeb..eea185f 100644
--- a/results/v13_secondary_full_summary.json
+++ b/results/v13_secondary_full_summary.json
@@ -26,7 +26,7 @@
       "precision": 0.706,
       "npv": 0.83,
       "f1": 0.824,
-      "use_case": "Initial screening. Negative result rules out disease."
+      "use_case": "High-sensitivity triage; periodontal examination remains required."
     },
     "balanced": {
       "name": "Balanced (Youden J)",
@@ -37,7 +37,7 @@
       "npv": 0.52,
       "f1": 0.772,
       "youden_j": 0.331,
-      "use_case": "Clinical decision-making. Balanced tradeoff."
+      "use_case": "Balanced triage; periodontal examination remains required."
     }
   },
   "confusion_matrix": {
@@ -63,4 +63,3 @@
   },
   "timestamp": "2025-12-02T00:00:00Z"
 }
-
diff --git a/scripts/02_process_nhanes_data.py b/scripts/02_process_nhanes_data.py
index 6d5af6d..074d844 100644
--- a/scripts/02_process_nhanes_data.py
+++ b/scripts/02_process_nhanes_data.py
@@ -22,6 +22,7 @@
     sys.path.insert(0, str(ROOT))
 
 from src.labels import label_periodontitis
+from src.reproduction import build_modeling_frame
 
 # Directories
 DATA_DIR = Path("data")
@@ -49,6 +50,7 @@
     'SEQN': 'participant_id',
     'BMXBMI': 'bmi',
     'BMXWAIST': 'waist_circumference',
+    'BMXHT': 'height_cm',
 }
 
 # Blood Pressure (BPX)
@@ -82,6 +84,7 @@
     'OHQ030': 'time_since_dental_visit',  # 1=6mo, 2=1yr, 3=2yr, etc.
     'OHQ620': 'floss_days_per_week',
     'OHQ845': 'loose_teeth',  # 1=Yes, 2=No
+    'OHQ680': 'mobile_teeth',
 }
 
 # =============================================================================
@@ -318,8 +321,9 @@ def process_cycle(cycle: str) -> pd.DataFrame:
     
     # Merge other components
     if 'body_measures' in dfs:
-        bm = dfs['body_measures'][['SEQN', 'BMXBMI', 'BMXWAIST']].copy()
-        bm.columns = ['participant_id', 'bmi', 'waist_circumference']
+        bm_cols = [c for c in BMX_VARS if c in dfs['body_measures'].columns]
+        bm = dfs['body_measures'][bm_cols].copy()
+        bm.columns = [BMX_VARS[c] for c in bm_cols]
         df = df.merge(bm, on='participant_id', how='left')
     
     if 'blood_pressure' in dfs:
@@ -331,8 +335,9 @@ def process_cycle(cycle: str) -> pd.DataFrame:
         df = df.merge(bp, on='participant_id', how='left')
     
     if 'smoking' in dfs:
-        sm = dfs['smoking'][['SEQN', 'SMQ020']].copy()
-        sm.columns = ['participant_id', 'smoked_100_cigs']
+        sm_cols = [c for c in SMQ_VARS if c in dfs['smoking'].columns]
+        sm = dfs['smoking'][sm_cols].copy()
+        sm.columns = [SMQ_VARS[c] for c in sm_cols]
         df = df.merge(sm, on='participant_id', how='left')
 
     if 'alcohol' in dfs:
@@ -342,14 +347,9 @@ def process_cycle(cycle: str) -> pd.DataFrame:
         df = df.merge(alq, on='participant_id', how='left')
     
     if 'oral_health_questionnaire' in dfs:
-        ohq_cols = ['SEQN']
-        for c in ['OHQ030', 'OHQ620', 'OHQ845']:
-            if c in dfs['oral_health_questionnaire'].columns:
-                ohq_cols.append(c)
+        ohq_cols = [c for c in OHQ_VARS if c in dfs['oral_health_questionnaire'].columns]
         ohq = dfs['oral_health_questionnaire'][ohq_cols].copy()
-        col_map = {'SEQN': 'participant_id', 'OHQ030': 'time_since_dental_visit',
-                   'OHQ620': 'floss_days_per_week', 'OHQ845': 'loose_teeth'}
-        ohq.columns = [col_map.get(c, c) for c in ohq.columns]
+        ohq.columns = [OHQ_VARS[c] for c in ohq_cols]
         df = df.merge(ohq, on='participant_id', how='left')
 
     if 'glucose' in dfs and 'LBXGLU' in dfs['glucose'].columns:
@@ -376,6 +376,7 @@ def process_cycle(cycle: str) -> pd.DataFrame:
         labeled = label_periodontitis(dfs['periodontal'].copy())
         labels = labeled[['SEQN', 'perio_class', 'has_periodontitis']].copy()
         labels.columns = ['participant_id', 'perio_class', 'has_periodontitis']
+        labels['periodontitis_binary'] = labels['has_periodontitis'].astype(int)
         df = df.merge(labels, on='participant_id', how='inner')
     
     # Add cycle identifier
@@ -414,6 +415,11 @@ def main():
         output_path = PROCESSED_DIR / "nhanes_combined.parquet"
         df_combined.to_parquet(output_path)
         print(f"\nSaved to {output_path}")
+
+        modeling_path = PROCESSED_DIR / "nhanes_modeling_frame.parquet"
+        modeling_frame = build_modeling_frame(df_combined)
+        modeling_frame.to_parquet(modeling_path)
+        print(f"Saved modeling frame to {modeling_path}")
         
         # Print summary
         print(f"\n{'='*60}")
diff --git a/scripts/04_publication_analyses.py b/scripts/04_publication_analyses.py
index e6a4558..28912a5 100644
--- a/scripts/04_publication_analyses.py
+++ b/scripts/04_publication_analyses.py
@@ -96,7 +96,7 @@ def main() -> None:
     input_path = Path(args.input)
     if not input_path.exists():
         raise FileNotFoundError(
-            f"Input table not found: {input_path}. Generate it from the processing and validation notebooks first."
+            f"Input table not found: {input_path}. Run `make process && make temporal` first."
         )
 
     df = load_table(input_path)
diff --git a/scripts/check_publication_consistency.py b/scripts/check_publication_consistency.py
index fb99706..5f3ef28 100644
--- a/scripts/check_publication_consistency.py
+++ b/scripts/check_publication_consistency.py
@@ -1,30 +1,14 @@
 #!/usr/bin/env python3
-"""Check that publication-facing files use the same canonical results."""
+"""Check that publication-facing files use the same current result artifacts."""
 
 from __future__ import annotations
 
 import json
-import math
 from pathlib import Path
 
 
 ROOT = Path(__file__).resolve().parents[1]
 
-EXPECTED = {
-    "primary_features": 29,
-    "secondary_features": 33,
-    "primary_auc": 0.717245742046474,
-    "primary_pr_auc": 0.8157447372867956,
-    "secondary_auc": 0.7255326805774952,
-    "temporal_auc": 0.6771141964954918,
-    "temporal_pr_auc": 0.7734533687334428,
-    "temporal_brier": 0.20025236260487186,
-    "temporal_rule_out_sensitivity": 0.970968669157804,
-    "temporal_rule_out_specificity": 0.18080094228504123,
-    "temporal_balanced_sensitivity": 0.8263868927852831,
-    "temporal_balanced_specificity": 0.4334511189634865,
-}
-
 DOCS = [
     ROOT / "README.md",
     ROOT / "MODEL_CARD.md",
@@ -50,57 +34,73 @@ def load_json(path: str) -> dict:
         return json.load(f)
 
 
-def close(actual: float, expected: float, tol: float = 1e-6) -> bool:
-    return math.isclose(float(actual), float(expected), rel_tol=tol, abs_tol=tol)
+def fmt4(value: float) -> str:
+    return f"{float(value):.4f}"
 
 
-def assert_close(label: str, actual: float, expected: float) -> None:
-    if not close(actual, expected):
-        raise AssertionError(f"{label}: expected {expected}, got {actual}")
+def pct1(value: float) -> str:
+    return f"{float(value) * 100:.1f}%"
 
 
-def check_result_files() -> None:
+def canonical_values() -> dict[str, float | int | str]:
     featuredrop = load_json("results/v13_featuredrop.json")
     temporal = load_json("results/external_0910_metrics.json")
-
     primary = featuredrop["v1.3_no_reverse_causality"]
     secondary = featuredrop["v1.3_full"]
-
-    assert primary["n_features"] == EXPECTED["primary_features"]
-    assert secondary["n_features"] == EXPECTED["secondary_features"]
-    assert_close("primary AUC", primary["auc"], EXPECTED["primary_auc"])
-    assert_close("primary PR-AUC", primary["pr_auc"], EXPECTED["primary_pr_auc"])
-    assert_close("secondary AUC", secondary["auc"], EXPECTED["secondary_auc"])
-
-    assert_close("temporal AUC", temporal["metrics"]["auc"]["mean"], EXPECTED["temporal_auc"])
-    assert_close("temporal PR-AUC", temporal["metrics"]["prauc"]["mean"], EXPECTED["temporal_pr_auc"])
-    assert_close("temporal Brier", temporal["metrics"]["brier"]["mean"], EXPECTED["temporal_brier"])
-
     rule_out = temporal["operating_points"]["rule_out_t_0.35"]
     balanced = temporal["operating_points"]["balanced_t_0.65"]
-    assert_close(
-        "temporal rule-out sensitivity",
-        rule_out["sensitivity"],
-        EXPECTED["temporal_rule_out_sensitivity"],
-    )
-    assert_close(
-        "temporal rule-out specificity",
-        rule_out["specificity"],
-        EXPECTED["temporal_rule_out_specificity"],
-    )
-    assert_close(
-        "temporal balanced sensitivity",
-        balanced["sensitivity"],
-        EXPECTED["temporal_balanced_sensitivity"],
-    )
-    assert_close(
-        "temporal balanced specificity",
-        balanced["specificity"],
-        EXPECTED["temporal_balanced_specificity"],
-    )
+    return {
+        "primary_features": int(primary["n_features"]),
+        "secondary_features": int(secondary["n_features"]),
+        "primary_auc": float(primary["auc"]),
+        "primary_pr_auc": float(primary["pr_auc"]),
+        "secondary_auc": float(secondary["auc"]),
+        "secondary_pr_auc": float(secondary["pr_auc"]),
+        "temporal_auc": float(temporal["metrics"]["auc"]["mean"]),
+        "temporal_pr_auc": float(temporal["metrics"]["prauc"]["mean"]),
+        "temporal_brier": float(temporal["metrics"]["brier"]["mean"]),
+        "temporal_rule_out_sensitivity": float(rule_out["sensitivity"]),
+        "temporal_rule_out_specificity": float(rule_out["specificity"]),
+        "temporal_balanced_sensitivity": float(balanced["sensitivity"]),
+        "temporal_balanced_specificity": float(balanced["specificity"]),
+    }
+
+
+def check_result_files() -> None:
+    values = canonical_values()
+    if values["primary_features"] != 29:
+        raise AssertionError(f"Primary feature count must be 29, got {values['primary_features']}")
+    if values["secondary_features"] != 33:
+        raise AssertionError(f"Secondary feature count must be 33, got {values['secondary_features']}")
+    for key in [
+        "primary_auc",
+        "primary_pr_auc",
+        "secondary_auc",
+        "secondary_pr_auc",
+        "temporal_auc",
+        "temporal_pr_auc",
+        "temporal_brier",
+    ]:
+        value = float(values[key])
+        if not 0 <= value <= 1:
+            raise AssertionError(f"{key} must be in [0, 1], got {value}")
+
+
+def required_strings(values: dict) -> list[str]:
+    return [
+        str(values["primary_features"]),
+        str(values["secondary_features"]),
+        fmt4(values["primary_auc"]),
+        fmt4(values["primary_pr_auc"]),
+        fmt4(values["secondary_auc"]),
+        fmt4(values["secondary_pr_auc"]),
+        fmt4(values["temporal_auc"]),
+        fmt4(values["temporal_pr_auc"]),
+    ]
 
 
 def check_docs() -> None:
+    values = canonical_values()
     for path in DOCS:
         text = path.read_text(encoding="utf-8")
         for phrase in BANNED_PHRASES:
@@ -108,9 +108,10 @@ def check_docs() -> None:
                 rel = path.relative_to(ROOT)
                 raise AssertionError(f"{rel} contains banned phrase: {phrase}")
 
-        if "29" not in text or "33" not in text:
+        missing = [value for value in required_strings(values) if value not in text]
+        if missing:
             rel = path.relative_to(ROOT)
-            raise AssertionError(f"{rel} must mention canonical 29/33 feature counts")
+            raise AssertionError(f"{rel} is missing current result values: {missing}")
 
         if "same-source temporal validation" not in text.lower():
             rel = path.relative_to(ROOT)
diff --git a/scripts/download_nhanes.py b/scripts/download_nhanes.py
index f88a49c..963e461 100644
--- a/scripts/download_nhanes.py
+++ b/scripts/download_nhanes.py
@@ -50,7 +50,7 @@ def download_nhanes_data(cycles=None, components=None):
     Args:
         cycles: List of cycles to download, e.g., ["2011-2012", "2013-2014"]
                 If None, downloads all cycles.
-        components: List of components to download, e.g., ["demographics", "oral_health_exam"]
+        components: List of components to download, e.g., ["demographics", "periodontal"]
                    If None, downloads all components.
     """
     if cycles is None:
diff --git a/scripts/reproduce_v13_primary.py b/scripts/reproduce_v13_primary.py
new file mode 100644
index 0000000..2e9bb44
--- /dev/null
+++ b/scripts/reproduce_v13_primary.py
@@ -0,0 +1,294 @@
+#!/usr/bin/env python3
+"""Reproduce the v1.3 internal benchmark from processed NHANES data."""
+
+from __future__ import annotations
+
+import argparse
+import sys
+from pathlib import Path
+
+import pandas as pd
+
+ROOT = Path(__file__).resolve().parents[1]
+if str(ROOT) not in sys.path:
+    sys.path.insert(0, str(ROOT))
+
+from src.reproduction import (
+    BALANCED_THRESHOLD,
+    CORE_FEATURES,
+    PRIMARY_FEATURES,
+    RULE_OUT_THRESHOLD,
+    SECONDARY_BALANCED_THRESHOLD,
+    SECONDARY_FEATURES,
+    SECONDARY_RULE_OUT_THRESHOLD,
+    TREATMENT_SEEKING_FEATURES,
+    assert_feature_contract,
+    build_modeling_frame,
+    cross_validated_predictions,
+    feature_sets,
+    fit_calibrated_ensemble,
+    summarize_predictions,
+    summary_artifact,
+    timestamp,
+    write_json,
+)
+
+
+def load_modeling_frame(input_path: Path) -> pd.DataFrame:
+    if not input_path.exists():
+        raise FileNotFoundError(f"Processed data not found: {input_path}. Run `make download && make process` first.")
+
+    raw = pd.read_parquet(input_path)
+    frame = build_modeling_frame(raw)
+    frame = frame.dropna(subset=["has_periodontitis"])
+    assert_feature_contract(frame)
+    return frame.reset_index(drop=True)
+
+
+def development_subset(frame: pd.DataFrame, max_rows: int | None = None) -> pd.DataFrame:
+    frame = frame[frame["cycle"].isin(["2011-2012", "2013-2014"])].copy()
+    if max_rows:
+        per_class = max(1, max_rows // max(frame["has_periodontitis"].nunique(), 1))
+        sampled = [
+            group.sample(min(len(group), per_class), random_state=42)
+            for _, group in frame.groupby("has_periodontitis")
+        ]
+        frame = pd.concat(sampled, ignore_index=True).sample(frac=1, random_state=42).reset_index(drop=True)
+    return frame.reset_index(drop=True)
+
+
+def write_internal_artifacts(frame: pd.DataFrame, out_dir: Path, folds: int, seed: int) -> dict:
+    y = frame["has_periodontitis"].astype(int)
+    prevalence = float(y.mean())
+
+    predictions = {}
+    summaries = {}
+    for name, features, ro_threshold, balanced_threshold in [
+        ("deployment_ready", CORE_FEATURES, RULE_OUT_THRESHOLD, BALANCED_THRESHOLD),
+        ("primary", PRIMARY_FEATURES, RULE_OUT_THRESHOLD, BALANCED_THRESHOLD),
+        ("secondary", SECONDARY_FEATURES, SECONDARY_RULE_OUT_THRESHOLD, SECONDARY_BALANCED_THRESHOLD),
+    ]:
+        print(f"Running {folds}-fold CV for {name} ({len(features)} features)...")
+        prob = cross_validated_predictions(frame, features, n_folds=folds, seed=seed)
+        predictions[name] = prob
+        summaries[name] = summarize_predictions(
+            y,
+            prob,
+            n_features=len(features),
+            name=name,
+            rule_out_threshold=ro_threshold,
+            balanced_threshold=balanced_threshold,
+        )
+
+    primary = summaries["primary"]
+    secondary = summaries["secondary"]
+    deployment = summaries["deployment_ready"]
+
+    featuredrop = {
+        "v1.3_full": {
+            key: secondary[key]
+            for key in [
+                "name",
+                "n_features",
+                "auc",
+                "pr_auc",
+                "rule_out_recall",
+                "rule_out_specificity",
+                "balanced_recall",
+                "balanced_specificity",
+            ]
+        },
+        "v1.3_no_reverse_causality": {
+            key: primary[key]
+            for key in [
+                "name",
+                "n_features",
+                "auc",
+                "pr_auc",
+                "rule_out_recall",
+                "rule_out_specificity",
+                "balanced_recall",
+                "balanced_specificity",
+            ]
+        },
+        "dropped_features": TREATMENT_SEEKING_FEATURES,
+        "deltas": {
+            "auc": primary["auc"] - secondary["auc"],
+            "pr_auc": primary["pr_auc"] - secondary["pr_auc"],
+            "rule_out_recall": primary["rule_out_recall"] - secondary["rule_out_recall"],
+            "balanced_specificity": primary["balanced_specificity"] - secondary["balanced_specificity"],
+        },
+        "timestamp": timestamp(),
+    }
+    write_json(out_dir / "v13_featuredrop.json", featuredrop)
+
+    write_json(
+        out_dir / "v13_primary_norc_summary.json",
+        summary_artifact(
+            primary,
+            model_name="v1.3_primary_no_reverse_causality",
+            description="Primary benchmark model excluding treatment-seeking variables",
+            dataset="NHANES 2011-2014",
+            prevalence=prevalence,
+            threshold_rule_out=RULE_OUT_THRESHOLD,
+            threshold_balanced=BALANCED_THRESHOLD,
+            extra={
+                "dropped_features": TREATMENT_SEEKING_FEATURES,
+                "rationale": "Removes treatment-seeking variables with limited discrimination cost.",
+            },
+        ),
+    )
+    write_json(
+        out_dir / "v13_secondary_full_summary.json",
+        summary_artifact(
+            secondary,
+            model_name="v1.3_secondary_full_features",
+            description="Secondary upper-bound model including treatment-seeking variables",
+            dataset="NHANES 2011-2014",
+            prevalence=prevalence,
+            threshold_rule_out=SECONDARY_RULE_OUT_THRESHOLD,
+            threshold_balanced=SECONDARY_BALANCED_THRESHOLD,
+            extra={
+                "additional_features": TREATMENT_SEEKING_FEATURES,
+                "rationale": "Upper-bound sensitivity analysis for treatment-seeking variables.",
+            },
+        ),
+    )
+
+    nan_ablation = [
+        slim_summary("v1.3_full", secondary),
+        slim_summary("v1.3_no_reverse_causality", primary),
+        slim_summary("deployment_ready_core", deployment),
+    ]
+    complete = frame.dropna(subset=CORE_FEATURES)
+    if len(complete) and complete["has_periodontitis"].nunique() == 2:
+        complete_prob = cross_validated_predictions(complete.reset_index(drop=True), CORE_FEATURES, n_folds=folds, seed=seed)
+        complete_summary = summarize_predictions(
+            complete["has_periodontitis"].astype(int),
+            complete_prob,
+            len(CORE_FEATURES),
+            "complete_case_core",
+        )
+        nan_ablation.append(slim_summary("complete_case_core", complete_summary, n_samples=len(complete)))
+    write_json(out_dir / "v13_nan_ablation.json", nan_ablation)
+
+    write_json(
+        out_dir / "v13_operating_points.json",
+        {
+            "version": "v1.3",
+            "dataset": "NHANES 2011-2014",
+            "n_samples": int(len(frame)),
+            "prevalence": prevalence,
+            "calibration": "nested_isotonic_regression",
+            "feature_sets": {key: len(value) for key, value in feature_sets().items()},
+            "models": {
+                "primary": model_operating_payload(primary, RULE_OUT_THRESHOLD, BALANCED_THRESHOLD),
+                "secondary": model_operating_payload(secondary, SECONDARY_RULE_OUT_THRESHOLD, SECONDARY_BALANCED_THRESHOLD),
+            },
+            "feature_drop_analysis": {
+                "dropped_features": TREATMENT_SEEKING_FEATURES,
+                "auc_delta": primary["auc"] - secondary["auc"],
+                "pr_auc_delta": primary["pr_auc"] - secondary["pr_auc"],
+                "interpretation": "Treatment-seeking features are reported as an upper-bound sensitivity analysis.",
+            },
+            "timestamp": timestamp(),
+        },
+    )
+
+    importance = feature_importance_summary(frame, SECONDARY_FEATURES, seed)
+    write_json(out_dir / "v13_shap_summary.json", importance)
+    return summaries
+
+
+def slim_summary(name: str, summary: dict, n_samples: int | None = None) -> dict:
+    return {
+        "name": name,
+        "n_samples": int(n_samples) if n_samples is not None else None,
+        "n_features": int(summary["n_features"]),
+        "auc": summary["auc"],
+        "pr_auc": summary["pr_auc"],
+        "brier_score": summary["brier_score"],
+        "rule_out_recall": summary["rule_out_recall"],
+        "rule_out_specificity": summary["rule_out_specificity"],
+        "balanced_recall": summary["balanced_recall"],
+        "balanced_specificity": summary["balanced_specificity"],
+    }
+
+
+def model_operating_payload(summary: dict, rule_threshold: float, balanced_threshold: float) -> dict:
+    return {
+        "name": summary["name"],
+        "n_features": summary["n_features"],
+        "metrics": {
+            "auc_roc": summary["auc"],
+            "pr_auc": summary["pr_auc"],
+            "brier_score": summary["brier_score"],
+        },
+        "operating_points": {
+            "rule_out": {
+                "threshold": rule_threshold,
+                "recall": summary["rule_out"]["sensitivity"],
+                "specificity": summary["rule_out"]["specificity"],
+                "precision": summary["rule_out"]["precision"],
+                "f1": summary["rule_out"]["f1_score"],
+            },
+            "balanced": {
+                "threshold": balanced_threshold,
+                "recall": summary["balanced"]["sensitivity"],
+                "specificity": summary["balanced"]["specificity"],
+                "precision": summary["balanced"]["precision"],
+                "f1": summary["balanced"]["f1_score"],
+            },
+        },
+    }
+
+
+def feature_importance_summary(frame: pd.DataFrame, features: list[str], seed: int) -> dict:
+    fitted = fit_calibrated_ensemble(frame, frame["has_periodontitis"].astype(int), features, seed=seed)
+    importances = []
+    for model in fitted.models:
+        if hasattr(model, "feature_importances_"):
+            importances.append(model.feature_importances_)
+        elif hasattr(model, "get_feature_importance"):
+            importances.append(model.get_feature_importance())
+    if importances:
+        mean_importance = pd.Series(sum(importances) / len(importances), index=features)
+        top = [
+            {"feature": feature, "mean_model_importance": float(value)}
+            for feature, value in mean_importance.sort_values(ascending=False).head(10).items()
+        ]
+    else:
+        top = []
+    return {
+        "model": "v1.3_secondary_full_features",
+        "importance_method": "mean_tree_feature_importance",
+        "n_samples": int(len(frame)),
+        "n_features": int(len(features)),
+        "top_10_features": top,
+        "treatment_seeking_features": TREATMENT_SEEKING_FEATURES,
+        "timestamp": timestamp(),
+    }
+
+
+def main() -> None:
+    parser = argparse.ArgumentParser()
+    parser.add_argument("--input", default="data/processed/nhanes_combined.parquet")
+    parser.add_argument("--out-dir", default="results")
+    parser.add_argument("--modeling-frame", default="data/processed/nhanes_modeling_frame.parquet")
+    parser.add_argument("--folds", type=int, default=5)
+    parser.add_argument("--seed", type=int, default=42)
+    parser.add_argument("--max-rows", type=int, default=None, help="Optional smoke-test row cap.")
+    args = parser.parse_args()
+
+    full_frame = load_modeling_frame(Path(args.input))
+    Path(args.modeling_frame).parent.mkdir(parents=True, exist_ok=True)
+    full_frame.to_parquet(args.modeling_frame)
+    frame = development_subset(full_frame, max_rows=args.max_rows)
+    summaries = write_internal_artifacts(frame, Path(args.out_dir), args.folds, args.seed)
+    print("Internal reproduction complete:")
+    for name, summary in summaries.items():
+        print(f"  {name}: AUC={summary['auc']:.4f}, PR-AUC={summary['pr_auc']:.4f}, Brier={summary['brier_score']:.4f}")
+
+
+if __name__ == "__main__":
+    main()
diff --git a/scripts/run_external_validation.sh b/scripts/run_external_validation.sh
index 1d19415..333d81c 100755
--- a/scripts/run_external_validation.sh
+++ b/scripts/run_external_validation.sh
@@ -3,7 +3,7 @@
 # Same-source temporal validation script for NHANES 2009-2010
 # ==============================================================================
 # 
-# This script runs the temporal validation notebook non-interactively,
+# This script runs the temporal validation Python workflow non-interactively,
 # generating all figures and result files.
 #
 # Usage:
@@ -30,24 +30,11 @@ PROJECT_ROOT=$(pwd)
 
 echo "Project root: $PROJECT_ROOT"
 
-# Check if notebook exists
-NOTEBOOK="$PROJECT_ROOT/notebooks/01_external_validation.ipynb"
-if [ ! -f "$NOTEBOOK" ]; then
-    echo "Error: Notebook not found at $NOTEBOOK"
-    exit 1
-fi
-
-echo "Running notebook: $NOTEBOOK"
-
-# Execute notebook
-jupyter nbconvert --to notebook --execute \
-    --ExecutePreprocessor.timeout=3600 \
-    --ExecutePreprocessor.kernel_name=python3 \
-    --output="01_external_validation_executed.ipynb" \
-    "$NOTEBOOK"
+echo "Running maintained temporal validation script"
+"${PYTHON:-python3}" scripts/run_temporal_validation.py
 
 echo ""
-echo "Notebook executed successfully."
+echo "Temporal validation script executed successfully."
 echo ""
 
 # Check outputs
diff --git a/scripts/run_temporal_validation.py b/scripts/run_temporal_validation.py
new file mode 100644
index 0000000..d2826e5
--- /dev/null
+++ b/scripts/run_temporal_validation.py
@@ -0,0 +1,193 @@
+#!/usr/bin/env python3
+"""Run same-source temporal validation on NHANES 2009-2010."""
+
+from __future__ import annotations
+
+import argparse
+import sys
+from pathlib import Path
+
+import numpy as np
+import pandas as pd
+from sklearn.metrics import average_precision_score, brier_score_loss, roc_auc_score
+
+ROOT = Path(__file__).resolve().parents[1]
+if str(ROOT) not in sys.path:
+    sys.path.insert(0, str(ROOT))
+
+from src.evaluation import compute_metrics
+from src.reproduction import (
+    BALANCED_THRESHOLD,
+    CORE_FEATURES,
+    DEVELOPMENT_CYCLES,
+    PRIMARY_FEATURES,
+    RULE_OUT_THRESHOLD,
+    TEMPORAL_CYCLE,
+    assert_feature_contract,
+    bootstrap_ci,
+    build_modeling_frame,
+    feature_missingness_shift,
+    fit_calibrated_ensemble,
+    timestamp,
+    write_json,
+)
+
+
+def load_frame(input_path: Path) -> pd.DataFrame:
+    if input_path.exists() and input_path.name == "nhanes_modeling_frame.parquet":
+        frame = pd.read_parquet(input_path)
+    elif input_path.exists():
+        frame = build_modeling_frame(pd.read_parquet(input_path))
+    else:
+        fallback = Path("data/processed/nhanes_combined.parquet")
+        if not fallback.exists():
+            raise FileNotFoundError(f"No processed data found at {input_path} or {fallback}.")
+        frame = build_modeling_frame(pd.read_parquet(fallback))
+    assert_feature_contract(frame)
+    return frame.reset_index(drop=True)
+
+
+def prevalence_payload(frame: pd.DataFrame) -> dict:
+    rows = []
+    for cycle, group in frame.groupby("cycle"):
+        counts = group["perio_class"].value_counts(normalize=True)
+        rows.append(
+            {
+                "cycle": cycle,
+                "n": int(len(group)),
+                "prevalence": float(group["has_periodontitis"].mean() * 100),
+                "severe": float(counts.get("severe", 0.0) * 100),
+                "moderate": float(counts.get("moderate", 0.0) * 100),
+                "mild": float(counts.get("mild", 0.0) * 100),
+            }
+        )
+    return {
+        "our_estimates": rows,
+        "note": "Unweighted analytic-sample prevalence; weighted prevalence is generated by scripts/04_publication_analyses.py.",
+        "timestamp": timestamp(),
+    }
+
+
+def decision_curve_summary(y_true: pd.Series, y_prob: np.ndarray) -> dict:
+    def net_benefit(threshold: float) -> float:
+        pred = (y_prob >= threshold).astype(int)
+        y = y_true.to_numpy().astype(int)
+        tp = ((pred == 1) & (y == 1)).sum()
+        fp = ((pred == 1) & (y == 0)).sum()
+        weight = threshold / (1 - threshold)
+        return float((tp / len(y)) - (fp / len(y)) * weight)
+
+    return {
+        "model": "v1.3_primary",
+        "dataset": "NHANES 2009-2010",
+        "validation_design": "same-source temporal validation",
+        "description": "Decision-curve net benefit at manuscript operating points",
+        "data_summary": {
+            "model_at_0.35": net_benefit(RULE_OUT_THRESHOLD),
+            "model_at_0.65": net_benefit(BALANCED_THRESHOLD),
+        },
+        "timestamp": timestamp(),
+    }
+
+
+def write_temporal_artifacts(frame: pd.DataFrame, out_dir: Path, predictions_path: Path, bootstrap: int, seed: int) -> dict:
+    development = frame[frame["cycle"].isin(DEVELOPMENT_CYCLES)].copy()
+    temporal = frame[frame["cycle"] == TEMPORAL_CYCLE].copy()
+    if development.empty or temporal.empty:
+        raise ValueError("Both development cycles and NHANES 2009-2010 temporal cycle are required.")
+
+    model = fit_calibrated_ensemble(
+        development,
+        development["has_periodontitis"].astype(int),
+        PRIMARY_FEATURES,
+        seed=seed,
+    )
+    y_temporal = temporal["has_periodontitis"].astype(int)
+    p_temporal = model.predict_proba(temporal)
+
+    auc_mean, auc_ci = bootstrap_ci(y_temporal, p_temporal, roc_auc_score, n_bootstrap=bootstrap, seed=seed)
+    pr_mean, pr_ci = bootstrap_ci(y_temporal, p_temporal, average_precision_score, n_bootstrap=bootstrap, seed=seed + 1)
+    brier_mean, brier_ci = bootstrap_ci(y_temporal, p_temporal, brier_score_loss, n_bootstrap=bootstrap, seed=seed + 2)
+    rule_out = compute_metrics(y_temporal, p_temporal, threshold=RULE_OUT_THRESHOLD)
+    balanced = compute_metrics(y_temporal, p_temporal, threshold=BALANCED_THRESHOLD)
+
+    metrics = {
+        "dataset": "NHANES 2009-2010",
+        "validation_design": "same-source temporal validation",
+        "model": "v1.3_primary_ensemble_calibrated",
+        "n_train": int(len(development)),
+        "n_test": int(len(temporal)),
+        "prevalence_test": float(y_temporal.mean()),
+        "metrics": {
+            "auc": {"mean": auc_mean, "ci95": auc_ci},
+            "prauc": {"mean": pr_mean, "ci95": pr_ci},
+            "brier": {"mean": brier_mean, "ci95": brier_ci},
+        },
+        "operating_points": {
+            "rule_out_t_0.35": operating_point(rule_out),
+            "balanced_t_0.65": operating_point(balanced),
+        },
+        "timestamp": timestamp(),
+    }
+    write_json(out_dir / "external_0910_metrics.json", metrics)
+    write_json(
+        out_dir / "external_summary.json",
+        {
+            "dataset": "NHANES 2009-2010",
+            "validation_design": "same-source temporal validation",
+            "n": int(len(temporal)),
+            "prevalence": float(y_temporal.mean()),
+            "auc": auc_mean,
+            "prauc": pr_mean,
+            "brier": brier_mean,
+            "rule_out": {"threshold": RULE_OUT_THRESHOLD, **operating_point(rule_out)},
+            "balanced": {"threshold": BALANCED_THRESHOLD, **operating_point(balanced)},
+            "source": "scripts/run_temporal_validation.py",
+            "timestamp": timestamp(),
+        },
+    )
+    write_json(out_dir / "decision_curve_external.json", decision_curve_summary(y_temporal, p_temporal))
+    write_json(out_dir / "missingness_shift.json", feature_missingness_shift(development, temporal, CORE_FEATURES))
+    write_json(out_dir / "prevalence_check.json", prevalence_payload(frame))
+
+    publication = frame.copy()
+    publication["predicted_probability"] = np.nan
+    publication.loc[publication["cycle"] == TEMPORAL_CYCLE, "predicted_probability"] = p_temporal
+    publication["split"] = np.where(publication["cycle"].isin(DEVELOPMENT_CYCLES), "development", "temporal")
+    predictions_path.parent.mkdir(parents=True, exist_ok=True)
+    publication.to_parquet(predictions_path)
+    return metrics
+
+
+def operating_point(metrics: dict) -> dict:
+    denominator = metrics["tn"] + metrics["fn"]
+    npv = float(metrics["tn"] / denominator) if denominator else 0.0
+    return {
+        "sensitivity": metrics["sensitivity"],
+        "specificity": metrics["specificity"],
+        "ppv": metrics["precision"],
+        "npv": npv,
+    }
+
+
+def main() -> None:
+    parser = argparse.ArgumentParser()
+    parser.add_argument("--input", default="data/processed/nhanes_modeling_frame.parquet")
+    parser.add_argument("--out-dir", default="results")
+    parser.add_argument("--predictions", default="data/processed/publication_predictions.parquet")
+    parser.add_argument("--bootstrap", type=int, default=500)
+    parser.add_argument("--seed", type=int, default=42)
+    args = parser.parse_args()
+
+    frame = load_frame(Path(args.input))
+    metrics = write_temporal_artifacts(frame, Path(args.out_dir), Path(args.predictions), args.bootstrap, args.seed)
+    print(
+        "Temporal validation complete: "
+        f"AUC={metrics['metrics']['auc']['mean']:.4f}, "
+        f"PR-AUC={metrics['metrics']['prauc']['mean']:.4f}, "
+        f"Brier={metrics['metrics']['brier']['mean']:.4f}"
+    )
+
+
+if __name__ == "__main__":
+    main()
diff --git a/scripts/run_v13_primary.sh b/scripts/run_v13_primary.sh
index 42e5341..5ae22e1 100755
--- a/scripts/run_v13_primary.sh
+++ b/scripts/run_v13_primary.sh
@@ -43,22 +43,8 @@ print('All dependencies installed')
 "
 echo ""
 
-# Run notebook sections via papermill (if available) or jupyter nbconvert
-echo "Running notebook..."
-echo ""
-
-if command -v papermill &> /dev/null; then
-    echo "Using papermill..."
-    papermill notebooks/00_nhanes_periodontitis_end_to_end.ipynb \
-               notebooks/00_nhanes_periodontitis_end_to_end_executed.ipynb \
-               --no-progress-bar
-else
-    echo "Using jupyter nbconvert..."
-    jupyter nbconvert --to notebook --execute \
-        --ExecutePreprocessor.timeout=3600 \
-        --output 00_nhanes_periodontitis_end_to_end_executed.ipynb \
-        notebooks/00_nhanes_periodontitis_end_to_end.ipynb
-fi
+echo "Running maintained Python reproduction script..."
+"${PYTHON:-python3}" scripts/reproduce_v13_primary.py
 
 echo ""
 echo "============================================================"
@@ -73,14 +59,3 @@ echo "   - results/v13_featuredrop.json"
 echo "   - results/v13_nan_ablation.json"
 echo "   - results/v13_shap_summary.json"
 echo ""
-echo "Figures saved to:"
-echo "   - figures/14_v13_operating_points.png"
-echo "   - figures/15_shap_beeswarm.png"
-echo "   - figures/16_shap_importance.png"
-echo "   - figures/17_shap_dependence.png"
-echo "   - figures/18_nan_ablation.png"
-echo ""
-echo "Git tags available:"
-echo "   - v1.3-primary-norc"
-echo "   - v1.3-secondary-full"
-echo ""
diff --git a/scripts/verify_submission.py b/scripts/verify_submission.py
new file mode 100644
index 0000000..7d0a2f6
--- /dev/null
+++ b/scripts/verify_submission.py
@@ -0,0 +1,113 @@
+#!/usr/bin/env python3
+"""Lightweight submission-readiness checks that do not run the full reproduction."""
+
+from __future__ import annotations
+
+import json
+import shutil
+import sys
+import urllib.request
+from pathlib import Path
+
+import yaml
+
+ROOT = Path(__file__).resolve().parents[1]
+if str(ROOT) not in sys.path:
+    sys.path.insert(0, str(ROOT))
+
+from scripts.check_publication_consistency import check_docs, check_result_files
+from scripts import download_nhanes
+
+
+PUBLICATION_FILES = [
+    ROOT / "README.md",
+    ROOT / "MODEL_CARD.md",
+    ROOT / "docs/publication/ARTICLE_DRAFT.md",
+    ROOT / "docs/publication/ARTICLE_PEER_REVIEW.md",
+    ROOT / "results/v13_primary_norc_summary.json",
+    ROOT / "results/v13_secondary_full_summary.json",
+    ROOT / "results/external_0910_metrics.json",
+]
+
+
+def check_json_files() -> None:
+    for path in sorted((ROOT / "results").glob("*.json")):
+        with path.open("r", encoding="utf-8") as f:
+            json.load(f)
+
+
+def check_yaml_files() -> None:
+    for path in [ROOT / "CITATION.cff", ROOT / "configs/config.yaml"]:
+        with path.open("r", encoding="utf-8") as f:
+            if yaml.safe_load(f) is None:
+                raise AssertionError(f"{path.relative_to(ROOT)} parsed to empty YAML")
+
+
+def check_reproduction_hooks() -> None:
+    for script in ["scripts/run_v13_primary.sh", "scripts/run_external_validation.sh"]:
+        text = (ROOT / script).read_text(encoding="utf-8")
+        if "nbconvert" in text or "papermill" in text:
+            raise AssertionError(f"{script} still uses notebook execution.")
+    makefile = (ROOT / "Makefile").read_text(encoding="utf-8")
+    for target in ["verify-submission", "reproduce-full", "scripts/reproduce_v13_primary.py", "scripts/run_temporal_validation.py"]:
+        if target not in makefile:
+            raise AssertionError(f"Makefile missing expected submission target or script reference: {target}")
+
+
+def check_temporal_metric_shape() -> None:
+    metrics = json.loads((ROOT / "results/external_0910_metrics.json").read_text(encoding="utf-8"))
+    for metric in ["auc", "prauc", "brier"]:
+        payload = metrics["metrics"][metric]
+        if "mean" not in payload or "ci95" not in payload or len(payload["ci95"]) != 2:
+            raise AssertionError(f"Temporal metric missing mean/ci95 shape: {metric}")
+    for point in ["rule_out_t_0.35", "balanced_t_0.65"]:
+        payload = metrics["operating_points"][point]
+        for field in ["sensitivity", "specificity", "ppv", "npv"]:
+            if field not in payload:
+                raise AssertionError(f"Temporal operating point {point} missing {field}")
+
+
+def check_publication_wording() -> None:
+    banned = ["Publication Ready", "External Validation", "clinical deployment", "negative result rules out"]
+    for path in PUBLICATION_FILES:
+        text = path.read_text(encoding="utf-8")
+        for phrase in banned:
+            if phrase in text:
+                raise AssertionError(f"{path.relative_to(ROOT)} contains banned phrase: {phrase}")
+
+
+def check_nhanes_urls() -> None:
+    urls = [
+        download_nhanes.NHANES_FILES["2009-2010"]["demographics"],
+        download_nhanes.NHANES_FILES["2011-2012"]["periodontal"],
+        download_nhanes.NHANES_FILES["2013-2014"]["hdl"],
+    ]
+    for url in urls:
+        with urllib.request.urlopen(url, timeout=20) as response:
+            head = response.read(20)
+        if not head.startswith(b"HEADER RECORD"):
+            raise AssertionError(f"CDC URL did not return an XPT header: {url}")
+
+
+def check_manuscript_render_support() -> None:
+    if shutil.which("pandoc") is None:
+        print("pandoc not installed; PDF render check skipped.")
+    else:
+        print("pandoc installed; `make manuscript` can render PDF if a PDF engine is available.")
+
+
+def main() -> None:
+    check_json_files()
+    check_yaml_files()
+    check_result_files()
+    check_docs()
+    check_reproduction_hooks()
+    check_temporal_metric_shape()
+    check_publication_wording()
+    check_nhanes_urls()
+    check_manuscript_render_support()
+    print("Submission-readiness checks passed.")
+
+
+if __name__ == "__main__":
+    main()
diff --git a/src/reproduction.py b/src/reproduction.py
new file mode 100644
index 0000000..8b5bfea
--- /dev/null
+++ b/src/reproduction.py
@@ -0,0 +1,414 @@
+"""Script-backed reproduction utilities for the NHANES benchmark."""
+
+from __future__ import annotations
+
+import json
+from dataclasses import dataclass
+from datetime import datetime, timezone
+from pathlib import Path
+from typing import Iterable
+
+import numpy as np
+import pandas as pd
+from sklearn.calibration import calibration_curve
+from sklearn.isotonic import IsotonicRegression
+from sklearn.metrics import average_precision_score, brier_score_loss, roc_auc_score
+from sklearn.model_selection import StratifiedKFold, train_test_split
+
+from src.evaluation import compute_metrics
+
+
+SEED = 42
+DEVELOPMENT_CYCLES = ("2011-2012", "2013-2014")
+TEMPORAL_CYCLE = "2009-2010"
+
+CORE_FEATURES = [
+    "age",
+    "sex",
+    "education",
+    "bmi",
+    "waist_cm",
+    "waist_height",
+    "height_cm",
+    "systolic_bp",
+    "diastolic_bp",
+    "glucose",
+    "triglycerides",
+    "hdl",
+    "smoke_current",
+    "smoke_former",
+    "alcohol_current",
+]
+MISSING_INDICATOR_FEATURES = [f"{feature}_missing" for feature in CORE_FEATURES if feature != "age"]
+TREATMENT_SEEKING_FEATURES = ["dental_visit", "floss_days", "mobile_teeth", "floss_days_missing"]
+PRIMARY_FEATURES = CORE_FEATURES + MISSING_INDICATOR_FEATURES
+SECONDARY_FEATURES = PRIMARY_FEATURES + TREATMENT_SEEKING_FEATURES
+
+RULE_OUT_THRESHOLD = 0.35
+BALANCED_THRESHOLD = 0.65
+SECONDARY_RULE_OUT_THRESHOLD = 0.37
+SECONDARY_BALANCED_THRESHOLD = 0.67
+
+
+@dataclass
+class FittedEnsemble:
+    models: list
+    calibrator: IsotonicRegression
+    features: list[str]
+
+    def predict_proba(self, X: pd.DataFrame) -> np.ndarray:
+        raw = ensemble_raw_probability(self.models, X[self.features])
+        return np.asarray(self.calibrator.predict(raw), dtype=float)
+
+
+def write_json(path: Path | str, payload) -> None:
+    output = Path(path)
+    output.parent.mkdir(parents=True, exist_ok=True)
+    output.write_text(json.dumps(payload, indent=2, allow_nan=False), encoding="utf-8")
+
+
+def timestamp() -> str:
+    return datetime.now(timezone.utc).replace(microsecond=0).isoformat()
+
+
+def clean_numeric(series: pd.Series, missing_codes: Iterable[float] = ()) -> pd.Series:
+    values = pd.to_numeric(series, errors="coerce")
+    return values.mask(values.isin(list(missing_codes)))
+
+
+def first_existing(df: pd.DataFrame, columns: Iterable[str], default=np.nan) -> pd.Series:
+    for column in columns:
+        if column in df.columns:
+            return df[column]
+    return pd.Series(default, index=df.index)
+
+
+def average_columns(df: pd.DataFrame, columns: Iterable[str]) -> pd.Series:
+    existing = [column for column in columns if column in df.columns]
+    if not existing:
+        return pd.Series(np.nan, index=df.index)
+    return df[existing].apply(pd.to_numeric, errors="coerce").mean(axis=1)
+
+
+def build_modeling_frame(df: pd.DataFrame) -> pd.DataFrame:
+    """Build the maintained 29/33-feature modeling frame from processed NHANES data."""
+    frame = pd.DataFrame(index=df.index)
+    frame["participant_id"] = first_existing(df, ["participant_id", "SEQN"])
+    frame["cycle"] = df["cycle"]
+    frame["age"] = clean_numeric(first_existing(df, ["age", "RIDAGEYR"]))
+    frame["sex"] = clean_numeric(first_existing(df, ["sex", "RIAGENDR"]), missing_codes=(7, 9)).map({1: 1.0, 2: 0.0})
+    frame["education"] = clean_numeric(first_existing(df, ["education", "DMDEDUC2"]), missing_codes=(7, 9))
+
+    frame["bmi"] = clean_numeric(first_existing(df, ["bmi", "BMXBMI"]))
+    frame["waist_cm"] = clean_numeric(first_existing(df, ["waist_cm", "waist_circumference", "BMXWAIST"]))
+    frame["height_cm"] = clean_numeric(first_existing(df, ["height_cm", "height", "BMXHT"]))
+    frame["waist_height"] = frame["waist_cm"] / frame["height_cm"].replace(0, np.nan)
+
+    frame["systolic_bp"] = clean_numeric(first_existing(df, ["systolic_bp"]))
+    if frame["systolic_bp"].isna().all():
+        frame["systolic_bp"] = average_columns(df, ["systolic_bp_1", "systolic_bp_2", "systolic_bp_3"])
+    frame["diastolic_bp"] = clean_numeric(first_existing(df, ["diastolic_bp"]))
+    if frame["diastolic_bp"].isna().all():
+        frame["diastolic_bp"] = average_columns(df, ["diastolic_bp_1", "diastolic_bp_2", "diastolic_bp_3"])
+
+    frame["glucose"] = clean_numeric(first_existing(df, ["glucose", "fasting_glucose", "LBXGLU"]))
+    frame["triglycerides"] = clean_numeric(first_existing(df, ["triglycerides", "LBXTR"]))
+    frame["hdl"] = clean_numeric(first_existing(df, ["hdl", "LBDHDD"]))
+
+    smoking_now = clean_numeric(first_existing(df, ["smoking_now", "SMQ040"]), missing_codes=(7, 9))
+    smoked_100 = clean_numeric(first_existing(df, ["smoked_100_cigs", "SMQ020"]), missing_codes=(7, 9))
+    frame["smoke_current"] = smoking_now.isin([1, 2]).astype(float).mask(smoking_now.isna())
+    frame["smoke_former"] = ((smoked_100 == 1) & (smoking_now == 3)).astype(float).mask(
+        smoked_100.isna() & smoking_now.isna()
+    )
+
+    alcohol_year = clean_numeric(first_existing(df, ["ever_12_drinks_year", "ALQ101"]), missing_codes=(7, 9))
+    alcohol_lifetime = clean_numeric(first_existing(df, ["ever_12_drinks_lifetime", "ALQ110"]), missing_codes=(7, 9))
+    alcohol_source = alcohol_year.where(alcohol_year.notna(), alcohol_lifetime)
+    frame["alcohol_current"] = (alcohol_source == 1).astype(float).mask(alcohol_source.isna())
+
+    dental_visit = clean_numeric(first_existing(df, ["time_since_dental_visit", "OHQ030"]), missing_codes=(7, 9))
+    floss_days = clean_numeric(first_existing(df, ["floss_days_per_week", "OHQ620"]), missing_codes=(77, 99))
+    loose_teeth = clean_numeric(first_existing(df, ["loose_teeth", "mobile_teeth", "OHQ845", "OHQ680"]), missing_codes=(7, 9))
+    frame["dental_visit"] = (dental_visit <= 2).astype(float).mask(dental_visit.isna())
+    frame["floss_days"] = floss_days.where(floss_days.between(0, 7))
+    frame["mobile_teeth"] = (loose_teeth == 1).astype(float).mask(loose_teeth.isna())
+
+    for feature in CORE_FEATURES:
+        if feature != "age":
+            frame[f"{feature}_missing"] = frame[feature].isna().astype(int)
+    frame["floss_days_missing"] = frame["floss_days"].isna().astype(int)
+
+    outcome = first_existing(df, ["has_periodontitis", "periodontitis_binary"])
+    frame["has_periodontitis"] = outcome.astype(int)
+    frame["perio_class"] = first_existing(df, ["perio_class"], default="")
+    for column in ["exam_weight", "fasting_weight", "survey_psu", "survey_strata"]:
+        if column in df.columns:
+            frame[column] = df[column]
+
+    frame["age_group"] = pd.cut(
+        frame["age"],
+        bins=[29, 44, 64, np.inf],
+        labels=["30-44", "45-64", "65+"],
+        include_lowest=True,
+    ).astype(object)
+    frame["smoking"] = np.select(
+        [frame["smoke_current"] == 1, frame["smoke_former"] == 1],
+        ["current", "former"],
+        default="never/unknown",
+    )
+    frame["metabolic_risk"] = np.where(
+        (frame["bmi"] >= 30)
+        | (frame["glucose"] >= 126)
+        | (frame["triglycerides"] >= 150)
+        | (frame["systolic_bp"] >= 130)
+        | (frame["diastolic_bp"] >= 80),
+        "elevated",
+        "not_elevated",
+    )
+
+    return frame
+
+
+def feature_sets() -> dict[str, list[str]]:
+    return {
+        "deployment_ready": CORE_FEATURES,
+        "primary": PRIMARY_FEATURES,
+        "secondary": SECONDARY_FEATURES,
+    }
+
+
+def assert_feature_contract(frame: pd.DataFrame) -> None:
+    missing = sorted(set(SECONDARY_FEATURES + ["has_periodontitis", "cycle"]) - set(frame.columns))
+    if missing:
+        raise KeyError(f"Modeling frame missing required columns: {missing}")
+    if len(PRIMARY_FEATURES) != 29 or len(SECONDARY_FEATURES) != 33:
+        raise AssertionError("Feature contract must remain 29 primary and 33 secondary predictors.")
+
+
+def build_base_models(seed: int = SEED) -> list:
+    from catboost import CatBoostClassifier
+    from lightgbm import LGBMClassifier
+    from xgboost import XGBClassifier
+
+    return [
+        CatBoostClassifier(
+            iterations=200,
+            depth=5,
+            learning_rate=0.05,
+            loss_function="Logloss",
+            random_seed=seed,
+            verbose=False,
+            allow_writing_files=False,
+        ),
+        XGBClassifier(
+            n_estimators=200,
+            max_depth=3,
+            learning_rate=0.05,
+            subsample=0.9,
+            colsample_bytree=0.8,
+            eval_metric="logloss",
+            random_state=seed,
+        ),
+        LGBMClassifier(
+            n_estimators=200,
+            learning_rate=0.05,
+            num_leaves=31,
+            subsample=0.9,
+            colsample_bytree=0.8,
+            random_state=seed,
+            verbose=-1,
+        ),
+    ]
+
+
+def ensemble_raw_probability(models: list, X: pd.DataFrame) -> np.ndarray:
+    probabilities = [model.predict_proba(X)[:, 1] for model in models]
+    return np.mean(probabilities, axis=0)
+
+
+def fit_calibrated_ensemble(
+    X: pd.DataFrame,
+    y: pd.Series,
+    features: list[str],
+    seed: int = SEED,
+    calibration_fraction: float = 0.2,
+) -> FittedEnsemble:
+    y = y.astype(int)
+    if y.nunique() < 2:
+        raise ValueError("Model fitting requires both outcome classes.")
+
+    X_fit, X_cal, y_fit, y_cal = train_test_split(
+        X[features],
+        y,
+        test_size=calibration_fraction,
+        random_state=seed,
+        stratify=y,
+    )
+    models = build_base_models(seed)
+    for model in models:
+        model.fit(X_fit, y_fit)
+    raw_cal = ensemble_raw_probability(models, X_cal)
+    calibrator = IsotonicRegression(out_of_bounds="clip", y_min=0.0, y_max=1.0)
+    calibrator.fit(raw_cal, y_cal)
+    return FittedEnsemble(models=models, calibrator=calibrator, features=features)
+
+
+def cross_validated_predictions(
+    frame: pd.DataFrame,
+    features: list[str],
+    n_folds: int = 5,
+    seed: int = SEED,
+) -> np.ndarray:
+    y = frame["has_periodontitis"].astype(int)
+    predictions = np.zeros(len(frame), dtype=float)
+    cv = StratifiedKFold(n_splits=n_folds, shuffle=True, random_state=seed)
+    for fold_idx, (train_idx, test_idx) in enumerate(cv.split(frame[features], y), start=1):
+        fitted = fit_calibrated_ensemble(
+            frame.iloc[train_idx],
+            y.iloc[train_idx],
+            features,
+            seed=seed + fold_idx,
+        )
+        predictions[test_idx] = fitted.predict_proba(frame.iloc[test_idx])
+    return predictions
+
+
+def summarize_predictions(
+    y_true: pd.Series,
+    y_prob: np.ndarray,
+    n_features: int,
+    name: str,
+    rule_out_threshold: float = RULE_OUT_THRESHOLD,
+    balanced_threshold: float = BALANCED_THRESHOLD,
+) -> dict:
+    rule_out = compute_metrics(y_true, y_prob, threshold=rule_out_threshold)
+    balanced = compute_metrics(y_true, y_prob, threshold=balanced_threshold)
+    return {
+        "name": name,
+        "n_features": int(n_features),
+        "auc": float(roc_auc_score(y_true, y_prob)),
+        "pr_auc": float(average_precision_score(y_true, y_prob)),
+        "brier_score": float(brier_score_loss(y_true, y_prob)),
+        "rule_out_recall": rule_out["sensitivity"],
+        "rule_out_specificity": rule_out["specificity"],
+        "balanced_recall": balanced["sensitivity"],
+        "balanced_specificity": balanced["specificity"],
+        "rule_out": rule_out,
+        "balanced": balanced,
+    }
+
+
+def summary_artifact(
+    summary: dict,
+    model_name: str,
+    description: str,
+    dataset: str,
+    prevalence: float,
+    threshold_rule_out: float,
+    threshold_balanced: float,
+    extra: dict | None = None,
+) -> dict:
+    payload = {
+        "model": model_name,
+        "description": description,
+        "dataset": dataset,
+        "n_samples": int(summary["rule_out"]["tn"] + summary["rule_out"]["fp"] + summary["rule_out"]["fn"] + summary["rule_out"]["tp"]),
+        "n_features": int(summary["n_features"]),
+        "prevalence": float(prevalence),
+        "metrics": {
+            "auc_roc": summary["auc"],
+            "pr_auc": summary["pr_auc"],
+            "brier_score": summary["brier_score"],
+        },
+        "operating_points": {
+            "rule_out": operating_point_payload(summary["rule_out"], threshold_rule_out, "High-sensitivity triage"),
+            "balanced": operating_point_payload(summary["balanced"], threshold_balanced, "Balanced triage"),
+        },
+        "confusion_matrix": {
+            "rule_out": confusion_payload(summary["rule_out"], threshold_rule_out),
+            "balanced": confusion_payload(summary["balanced"], threshold_balanced),
+        },
+        "calibration": "nested_isotonic_regression",
+        "timestamp": timestamp(),
+    }
+    if extra:
+        payload.update(extra)
+    return payload
+
+
+def operating_point_payload(metrics: dict, threshold: float, use_case: str) -> dict:
+    return {
+        "name": use_case,
+        "threshold": float(threshold),
+        "recall": metrics["sensitivity"],
+        "specificity": metrics["specificity"],
+        "precision": metrics["precision"],
+        "npv": npv(metrics),
+        "f1": metrics["f1_score"],
+        "use_case": "Triage operating point; periodontal examination remains required.",
+    }
+
+
+def confusion_payload(metrics: dict, threshold: float) -> dict:
+    return {
+        "threshold": float(threshold),
+        "tp": int(metrics["tp"]),
+        "fp": int(metrics["fp"]),
+        "tn": int(metrics["tn"]),
+        "fn": int(metrics["fn"]),
+    }
+
+
+def npv(metrics: dict) -> float:
+    denominator = metrics["tn"] + metrics["fn"]
+    return float(metrics["tn"] / denominator) if denominator else 0.0
+
+
+def bootstrap_ci(
+    y_true: pd.Series,
+    y_prob: np.ndarray,
+    metric_fn,
+    n_bootstrap: int = 500,
+    seed: int = SEED,
+) -> tuple[float, list[float]]:
+    rng = np.random.default_rng(seed)
+    y = np.asarray(y_true).astype(int)
+    p = np.asarray(y_prob).astype(float)
+    scores = []
+    for _ in range(n_bootstrap):
+        idx = rng.integers(0, len(y), len(y))
+        if len(np.unique(y[idx])) < 2:
+            continue
+        scores.append(float(metric_fn(y[idx], p[idx])))
+    mean = float(metric_fn(y, p))
+    if not scores:
+        return mean, [float("nan"), float("nan")]
+    return mean, [float(np.percentile(scores, 2.5)), float(np.percentile(scores, 97.5))]
+
+
+def feature_missingness_shift(train: pd.DataFrame, temporal: pd.DataFrame, features: Iterable[str]) -> dict:
+    rates = {}
+    deltas = {}
+    for feature in features:
+        train_rate = float(train[feature].isna().mean() * 100)
+        temporal_rate = float(temporal[feature].isna().mean() * 100)
+        delta = temporal_rate - train_rate
+        rates[feature] = {
+            "train_2011_2014": train_rate,
+            "temporal_2009_2010": temporal_rate,
+            "delta": delta,
+        }
+        deltas[feature] = abs(delta)
+    max_feature = max(deltas, key=deltas.get) if deltas else None
+    return {
+        "description": "Per-feature NaN rates: NHANES 2011-2014 training data vs 2009-2010 same-source temporal validation data",
+        "feature_nan_rates": rates,
+        "flags": {
+            "delta_exceeds_10pct": [feature for feature, delta in deltas.items() if delta > 10],
+            "max_delta_feature": max_feature,
+            "max_delta_value": deltas.get(max_feature, 0.0) if max_feature else 0.0,
+        },
+        "interpretation": "Missingness shift is reported descriptively; transportability remains limited to NHANES-like data.",
+        "timestamp": timestamp(),
+    }
diff --git a/tests/test_reproduction_contract.py b/tests/test_reproduction_contract.py
new file mode 100644
index 0000000..82851d3
--- /dev/null
+++ b/tests/test_reproduction_contract.py
@@ -0,0 +1,59 @@
+import pandas as pd
+
+from src.reproduction import (
+    PRIMARY_FEATURES,
+    SECONDARY_FEATURES,
+    assert_feature_contract,
+    build_modeling_frame,
+    feature_sets,
+)
+
+
+def toy_processed_frame():
+    return pd.DataFrame(
+        {
+            "participant_id": [1, 2],
+            "cycle": ["2011-2012", "2009-2010"],
+            "age": [45, 67],
+            "sex": [1, 2],
+            "education": [4, 2],
+            "bmi": [26.0, None],
+            "waist_circumference": [90.0, 101.0],
+            "height_cm": [170.0, 165.0],
+            "systolic_bp_1": [120, 140],
+            "systolic_bp_2": [122, 142],
+            "diastolic_bp_1": [75, 85],
+            "diastolic_bp_2": [76, 86],
+            "fasting_glucose": [95, None],
+            "triglycerides": [130, 180],
+            "hdl": [55, 42],
+            "smoked_100_cigs": [1, 2],
+            "smoking_now": [3, None],
+            "ever_12_drinks_year": [1, 2],
+            "time_since_dental_visit": [2, 4],
+            "floss_days_per_week": [5, None],
+            "loose_teeth": [2, 1],
+            "has_periodontitis": [1, 0],
+            "perio_class": ["moderate", "none"],
+            "exam_weight": [1.2, 0.8],
+        }
+    )
+
+
+def test_feature_contract_is_29_primary_and_33_secondary():
+    sets = feature_sets()
+
+    assert len(PRIMARY_FEATURES) == 29
+    assert len(SECONDARY_FEATURES) == 33
+    assert len(sets["deployment_ready"]) == 15
+    assert set(sets["primary"]).issubset(set(sets["secondary"]))
+
+
+def test_build_modeling_frame_produces_required_columns_and_subgroups():
+    frame = build_modeling_frame(toy_processed_frame())
+
+    assert_feature_contract(frame)
+    assert frame.loc[0, "waist_height"] == 90.0 / 170.0
+    assert frame.loc[1, "bmi_missing"] == 1
+    assert frame.loc[0, "floss_days_missing"] == 0
+    assert {"age_group", "smoking", "metabolic_risk", "exam_weight"}.issubset(frame.columns)

From 92376f9f930f6e306550af3dbd81c72fa1a0588e Mon Sep 17 00:00:00 2001
From: Tuminha <83960151+Tuminha@users.noreply.github.com>
Date: Tue, 2 Jun 2026 10:21:42 +0200
Subject: [PATCH 2/3] Update artifacts from full reproduction

---
 MODEL_CARD.md                               |  16 +-
 Makefile                                    |   4 +-
 README.md                                   |  17 +-
 docs/publication/ARTICLE_DRAFT.md           |  44 +--
 reports/ARTICLE_DRAFT_line_numbered.txt     |  44 +--
 results/decision_curve_external.json        |  10 +-
 results/external_0910_metrics.json          |  44 +--
 results/external_summary.json               |  41 +--
 results/missingness_shift.json              |  98 +++++-
 results/prevalence_check.json               |  60 +---
 results/publication_sensitivity_tables.json | 359 ++++++++++++++++++++
 results/publication_sensitivity_tables.md   |  30 ++
 results/v13_featuredrop.json                |  39 +--
 results/v13_nan_ablation.json               |  83 +++--
 results/v13_operating_points.json           |  91 +++--
 results/v13_primary_norc_summary.json       |  81 +++--
 results/v13_secondary_full_summary.json     |  81 +++--
 results/v13_shap_summary.json               |  55 +--
 scripts/verify_submission.py                |  21 ++
 19 files changed, 820 insertions(+), 398 deletions(-)
 create mode 100644 results/publication_sensitivity_tables.json
 create mode 100644 results/publication_sensitivity_tables.md

diff --git a/MODEL_CARD.md b/MODEL_CARD.md
index e0fb0e9..ce6fd14 100644
--- a/MODEL_CARD.md
+++ b/MODEL_CARD.md
@@ -6,8 +6,8 @@
 |---|---|
 | Model name | `v1.3_primary_no_reverse_causality` |
 | Model type | Calibrated soft-voting ensemble of CatBoost, XGBoost, and LightGBM |
-| Development data | NHANES 2011-2014 adults age 30+ with full periodontal examination, `n=9,379` |
-| Same-source temporal validation data | NHANES 2009-2010, `n=5,177` |
+| Development data | NHANES 2011-2014 adults age 30+ with full periodontal examination, `n=9,034` |
+| Same-source temporal validation data | NHANES 2009-2010, `n=5,037` |
 | Outcome | Any CDC/AAP periodontitis versus no periodontitis |
 | Primary feature count | 29 predictors |
 | Secondary feature count | 33 predictors |
@@ -21,16 +21,16 @@ This model is intended for research benchmarking, methods comparison, and risk-s
 
 | Evaluation | AUC-ROC | PR-AUC | Brier | Notes |
 |---|---:|---:|---:|---|
-| Internal 5-fold CV, primary 29-feature model | 0.7172 | 0.8157 | 0.1812 | Excludes treatment-seeking variables |
-| Internal 5-fold CV, secondary 33-feature model | 0.7255 | 0.8207 | 0.1793 | Includes treatment-seeking variables |
-| Same-source temporal validation, frozen primary model | 0.6771 | 0.7735 | 0.2003 | NHANES 2009-2010 |
+| Internal 5-fold CV, primary 29-feature model | 0.6896 | 0.8240 | 0.1871 | Excludes treatment-seeking variables |
+| Internal 5-fold CV, secondary 33-feature model | 0.6996 | 0.8295 | 0.1844 | Includes treatment-seeking variables |
+| Same-source temporal validation, frozen primary model | 0.6495 | 0.7727 | 0.2023 | NHANES 2009-2010 |
 
 Temporal operating points for the frozen primary model:
 
 | Threshold | Sensitivity | Specificity | PPV | NPV | Appropriate interpretation |
 |---:|---:|---:|---:|---:|---|
-| 0.35 | 97.1% | 18.1% | 70.8% | 75.2% | High-sensitivity triage threshold; many false positives and some false negatives remain |
-| 0.65 | 82.6% | 43.3% | 74.9% | 54.9% | More balanced threshold; still not sufficient for diagnosis |
+| 0.35 | 98.9% | 5.5% | 70.0% | 69.1% | High-sensitivity triage threshold; many false positives and some false negatives remain |
+| 0.65 | 77.7% | 45.2% | 76.0% | 47.5% | More balanced threshold; still not sufficient for diagnosis |
 
 ## Feature Sets
 
@@ -58,7 +58,7 @@ The temporal validation cohort is useful because the model is frozen and evaluat
 
 Known applicability limits:
 
-- High analytic-sample prevalence, around 67-68%, limits direct PPV/NPV transfer to lower-prevalence populations.
+- High analytic-sample prevalence, around 66-72% depending on cycle and weighting, limits direct PPV/NPV transfer to lower-prevalence populations.
 - Missingness indicators may learn survey logistics, so the deployment-ready no-indicator model should be reported as a conservative benchmark.
 - Subgroup calibration and discrimination should be regenerated before journal submission using `scripts/04_publication_analyses.py`.
 - Any implementation outside NHANES-like research data requires local recalibration and independent safety assessment.
diff --git a/Makefile b/Makefile
index 618f733..872cc2d 100644
--- a/Makefile
+++ b/Makefile
@@ -1,3 +1,4 @@
+SHELL := /bin/bash
 PYTHON ?= ./venv/bin/python
 
 .PHONY: help setup setup-lock download process train reproduce temporal test consistency verify-submission reproduce-full notebook clean figures lock dirs manuscript
@@ -89,7 +90,8 @@ temporal:
 
 reproduce-full:
 	@mkdir -p logs
-	@LOG="logs/full_reproduction_$$(date -u +%Y%m%dT%H%M%SZ).log"; \
+	@set -euo pipefail; \
+	LOG="logs/full_reproduction_$$(date -u +%Y%m%dT%H%M%SZ).log"; \
 	echo "Writing full reproduction log to $$LOG"; \
 	{ \
 		$(MAKE) download; \
diff --git a/README.md b/README.md
index 3a8fde9..3d58d89 100644
--- a/README.md
+++ b/README.md
@@ -4,8 +4,8 @@ This repository contains a reproducible benchmark of low-cost predictors for per
 
 ## Current Study Framing
 
-- Development cohort: NHANES 2011-2014 adults age 30+ with full periodontal examination, `n=9,379`.
-- Same-source temporal validation cohort: NHANES 2009-2010, `n=5,177`.
+- Development cohort: NHANES 2011-2014 adults age 30+ with full periodontal examination, `n=9,034`.
+- Same-source temporal validation cohort: NHANES 2009-2010, `n=5,037`.
 - Outcome: any periodontitis versus no periodontitis using CDC/AAP case definitions.
 - Primary model: calibrated soft-voting ensemble with 29 predictors after excluding treatment-seeking/reverse-causality variables.
 - Secondary model: 33 predictors with the treatment-seeking variables restored for upper-bound sensitivity analysis.
@@ -17,18 +17,18 @@ These values are the source-of-truth values enforced by `scripts/check_publicati
 
 | Analysis | Model | Features | AUC-ROC | PR-AUC | Notes |
 |---|---:|---:|---:|---:|---|
-| Internal 5-fold CV | Primary no reverse-causality | 29 | 0.7172 | 0.8157 | Main development estimate |
-| Internal 5-fold CV | Secondary full-feature | 33 | 0.7255 | 0.8207 | Adds dental visit, flossing, loose teeth, and floss-missing flag |
-| Same-source temporal validation | Frozen primary model on 2009-2010 | 29 | 0.6771 | 0.7735 | Same survey system, earlier cycle |
+| Internal 5-fold CV | Primary no reverse-causality | 29 | 0.6896 | 0.8240 | Main development estimate |
+| Internal 5-fold CV | Secondary full-feature | 33 | 0.6996 | 0.8295 | Adds dental visit, flossing, loose teeth, and floss-missing flag |
+| Same-source temporal validation | Frozen primary model on 2009-2010 | 29 | 0.6495 | 0.7727 | Same survey system, earlier cycle |
 
 Temporal operating points for the frozen primary model:
 
 | Threshold | Sensitivity | Specificity | PPV | NPV | Interpretation |
 |---:|---:|---:|---:|---:|---|
-| 0.35 | 97.1% | 18.1% | 70.8% | 75.2% | High-sensitivity triage; negative screens are not definitive |
-| 0.65 | 82.6% | 43.3% | 74.9% | 54.9% | More balanced but still requires clinical confirmation |
+| 0.35 | 98.9% | 5.5% | 70.0% | 69.1% | High-sensitivity triage; negative screens are not definitive |
+| 0.65 | 77.7% | 45.2% | 76.0% | 47.5% | More balanced but still requires clinical confirmation |
 
-The key conclusion is deliberately modest: with these low-cost predictors, discrimination is around 0.72 internally and around 0.68 under same-source temporal validation. The observed performance is useful as a benchmark, not as proof of readiness for clinical implementation.
+The key conclusion is deliberately modest: with these low-cost predictors, discrimination is around 0.69 internally and around 0.65 under same-source temporal validation. The observed performance is useful as a benchmark, not as proof of readiness for clinical implementation.
 
 ## Reproducibility
 
@@ -74,6 +74,7 @@ The legacy notebooks are retired as source-of-truth artifacts. The maintained pu
 | `scripts/reproduce_v13_primary.py` | Regenerates internal v1.3 benchmark result artifacts |
 | `scripts/run_temporal_validation.py` | Regenerates same-source temporal validation artifacts |
 | `scripts/04_publication_analyses.py` | Generates publication sensitivity tables from processed predictions |
+| `results/publication_sensitivity_tables.md` | Survey-weighted prevalence and subgroup performance summary generated by the full reproduction |
 | `results/` | Saved result artifacts used by the manuscript and model card |
 | `docs/publication/ARTICLE_DRAFT.md` | Current manuscript source |
 
diff --git a/docs/publication/ARTICLE_DRAFT.md b/docs/publication/ARTICLE_DRAFT.md
index 60e5a50..176f789 100644
--- a/docs/publication/ARTICLE_DRAFT.md
+++ b/docs/publication/ARTICLE_DRAFT.md
@@ -8,11 +8,11 @@
 
 **Background:** Machine-learning studies for periodontitis prediction have reported very high internal discrimination using low-cost NHANES predictors. Such estimates require careful reassessment because internal validation, missing-data handling, calibration, and treatment-seeking predictors can materially change apparent performance.
 
-**Methods:** We analyzed NHANES 2011-2014 adults age 30 years or older with full periodontal examinations (`n=9,379`). Periodontitis was classified using CDC/AAP case definitions. XGBoost, LightGBM, and CatBoost were compared with logistic regression and random forest using stratified 5-fold cross-validation. The primary calibrated ensemble excluded treatment-seeking variables and used 29 predictors. A secondary 33-feature model restored dental visit, flossing, loose teeth, and floss-missing variables to estimate their incremental contribution. The frozen primary model was evaluated on NHANES 2009-2010 (`n=5,177`) as same-source temporal validation. Missingness ablations and deployment-ready feature analysis were used to assess whether NHANES missingness patterns contributed survey-specific signal.
+**Methods:** We analyzed NHANES 2011-2014 adults age 30 years or older with full periodontal examinations (`n=9,034`). Periodontitis was classified using CDC/AAP case definitions. XGBoost, LightGBM, and CatBoost were compared using stratified 5-fold cross-validation. The primary calibrated ensemble excluded treatment-seeking variables and used 29 predictors. A secondary 33-feature model restored dental visit, flossing, loose teeth, and floss-missing variables to estimate their incremental contribution. The frozen primary model was evaluated on NHANES 2009-2010 (`n=5,037`) as same-source temporal validation. Missingness ablations and deployment-ready feature analysis were used to assess whether NHANES missingness patterns contributed survey-specific signal.
 
-**Results:** The primary 29-feature model achieved internal AUC-ROC 0.7172 and PR-AUC 0.8157. The secondary 33-feature model achieved AUC-ROC 0.7255 and PR-AUC 0.8207, indicating that treatment-seeking variables added only 0.0083 AUC. In same-source temporal validation, the frozen primary model achieved AUC-ROC 0.6771, PR-AUC 0.7735, and Brier score 0.2003. At threshold 0.35, sensitivity was 97.1% and specificity was 18.1%; at threshold 0.65, sensitivity was 82.6% and specificity was 43.3%. These operating points are not diagnostic rules and require periodontal examination for confirmation.
+**Results:** The primary 29-feature model achieved internal AUC-ROC 0.6896 and PR-AUC 0.8240. The secondary 33-feature model achieved AUC-ROC 0.6996 and PR-AUC 0.8295, indicating that treatment-seeking variables added 0.0100 AUC. In same-source temporal validation, the frozen primary model achieved AUC-ROC 0.6495, PR-AUC 0.7727, and Brier score 0.2023. At threshold 0.35, sensitivity was 98.9% and specificity was 5.5%; at threshold 0.65, sensitivity was 77.7% and specificity was 45.2%. These operating points are not diagnostic rules and require periodontal examination for confirmation.
 
-**Conclusions:** Low-cost NHANES predictors appear to support realistic discrimination around 0.72 internally and 0.68 under same-source temporal validation. The study is best interpreted as a benchmark and cautionary methods report, not as evidence that a deployable diagnostic screening model has been established. Geographic validation, prospective clinical validation, local recalibration, subgroup calibration, and survey-weighted sensitivity analyses remain necessary before implementation claims.
+**Conclusions:** Low-cost NHANES predictors appear to support realistic discrimination around 0.69 internally and 0.65 under same-source temporal validation. The study is best interpreted as a benchmark and cautionary methods report, not as evidence that a deployable diagnostic screening model has been established. Geographic validation, prospective clinical validation, local recalibration, and subgroup calibration remain necessary before implementation claims.
 
 **Keywords:** periodontitis; NHANES; prediction model; gradient boosting; calibration; missing data; TRIPOD+AI
 
@@ -40,9 +40,9 @@ NHANES data are publicly available and de-identified. This secondary analysis di
 
 ### 2.2 Participants
 
-Eligible participants were adults age 30 years or older with full periodontal examination data and sufficient information for CDC/AAP periodontitis classification. The development cohort included 9,379 participants. The same-source temporal validation cohort included 5,177 participants.
+Eligible participants were adults age 30 years or older with full periodontal examination data and sufficient information for CDC/AAP periodontitis classification. The development cohort included 9,034 participants. The same-source temporal validation cohort included 5,037 participants.
 
-The analytic prevalence of periodontitis was approximately 67-68%, higher than general-population CDC estimates. This reflects the restricted full-examination analytic sample and should not be interpreted as the expected prevalence in a lower-risk screening population.
+The analytic prevalence of periodontitis was approximately 69-72% unweighted and approximately 66% after applying examination weights by cycle, higher than general-population CDC estimates. This reflects the restricted full-examination analytic sample and should not be interpreted as the expected prevalence in a lower-risk screening population.
 
 ### 2.3 Outcome Definition
 
@@ -77,50 +77,50 @@ The final temporal evaluation used the frozen primary model and pre-specified th
 
 Discrimination was summarized with AUC-ROC and PR-AUC. Calibration was summarized with Brier score and reliability plots. Operating points were reported using sensitivity, specificity, PPV, and NPV. Decision-curve analysis was included as a descriptive utility analysis only.
 
-Survey-weighted prevalence and subgroup performance are generated by `scripts/04_publication_analyses.py` when processed prediction tables are available. These analyses should be regenerated immediately before journal submission.
+Survey-weighted prevalence and subgroup performance are generated by `scripts/04_publication_analyses.py` when processed prediction tables are available. The current regenerated summary is saved in `results/publication_sensitivity_tables.md`.
 
 ## 3. Results
 
 ### 3.1 Cohorts
 
-The development cohort included 9,379 adults from NHANES 2011-2014. The temporal validation cohort included 5,177 adults from NHANES 2009-2010. Periodontitis prevalence was 68.3% in development data and 67.2% in temporal validation data.
+The development cohort included 9,034 adults from NHANES 2011-2014. The temporal validation cohort included 5,037 adults from NHANES 2009-2010. Periodontitis prevalence was 70.9% in development data and 69.1% in temporal validation data before survey weighting.
 
 ### 3.2 Internal Model Performance
 
 | Model variant | Features | AUC-ROC | PR-AUC | Interpretation |
 |---|---:|---:|---:|---|
-| Primary model without treatment-seeking variables | 29 | 0.7172 | 0.8157 | Preferred benchmark model |
-| Secondary full-feature model | 33 | 0.7255 | 0.8207 | Upper-bound sensitivity analysis |
-| Deployment-ready core model | 15 | 0.6692 | 0.8137 | Conservative feature set without missingness indicators |
+| Primary model without treatment-seeking variables | 29 | 0.6896 | 0.8240 | Preferred benchmark model |
+| Secondary full-feature model | 33 | 0.6996 | 0.8295 | Upper-bound sensitivity analysis |
+| Deployment-ready core model | 15 | 0.6896 | 0.8237 | Conservative feature set without missingness indicators |
 
-The full-feature model improved AUC by 0.0083 over the primary model. This small difference supports excluding treatment-seeking variables from the primary benchmark.
+The full-feature model improved AUC by 0.0100 over the primary model. This small difference supports excluding treatment-seeking variables from the primary benchmark.
 
 ### 3.3 Same-Source Temporal Validation
 
 | Metric | Development estimate | Temporal validation estimate |
 |---|---:|---:|
-| N | 9,379 | 5,177 |
-| Prevalence | 68.3% | 67.2% |
-| AUC-ROC | 0.7172 | 0.6771 (95% CI 0.6612-0.6934) |
-| PR-AUC | 0.8157 | 0.7735 (95% CI 0.7571-0.7892) |
-| Brier score | 0.1812 | 0.2003 (95% CI 0.1935-0.2073) |
+| N | 9,034 | 5,037 |
+| Prevalence | 70.9% | 69.1% |
+| AUC-ROC | 0.6896 | 0.6495 (95% CI 0.6315-0.6664) |
+| PR-AUC | 0.8240 | 0.7727 (95% CI 0.7570-0.7885) |
+| Brier score | 0.1871 | 0.2023 (95% CI 0.1955-0.2085) |
 
-The drop from internal AUC 0.7172 to temporal AUC 0.6771 is consistent with a modest but meaningful generalization gap.
+The drop from internal AUC 0.6896 to temporal AUC 0.6495 is consistent with a modest but meaningful generalization gap.
 
 ### 3.4 Operating Points
 
 | Threshold | Sensitivity | Specificity | PPV | NPV |
 |---:|---:|---:|---:|---:|
-| 0.35 | 97.1% | 18.1% | 70.8% | 75.2% |
-| 0.65 | 82.6% | 43.3% | 74.9% | 54.9% |
+| 0.35 | 98.9% | 5.5% | 70.0% | 69.1% |
+| 0.65 | 77.7% | 45.2% | 76.0% | 47.5% |
 
-The 0.35 threshold prioritizes sensitivity but has very low specificity and an NPV of only 75.2% in this high-prevalence cohort. It should be described as a high-sensitivity triage operating point, not a reliable disease-exclusion rule.
+The 0.35 threshold prioritizes sensitivity but has very low specificity and an NPV of only 69.1% in this high-prevalence cohort. It should be described as a high-sensitivity triage operating point, not a reliable disease-exclusion rule.
 
 ### 3.5 Missingness and Survey-Design Sensitivity
 
 Missingness indicators contributed limited but measurable predictive signal. Missingness patterns were broadly comparable between the development and temporal validation cohorts, but the signal may still be NHANES-specific. The deployment-ready 15-feature model remains the most conservative estimate for settings where NHANES missingness patterns are unavailable.
 
-Survey-weighted prevalence and subgroup-performance tables should be generated from processed prediction data before submission. The repository now includes reusable code for these analyses, but the current manuscript should not claim population-level transportability until those tables are complete and reviewed.
+Survey-weighted prevalence and subgroup-performance tables were generated from processed prediction data and are saved in `results/publication_sensitivity_tables.md`. Weighted prevalence was approximately 65.6% in 2009-2010 and 66.2-66.3% in 2011-2014. Subgroup analyses are descriptive and should not be overinterpreted as evidence of transportability.
 
 ## 4. Discussion
 
@@ -141,7 +141,7 @@ The operating points also require conservative interpretation. High sensitivity
 
 ## 6. Conclusions
 
-The primary 29-feature model achieved AUC-ROC 0.7172 internally and 0.6771 under same-source temporal validation. A secondary 33-feature model provided only a small apparent gain, supporting exclusion of treatment-seeking variables from the preferred benchmark. These results support a cautious methodological conclusion: low-cost NHANES predictors provide moderate discrimination and are useful for benchmarking, but they do not establish a clinically ready periodontitis screening system.
+The primary 29-feature model achieved AUC-ROC 0.6896 internally and 0.6495 under same-source temporal validation. A secondary 33-feature model provided only a small apparent gain, supporting exclusion of treatment-seeking variables from the preferred benchmark. These results support a cautious methodological conclusion: low-cost NHANES predictors provide moderate discrimination and are useful for benchmarking, but they do not establish a clinically ready periodontitis screening system.
 
 ## Data and Code Availability
 
diff --git a/reports/ARTICLE_DRAFT_line_numbered.txt b/reports/ARTICLE_DRAFT_line_numbered.txt
index 56f95b8..2d6abf4 100644
--- a/reports/ARTICLE_DRAFT_line_numbered.txt
+++ b/reports/ARTICLE_DRAFT_line_numbered.txt
@@ -8,11 +8,11 @@
 0008  
 0009  **Background:** Machine-learning studies for periodontitis prediction have reported very high internal discrimination using low-cost NHANES predictors. Such estimates require careful reassessment because internal validation, missing-data handling, calibration, and treatment-seeking predictors can materially change apparent performance.
 0010  
-0011  **Methods:** We analyzed NHANES 2011-2014 adults age 30 years or older with full periodontal examinations (`n=9,379`). Periodontitis was classified using CDC/AAP case definitions. XGBoost, LightGBM, and CatBoost were compared with logistic regression and random forest using stratified 5-fold cross-validation. The primary calibrated ensemble excluded treatment-seeking variables and used 29 predictors. A secondary 33-feature model restored dental visit, flossing, loose teeth, and floss-missing variables to estimate their incremental contribution. The frozen primary model was evaluated on NHANES 2009-2010 (`n=5,177`) as same-source temporal validation. Missingness ablations and deployment-ready feature analysis were used to assess whether NHANES missingness patterns contributed survey-specific signal.
+0011  **Methods:** We analyzed NHANES 2011-2014 adults age 30 years or older with full periodontal examinations (`n=9,034`). Periodontitis was classified using CDC/AAP case definitions. XGBoost, LightGBM, and CatBoost were compared using stratified 5-fold cross-validation. The primary calibrated ensemble excluded treatment-seeking variables and used 29 predictors. A secondary 33-feature model restored dental visit, flossing, loose teeth, and floss-missing variables to estimate their incremental contribution. The frozen primary model was evaluated on NHANES 2009-2010 (`n=5,037`) as same-source temporal validation. Missingness ablations and deployment-ready feature analysis were used to assess whether NHANES missingness patterns contributed survey-specific signal.
 0012  
-0013  **Results:** The primary 29-feature model achieved internal AUC-ROC 0.7172 and PR-AUC 0.8157. The secondary 33-feature model achieved AUC-ROC 0.7255 and PR-AUC 0.8207, indicating that treatment-seeking variables added only 0.0083 AUC. In same-source temporal validation, the frozen primary model achieved AUC-ROC 0.6771, PR-AUC 0.7735, and Brier score 0.2003. At threshold 0.35, sensitivity was 97.1% and specificity was 18.1%; at threshold 0.65, sensitivity was 82.6% and specificity was 43.3%. These operating points are not diagnostic rules and require periodontal examination for confirmation.
+0013  **Results:** The primary 29-feature model achieved internal AUC-ROC 0.6896 and PR-AUC 0.8240. The secondary 33-feature model achieved AUC-ROC 0.6996 and PR-AUC 0.8295, indicating that treatment-seeking variables added 0.0100 AUC. In same-source temporal validation, the frozen primary model achieved AUC-ROC 0.6495, PR-AUC 0.7727, and Brier score 0.2023. At threshold 0.35, sensitivity was 98.9% and specificity was 5.5%; at threshold 0.65, sensitivity was 77.7% and specificity was 45.2%. These operating points are not diagnostic rules and require periodontal examination for confirmation.
 0014  
-0015  **Conclusions:** Low-cost NHANES predictors appear to support realistic discrimination around 0.72 internally and 0.68 under same-source temporal validation. The study is best interpreted as a benchmark and cautionary methods report, not as evidence that a deployable diagnostic screening model has been established. Geographic validation, prospective clinical validation, local recalibration, subgroup calibration, and survey-weighted sensitivity analyses remain necessary before implementation claims.
+0015  **Conclusions:** Low-cost NHANES predictors appear to support realistic discrimination around 0.69 internally and 0.65 under same-source temporal validation. The study is best interpreted as a benchmark and cautionary methods report, not as evidence that a deployable diagnostic screening model has been established. Geographic validation, prospective clinical validation, local recalibration, and subgroup calibration remain necessary before implementation claims.
 0016  
 0017  **Keywords:** periodontitis; NHANES; prediction model; gradient boosting; calibration; missing data; TRIPOD+AI
 0018  
@@ -40,9 +40,9 @@
 0040  
 0041  ### 2.2 Participants
 0042  
-0043  Eligible participants were adults age 30 years or older with full periodontal examination data and sufficient information for CDC/AAP periodontitis classification. The development cohort included 9,379 participants. The same-source temporal validation cohort included 5,177 participants.
+0043  Eligible participants were adults age 30 years or older with full periodontal examination data and sufficient information for CDC/AAP periodontitis classification. The development cohort included 9,034 participants. The same-source temporal validation cohort included 5,037 participants.
 0044  
-0045  The analytic prevalence of periodontitis was approximately 67-68%, higher than general-population CDC estimates. This reflects the restricted full-examination analytic sample and should not be interpreted as the expected prevalence in a lower-risk screening population.
+0045  The analytic prevalence of periodontitis was approximately 69-72% unweighted and approximately 66% after applying examination weights by cycle, higher than general-population CDC estimates. This reflects the restricted full-examination analytic sample and should not be interpreted as the expected prevalence in a lower-risk screening population.
 0046  
 0047  ### 2.3 Outcome Definition
 0048  
@@ -77,50 +77,50 @@
 0077  
 0078  Discrimination was summarized with AUC-ROC and PR-AUC. Calibration was summarized with Brier score and reliability plots. Operating points were reported using sensitivity, specificity, PPV, and NPV. Decision-curve analysis was included as a descriptive utility analysis only.
 0079  
-0080  Survey-weighted prevalence and subgroup performance are generated by `scripts/04_publication_analyses.py` when processed prediction tables are available. These analyses should be regenerated immediately before journal submission.
+0080  Survey-weighted prevalence and subgroup performance are generated by `scripts/04_publication_analyses.py` when processed prediction tables are available. The current regenerated summary is saved in `results/publication_sensitivity_tables.md`.
 0081  
 0082  ## 3. Results
 0083  
 0084  ### 3.1 Cohorts
 0085  
-0086  The development cohort included 9,379 adults from NHANES 2011-2014. The temporal validation cohort included 5,177 adults from NHANES 2009-2010. Periodontitis prevalence was 68.3% in development data and 67.2% in temporal validation data.
+0086  The development cohort included 9,034 adults from NHANES 2011-2014. The temporal validation cohort included 5,037 adults from NHANES 2009-2010. Periodontitis prevalence was 70.9% in development data and 69.1% in temporal validation data before survey weighting.
 0087  
 0088  ### 3.2 Internal Model Performance
 0089  
 0090  | Model variant | Features | AUC-ROC | PR-AUC | Interpretation |
 0091  |---|---:|---:|---:|---|
-0092  | Primary model without treatment-seeking variables | 29 | 0.7172 | 0.8157 | Preferred benchmark model |
-0093  | Secondary full-feature model | 33 | 0.7255 | 0.8207 | Upper-bound sensitivity analysis |
-0094  | Deployment-ready core model | 15 | 0.6692 | 0.8137 | Conservative feature set without missingness indicators |
+0092  | Primary model without treatment-seeking variables | 29 | 0.6896 | 0.8240 | Preferred benchmark model |
+0093  | Secondary full-feature model | 33 | 0.6996 | 0.8295 | Upper-bound sensitivity analysis |
+0094  | Deployment-ready core model | 15 | 0.6896 | 0.8237 | Conservative feature set without missingness indicators |
 0095  
-0096  The full-feature model improved AUC by 0.0083 over the primary model. This small difference supports excluding treatment-seeking variables from the primary benchmark.
+0096  The full-feature model improved AUC by 0.0100 over the primary model. This small difference supports excluding treatment-seeking variables from the primary benchmark.
 0097  
 0098  ### 3.3 Same-Source Temporal Validation
 0099  
 0100  | Metric | Development estimate | Temporal validation estimate |
 0101  |---|---:|---:|
-0102  | N | 9,379 | 5,177 |
-0103  | Prevalence | 68.3% | 67.2% |
-0104  | AUC-ROC | 0.7172 | 0.6771 (95% CI 0.6612-0.6934) |
-0105  | PR-AUC | 0.8157 | 0.7735 (95% CI 0.7571-0.7892) |
-0106  | Brier score | 0.1812 | 0.2003 (95% CI 0.1935-0.2073) |
+0102  | N | 9,034 | 5,037 |
+0103  | Prevalence | 70.9% | 69.1% |
+0104  | AUC-ROC | 0.6896 | 0.6495 (95% CI 0.6315-0.6664) |
+0105  | PR-AUC | 0.8240 | 0.7727 (95% CI 0.7570-0.7885) |
+0106  | Brier score | 0.1871 | 0.2023 (95% CI 0.1955-0.2085) |
 0107  
-0108  The drop from internal AUC 0.7172 to temporal AUC 0.6771 is consistent with a modest but meaningful generalization gap.
+0108  The drop from internal AUC 0.6896 to temporal AUC 0.6495 is consistent with a modest but meaningful generalization gap.
 0109  
 0110  ### 3.4 Operating Points
 0111  
 0112  | Threshold | Sensitivity | Specificity | PPV | NPV |
 0113  |---:|---:|---:|---:|---:|
-0114  | 0.35 | 97.1% | 18.1% | 70.8% | 75.2% |
-0115  | 0.65 | 82.6% | 43.3% | 74.9% | 54.9% |
+0114  | 0.35 | 98.9% | 5.5% | 70.0% | 69.1% |
+0115  | 0.65 | 77.7% | 45.2% | 76.0% | 47.5% |
 0116  
-0117  The 0.35 threshold prioritizes sensitivity but has very low specificity and an NPV of only 75.2% in this high-prevalence cohort. It should be described as a high-sensitivity triage operating point, not a reliable disease-exclusion rule.
+0117  The 0.35 threshold prioritizes sensitivity but has very low specificity and an NPV of only 69.1% in this high-prevalence cohort. It should be described as a high-sensitivity triage operating point, not a reliable disease-exclusion rule.
 0118  
 0119  ### 3.5 Missingness and Survey-Design Sensitivity
 0120  
 0121  Missingness indicators contributed limited but measurable predictive signal. Missingness patterns were broadly comparable between the development and temporal validation cohorts, but the signal may still be NHANES-specific. The deployment-ready 15-feature model remains the most conservative estimate for settings where NHANES missingness patterns are unavailable.
 0122  
-0123  Survey-weighted prevalence and subgroup-performance tables should be generated from processed prediction data before submission. The repository now includes reusable code for these analyses, but the current manuscript should not claim population-level transportability until those tables are complete and reviewed.
+0123  Survey-weighted prevalence and subgroup-performance tables were generated from processed prediction data and are saved in `results/publication_sensitivity_tables.md`. Weighted prevalence was approximately 65.6% in 2009-2010 and 66.2-66.3% in 2011-2014. Subgroup analyses are descriptive and should not be overinterpreted as evidence of transportability.
 0124  
 0125  ## 4. Discussion
 0126  
@@ -141,7 +141,7 @@
 0141  
 0142  ## 6. Conclusions
 0143  
-0144  The primary 29-feature model achieved AUC-ROC 0.7172 internally and 0.6771 under same-source temporal validation. A secondary 33-feature model provided only a small apparent gain, supporting exclusion of treatment-seeking variables from the preferred benchmark. These results support a cautious methodological conclusion: low-cost NHANES predictors provide moderate discrimination and are useful for benchmarking, but they do not establish a clinically ready periodontitis screening system.
+0144  The primary 29-feature model achieved AUC-ROC 0.6896 internally and 0.6495 under same-source temporal validation. A secondary 33-feature model provided only a small apparent gain, supporting exclusion of treatment-seeking variables from the preferred benchmark. These results support a cautious methodological conclusion: low-cost NHANES predictors provide moderate discrimination and are useful for benchmarking, but they do not establish a clinically ready periodontitis screening system.
 0145  
 0146  ## Data and Code Availability
 0147  
diff --git a/results/decision_curve_external.json b/results/decision_curve_external.json
index f3554a1..3420ebc 100644
--- a/results/decision_curve_external.json
+++ b/results/decision_curve_external.json
@@ -2,10 +2,10 @@
   "model": "v1.3_primary",
   "dataset": "NHANES 2009-2010",
   "validation_design": "same-source temporal validation",
-  "description": "Decision curve analysis showing net benefit vs treat-all and treat-none",
+  "description": "Decision-curve net benefit at manuscript operating points",
   "data_summary": {
-    "model_at_0.35": 0.5142793591702226,
-    "model_at_0.65": 0.21024310825353898
+    "model_at_0.35": 0.5256792046547855,
+    "model_at_0.65": 0.22156045265038715
   },
-  "timestamp": "2025-12-02T20:43:36.175449"
-}
+  "timestamp": "2026-06-02T08:18:13+00:00"
+}
\ No newline at end of file
diff --git a/results/external_0910_metrics.json b/results/external_0910_metrics.json
index 4757457..2ed7d99 100644
--- a/results/external_0910_metrics.json
+++ b/results/external_0910_metrics.json
@@ -2,45 +2,45 @@
   "dataset": "NHANES 2009-2010",
   "validation_design": "same-source temporal validation",
   "model": "v1.3_primary_ensemble_calibrated",
-  "n_train": 9379,
-  "n_test": 5177,
-  "prevalence_test": 0.6720108170755263,
+  "n_train": 9034,
+  "n_test": 5037,
+  "prevalence_test": 0.6906889021242804,
   "metrics": {
     "auc": {
-      "mean": 0.6771141964954918,
+      "mean": 0.6494935134371237,
       "ci95": [
-        0.6612252469095934,
-        0.6933638332822355
+        0.6315493654219355,
+        0.6664205630508959
       ]
     },
     "prauc": {
-      "mean": 0.7734533687334428,
+      "mean": 0.7727428229243357,
       "ci95": [
-        0.7571437489815437,
-        0.789245793943464
+        0.7569515317862461,
+        0.7885439662133892
       ]
     },
     "brier": {
-      "mean": 0.20025236260487186,
+      "mean": 0.20225240210310863,
       "ci95": [
-        0.19354020952385412,
-        0.2072521352791959
+        0.1955149205330184,
+        0.20846683355057208
       ]
     }
   },
   "operating_points": {
     "rule_out_t_0.35": {
-      "sensitivity": 0.970968669157804,
-      "specificity": 0.18080094228504123,
-      "ppv": 0.7083245963514364,
-      "npv": 0.7524509803921569
+      "sensitivity": 0.9890773210692728,
+      "specificity": 0.05455712451861361,
+      "ppv": 0.7002442002442002,
+      "npv": 0.6910569105691057
     },
     "balanced_t_0.65": {
-      "sensitivity": 0.8263868927852831,
-      "specificity": 0.4334511189634865,
-      "ppv": 0.7492832942402919,
-      "npv": 0.5492537313432836
+      "sensitivity": 0.7766599597585513,
+      "specificity": 0.45186136071887034,
+      "ppv": 0.7598425196850394,
+      "npv": 0.475354490209318
     }
   },
-  "timestamp": "2025-12-02T20:43:35.470293"
-}
+  "timestamp": "2026-06-02T08:18:13+00:00"
+}
\ No newline at end of file
diff --git a/results/external_summary.json b/results/external_summary.json
index 79b4e94..120e980 100644
--- a/results/external_summary.json
+++ b/results/external_summary.json
@@ -1,34 +1,25 @@
 {
   "dataset": "NHANES 2009-2010",
   "validation_design": "same-source temporal validation",
-  "n": 5177,
-  "prevalence": 0.672,
-  "auc": 0.677,
-  "auc_ci": [0.661, 0.693],
-  "pr_auc": 0.773,
-  "pr_auc_ci": [0.757, 0.789],
-  "brier": 0.200,
-  "brier_ci": [0.194, 0.207],
+  "n": 5037,
+  "prevalence": 0.6906889021242804,
+  "auc": 0.6494935134371237,
+  "prauc": 0.7727428229243357,
+  "brier": 0.20225240210310863,
   "rule_out": {
     "threshold": 0.35,
-    "sensitivity": 0.971,
-    "specificity": 0.181,
-    "ppv": 0.708,
-    "npv": 0.752
+    "sensitivity": 0.9890773210692728,
+    "specificity": 0.05455712451861361,
+    "ppv": 0.7002442002442002,
+    "npv": 0.6910569105691057
   },
   "balanced": {
     "threshold": 0.65,
-    "sensitivity": 0.826,
-    "specificity": 0.433,
-    "ppv": 0.749,
-    "npv": 0.549
+    "sensitivity": 0.7766599597585513,
+    "specificity": 0.45186136071887034,
+    "ppv": 0.7598425196850394,
+    "npv": 0.475354490209318
   },
-  "temporal_comparison": {
-    "internal_auc": 0.717,
-    "temporal_auc": 0.677,
-    "auc_drop": 0.040,
-    "auc_drop_pct": 5.6
-  },
-  "source": "notebooks/01_external_validation.ipynb",
-  "timestamp": "2025-12-02"
-}
+  "source": "scripts/run_temporal_validation.py",
+  "timestamp": "2026-06-02T08:18:13+00:00"
+}
\ No newline at end of file
diff --git a/results/missingness_shift.json b/results/missingness_shift.json
index e0859a0..acf01db 100644
--- a/results/missingness_shift.json
+++ b/results/missingness_shift.json
@@ -1,25 +1,87 @@
 {
   "description": "Per-feature NaN rates: NHANES 2011-2014 training data vs 2009-2010 same-source temporal validation data",
   "feature_nan_rates": {
-    "age": {"train_2011_2014": 0.0, "temporal_2009_2010": 0.0, "delta": 0.0},
-    "sex": {"train_2011_2014": 0.0, "temporal_2009_2010": 0.0, "delta": 0.0},
-    "education": {"train_2011_2014": 0.0, "temporal_2009_2010": 0.0, "delta": 0.0},
-    "bmi": {"train_2011_2014": 6.1, "temporal_2009_2010": 5.8, "delta": -0.3},
-    "waist_cm": {"train_2011_2014": 10.9, "temporal_2009_2010": 11.2, "delta": 0.3},
-    "height_cm": {"train_2011_2014": 6.1, "temporal_2009_2010": 5.8, "delta": -0.3},
-    "systolic_bp": {"train_2011_2014": 12.5, "temporal_2009_2010": 13.1, "delta": 0.6},
-    "diastolic_bp": {"train_2011_2014": 12.5, "temporal_2009_2010": 13.1, "delta": 0.6},
-    "glucose": {"train_2011_2014": 55.2, "temporal_2009_2010": 54.8, "delta": -0.4},
-    "triglycerides": {"train_2011_2014": 55.8, "temporal_2009_2010": 55.1, "delta": -0.7},
-    "hdl": {"train_2011_2014": 11.7, "temporal_2009_2010": 12.0, "delta": 0.3},
-    "smoking": {"train_2011_2014": 54.8, "temporal_2009_2010": 53.9, "delta": -0.9},
-    "alcohol": {"train_2011_2014": 46.3, "temporal_2009_2010": 45.7, "delta": -0.6}
+    "age": {
+      "train_2011_2014": 0.0,
+      "temporal_2009_2010": 0.0,
+      "delta": 0.0
+    },
+    "sex": {
+      "train_2011_2014": 0.0,
+      "temporal_2009_2010": 0.0,
+      "delta": 0.0
+    },
+    "education": {
+      "train_2011_2014": 0.08855435023245517,
+      "temporal_2009_2010": 0.25809013301568395,
+      "delta": 0.16953578278322878
+    },
+    "bmi": {
+      "train_2011_2014": 1.5497011290679656,
+      "temporal_2009_2010": 1.1911852293031566,
+      "delta": -0.358515899764809
+    },
+    "waist_cm": {
+      "train_2011_2014": 6.453398273190171,
+      "temporal_2009_2010": 6.1147508437562035,
+      "delta": -0.3386474294339674
+    },
+    "waist_height": {
+      "train_2011_2014": 6.641576267434138,
+      "temporal_2009_2010": 6.2735755409966245,
+      "delta": -0.36800072643751314
+    },
+    "height_cm": {
+      "train_2011_2014": 1.3283152534868277,
+      "temporal_2009_2010": 1.0720667063728408,
+      "delta": -0.2562485471139868
+    },
+    "systolic_bp": {
+      "train_2011_2014": 4.139915873367279,
+      "temporal_2009_2010": 4.5066507841969425,
+      "delta": 0.36673491082966336
+    },
+    "diastolic_bp": {
+      "train_2011_2014": 4.139915873367279,
+      "temporal_2009_2010": 4.5066507841969425,
+      "delta": 0.36673491082966336
+    },
+    "glucose": {
+      "train_2011_2014": 53.23223378348462,
+      "temporal_2009_2010": 53.841572364502674,
+      "delta": 0.6093385810180578
+    },
+    "triglycerides": {
+      "train_2011_2014": 53.77462917865841,
+      "temporal_2009_2010": 54.21878102044868,
+      "delta": 0.4441518417902728
+    },
+    "hdl": {
+      "train_2011_2014": 5.966349346911667,
+      "temporal_2009_2010": 5.955926146515783,
+      "delta": -0.010423200395883292
+    },
+    "smoke_current": {
+      "train_2011_2014": 54.52734115563427,
+      "temporal_2009_2010": 52.21361921778836,
+      "delta": -2.313721937845905
+    },
+    "smoke_former": {
+      "train_2011_2014": 0.06641576267434138,
+      "temporal_2009_2010": 0.0,
+      "delta": -0.06641576267434138
+    },
+    "alcohol_current": {
+      "train_2011_2014": 10.394066858534426,
+      "temporal_2009_2010": 12.09053007742704,
+      "delta": 1.6964632188926139
+    }
   },
   "flags": {
     "delta_exceeds_10pct": [],
-    "max_delta_feature": "smoking",
-    "max_delta_value": 0.9
+    "max_delta_feature": "smoke_current",
+    "max_delta_value": 2.313721937845905
   },
-  "interpretation": "No feature showed missingness shift exceeding 10 percentage points. Missingness patterns are comparable between training and same-source temporal validation cohorts.",
-  "timestamp": "2025-12-02"
-}
+  "interpretation": "Missingness shift is reported descriptively; transportability remains limited to NHANES-like data.",
+  "timestamp": "2026-06-02T08:18:13+00:00"
+}
\ No newline at end of file
diff --git a/results/prevalence_check.json b/results/prevalence_check.json
index dc80b64..05af961 100644
--- a/results/prevalence_check.json
+++ b/results/prevalence_check.json
@@ -2,53 +2,29 @@
   "our_estimates": [
     {
       "cycle": "2009-2010",
-      "n": 5177,
-      "prevalence": 67.20,
-      "severe": 55.79,
-      "moderate": 6.61,
-      "mild": 4.81
+      "n": 5037,
+      "prevalence": 69.06889021242803,
+      "severe": 57.33571570379195,
+      "moderate": 6.789755807027993,
+      "mild": 4.9434187016081
     },
     {
       "cycle": "2011-2012",
-      "n": 4566,
-      "prevalence": 68.62,
-      "severe": 58.59,
-      "moderate": 4.86,
-      "mild": 5.17
+      "n": 4365,
+      "prevalence": 71.77548682703322,
+      "severe": 61.28293241695304,
+      "moderate": 5.085910652920962,
+      "mild": 5.406643757159221
     },
     {
       "cycle": "2013-2014",
-      "n": 4813,
-      "prevalence": 67.98,
-      "severe": 57.34,
-      "moderate": 5.26,
-      "mild": 5.38
+      "n": 4669,
+      "prevalence": 70.0792460912401,
+      "severe": 59.11330049261084,
+      "moderate": 5.41871921182266,
+      "mild": 5.547226386806597
     }
   ],
-  "cdc_reference": {
-    "source": "Eke et al. 2015",
-    "total": 47.2,
-    "severe": 8.9,
-    "moderate": 30.0,
-    "mild": 8.7,
-    "note": "CDC estimates are for general US population ages 30+"
-  },
-  "reconciliation_analysis": {
-    "prevalence_discrepancy": {
-      "our_total": "67-68%",
-      "cdc_total": "47.2%",
-      "delta": "+20-21 percentage points",
-      "explanation": "NHANES full-mouth periodontal exams are given to a subset of participants. Selection criteria likely include individuals with dental concerns, creating selection bias toward higher disease prevalence."
-    },
-    "severity_distribution_discrepancy": {
-      "our_severe_pct": "56-58%",
-      "cdc_severe_pct": "8.9%",
-      "our_moderate_pct": "5-7%",
-      "cdc_moderate_pct": "30.0%",
-      "explanation": "The inverted severe:moderate ratio suggests either (1) selection bias toward more severe cases in the exam-eligible population, (2) cumulative effect of hierarchical classification (severe checked first), or (3) potential CDC/AAP implementation differences. This warrants further investigation."
-    },
-    "impact_on_study": "Our binary outcome (any periodontitis vs. none) does not depend on the mild/moderate/severe split. The prediction task remains valid for screening purposes. The high prevalence (68%) means the model operates in a class-balanced setting, which may improve recall estimates but limit applicability to lower-prevalence populations.",
-    "recommendation": "Future work should investigate the CDC/AAP labeling implementation and compare tooth-level classifications with published NHANES documentation."
-  },
-  "timestamp": "2025-12-02"
-}
+  "note": "Unweighted analytic-sample prevalence; weighted prevalence is generated by scripts/04_publication_analyses.py.",
+  "timestamp": "2026-06-02T08:18:13+00:00"
+}
\ No newline at end of file
diff --git a/results/publication_sensitivity_tables.json b/results/publication_sensitivity_tables.json
new file mode 100644
index 0000000..cfb20e3
--- /dev/null
+++ b/results/publication_sensitivity_tables.json
@@ -0,0 +1,359 @@
+{
+  "input": "data/processed/publication_predictions.parquet",
+  "prevalence_by_cycle": [
+    {
+      "cycle": "2009-2010",
+      "n": 5037,
+      "unweighted_prevalence": 0.6906889021242804,
+      "weighted_prevalence": 0.6560081231732982
+    },
+    {
+      "cycle": "2011-2012",
+      "n": 4365,
+      "unweighted_prevalence": 0.7177548682703322,
+      "weighted_prevalence": 0.662160130350658
+    },
+    {
+      "cycle": "2013-2014",
+      "n": 4669,
+      "unweighted_prevalence": 0.700792460912401,
+      "weighted_prevalence": 0.6630377749941171
+    }
+  ],
+  "subgroup_performance": [
+    {
+      "subgroup_variable": "age_group",
+      "subgroup": "30-44",
+      "n": 1562,
+      "prevalence": 0.5992317541613317,
+      "brier": 0.2220824731530692,
+      "auc": 0.6690235793670298,
+      "pr_auc": 0.7331701011425323,
+      "sensitivity_t_0_35": 0.967948717948718,
+      "specificity_t_0_35": 0.09744408945686901,
+      "ppv_t_0_35": 0.6159075458871516,
+      "npv_t_0_35": 0.6703296703296703,
+      "sensitivity_t_0_65": 0.5192307692307693,
+      "specificity_t_0_65": 0.7172523961661342,
+      "ppv_t_0_65": 0.7330316742081447,
+      "npv_t_0_65": 0.4994438264738598
+    },
+    {
+      "subgroup_variable": "age_group",
+      "subgroup": "45-64",
+      "n": 2015,
+      "prevalence": 0.7573200992555831,
+      "brier": 0.17534045205329288,
+      "auc": 0.6366297067597232,
+      "pr_auc": 0.8245462969571644,
+      "sensitivity_t_0_35": 0.9954128440366973,
+      "specificity_t_0_35": 0.03067484662576687,
+      "ppv_t_0_35": 0.7621675865529353,
+      "npv_t_0_35": 0.6818181818181818,
+      "sensitivity_t_0_65": 0.8381389252948886,
+      "specificity_t_0_65": 0.3496932515337423,
+      "ppv_t_0_65": 0.8008766437069506,
+      "npv_t_0_65": 0.4090909090909091
+    },
+    {
+      "subgroup_variable": "age_group",
+      "subgroup": "65+",
+      "n": 1460,
+      "prevalence": 0.6965753424657535,
+      "brier": 0.21817912016498556,
+      "auc": 0.5766195888851156,
+      "pr_auc": 0.7313193592802272,
+      "sensitivity_t_0_35": 0.9990167158308751,
+      "specificity_t_0_35": 0.020316027088036117,
+      "ppv_t_0_35": 0.7006896551724138,
+      "npv_t_0_35": 0.9,
+      "sensitivity_t_0_65": 0.9213372664700098,
+      "specificity_t_0_65": 0.18961625282167044,
+      "ppv_t_0_65": 0.7229938271604939,
+      "npv_t_0_65": 0.5121951219512195
+    },
+    {
+      "subgroup_variable": "sex",
+      "subgroup": 0.0,
+      "n": 2581,
+      "prevalence": 0.6644711352189074,
+      "brier": 0.21519639358649806,
+      "auc": 0.6298672223755883,
+      "pr_auc": 0.7344608707220134,
+      "sensitivity_t_0_35": 0.9854227405247813,
+      "specificity_t_0_35": 0.05658198614318707,
+      "ppv_t_0_35": 0.6741124850418827,
+      "npv_t_0_35": 0.6621621621621622,
+      "sensitivity_t_0_65": 0.7381924198250729,
+      "specificity_t_0_65": 0.46997690531177827,
+      "ppv_t_0_65": 0.7339130434782609,
+      "npv_t_0_65": 0.47546728971962615
+    },
+    {
+      "subgroup_variable": "sex",
+      "subgroup": 1.0,
+      "n": 2456,
+      "prevalence": 0.7182410423452769,
+      "brier": 0.188649616264905,
+      "auc": 0.6615343150747774,
+      "pr_auc": 0.8005716063362711,
+      "sensitivity_t_0_35": 0.9926303854875284,
+      "specificity_t_0_35": 0.05202312138728324,
+      "ppv_t_0_35": 0.7274615704196095,
+      "npv_t_0_35": 0.7346938775510204,
+      "sensitivity_t_0_65": 0.8140589569160998,
+      "specificity_t_0_65": 0.4291907514450867,
+      "ppv_t_0_65": 0.7842708902239214,
+      "npv_t_0_65": 0.4752
+    },
+    {
+      "subgroup_variable": "education",
+      "subgroup": 1.0,
+      "n": 697,
+      "prevalence": 0.7216642754662841,
+      "brier": 0.19929217329091564,
+      "auc": 0.5922045049291877,
+      "pr_auc": 0.7630939616582941,
+      "sensitivity_t_0_35": 1.0,
+      "specificity_t_0_35": 0.015463917525773196,
+      "ppv_t_0_35": 0.7247838616714697,
+      "npv_t_0_35": 1.0,
+      "sensitivity_t_0_65": 0.9005964214711729,
+      "specificity_t_0_65": 0.25257731958762886,
+      "ppv_t_0_65": 0.7575250836120402,
+      "npv_t_0_65": 0.494949494949495
+    },
+    {
+      "subgroup_variable": "education",
+      "subgroup": 2.0,
+      "n": 807,
+      "prevalence": 0.7744733581164808,
+      "brier": 0.16595014409359315,
+      "auc": 0.6289362637362637,
+      "pr_auc": 0.8258605061941913,
+      "sensitivity_t_0_35": 0.9984,
+      "specificity_t_0_35": 0.02197802197802198,
+      "ppv_t_0_35": 0.7780548628428927,
+      "npv_t_0_35": 0.8,
+      "sensitivity_t_0_65": 0.912,
+      "specificity_t_0_65": 0.26373626373626374,
+      "ppv_t_0_65": 0.8096590909090909,
+      "npv_t_0_65": 0.46601941747572817
+    },
+    {
+      "subgroup_variable": "education",
+      "subgroup": 3.0,
+      "n": 1127,
+      "prevalence": 0.7178349600709849,
+      "brier": 0.2012779309472302,
+      "auc": 0.6052429041210906,
+      "pr_auc": 0.7690713423817302,
+      "sensitivity_t_0_35": 0.9975278121137207,
+      "specificity_t_0_35": 0.0031446540880503146,
+      "ppv_t_0_35": 0.7179715302491103,
+      "npv_t_0_35": 0.3333333333333333,
+      "sensitivity_t_0_65": 0.8936959208899876,
+      "specificity_t_0_65": 0.25157232704402516,
+      "ppv_t_0_65": 0.7523413111342352,
+      "npv_t_0_65": 0.4819277108433735
+    },
+    {
+      "subgroup_variable": "education",
+      "subgroup": 4.0,
+      "n": 1331,
+      "prevalence": 0.6724267468069121,
+      "brier": 0.2126701226186869,
+      "auc": 0.632667469632515,
+      "pr_auc": 0.7588824815018054,
+      "sensitivity_t_0_35": 0.9921787709497206,
+      "specificity_t_0_35": 0.0389908256880734,
+      "ppv_t_0_35": 0.6794185156847743,
+      "npv_t_0_35": 0.7083333333333334,
+      "sensitivity_t_0_65": 0.7452513966480447,
+      "specificity_t_0_65": 0.4426605504587156,
+      "ppv_t_0_65": 0.7329670329670329,
+      "npv_t_0_65": 0.4584323040380047
+    },
+    {
+      "subgroup_variable": "education",
+      "subgroup": 5.0,
+      "n": 1062,
+      "prevalence": 0.60075329566855,
+      "brier": 0.2204667805502954,
+      "auc": 0.6736484887916248,
+      "pr_auc": 0.7240916219706963,
+      "sensitivity_t_0_35": 0.9561128526645768,
+      "specificity_t_0_35": 0.14150943396226415,
+      "ppv_t_0_35": 0.6262833675564682,
+      "npv_t_0_35": 0.6818181818181818,
+      "sensitivity_t_0_65": 0.44043887147335425,
+      "specificity_t_0_65": 0.7806603773584906,
+      "ppv_t_0_65": 0.7513368983957219,
+      "npv_t_0_65": 0.4811046511627907
+    },
+    {
+      "subgroup_variable": "education",
+      "subgroup": null,
+      "n": 13,
+      "prevalence": 0.6923076923076923,
+      "brier": 0.14438892297275846,
+      "auc": 0.9305555555555556,
+      "pr_auc": 0.9598765432098766,
+      "sensitivity_t_0_35": 1.0,
+      "specificity_t_0_35": 0.0,
+      "ppv_t_0_35": 0.6923076923076923,
+      "npv_t_0_35": 0.0,
+      "sensitivity_t_0_65": 0.8888888888888888,
+      "specificity_t_0_65": 0.75,
+      "ppv_t_0_65": 0.8888888888888888,
+      "npv_t_0_65": 0.75
+    },
+    {
+      "subgroup_variable": "smoking",
+      "subgroup": "current",
+      "n": 1023,
+      "prevalence": 0.7214076246334311,
+      "brier": 0.18201572252223389,
+      "auc": 0.6874221461512862,
+      "pr_auc": 0.8144641729925858,
+      "sensitivity_t_0_35": 0.991869918699187,
+      "specificity_t_0_35": 0.03508771929824561,
+      "ppv_t_0_35": 0.7269116186693148,
+      "npv_t_0_35": 0.625,
+      "sensitivity_t_0_65": 0.8983739837398373,
+      "specificity_t_0_65": 0.3824561403508772,
+      "ppv_t_0_65": 0.7902264600715138,
+      "npv_t_0_65": 0.592391304347826
+    },
+    {
+      "subgroup_variable": "smoking",
+      "subgroup": "former",
+      "n": 1384,
+      "prevalence": 0.708092485549133,
+      "brier": 0.2018891809691949,
+      "auc": 0.6147555061628612,
+      "pr_auc": 0.7625766767366924,
+      "sensitivity_t_0_35": 0.9948979591836735,
+      "specificity_t_0_35": 0.034653465346534656,
+      "ppv_t_0_35": 0.7142857142857143,
+      "npv_t_0_35": 0.7368421052631579,
+      "sensitivity_t_0_65": 0.8377551020408164,
+      "specificity_t_0_65": 0.34405940594059403,
+      "ppv_t_0_65": 0.7559852670349908,
+      "npv_t_0_65": 0.4664429530201342
+    },
+    {
+      "subgroup_variable": "smoking",
+      "subgroup": "never/unknown",
+      "n": 2630,
+      "prevalence": 0.6695817490494297,
+      "brier": 0.21031507178393427,
+      "auc": 0.641618130717391,
+      "pr_auc": 0.7514219412660561,
+      "sensitivity_t_0_35": 0.9846678023850085,
+      "specificity_t_0_35": 0.07019562715765247,
+      "ppv_t_0_35": 0.6821400472069237,
+      "npv_t_0_35": 0.6931818181818182,
+      "sensitivity_t_0_65": 0.6916524701873935,
+      "specificity_t_0_65": 0.5247410817031071,
+      "ppv_t_0_65": 0.7467811158798283,
+      "npv_t_0_65": 0.45645645645645644
+    },
+    {
+      "subgroup_variable": "metabolic_risk",
+      "subgroup": "elevated",
+      "n": 3404,
+      "prevalence": 0.7079905992949471,
+      "brier": 0.1990121693597935,
+      "auc": 0.6330948345675714,
+      "pr_auc": 0.7769776401445407,
+      "sensitivity_t_0_35": 0.9950207468879668,
+      "specificity_t_0_35": 0.023138832997987926,
+      "ppv_t_0_35": 0.711783912140101,
+      "npv_t_0_35": 0.6571428571428571,
+      "sensitivity_t_0_65": 0.8153526970954357,
+      "specificity_t_0_65": 0.3772635814889336,
+      "ppv_t_0_65": 0.7604489164086687,
+      "npv_t_0_65": 0.4573170731707317
+    },
+    {
+      "subgroup_variable": "metabolic_risk",
+      "subgroup": "not_elevated",
+      "n": 1633,
+      "prevalence": 0.6546233925290875,
+      "brier": 0.20900669007508946,
+      "auc": 0.6720397202927109,
+      "pr_auc": 0.7605537354367313,
+      "sensitivity_t_0_35": 0.9756782039289055,
+      "specificity_t_0_35": 0.1099290780141844,
+      "ppv_t_0_35": 0.6750809061488673,
+      "npv_t_0_35": 0.7045454545454546,
+      "sensitivity_t_0_65": 0.6894293732460243,
+      "specificity_t_0_65": 0.5833333333333334,
+      "ppv_t_0_65": 0.7582304526748971,
+      "npv_t_0_65": 0.4977307110438729
+    }
+  ],
+  "missingness": [
+    {
+      "feature": "triglycerides",
+      "n": 14071,
+      "missing_n": 7589,
+      "missing_pct": 0.5393362234382774
+    },
+    {
+      "feature": "glucose",
+      "n": 14071,
+      "missing_n": 7521,
+      "missing_pct": 0.5345035889417952
+    },
+    {
+      "feature": "waist_height",
+      "n": 14071,
+      "missing_n": 916,
+      "missing_pct": 0.06509842939378864
+    },
+    {
+      "feature": "waist_cm",
+      "n": 14071,
+      "missing_n": 891,
+      "missing_pct": 0.06332172553478786
+    },
+    {
+      "feature": "hdl",
+      "n": 14071,
+      "missing_n": 839,
+      "missing_pct": 0.059626181508066235
+    },
+    {
+      "feature": "systolic_bp",
+      "n": 14071,
+      "missing_n": 601,
+      "missing_pct": 0.042711960770378796
+    },
+    {
+      "feature": "diastolic_bp",
+      "n": 14071,
+      "missing_n": 601,
+      "missing_pct": 0.042711960770378796
+    },
+    {
+      "feature": "bmi",
+      "n": 14071,
+      "missing_n": 200,
+      "missing_pct": 0.014213630872006253
+    },
+    {
+      "feature": "height_cm",
+      "n": 14071,
+      "missing_n": 174,
+      "missing_pct": 0.012365858858645442
+    },
+    {
+      "feature": "age",
+      "n": 14071,
+      "missing_n": 0,
+      "missing_pct": 0.0
+    }
+  ]
+}
\ No newline at end of file
diff --git a/results/publication_sensitivity_tables.md b/results/publication_sensitivity_tables.md
new file mode 100644
index 0000000..1538a6a
--- /dev/null
+++ b/results/publication_sensitivity_tables.md
@@ -0,0 +1,30 @@
+# Publication Sensitivity Tables
+
+## Prevalence by Cycle
+
+| cycle     | n    | unweighted_prevalence | weighted_prevalence |
+| --------- | ---- | --------------------- | ------------------- |
+| 2009-2010 | 5037 | 0.6906889021242804    | 0.6560081231732982  |
+| 2011-2012 | 4365 | 0.7177548682703322    | 0.662160130350658   |
+| 2013-2014 | 4669 | 0.700792460912401     | 0.6630377749941171  |
+
+## Subgroup Performance
+
+| subgroup_variable | subgroup      | n    | prevalence         | brier               | auc                | pr_auc             | sensitivity_t_0_35 | specificity_t_0_35    | ppv_t_0_35         | npv_t_0_35         | sensitivity_t_0_65  | specificity_t_0_65  | ppv_t_0_65         | npv_t_0_65          |
+| ----------------- | ------------- | ---- | ------------------ | ------------------- | ------------------ | ------------------ | ------------------ | --------------------- | ------------------ | ------------------ | ------------------- | ------------------- | ------------------ | ------------------- |
+| age_group         | 30-44         | 1562 | 0.5992317541613317 | 0.2220824731530692  | 0.6690235793670298 | 0.7331701011425323 | 0.967948717948718  | 0.09744408945686901   | 0.6159075458871516 | 0.6703296703296703 | 0.5192307692307693  | 0.7172523961661342  | 0.7330316742081447 | 0.4994438264738598  |
+| age_group         | 45-64         | 2015 | 0.7573200992555831 | 0.17534045205329288 | 0.6366297067597232 | 0.8245462969571644 | 0.9954128440366973 | 0.03067484662576687   | 0.7621675865529353 | 0.6818181818181818 | 0.8381389252948886  | 0.3496932515337423  | 0.8008766437069506 | 0.4090909090909091  |
+| age_group         | 65+           | 1460 | 0.6965753424657535 | 0.21817912016498556 | 0.5766195888851156 | 0.7313193592802272 | 0.9990167158308751 | 0.020316027088036117  | 0.7006896551724138 | 0.9                | 0.9213372664700098  | 0.18961625282167044 | 0.7229938271604939 | 0.5121951219512195  |
+| sex               | 0.0           | 2581 | 0.6644711352189074 | 0.21519639358649806 | 0.6298672223755883 | 0.7344608707220134 | 0.9854227405247813 | 0.05658198614318707   | 0.6741124850418827 | 0.6621621621621622 | 0.7381924198250729  | 0.46997690531177827 | 0.7339130434782609 | 0.47546728971962615 |
+| sex               | 1.0           | 2456 | 0.7182410423452769 | 0.188649616264905   | 0.6615343150747774 | 0.8005716063362711 | 0.9926303854875284 | 0.05202312138728324   | 0.7274615704196095 | 0.7346938775510204 | 0.8140589569160998  | 0.4291907514450867  | 0.7842708902239214 | 0.4752              |
+| education         | 1.0           | 697  | 0.7216642754662841 | 0.19929217329091564 | 0.5922045049291877 | 0.7630939616582941 | 1.0                | 0.015463917525773196  | 0.7247838616714697 | 1.0                | 0.9005964214711729  | 0.25257731958762886 | 0.7575250836120402 | 0.494949494949495   |
+| education         | 2.0           | 807  | 0.7744733581164808 | 0.16595014409359315 | 0.6289362637362637 | 0.8258605061941913 | 0.9984             | 0.02197802197802198   | 0.7780548628428927 | 0.8                | 0.912               | 0.26373626373626374 | 0.8096590909090909 | 0.46601941747572817 |
+| education         | 3.0           | 1127 | 0.7178349600709849 | 0.2012779309472302  | 0.6052429041210906 | 0.7690713423817302 | 0.9975278121137207 | 0.0031446540880503146 | 0.7179715302491103 | 0.3333333333333333 | 0.8936959208899876  | 0.25157232704402516 | 0.7523413111342352 | 0.4819277108433735  |
+| education         | 4.0           | 1331 | 0.6724267468069121 | 0.2126701226186869  | 0.632667469632515  | 0.7588824815018054 | 0.9921787709497206 | 0.0389908256880734    | 0.6794185156847743 | 0.7083333333333334 | 0.7452513966480447  | 0.4426605504587156  | 0.7329670329670329 | 0.4584323040380047  |
+| education         | 5.0           | 1062 | 0.60075329566855   | 0.2204667805502954  | 0.6736484887916248 | 0.7240916219706963 | 0.9561128526645768 | 0.14150943396226415   | 0.6262833675564682 | 0.6818181818181818 | 0.44043887147335425 | 0.7806603773584906  | 0.7513368983957219 | 0.4811046511627907  |
+| education         |               | 13   | 0.6923076923076923 | 0.14438892297275846 | 0.9305555555555556 | 0.9598765432098766 | 1.0                | 0.0                   | 0.6923076923076923 | 0.0                | 0.8888888888888888  | 0.75                | 0.8888888888888888 | 0.75                |
+| smoking           | current       | 1023 | 0.7214076246334311 | 0.18201572252223389 | 0.6874221461512862 | 0.8144641729925858 | 0.991869918699187  | 0.03508771929824561   | 0.7269116186693148 | 0.625              | 0.8983739837398373  | 0.3824561403508772  | 0.7902264600715138 | 0.592391304347826   |
+| smoking           | former        | 1384 | 0.708092485549133  | 0.2018891809691949  | 0.6147555061628612 | 0.7625766767366924 | 0.9948979591836735 | 0.034653465346534656  | 0.7142857142857143 | 0.7368421052631579 | 0.8377551020408164  | 0.34405940594059403 | 0.7559852670349908 | 0.4664429530201342  |
+| smoking           | never/unknown | 2630 | 0.6695817490494297 | 0.21031507178393427 | 0.641618130717391  | 0.7514219412660561 | 0.9846678023850085 | 0.07019562715765247   | 0.6821400472069237 | 0.6931818181818182 | 0.6916524701873935  | 0.5247410817031071  | 0.7467811158798283 | 0.45645645645645644 |
+| metabolic_risk    | elevated      | 3404 | 0.7079905992949471 | 0.1990121693597935  | 0.6330948345675714 | 0.7769776401445407 | 0.9950207468879668 | 0.023138832997987926  | 0.711783912140101  | 0.6571428571428571 | 0.8153526970954357  | 0.3772635814889336  | 0.7604489164086687 | 0.4573170731707317  |
+| metabolic_risk    | not_elevated  | 1633 | 0.6546233925290875 | 0.20900669007508946 | 0.6720397202927109 | 0.7605537354367313 | 0.9756782039289055 | 0.1099290780141844    | 0.6750809061488673 | 0.7045454545454546 | 0.6894293732460243  | 0.5833333333333334  | 0.7582304526748971 | 0.4977307110438729  |
diff --git a/results/v13_featuredrop.json b/results/v13_featuredrop.json
index eff002f..3534dfd 100644
--- a/results/v13_featuredrop.json
+++ b/results/v13_featuredrop.json
@@ -1,23 +1,23 @@
 {
   "v1.3_full": {
-    "name": "v1.3_full",
+    "name": "secondary",
     "n_features": 33,
-    "auc": 0.7255326805774952,
-    "pr_auc": 0.8207039298134957,
-    "rule_out_recall": 0.9876658860265418,
-    "rule_out_specificity": 0.16778749159381304,
-    "balanced_recall": 0.7542544886807182,
-    "balanced_specificity": 0.5773369199731002
+    "auc": 0.6995820353595235,
+    "pr_auc": 0.8294776042045284,
+    "rule_out_recall": 0.9729898516783763,
+    "rule_out_specificity": 0.12666413084823128,
+    "balanced_recall": 0.6919594067135051,
+    "balanced_specificity": 0.606694560669456
   },
   "v1.3_no_reverse_causality": {
-    "name": "v1.3_no_reverse_causality",
+    "name": "primary",
     "n_features": 29,
-    "auc": 0.717245742046474,
-    "pr_auc": 0.8157447372867956,
-    "rule_out_recall": 0.9990632318501171,
-    "rule_out_specificity": 0.12441156691324814,
-    "balanced_recall": 0.7283372365339579,
-    "balanced_specificity": 0.5921318090114324
+    "auc": 0.6896307296060366,
+    "pr_auc": 0.8239764216424541,
+    "rule_out_recall": 0.986104605776737,
+    "rule_out_specificity": 0.06466337010270065,
+    "balanced_recall": 0.75160031225605,
+    "balanced_specificity": 0.5195891974134652
   },
   "dropped_features": [
     "dental_visit",
@@ -26,9 +26,10 @@
     "floss_days_missing"
   ],
   "deltas": {
-    "auc": -0.008286938531021248,
-    "pr_auc": -0.004959192526700074,
-    "rule_out_recall": 0.011397345823575322,
-    "balanced_specificity": 0.014794889038332149
-  }
+    "auc": -0.009951305753486905,
+    "pr_auc": -0.005501182562074325,
+    "rule_out_recall": 0.013114754098360715,
+    "balanced_specificity": -0.08710536325599083
+  },
+  "timestamp": "2026-06-02T08:17:59+00:00"
 }
\ No newline at end of file
diff --git a/results/v13_nan_ablation.json b/results/v13_nan_ablation.json
index ea3e7c9..13e53fa 100644
--- a/results/v13_nan_ablation.json
+++ b/results/v13_nan_ablation.json
@@ -1,57 +1,50 @@
 [
   {
     "name": "v1.3_full",
-    "n_samples": 9379,
+    "n_samples": null,
     "n_features": 33,
-    "auc": 0.7255326805774952,
-    "pr_auc": 0.8207039298134957,
-    "rule_out_recall": 0.9876658860265418,
-    "rule_out_specificity": 0.16778749159381304,
-    "balanced_recall": 0.7542544886807182,
-    "balanced_specificity": 0.5773369199731002
+    "auc": 0.6995820353595235,
+    "pr_auc": 0.8294776042045284,
+    "brier_score": 0.18439499071448648,
+    "rule_out_recall": 0.9729898516783763,
+    "rule_out_specificity": 0.12666413084823128,
+    "balanced_recall": 0.6919594067135051,
+    "balanced_specificity": 0.606694560669456
   },
   {
-    "name": "no_missing_indicators",
-    "n_samples": 9379,
-    "n_features": 20,
-    "auc": 0.7255326805774952,
-    "pr_auc": 0.8207039298134957,
-    "rule_out_recall": 0.9876658860265418,
-    "rule_out_specificity": 0.16778749159381304,
-    "balanced_recall": 0.7542544886807182,
-    "balanced_specificity": 0.5773369199731002
+    "name": "v1.3_no_reverse_causality",
+    "n_samples": null,
+    "n_features": 29,
+    "auc": 0.6896307296060366,
+    "pr_auc": 0.8239764216424541,
+    "brier_score": 0.18712707529235736,
+    "rule_out_recall": 0.986104605776737,
+    "rule_out_specificity": 0.06466337010270065,
+    "balanced_recall": 0.75160031225605,
+    "balanced_specificity": 0.5195891974134652
   },
   {
-    "name": "lab_missing_only",
-    "n_samples": 9379,
-    "n_features": 23,
-    "auc": 0.7263705168971577,
-    "pr_auc": 0.822111362347326,
-    "rule_out_recall": 0.9881342701014832,
-    "rule_out_specificity": 0.1714862138533961,
-    "balanced_recall": 0.7648711943793911,
-    "balanced_specificity": 0.5662407531943511
+    "name": "deployment_ready_core",
+    "n_samples": null,
+    "n_features": 15,
+    "auc": 0.6896105677709354,
+    "pr_auc": 0.8236621768778728,
+    "brier_score": 0.1868675420758485,
+    "rule_out_recall": 0.985480093676815,
+    "rule_out_specificity": 0.07189045264359072,
+    "balanced_recall": 0.7697111631537861,
+    "balanced_specificity": 0.500950931913275
   },
   {
-    "name": "complete_case_only",
-    "n_samples": 4169,
-    "n_features": 20,
-    "auc": 0.679430661635789,
-    "pr_auc": 0.8136534275604846,
-    "rule_out_recall": 0.9949031600407747,
-    "rule_out_specificity": 0.03099510603588907,
-    "balanced_recall": 0.6479782534828407,
-    "balanced_specificity": 0.6117455138662317
-  },
-  {
-    "name": "lab_complete_full_features",
-    "n_samples": 4169,
-    "n_features": 33,
-    "auc": 0.679430661635789,
-    "pr_auc": 0.8136534275604846,
-    "rule_out_recall": 0.9949031600407747,
-    "rule_out_specificity": 0.03099510603588907,
-    "balanced_recall": 0.6479782534828407,
-    "balanced_specificity": 0.6117455138662317
+    "name": "complete_case_core",
+    "n_samples": 1635,
+    "n_features": 15,
+    "auc": 0.5825289575289575,
+    "pr_auc": 0.8238329296686819,
+    "brier_score": 0.1597205098061504,
+    "rule_out_recall": 0.9953667953667954,
+    "rule_out_specificity": 0.052941176470588235,
+    "balanced_recall": 0.9806949806949807,
+    "balanced_specificity": 0.10294117647058823
   }
 ]
\ No newline at end of file
diff --git a/results/v13_operating_points.json b/results/v13_operating_points.json
index efb4929..eff61a6 100644
--- a/results/v13_operating_points.json
+++ b/results/v13_operating_points.json
@@ -1,83 +1,76 @@
 {
   "version": "v1.3",
   "dataset": "NHANES 2011-2014",
-  "n_samples": 9379,
-  "prevalence": 0.683,
-  "calibration": "isotonic_regression",
-  "target_a_achieved": false,
-  "target_a_reason": "Cannot achieve Recall≥90% AND Specificity≥35% with this feature family",
-  "ensemble_weights": {
-    "catboost": 0.34,
-    "xgboost": 0.33,
-    "lightgbm": 0.33
-  },
-  "monotonic_constraints": {
-    "increasing": ["age", "bmi", "waist_cm", "waist_height", "systolic_bp", "diastolic_bp", "glucose", "triglycerides"],
-    "decreasing": ["hdl"]
+  "n_samples": 9034,
+  "prevalence": 0.7089882665485941,
+  "calibration": "nested_isotonic_regression",
+  "feature_sets": {
+    "deployment_ready": 15,
+    "primary": 29,
+    "secondary": 33
   },
   "models": {
     "primary": {
-      "name": "v1.3_primary_no_reverse_causality",
+      "name": "primary",
       "n_features": 29,
-      "description": "Excludes dental_visit, floss_days, mobile_teeth (treatment-seeking bias)",
       "metrics": {
-        "auc_roc": 0.7172,
-        "pr_auc": 0.8157
+        "auc_roc": 0.6896307296060366,
+        "pr_auc": 0.8239764216424541,
+        "brier_score": 0.18712707529235736
       },
       "operating_points": {
         "rule_out": {
           "threshold": 0.35,
-          "recall": 0.999,
-          "specificity": 0.124,
-          "precision": 0.692,
-          "npv": 0.96,
-          "f1": 0.818
+          "recall": 0.986104605776737,
+          "specificity": 0.06466337010270065,
+          "precision": 0.7197720797720798,
+          "f1": 0.8321475625823452
         },
         "balanced": {
           "threshold": 0.65,
-          "recall": 0.728,
-          "specificity": 0.592,
-          "precision": 0.79,
-          "npv": 0.51,
-          "f1": 0.758,
-          "youden_j": 0.32
+          "recall": 0.75160031225605,
+          "specificity": 0.5195891974134652,
+          "precision": 0.792167187757117,
+          "f1": 0.771350745072905
         }
       }
     },
     "secondary": {
-      "name": "v1.3_secondary_full_features",
+      "name": "secondary",
       "n_features": 33,
-      "description": "Includes all features (upper bound)",
       "metrics": {
-        "auc_roc": 0.7255,
-        "pr_auc": 0.8207
+        "auc_roc": 0.6995820353595235,
+        "pr_auc": 0.8294776042045284,
+        "brier_score": 0.18439499071448648
       },
       "operating_points": {
         "rule_out": {
           "threshold": 0.37,
-          "recall": 0.9877,
-          "specificity": 0.168,
-          "precision": 0.706,
-          "npv": 0.83,
-          "f1": 0.824
+          "recall": 0.9729898516783763,
+          "specificity": 0.12666413084823128,
+          "precision": 0.7307692307692307,
+          "f1": 0.8346614879796425
         },
         "balanced": {
           "threshold": 0.67,
-          "recall": 0.754,
-          "specificity": 0.577,
-          "precision": 0.79,
-          "npv": 0.52,
-          "f1": 0.772,
-          "youden_j": 0.331
+          "recall": 0.6919594067135051,
+          "specificity": 0.606694560669456,
+          "precision": 0.8108305890962313,
+          "f1": 0.7466936231151545
         }
       }
     }
   },
   "feature_drop_analysis": {
-    "dropped_features": ["dental_visit", "floss_days", "mobile_teeth", "floss_days_missing"],
-    "auc_delta": -0.0083,
-    "pr_auc_delta": -0.005,
-    "interpretation": "Reverse-causality features contribute ~1% AUC. Core risk factors drive majority of discrimination."
+    "dropped_features": [
+      "dental_visit",
+      "floss_days",
+      "mobile_teeth",
+      "floss_days_missing"
+    ],
+    "auc_delta": -0.009951305753486905,
+    "pr_auc_delta": -0.005501182562074325,
+    "interpretation": "Treatment-seeking features are reported as an upper-bound sensitivity analysis."
   },
-  "timestamp": "2025-12-02T00:00:00Z"
-}
+  "timestamp": "2026-06-02T08:18:07+00:00"
+}
\ No newline at end of file
diff --git a/results/v13_primary_norc_summary.json b/results/v13_primary_norc_summary.json
index 2193400..ba3d776 100644
--- a/results/v13_primary_norc_summary.json
+++ b/results/v13_primary_norc_summary.json
@@ -1,65 +1,60 @@
 {
   "model": "v1.3_primary_no_reverse_causality",
-  "description": "Primary model for publication - excludes treatment-seeking features",
+  "description": "Primary benchmark model excluding treatment-seeking variables",
   "dataset": "NHANES 2011-2014",
-  "n_samples": 9379,
+  "n_samples": 9034,
   "n_features": 29,
-  "prevalence": 0.683,
-  "dropped_features": [
-    "dental_visit",
-    "floss_days", 
-    "mobile_teeth",
-    "floss_days_missing"
-  ],
-  "rationale": "Removes treatment-seeking bias with only small AUC cost and slightly better balanced specificity",
+  "prevalence": 0.7089882665485941,
   "metrics": {
-    "auc_roc": 0.7172,
-    "pr_auc": 0.8157,
-    "brier_score": 0.1812
+    "auc_roc": 0.6896307296060366,
+    "pr_auc": 0.8239764216424541,
+    "brier_score": 0.18712707529235736
   },
   "operating_points": {
     "rule_out": {
-      "name": "Rule-Out (High Sensitivity)",
+      "name": "High-sensitivity triage",
       "threshold": 0.35,
-      "recall": 0.999,
-      "specificity": 0.124,
-      "precision": 0.692,
-      "npv": 0.96,
-      "f1": 0.818,
-      "use_case": "High-sensitivity triage; periodontal examination remains required."
+      "recall": 0.986104605776737,
+      "specificity": 0.06466337010270065,
+      "precision": 0.7197720797720798,
+      "npv": 0.6563706563706564,
+      "f1": 0.8321475625823452,
+      "use_case": "Triage operating point; periodontal examination remains required."
     },
     "balanced": {
-      "name": "Balanced (Youden J)",
+      "name": "Balanced triage",
       "threshold": 0.65,
-      "recall": 0.728,
-      "specificity": 0.592,
-      "precision": 0.79,
-      "npv": 0.51,
-      "f1": 0.758,
-      "youden_j": 0.32,
-      "use_case": "Balanced triage; periodontal examination remains required."
+      "recall": 0.75160031225605,
+      "specificity": 0.5195891974134652,
+      "precision": 0.792167187757117,
+      "npv": 0.4619546838011498,
+      "f1": 0.771350745072905,
+      "use_case": "Triage operating point; periodontal examination remains required."
     }
   },
   "confusion_matrix": {
     "rule_out": {
       "threshold": 0.35,
-      "tp": 6399,
-      "fp": 2605,
-      "tn": 369,
-      "fn": 6
+      "tp": 6316,
+      "fp": 2459,
+      "tn": 170,
+      "fn": 89
     },
     "balanced": {
       "threshold": 0.65,
-      "tp": 4666,
-      "fp": 1214,
-      "tn": 1760,
-      "fn": 1739
+      "tp": 4814,
+      "fp": 1263,
+      "tn": 1366,
+      "fn": 1591
     }
   },
-  "calibration": "isotonic_regression",
-  "monotonic_constraints": {
-    "increasing": ["age", "bmi", "waist_cm", "waist_height", "systolic_bp", "diastolic_bp", "glucose", "triglycerides"],
-    "decreasing": ["hdl"]
-  },
-  "timestamp": "2025-12-02T00:00:00Z"
-}
+  "calibration": "nested_isotonic_regression",
+  "timestamp": "2026-06-02T08:17:59+00:00",
+  "dropped_features": [
+    "dental_visit",
+    "floss_days",
+    "mobile_teeth",
+    "floss_days_missing"
+  ],
+  "rationale": "Removes treatment-seeking variables with limited discrimination cost."
+}
\ No newline at end of file
diff --git a/results/v13_secondary_full_summary.json b/results/v13_secondary_full_summary.json
index eea185f..1bcc649 100644
--- a/results/v13_secondary_full_summary.json
+++ b/results/v13_secondary_full_summary.json
@@ -1,65 +1,60 @@
 {
   "model": "v1.3_secondary_full_features",
-  "description": "Secondary model - includes reverse-causality features for comparison",
+  "description": "Secondary upper-bound model including treatment-seeking variables",
   "dataset": "NHANES 2011-2014",
-  "n_samples": 9379,
+  "n_samples": 9034,
   "n_features": 33,
-  "prevalence": 0.683,
-  "additional_features": [
-    "dental_visit",
-    "floss_days",
-    "mobile_teeth",
-    "floss_days_missing"
-  ],
-  "rationale": "Establishes upper bound and quantifies ~1% AUC contribution from reverse-causality variables",
+  "prevalence": 0.7089882665485941,
   "metrics": {
-    "auc_roc": 0.7255,
-    "pr_auc": 0.8207,
-    "brier_score": 0.1793
+    "auc_roc": 0.6995820353595235,
+    "pr_auc": 0.8294776042045284,
+    "brier_score": 0.18439499071448648
   },
   "operating_points": {
     "rule_out": {
-      "name": "Rule-Out (High Sensitivity)",
+      "name": "High-sensitivity triage",
       "threshold": 0.37,
-      "recall": 0.9877,
-      "specificity": 0.168,
-      "precision": 0.706,
-      "npv": 0.83,
-      "f1": 0.824,
-      "use_case": "High-sensitivity triage; periodontal examination remains required."
+      "recall": 0.9729898516783763,
+      "specificity": 0.12666413084823128,
+      "precision": 0.7307692307692307,
+      "npv": 0.658102766798419,
+      "f1": 0.8346614879796425,
+      "use_case": "Triage operating point; periodontal examination remains required."
     },
     "balanced": {
-      "name": "Balanced (Youden J)",
+      "name": "Balanced triage",
       "threshold": 0.67,
-      "recall": 0.754,
-      "specificity": 0.577,
-      "precision": 0.79,
-      "npv": 0.52,
-      "f1": 0.772,
-      "youden_j": 0.331,
-      "use_case": "Balanced triage; periodontal examination remains required."
+      "recall": 0.6919594067135051,
+      "specificity": 0.606694560669456,
+      "precision": 0.8108305890962313,
+      "npv": 0.4470291479820628,
+      "f1": 0.7466936231151545,
+      "use_case": "Triage operating point; periodontal examination remains required."
     }
   },
   "confusion_matrix": {
     "rule_out": {
       "threshold": 0.37,
-      "tp": 6326,
-      "fp": 2475,
-      "tn": 499,
-      "fn": 79
+      "tp": 6232,
+      "fp": 2296,
+      "tn": 333,
+      "fn": 173
     },
     "balanced": {
       "threshold": 0.67,
-      "tp": 4832,
-      "fp": 1258,
-      "tn": 1716,
-      "fn": 1573
+      "tp": 4432,
+      "fp": 1034,
+      "tn": 1595,
+      "fn": 1973
     }
   },
-  "calibration": "isotonic_regression",
-  "monotonic_constraints": {
-    "increasing": ["age", "bmi", "waist_cm", "waist_height", "systolic_bp", "diastolic_bp", "glucose", "triglycerides"],
-    "decreasing": ["hdl"]
-  },
-  "timestamp": "2025-12-02T00:00:00Z"
-}
+  "calibration": "nested_isotonic_regression",
+  "timestamp": "2026-06-02T08:17:59+00:00",
+  "additional_features": [
+    "dental_visit",
+    "floss_days",
+    "mobile_teeth",
+    "floss_days_missing"
+  ],
+  "rationale": "Upper-bound sensitivity analysis for treatment-seeking variables."
+}
\ No newline at end of file
diff --git a/results/v13_shap_summary.json b/results/v13_shap_summary.json
index d0d31c0..2b00f65 100644
--- a/results/v13_shap_summary.json
+++ b/results/v13_shap_summary.json
@@ -1,52 +1,55 @@
 {
-  "model": "LightGBM_v1.3_monotonic",
-  "n_samples": 9379,
+  "model": "v1.3_secondary_full_features",
+  "importance_method": "mean_tree_feature_importance",
+  "n_samples": 9034,
   "n_features": 33,
   "top_10_features": [
     {
-      "feature": "age",
-      "mean_abs_shap": 0.3185195974045061
+      "feature": "hdl",
+      "mean_model_importance": 214.7907644405308
     },
     {
-      "feature": "alcohol",
-      "mean_abs_shap": 0.2554450040004494
+      "feature": "systolic_bp",
+      "mean_model_importance": 197.13922753260897
     },
     {
       "feature": "height_cm",
-      "mean_abs_shap": 0.14800143393819168
+      "mean_model_importance": 189.08594772124522
     },
     {
-      "feature": "dental_visit",
-      "mean_abs_shap": 0.14748755500803817
+      "feature": "bmi",
+      "mean_model_importance": 186.07201828044742
     },
     {
-      "feature": "education",
-      "mean_abs_shap": 0.11887453217779864
+      "feature": "waist_cm",
+      "mean_model_importance": 178.39324953240362
     },
     {
-      "feature": "smoking",
-      "mean_abs_shap": 0.11271073472649758
+      "feature": "diastolic_bp",
+      "mean_model_importance": 178.26180264671208
     },
     {
-      "feature": "systolic_bp",
-      "mean_abs_shap": 0.08569929929007113
+      "feature": "waist_height",
+      "mean_model_importance": 177.5775224485836
     },
     {
-      "feature": "hdl",
-      "mean_abs_shap": 0.06925773015232965
+      "feature": "age",
+      "mean_model_importance": 157.10600473349473
     },
     {
-      "feature": "floss_days",
-      "mean_abs_shap": 0.062162435754803326
+      "feature": "glucose",
+      "mean_model_importance": 137.70363017852924
     },
     {
-      "feature": "bmi",
-      "mean_abs_shap": 0.05943293643976315
+      "feature": "triglycerides",
+      "mean_model_importance": 136.4529059565409
     }
   ],
-  "reverse_causality_features": {
-    "dental_visit": 0.14748755500803817,
-    "floss_days": 0.062162435754803326,
-    "mobile_teeth": 0.01172740070368736
-  }
+  "treatment_seeking_features": [
+    "dental_visit",
+    "floss_days",
+    "mobile_teeth",
+    "floss_days_missing"
+  ],
+  "timestamp": "2026-06-02T08:18:09+00:00"
 }
\ No newline at end of file
diff --git a/scripts/verify_submission.py b/scripts/verify_submission.py
index 7d0a2f6..c3c427a 100644
--- a/scripts/verify_submission.py
+++ b/scripts/verify_submission.py
@@ -27,6 +27,7 @@
     ROOT / "results/v13_primary_norc_summary.json",
     ROOT / "results/v13_secondary_full_summary.json",
     ROOT / "results/external_0910_metrics.json",
+    ROOT / "results/publication_sensitivity_tables.md",
 ]
 
 
@@ -67,6 +68,25 @@ def check_temporal_metric_shape() -> None:
                 raise AssertionError(f"Temporal operating point {point} missing {field}")
 
 
+def check_publication_analysis_outputs() -> None:
+    path = ROOT / "results/publication_sensitivity_tables.json"
+    payload = json.loads(path.read_text(encoding="utf-8"))
+    prevalence = payload.get("prevalence_by_cycle", [])
+    subgroup = payload.get("subgroup_performance", [])
+    if not prevalence:
+        raise AssertionError("Publication sensitivity output is missing prevalence_by_cycle rows.")
+    if not subgroup:
+        raise AssertionError("Publication sensitivity output is missing subgroup_performance rows.")
+    for row in prevalence:
+        if "weighted_prevalence" not in row:
+            raise AssertionError("Publication prevalence row missing weighted_prevalence.")
+    subgroup_variables = {row.get("subgroup_variable") for row in subgroup}
+    required = {"age_group", "sex", "education", "smoking", "metabolic_risk"}
+    missing = required - subgroup_variables
+    if missing:
+        raise AssertionError(f"Publication subgroup output missing strata: {sorted(missing)}")
+
+
 def check_publication_wording() -> None:
     banned = ["Publication Ready", "External Validation", "clinical deployment", "negative result rules out"]
     for path in PUBLICATION_FILES:
@@ -103,6 +123,7 @@ def main() -> None:
     check_docs()
     check_reproduction_hooks()
     check_temporal_metric_shape()
+    check_publication_analysis_outputs()
     check_publication_wording()
     check_nhanes_urls()
     check_manuscript_render_support()

From 5d68997acd3341eda1b35e68ef1956a976985e30 Mon Sep 17 00:00:00 2001
From: Tuminha <83960151+Tuminha@users.noreply.github.com>
Date: Tue, 2 Jun 2026 10:29:46 +0200
Subject: [PATCH 3/3] Clean up reproduction wrapper checks

---
 scripts/run_external_validation.sh | 6 +-----
 scripts/run_v13_primary.sh         | 4 ++--
 2 files changed, 3 insertions(+), 7 deletions(-)

diff --git a/scripts/run_external_validation.sh b/scripts/run_external_validation.sh
index 333d81c..1e8cdce 100755
--- a/scripts/run_external_validation.sh
+++ b/scripts/run_external_validation.sh
@@ -4,7 +4,7 @@
 # ==============================================================================
 # 
 # This script runs the temporal validation Python workflow non-interactively,
-# generating all figures and result files.
+# generating the canonical temporal-validation result files.
 #
 # Usage:
 #   bash scripts/run_external_validation.sh
@@ -14,8 +14,6 @@
 #   - results/external_0910_metrics.json
 #   - results/prevalence_check.json
 #   - results/decision_curve_external.json
-#   - figures/external_roc_pr_calibration.png
-#   - figures/decision_curve_external.png
 # ==============================================================================
 
 set -e  # Exit on error
@@ -45,8 +43,6 @@ FILES=(
     "results/external_0910_metrics.json"
     "results/prevalence_check.json"
     "results/decision_curve_external.json"
-    "figures/external_roc_pr_calibration.png"
-    "figures/decision_curve_external.png"
 )
 
 for file in "${FILES[@]}"; do
diff --git a/scripts/run_v13_primary.sh b/scripts/run_v13_primary.sh
index 5ae22e1..3b2c35e 100755
--- a/scripts/run_v13_primary.sh
+++ b/scripts/run_v13_primary.sh
@@ -24,12 +24,12 @@ echo ""
 
 # Check Python version
 echo "Python version:"
-python3 --version
+"${PYTHON:-python3}" --version
 echo ""
 
 # Check dependencies
 echo "Checking dependencies..."
-python3 -c "
+"${PYTHON:-python3}" -c "
 import pandas, numpy, sklearn, xgboost, catboost, lightgbm, optuna, shap
 print('  pandas:', pandas.__version__)
 print('  numpy:', numpy.__version__)