ChatGPU · ChatGPU · May 29, 2026 · May 29, 2026
diff --git a/docs/data/lens_fragments/wave2_rl.json b/docs/data/lens_fragments/wave2_rl.json
diff --git a/docs/data/research_lens.json b/docs/data/research_lens.json
diff --git a/labs/research/exp_overestimation_bias.py b/labs/research/exp_overestimation_bias.py
@@ -0,0 +1,96 @@
+#!/usr/bin/env python3
+r"""Code-verify the lens of `insight:q_learning_max_is_optimistically_biased`.
+
+The lens claims: taking $\max_a$ over noisy action-value estimates is
+systematically optimistic, because $\mathbb{E}[\max_a \hat Q(s,a)] \ge
+\max_a \mathbb{E}[\hat Q(s,a)]$ (Jensen, since $\max$ is convex), and the bias
+grows with the number of actions and the estimation noise; decoupling
+"which action" from "its value" (the double estimator) removes it.
+
+Falsifier (from the lens): if the single-estimator bias does NOT grow with the
+action count, or the double estimator is not materially less biased, the claim
+is wrong.
+
+Setup (deliberately minimal and exactly analysable): all true action values are
+equal, $q^\*(a)=0$, so $\max_a \mathbb{E}[\hat Q(a)] = 0$ and any positive
+expected estimate is pure overestimation. Each action's value is estimated from
+`m` noisy samples ~ N(0, sigma^2). We sweep the action count `n` and report the
+mean estimator over many trials. This is the driving-relevant micro-version of
+why vanilla DQN over-values actions in large/branchy action spaces and why
+Double-DQN was introduced.
+
+Pure standard library; deterministic; CPU-only. Run:
+    python3 labs/research/exp_overestimation_bias.py
+"""
+from __future__ import annotations
+
+import random
+from pathlib import Path
+
+from harness import Hypothesis, Study, mean
+
+HYP = Hypothesis(
+    node="insight:q_learning_max_is_optimistically_biased",
+    claim="对等价动作的噪声估计取 max 会产生正偏差，且偏差随动作数 n 增长；把“选动作”与“估其值”解耦的双估计器几乎消除该偏差。",
+    independent="动作数 n (number of equally-valued actions)",
+    metric="期望估计值 E[V]（真值为 0，越大=越乐观）",
+    falsifier="若单估计器偏差不随 n 增长，或双估计器偏差未显著小于单估计器，则该洞察被推翻。",
+)
+
+
+def single_estimator(n: int, m: int, sigma: float, rng: random.Random) -> float:
+    """max_a mean of m noisy samples of a zero-mean value."""
+    return max(mean(rng.gauss(0.0, sigma) for _ in range(m)) for _ in range(n))
+
+
+def double_estimator(n: int, m: int, sigma: float, rng: random.Random) -> float:
+    """Split samples: pick argmax on set A, evaluate that action on set B."""
+    half = max(1, m // 2)
+    means_a, means_b = [], []
+    for _ in range(n):
+        means_a.append(mean(rng.gauss(0.0, sigma) for _ in range(half)))
+        means_b.append(mean(rng.gauss(0.0, sigma) for _ in range(half)))
+    a_star = max(range(n), key=lambda i: means_a[i])
+    return means_b[a_star]
+
+
+def run(seed: int = 0, trials: int = 4000, m: int = 8, sigma: float = 1.0) -> Study:
+    study = Study(hypothesis=HYP, seed=seed)
+    rng = random.Random(seed)
+    for n in (2, 4, 8, 16, 32, 64):
+        s = mean(single_estimator(n, m, sigma, rng) for _ in range(trials))
+        d = mean(double_estimator(n, m, sigma, rng) for _ in range(trials))
+        study.record("single max (vanilla Q)", n, s)
+        study.record("double estimator (Double-Q)", n, d)
+    return study
+
+
+def main() -> int:
+    study = run()
+    single = dict(study.series["single max (vanilla Q)"])
+    double = dict(study.series["double estimator (Double-Q)"])
+    ns = sorted(single)
+    grows = single[ns[-1]] > single[ns[0]] + 1e-3
+    double_smaller = all(abs(double[n]) < single[n] for n in ns if single[n] > 1e-3)
+    survived = grows and double_smaller
+    verdict = ("洞察成立：单估计器偏差随动作数单调上升，双估计器把它压到接近 0。"
+               if survived else "洞察未通过本次检验（见下）。")
+    detail = (
+        f"- 单估计器偏差从 n={ns[0]} 的 {single[ns[0]]:.3f} 升到 n={ns[-1]} 的 {single[ns[-1]]:.3f}"
+        f"（真值为 0，全部是乐观高估）。\n"
+        f"- 双估计器在同一范围内保持在 [{min(double.values()):.3f}, {max(double.values()):.3f}]，"
+        f"几乎无偏。\n"
+        f"- 自变量随动作数增长印证了 Jensen 间隙 $\\mathbb{{E}}[\\max]\\ge\\max\\mathbb{{E}}$ 的方向；"
+        f"对自动驾驶里动作空间大、Q 估计有噪声的价值方法，这正是 Double-DQN 一类解耦估计的动机。\n"
+        f"- 复现：`python3 labs/research/exp_overestimation_bias.py`（seed={study.seed}，确定性）。"
+    )
+    out = Path(__file__).resolve().parent / "results" / "overestimation_bias"
+    path = study.write(out, verdict, detail)
+    print(study.results_table())
+    print(verdict)
+    print(f"wrote {path.relative_to(Path(__file__).resolve().parents[2])} + figure.svg")
+    return 0 if survived else 1
+
+
+if __name__ == "__main__":
+    raise SystemExit(main())
diff --git a/labs/research/harness.py b/labs/research/harness.py
@@ -0,0 +1,126 @@
+#!/usr/bin/env python3
+"""A tiny, dependency-free research harness.
+
+The atlas points every core node at a *falsifiable* next experiment (the
+"研究透镜 / Research lens"). This harness is the other half: it turns such a
+hypothesis into a runnable study that sweeps one independent variable, records a
+metric, decides whether the hypothesis survived its own falsifier, and emits a
+results table, an SVG figure, and a findings note.
+
+Design choices:
+  * Pure standard library (random/math/statistics) — runs on a free CPU, in CI,
+    or in Colab with no install, matching the rest of labs/.
+  * Deterministic: every study fixes a seed, so a result is reproducible and a
+    reviewer can re-run it byte-for-byte.
+  * The unit of work is a Hypothesis tied to a graph node id, so a study is
+    traceable back to the exact place in the atlas it came from.
+"""
+from __future__ import annotations
+
+import math
+import statistics
+from dataclasses import dataclass, field
+from pathlib import Path
+from typing import Callable
+
+
+@dataclass
+class Hypothesis:
+    """A falsifiable claim attached to a node in the atlas."""
+    node: str            # graph node id this experiment tests, e.g. "insight:..."
+    claim: str           # the prediction in one sentence
+    independent: str     # the variable being swept (with units)
+    metric: str          # what is measured
+    falsifier: str       # the condition that, if observed, refutes the claim
+
+
+@dataclass
+class Study:
+    hypothesis: Hypothesis
+    seed: int = 0
+    # series name -> list of (x, y) points, filled by run()
+    series: dict[str, list[tuple[float, float]]] = field(default_factory=dict)
+    notes: list[str] = field(default_factory=list)
+
+    def record(self, series_name: str, x: float, y: float) -> None:
+        self.series.setdefault(series_name, []).append((float(x), float(y)))
+
+    # ---- reporting -------------------------------------------------------
+    def results_table(self) -> str:
+        xs = sorted({x for pts in self.series.values() for x, _ in pts})
+        names = list(self.series)
+        head = "| " + " | ".join([self.hypothesis.independent] + names) + " |\n"
+        head += "|" + "|".join(["---"] * (len(names) + 1)) + "|\n"
+        lookup = {n: dict(pts) for n, pts in self.series.items()}
+        rows = []
+        for x in xs:
+            cells = [f"{x:g}"] + [
+                (f"{lookup[n].get(x):.4f}" if x in lookup[n] else "—") for n in names
+            ]
+            rows.append("| " + " | ".join(cells) + " |")
+        return head + "\n".join(rows) + "\n"
+
+    def svg(self, width: int = 640, height: int = 360, pad: int = 48) -> str:
+        pts_all = [(x, y) for pts in self.series.values() for x, y in pts]
+        if not pts_all:
+            return "<svg/>"
+        xmin = min(x for x, _ in pts_all); xmax = max(x for x, _ in pts_all)
+        ymin = min(y for _, y in pts_all); ymax = max(y for _, y in pts_all)
+        ymin = min(ymin, 0.0)
+        if xmax == xmin: xmax = xmin + 1
+        if ymax == ymin: ymax = ymin + 1
+        def sx(x): return pad + (x - xmin) / (xmax - xmin) * (width - 2 * pad)
+        def sy(y): return height - pad - (y - ymin) / (ymax - ymin) * (height - 2 * pad)
+        palette = ["#6cb1ff", "#f97316", "#a78bfa", "#5fd38d", "#fcd34d"]
+        out = [f'<svg xmlns="http://www.w3.org/2000/svg" viewBox="0 0 {width} {height}" font-family="system-ui,sans-serif" font-size="12">']
+        out.append(f'<rect width="{width}" height="{height}" fill="#0b1020"/>')
+        # zero line + axes
+        y0 = sy(0.0)
+        out.append(f'<line x1="{pad}" y1="{y0:.1f}" x2="{width-pad}" y2="{y0:.1f}" stroke="#33415c" stroke-dasharray="4 4"/>')
+        out.append(f'<line x1="{pad}" y1="{pad}" x2="{pad}" y2="{height-pad}" stroke="#475569"/>')
+        out.append(f'<line x1="{pad}" y1="{height-pad}" x2="{width-pad}" y2="{height-pad}" stroke="#475569"/>')
+        out.append(f'<text x="{width/2:.0f}" y="{height-12}" fill="#9aa6b8" text-anchor="middle">{_esc(self.hypothesis.independent)}</text>')
+        out.append(f'<text x="14" y="{height/2:.0f}" fill="#9aa6b8" text-anchor="middle" transform="rotate(-90 14 {height/2:.0f})">{_esc(self.hypothesis.metric)}</text>')
+        for i, (name, pts) in enumerate(self.series.items()):
+            color = palette[i % len(palette)]
+            pts_sorted = sorted(pts)
+            d = " ".join(f"{'M' if j==0 else 'L'}{sx(x):.1f},{sy(y):.1f}" for j, (x, y) in enumerate(pts_sorted))
+            out.append(f'<path d="{d}" fill="none" stroke="{color}" stroke-width="2"/>')
+            for x, y in pts_sorted:
+                out.append(f'<circle cx="{sx(x):.1f}" cy="{sy(y):.1f}" r="2.5" fill="{color}"/>')
+            ly = pad + 14 + i * 18
+            out.append(f'<rect x="{pad+10}" y="{ly-9}" width="14" height="3" fill="{color}"/>')
+            out.append(f'<text x="{pad+30}" y="{ly}" fill="{color}">{_esc(name)}</text>')
+        out.append("</svg>")
+        return "\n".join(out)
+
+    def findings(self, verdict: str, detail: str) -> str:
+        h = self.hypothesis
+        return (
+            f"# Findings — {h.node}\n\n"
+            f"> 这是由 [研究透镜] 中的可证伪实验自动跑出的结果，可被任何人原样复现"
+            f"（固定随机种子 seed={self.seed}）。\n\n"
+            f"**被检验的论断 / Claim.** {h.claim}\n\n"
+            f"**自变量 / Independent variable.** {h.independent}\n\n"
+            f"**度量 / Metric.** {h.metric}\n\n"
+            f"**证伪条件 / Falsifier.** {h.falsifier}\n\n"
+            f"## 结果 / Results\n\n{self.results_table()}\n"
+            f"![figure](figure.svg)\n\n"
+            f"## 结论 / Verdict\n\n**{verdict}**\n\n{detail}\n"
+        )
+
+    def write(self, out_dir: str | Path, verdict: str, detail: str) -> Path:
+        d = Path(out_dir)
+        d.mkdir(parents=True, exist_ok=True)
+        (d / "figure.svg").write_text(self.svg(), encoding="utf-8")
+        path = d / "findings.md"
+        path.write_text(self.findings(verdict, detail), encoding="utf-8")
+        return path
+
+
+def _esc(s: str) -> str:
+    return (str(s).replace("&", "&amp;").replace("<", "&lt;").replace(">", "&gt;"))
+
+
+def mean(xs) -> float:
+    return statistics.fmean(xs)
diff --git a/labs/research/results/overestimation_bias/figure.svg b/labs/research/results/overestimation_bias/figure.svg
diff --git a/labs/research/results/overestimation_bias/findings.md b/labs/research/results/overestimation_bias/findings.md
@@ -0,0 +1,33 @@
+# Findings — insight:q_learning_max_is_optimistically_biased
+
+> 这是由 [研究透镜] 中的可证伪实验自动跑出的结果，可被任何人原样复现（固定随机种子 seed=0）。
+
+**被检验的论断 / Claim.** 对等价动作的噪声估计取 max 会产生正偏差，且偏差随动作数 n 增长；把“选动作”与“估其值”解耦的双估计器几乎消除该偏差。
+
+**自变量 / Independent variable.** 动作数 n (number of equally-valued actions)
+
+**度量 / Metric.** 期望估计值 E[V]（真值为 0，越大=越乐观）
+
+**证伪条件 / Falsifier.** 若单估计器偏差不随 n 增长，或双估计器偏差未显著小于单估计器，则该洞察被推翻。
+
+## 结果 / Results
+
+| 动作数 n (number of equally-valued actions) | single max (vanilla Q) | double estimator (Double-Q) |
+|---|---|---|
+| 2 | 0.2046 | -0.0082 |
+| 4 | 0.3700 | 0.0024 |
+| 8 | 0.4995 | -0.0030 |
+| 16 | 0.6229 | 0.0005 |
+| 32 | 0.7329 | 0.0017 |
+| 64 | 0.8281 | -0.0037 |
+
+![figure](figure.svg)
+
+## 结论 / Verdict
+
+**洞察成立：单估计器偏差随动作数单调上升，双估计器把它压到接近 0。**
+
+- 单估计器偏差从 n=2.0 的 0.205 升到 n=64.0 的 0.828（真值为 0，全部是乐观高估）。
+- 双估计器在同一范围内保持在 [-0.008, 0.002]，几乎无偏。
+- 自变量随动作数增长印证了 Jensen 间隙 $\mathbb{E}[\max]\ge\max\mathbb{E}$ 的方向；对自动驾驶里动作空间大、Q 估计有噪声的价值方法，这正是 Double-DQN 一类解耦估计的动机。
+- 复现：`python3 labs/research/exp_overestimation_bias.py`（seed=0，确定性）。
diff --git a/research/drafts/insight_offline_rl_is_actually_constrained_dynamic_programming.md b/research/drafts/insight_offline_rl_is_actually_constrained_dynamic_programming.md
@@ -0,0 +1,42 @@
+# 论文骨架 · 由「离线 RL 本质上是带约束的动态规划」生成
+
+> 这是一份**可编辑的论文草稿骨架**，由图谱节点 `insight:offline_rl_is_actually_constrained_dynamic_programming` 的研究透镜与其关系网络自动组装。每一节都被钉在一个具体、可证伪的论断与它的上下文文献上——用来消除空白页，而不是替代你的研究。
+
+## 摘要骨架 / Abstract skeleton
+- **问题**：当最优动作根本不在支撑内（日志里从未出现"紧急左打方向"），没有任何离线算法能学出它——这是信息缺失，不是优化失败。另一端，惩罚强度 $\alpha$ 过大时连支撑内的 Bellman 备份也被压平，策略退化为行为克隆；$\alpha$ 的可用区间通常很窄，需要在留出集上扫调。
+- **关键观察**：数据集 $\mathcal{D}$ 的状态-动作支撑 $\mathrm{supp}(\pi_\beta)$ 已经覆盖了通向高回报所必需的关键动作；惩罚项只压制支撑外的乐观，而不伤害支撑内的价值传播。换句话说，约束动态规划能达到的上界，被数据覆盖范围而非算法本身钉死。
+- **做法**：（填：你用下方“方法骨架”中的原语如何组合出新方法）
+- **可证伪的主张**：固定一份驾驶日志，按动作类型系统性"挖洞"，剔除 5% / 10% / 20% 的支撑，测 [CQL](paper_cql.md) 与 [IQL](paper_iql.md) 的闭环成功率随支撑覆盖率的衰减曲线，再与数据总量做对照回归。可证伪的预测：闭环性能由覆盖率单调决定、与总量近似无关；若总量能补偿覆盖缺失，则本洞察被推翻。
+
+## 1. 研究问题 / Research question
+固定一份驾驶日志，按动作类型系统性"挖洞"，剔除 5% / 10% / 20% 的支撑，测 [CQL](paper_cql.md) 与 [IQL](paper_iql.md) 的闭环成功率随支撑覆盖率的衰减曲线，再与数据总量做对照回归。可证伪的预测：闭环性能由覆盖率单调决定、与总量近似无关；若总量能补偿覆盖缺失，则本洞察被推翻。
+
+## 2. 背景与定位 / Related work（来自关系网络）
+（该节点邻接稀疏；先在图谱里补边，定位会更准。）
+
+## 3. 关键假设 / Load-bearing assumption
+数据集 $\mathcal{D}$ 的状态-动作支撑 $\mathrm{supp}(\pi_\beta)$ 已经覆盖了通向高回报所必需的关键动作；惩罚项只压制支撑外的乐观，而不伤害支撑内的价值传播。换句话说，约束动态规划能达到的上界，被数据覆盖范围而非算法本身钉死。
+
+## 4. 现有方法的失效边界 / The gap
+当最优动作根本不在支撑内（日志里从未出现"紧急左打方向"），没有任何离线算法能学出它——这是信息缺失，不是优化失败。另一端，惩罚强度 $\alpha$ 过大时连支撑内的 Bellman 备份也被压平，策略退化为行为克隆；$\alpha$ 的可用区间通常很窄，需要在留出集上扫调。
+
+## 5. 方法骨架 / Method（可复用的研究原语）
+（该节点未直接连到方法学原语 `move:*`；从第 2 节的先修工作里挑组件，或在图谱里补 `composes` 边。）
+
+## 6. 跨域先验 / Cross-domain prior
+同一结构出现在监督学习的"经验风险 $\le$ 真实风险 + 复杂度项"、离策略评估的 importance-sampling 权重截断、模型预测控制的可行域约束 $u\in\mathcal{U}$、推荐系统的 propensity clipping——都是"在可信区域内最优化、对区域外保持悲观"。
+
+## 7. 实验设计 / Experiment plan
+固定一份驾驶日志，按动作类型系统性"挖洞"，剔除 5% / 10% / 20% 的支撑，测 [CQL](paper_cql.md) 与 [IQL](paper_iql.md) 的闭环成功率随支撑覆盖率的衰减曲线，再与数据总量做对照回归。可证伪的预测：闭环性能由覆盖率单调决定、与总量近似无关；若总量能补偿覆盖缺失，则本洞察被推翻。
+
+可复现脚手架：`labs/research/harness.py` 提供固定随机种子的扫描-度量-出图-下结论流水线；参见已跑通的范例 `labs/research/exp_overestimation_bias.py`（它把一条研究透镜的论断用代码验了一遍）。
+
+## 8. 可复现性 / Reproducibility
+- 暂无 `implements` 边指向现成代码；用第 7 节的 harness 起一个最小可复现实验。
+
+## 9. 证伪判据 / Falsification criteria
+明确写下“看到什么结果就说明本主张错了”。一个无法被证伪的主张不值得投稿——
+固定一份驾驶日志，按动作类型系统性"挖洞"，剔除 5% / 10% / 20% 的支撑，测 [CQL](paper_cql.md) 与 [IQL](paper_iql.md) 的闭环成功率随支撑覆盖率的衰减曲线，再与数据总量做对照回归。可证伪的预测：闭环性能由覆盖率单调决定、与总量近似无关；若总量能补偿覆盖缺失，则本洞察被推翻。
+
+---
+<sub>seeded from atlas node `insight:offline_rl_is_actually_constrained_dynamic_programming` · 关系网络 + 研究透镜自动组装 · 由 tools/scaffold_paper.py 生成</sub>