Skip to content
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
62 changes: 62 additions & 0 deletions docs/data/lens_fragments/wave2_rl.json

Large diffs are not rendered by default.

60 changes: 60 additions & 0 deletions docs/data/research_lens.json

Large diffs are not rendered by default.

96 changes: 96 additions & 0 deletions labs/research/exp_overestimation_bias.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,96 @@
#!/usr/bin/env python3
r"""Code-verify the lens of `insight:q_learning_max_is_optimistically_biased`.

The lens claims: taking $\max_a$ over noisy action-value estimates is
systematically optimistic, because $\mathbb{E}[\max_a \hat Q(s,a)] \ge
\max_a \mathbb{E}[\hat Q(s,a)]$ (Jensen, since $\max$ is convex), and the bias
grows with the number of actions and the estimation noise; decoupling
"which action" from "its value" (the double estimator) removes it.

Falsifier (from the lens): if the single-estimator bias does NOT grow with the
action count, or the double estimator is not materially less biased, the claim
is wrong.

Setup (deliberately minimal and exactly analysable): all true action values are
equal, $q^\*(a)=0$, so $\max_a \mathbb{E}[\hat Q(a)] = 0$ and any positive
expected estimate is pure overestimation. Each action's value is estimated from
`m` noisy samples ~ N(0, sigma^2). We sweep the action count `n` and report the
mean estimator over many trials. This is the driving-relevant micro-version of
why vanilla DQN over-values actions in large/branchy action spaces and why
Double-DQN was introduced.

Pure standard library; deterministic; CPU-only. Run:
python3 labs/research/exp_overestimation_bias.py
"""
from __future__ import annotations

import random
from pathlib import Path

from harness import Hypothesis, Study, mean

HYP = Hypothesis(
node="insight:q_learning_max_is_optimistically_biased",
claim="对等价动作的噪声估计取 max 会产生正偏差,且偏差随动作数 n 增长;把“选动作”与“估其值”解耦的双估计器几乎消除该偏差。",
independent="动作数 n (number of equally-valued actions)",
metric="期望估计值 E[V](真值为 0,越大=越乐观)",
falsifier="若单估计器偏差不随 n 增长,或双估计器偏差未显著小于单估计器,则该洞察被推翻。",
)


def single_estimator(n: int, m: int, sigma: float, rng: random.Random) -> float:
"""max_a mean of m noisy samples of a zero-mean value."""
return max(mean(rng.gauss(0.0, sigma) for _ in range(m)) for _ in range(n))


def double_estimator(n: int, m: int, sigma: float, rng: random.Random) -> float:
"""Split samples: pick argmax on set A, evaluate that action on set B."""
half = max(1, m // 2)
means_a, means_b = [], []
for _ in range(n):
means_a.append(mean(rng.gauss(0.0, sigma) for _ in range(half)))
means_b.append(mean(rng.gauss(0.0, sigma) for _ in range(half)))
a_star = max(range(n), key=lambda i: means_a[i])
return means_b[a_star]


def run(seed: int = 0, trials: int = 4000, m: int = 8, sigma: float = 1.0) -> Study:
study = Study(hypothesis=HYP, seed=seed)
rng = random.Random(seed)
for n in (2, 4, 8, 16, 32, 64):
s = mean(single_estimator(n, m, sigma, rng) for _ in range(trials))
d = mean(double_estimator(n, m, sigma, rng) for _ in range(trials))
study.record("single max (vanilla Q)", n, s)
study.record("double estimator (Double-Q)", n, d)
return study


def main() -> int:
study = run()
single = dict(study.series["single max (vanilla Q)"])
double = dict(study.series["double estimator (Double-Q)"])
ns = sorted(single)
grows = single[ns[-1]] > single[ns[0]] + 1e-3
double_smaller = all(abs(double[n]) < single[n] for n in ns if single[n] > 1e-3)
survived = grows and double_smaller
verdict = ("洞察成立:单估计器偏差随动作数单调上升,双估计器把它压到接近 0。"
if survived else "洞察未通过本次检验(见下)。")
detail = (
f"- 单估计器偏差从 n={ns[0]} 的 {single[ns[0]]:.3f} 升到 n={ns[-1]} 的 {single[ns[-1]]:.3f}"
f"(真值为 0,全部是乐观高估)。\n"
f"- 双估计器在同一范围内保持在 [{min(double.values()):.3f}, {max(double.values()):.3f}],"
f"几乎无偏。\n"
f"- 自变量随动作数增长印证了 Jensen 间隙 $\\mathbb{{E}}[\\max]\\ge\\max\\mathbb{{E}}$ 的方向;"
f"对自动驾驶里动作空间大、Q 估计有噪声的价值方法,这正是 Double-DQN 一类解耦估计的动机。\n"
f"- 复现:`python3 labs/research/exp_overestimation_bias.py`(seed={study.seed},确定性)。"
)
out = Path(__file__).resolve().parent / "results" / "overestimation_bias"
path = study.write(out, verdict, detail)
print(study.results_table())
print(verdict)
print(f"wrote {path.relative_to(Path(__file__).resolve().parents[2])} + figure.svg")
return 0 if survived else 1


if __name__ == "__main__":
raise SystemExit(main())
126 changes: 126 additions & 0 deletions labs/research/harness.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,126 @@
#!/usr/bin/env python3
"""A tiny, dependency-free research harness.

The atlas points every core node at a *falsifiable* next experiment (the
"研究透镜 / Research lens"). This harness is the other half: it turns such a
hypothesis into a runnable study that sweeps one independent variable, records a
metric, decides whether the hypothesis survived its own falsifier, and emits a
results table, an SVG figure, and a findings note.

Design choices:
* Pure standard library (random/math/statistics) — runs on a free CPU, in CI,
or in Colab with no install, matching the rest of labs/.
* Deterministic: every study fixes a seed, so a result is reproducible and a
reviewer can re-run it byte-for-byte.
* The unit of work is a Hypothesis tied to a graph node id, so a study is
traceable back to the exact place in the atlas it came from.
"""
from __future__ import annotations

import math
import statistics
from dataclasses import dataclass, field
from pathlib import Path
from typing import Callable


@dataclass
class Hypothesis:
"""A falsifiable claim attached to a node in the atlas."""
node: str # graph node id this experiment tests, e.g. "insight:..."
claim: str # the prediction in one sentence
independent: str # the variable being swept (with units)
metric: str # what is measured
falsifier: str # the condition that, if observed, refutes the claim


@dataclass
class Study:
hypothesis: Hypothesis
seed: int = 0
# series name -> list of (x, y) points, filled by run()
series: dict[str, list[tuple[float, float]]] = field(default_factory=dict)
notes: list[str] = field(default_factory=list)

def record(self, series_name: str, x: float, y: float) -> None:
self.series.setdefault(series_name, []).append((float(x), float(y)))

# ---- reporting -------------------------------------------------------
def results_table(self) -> str:
xs = sorted({x for pts in self.series.values() for x, _ in pts})
names = list(self.series)
head = "| " + " | ".join([self.hypothesis.independent] + names) + " |\n"
head += "|" + "|".join(["---"] * (len(names) + 1)) + "|\n"
lookup = {n: dict(pts) for n, pts in self.series.items()}
rows = []
for x in xs:
cells = [f"{x:g}"] + [
(f"{lookup[n].get(x):.4f}" if x in lookup[n] else "—") for n in names
]
rows.append("| " + " | ".join(cells) + " |")
return head + "\n".join(rows) + "\n"

def svg(self, width: int = 640, height: int = 360, pad: int = 48) -> str:
pts_all = [(x, y) for pts in self.series.values() for x, y in pts]
if not pts_all:
return "<svg/>"
xmin = min(x for x, _ in pts_all); xmax = max(x for x, _ in pts_all)
ymin = min(y for _, y in pts_all); ymax = max(y for _, y in pts_all)
ymin = min(ymin, 0.0)
if xmax == xmin: xmax = xmin + 1
if ymax == ymin: ymax = ymin + 1
def sx(x): return pad + (x - xmin) / (xmax - xmin) * (width - 2 * pad)
def sy(y): return height - pad - (y - ymin) / (ymax - ymin) * (height - 2 * pad)
palette = ["#6cb1ff", "#f97316", "#a78bfa", "#5fd38d", "#fcd34d"]
out = [f'<svg xmlns="http://www.w3.org/2000/svg" viewBox="0 0 {width} {height}" font-family="system-ui,sans-serif" font-size="12">']
out.append(f'<rect width="{width}" height="{height}" fill="#0b1020"/>')
# zero line + axes
y0 = sy(0.0)
out.append(f'<line x1="{pad}" y1="{y0:.1f}" x2="{width-pad}" y2="{y0:.1f}" stroke="#33415c" stroke-dasharray="4 4"/>')
out.append(f'<line x1="{pad}" y1="{pad}" x2="{pad}" y2="{height-pad}" stroke="#475569"/>')
out.append(f'<line x1="{pad}" y1="{height-pad}" x2="{width-pad}" y2="{height-pad}" stroke="#475569"/>')
out.append(f'<text x="{width/2:.0f}" y="{height-12}" fill="#9aa6b8" text-anchor="middle">{_esc(self.hypothesis.independent)}</text>')
out.append(f'<text x="14" y="{height/2:.0f}" fill="#9aa6b8" text-anchor="middle" transform="rotate(-90 14 {height/2:.0f})">{_esc(self.hypothesis.metric)}</text>')
for i, (name, pts) in enumerate(self.series.items()):
color = palette[i % len(palette)]
pts_sorted = sorted(pts)
d = " ".join(f"{'M' if j==0 else 'L'}{sx(x):.1f},{sy(y):.1f}" for j, (x, y) in enumerate(pts_sorted))
out.append(f'<path d="{d}" fill="none" stroke="{color}" stroke-width="2"/>')
for x, y in pts_sorted:
out.append(f'<circle cx="{sx(x):.1f}" cy="{sy(y):.1f}" r="2.5" fill="{color}"/>')
ly = pad + 14 + i * 18
out.append(f'<rect x="{pad+10}" y="{ly-9}" width="14" height="3" fill="{color}"/>')
out.append(f'<text x="{pad+30}" y="{ly}" fill="{color}">{_esc(name)}</text>')
out.append("</svg>")
return "\n".join(out)

def findings(self, verdict: str, detail: str) -> str:
h = self.hypothesis
return (
f"# Findings — {h.node}\n\n"
f"> 这是由 [研究透镜] 中的可证伪实验自动跑出的结果,可被任何人原样复现"
f"(固定随机种子 seed={self.seed})。\n\n"
f"**被检验的论断 / Claim.** {h.claim}\n\n"
f"**自变量 / Independent variable.** {h.independent}\n\n"
f"**度量 / Metric.** {h.metric}\n\n"
f"**证伪条件 / Falsifier.** {h.falsifier}\n\n"
f"## 结果 / Results\n\n{self.results_table()}\n"
f"![figure](figure.svg)\n\n"
f"## 结论 / Verdict\n\n**{verdict}**\n\n{detail}\n"
)

def write(self, out_dir: str | Path, verdict: str, detail: str) -> Path:
d = Path(out_dir)
d.mkdir(parents=True, exist_ok=True)
(d / "figure.svg").write_text(self.svg(), encoding="utf-8")
path = d / "findings.md"
path.write_text(self.findings(verdict, detail), encoding="utf-8")
return path


def _esc(s: str) -> str:
return (str(s).replace("&", "&amp;").replace("<", "&lt;").replace(">", "&gt;"))


def mean(xs) -> float:
return statistics.fmean(xs)
26 changes: 26 additions & 0 deletions labs/research/results/overestimation_bias/figure.svg
Loading
Sorry, something went wrong. Reload?
Sorry, we cannot display this file.
Sorry, this file is invalid so it cannot be displayed.
33 changes: 33 additions & 0 deletions labs/research/results/overestimation_bias/findings.md
Original file line number Diff line number Diff line change
@@ -0,0 +1,33 @@
# Findings — insight:q_learning_max_is_optimistically_biased

> 这是由 [研究透镜] 中的可证伪实验自动跑出的结果,可被任何人原样复现(固定随机种子 seed=0)。

**被检验的论断 / Claim.** 对等价动作的噪声估计取 max 会产生正偏差,且偏差随动作数 n 增长;把“选动作”与“估其值”解耦的双估计器几乎消除该偏差。

**自变量 / Independent variable.** 动作数 n (number of equally-valued actions)

**度量 / Metric.** 期望估计值 E[V](真值为 0,越大=越乐观)

**证伪条件 / Falsifier.** 若单估计器偏差不随 n 增长,或双估计器偏差未显著小于单估计器,则该洞察被推翻。

## 结果 / Results

| 动作数 n (number of equally-valued actions) | single max (vanilla Q) | double estimator (Double-Q) |
|---|---|---|
| 2 | 0.2046 | -0.0082 |
| 4 | 0.3700 | 0.0024 |
| 8 | 0.4995 | -0.0030 |
| 16 | 0.6229 | 0.0005 |
| 32 | 0.7329 | 0.0017 |
| 64 | 0.8281 | -0.0037 |

![figure](figure.svg)

## 结论 / Verdict

**洞察成立:单估计器偏差随动作数单调上升,双估计器把它压到接近 0。**

- 单估计器偏差从 n=2.0 的 0.205 升到 n=64.0 的 0.828(真值为 0,全部是乐观高估)。
- 双估计器在同一范围内保持在 [-0.008, 0.002],几乎无偏。
- 自变量随动作数增长印证了 Jensen 间隙 $\mathbb{E}[\max]\ge\max\mathbb{E}$ 的方向;对自动驾驶里动作空间大、Q 估计有噪声的价值方法,这正是 Double-DQN 一类解耦估计的动机。
- 复现:`python3 labs/research/exp_overestimation_bias.py`(seed=0,确定性)。
Original file line number Diff line number Diff line change
@@ -0,0 +1,42 @@
# 论文骨架 · 由「离线 RL 本质上是带约束的动态规划」生成

> 这是一份**可编辑的论文草稿骨架**,由图谱节点 `insight:offline_rl_is_actually_constrained_dynamic_programming` 的研究透镜与其关系网络自动组装。每一节都被钉在一个具体、可证伪的论断与它的上下文文献上——用来消除空白页,而不是替代你的研究。

## 摘要骨架 / Abstract skeleton
- **问题**:当最优动作根本不在支撑内(日志里从未出现"紧急左打方向"),没有任何离线算法能学出它——这是信息缺失,不是优化失败。另一端,惩罚强度 $\alpha$ 过大时连支撑内的 Bellman 备份也被压平,策略退化为行为克隆;$\alpha$ 的可用区间通常很窄,需要在留出集上扫调。
- **关键观察**:数据集 $\mathcal{D}$ 的状态-动作支撑 $\mathrm{supp}(\pi_\beta)$ 已经覆盖了通向高回报所必需的关键动作;惩罚项只压制支撑外的乐观,而不伤害支撑内的价值传播。换句话说,约束动态规划能达到的上界,被数据覆盖范围而非算法本身钉死。
- **做法**:(填:你用下方“方法骨架”中的原语如何组合出新方法)
- **可证伪的主张**:固定一份驾驶日志,按动作类型系统性"挖洞",剔除 5% / 10% / 20% 的支撑,测 [CQL](paper_cql.md) 与 [IQL](paper_iql.md) 的闭环成功率随支撑覆盖率的衰减曲线,再与数据总量做对照回归。可证伪的预测:闭环性能由覆盖率单调决定、与总量近似无关;若总量能补偿覆盖缺失,则本洞察被推翻。

## 1. 研究问题 / Research question
固定一份驾驶日志,按动作类型系统性"挖洞",剔除 5% / 10% / 20% 的支撑,测 [CQL](paper_cql.md) 与 [IQL](paper_iql.md) 的闭环成功率随支撑覆盖率的衰减曲线,再与数据总量做对照回归。可证伪的预测:闭环性能由覆盖率单调决定、与总量近似无关;若总量能补偿覆盖缺失,则本洞察被推翻。

## 2. 背景与定位 / Related work(来自关系网络)
(该节点邻接稀疏;先在图谱里补边,定位会更准。)

## 3. 关键假设 / Load-bearing assumption
数据集 $\mathcal{D}$ 的状态-动作支撑 $\mathrm{supp}(\pi_\beta)$ 已经覆盖了通向高回报所必需的关键动作;惩罚项只压制支撑外的乐观,而不伤害支撑内的价值传播。换句话说,约束动态规划能达到的上界,被数据覆盖范围而非算法本身钉死。

## 4. 现有方法的失效边界 / The gap
当最优动作根本不在支撑内(日志里从未出现"紧急左打方向"),没有任何离线算法能学出它——这是信息缺失,不是优化失败。另一端,惩罚强度 $\alpha$ 过大时连支撑内的 Bellman 备份也被压平,策略退化为行为克隆;$\alpha$ 的可用区间通常很窄,需要在留出集上扫调。

## 5. 方法骨架 / Method(可复用的研究原语)
(该节点未直接连到方法学原语 `move:*`;从第 2 节的先修工作里挑组件,或在图谱里补 `composes` 边。)

## 6. 跨域先验 / Cross-domain prior
同一结构出现在监督学习的"经验风险 $\le$ 真实风险 + 复杂度项"、离策略评估的 importance-sampling 权重截断、模型预测控制的可行域约束 $u\in\mathcal{U}$、推荐系统的 propensity clipping——都是"在可信区域内最优化、对区域外保持悲观"。

## 7. 实验设计 / Experiment plan
固定一份驾驶日志,按动作类型系统性"挖洞",剔除 5% / 10% / 20% 的支撑,测 [CQL](paper_cql.md) 与 [IQL](paper_iql.md) 的闭环成功率随支撑覆盖率的衰减曲线,再与数据总量做对照回归。可证伪的预测:闭环性能由覆盖率单调决定、与总量近似无关;若总量能补偿覆盖缺失,则本洞察被推翻。

可复现脚手架:`labs/research/harness.py` 提供固定随机种子的扫描-度量-出图-下结论流水线;参见已跑通的范例 `labs/research/exp_overestimation_bias.py`(它把一条研究透镜的论断用代码验了一遍)。

## 8. 可复现性 / Reproducibility
- 暂无 `implements` 边指向现成代码;用第 7 节的 harness 起一个最小可复现实验。

## 9. 证伪判据 / Falsification criteria
明确写下“看到什么结果就说明本主张错了”。一个无法被证伪的主张不值得投稿——
固定一份驾驶日志,按动作类型系统性"挖洞",剔除 5% / 10% / 20% 的支撑,测 [CQL](paper_cql.md) 与 [IQL](paper_iql.md) 的闭环成功率随支撑覆盖率的衰减曲线,再与数据总量做对照回归。可证伪的预测:闭环性能由覆盖率单调决定、与总量近似无关;若总量能补偿覆盖缺失,则本洞察被推翻。

---
<sub>seeded from atlas node `insight:offline_rl_is_actually_constrained_dynamic_programming` · 关系网络 + 研究透镜自动组装 · 由 tools/scaffold_paper.py 生成</sub>
Loading
Loading