diff --git a/docs/source/datasets/fpa_fod_tabular.rst b/docs/source/datasets/fpa_fod_tabular.rst
new file mode 100644
index 00000000..2d1136f4
--- /dev/null
+++ b/docs/source/datasets/fpa_fod_tabular.rst
@@ -0,0 +1,38 @@
+FPA FOD — Tabular Classification Dataset
+========================================
+
+What it is
+----------
+
+This dataset provides a tabular supervised-learning view derived from FPA FOD wildfire records.
+One sample corresponds to one incident (or one processed record).
+
+Data source and licensing
+-------------------------
+
+The raw FPA FOD dataset is large and must be obtained by the user. PyHazards does not ship the raw data
+due to licensing/size constraints.
+
+How to provide data
+-------------------
+
+1. Obtain the FPA FOD sqlite (user-provided).
+2. Place it at the path expected by the dataset builder (see project README/scripts).
+3. Run the preprocess/build step to produce processed dataset artifacts.
+
+Returned tensors (contract)
+---------------------------
+
+A single sample returns:
+- ``x``: float tensor of shape ``[input_dim]``
+- ``y``: integer class id (scalar)
+
+A batch returns:
+- ``x``: ``[B, input_dim]``
+- ``y``: ``[B]``
+
+Micro dataset for CI
+--------------------
+
+For CI/testing: set ``micro=True`` to use deterministic synthetic data that matches the schema and shapes,
+so tests run without requiring the raw FPA FOD sqlite.
diff --git a/docs/source/datasets/fpa_fod_weekly.rst b/docs/source/datasets/fpa_fod_weekly.rst
new file mode 100644
index 00000000..1d3e441c
--- /dev/null
+++ b/docs/source/datasets/fpa_fod_weekly.rst
@@ -0,0 +1,31 @@
+FPA FOD — Weekly Forecasting Dataset
+====================================
+
+What it is
+----------
+
+This dataset provides weekly sequences for forecasting wildfire activity.
+One sample corresponds to a sequence of weekly feature vectors.
+
+Data source and licensing
+-------------------------
+
+The raw FPA FOD dataset must be obtained by the user. PyHazards does not ship the raw data due to
+licensing/size constraints.
+
+Returned tensors (contract)
+---------------------------
+
+A single sample returns:
+- ``x``: float tensor of shape ``[T, input_dim]``
+- ``y``: float tensor of shape ``[out_dim]`` (or scalar)
+
+A batch returns:
+- ``x``: ``[B, T, input_dim]``
+- ``y``: ``[B, out_dim]``
+
+Micro dataset for CI
+--------------------
+
+For CI/testing: set ``micro=True`` to use deterministic synthetic data (small B and T) to validate shapes,
+dtypes, and a trainer smoke run, without requiring the raw FPA FOD sqlite.
diff --git a/pyhazards/datasets/__init__.py b/pyhazards/datasets/__init__.py
index c6310db1..d5a678d5 100644
--- a/pyhazards/datasets/__init__.py
+++ b/pyhazards/datasets/__init__.py
@@ -14,3 +14,15 @@
     "GraphTemporalDataset",
     "graph_collate",
 ]
+from .wildfire_fpa_fod import FPAFODWildfireTabular, FPAFODWildfireWeekly
+
+# Register datasets so framework load_dataset(...) can find them
+register_dataset(
+    name="wildfire_fpa_fod_tabular",
+    builder=lambda **kwargs: FPAFODWildfireTabular(**kwargs),
+)
+
+register_dataset(
+    name="wildfire_fpa_fod_weekly",
+    builder=lambda **kwargs: FPAFODWildfireWeekly(**kwargs),
+)
diff --git a/pyhazards/datasets/wildfire_fpa_fod.py b/pyhazards/datasets/wildfire_fpa_fod.py
new file mode 100644
index 00000000..e3f5d80d
--- /dev/null
+++ b/pyhazards/datasets/wildfire_fpa_fod.py
@@ -0,0 +1,583 @@
+"""
+FPA-FOD Wildfire datasets for PyHazards.
+
+Implements:
+  - FPAFODWildfireTabular: per-incident tabular classification for (cause | size)
+  - FPAFODWildfireWeekly: weekly time-series forecasting (lookback -> next-week counts by size group)
+
+Contract (PyHazards-canonical):
+  - These datasets are DataBundle-style (NOT torch Dataset):
+      ds.load() -> DataBundle
+      bundle.splits["train"/"val"/"test"] are DataSplit objects with:
+        - .inputs  : torch.FloatTensor
+        - .targets : torch.LongTensor (classification) or torch.FloatTensor (regression)
+  - The engine.Trainer expects DataBundle.get_split(name) -> DataSplit.
+
+Non-negotiable:
+  - Includes deterministic MICRO synthetic datasets for unit tests.
+
+Note:
+  - Real data must be provided by the user (licensing/size). Supported: .sqlite/.db (Fires table), .csv, .parquet.
+"""
+
+from __future__ import annotations
+
+import math
+import os
+from typing import Any, Dict, List, Literal, Optional, Tuple
+
+import numpy as np
+import pandas as pd
+import torch
+
+from .base import DataBundle, DataSplit, Dataset, FeatureSpec, LabelSpec
+
+# -----------------------------------------------------------------------------
+# Types / constants
+# -----------------------------------------------------------------------------
+
+CauseMode = Literal["paper5", "keep_all"]
+Region = Literal["US", "CA"]
+WeeklyFeatures = Literal["counts", "counts+time"]
+
+# These strings must match NWCG_GENERAL_CAUSE values (with synonyms mapped below)
+PAPER5_CAUSES = [
+    "Debris and open burning",
+    "Natural",
+    "Arson/incendiarism",
+    "Equipment and vehicle use",
+    "Recreation and ceremony",
+]
+
+CAUSE_SYNONYMS = {
+    "Debris/open burning": "Debris and open burning",
+    "Debris and Open Burning": "Debris and open burning",
+    "Arson": "Arson/incendiarism",
+    "Equipment/vehicle use": "Equipment and vehicle use",
+    "Recreation/ceremony": "Recreation and ceremony",
+}
+
+SIZE_GROUPS = ["A", "B", "C", "D", "EFG"]
+
+
+# -----------------------------------------------------------------------------
+# Helpers
+# -----------------------------------------------------------------------------
+
+def _minmax_fit(x: np.ndarray) -> Tuple[np.ndarray, np.ndarray]:
+    mins = np.nanmin(x, axis=0)
+    maxs = np.nanmax(x, axis=0)
+    maxs = np.where(maxs == mins, mins + 1.0, maxs)
+    return mins, maxs
+
+
+def _minmax_apply(x: np.ndarray, mins: np.ndarray, maxs: np.ndarray) -> np.ndarray:
+    return (x - mins) / (maxs - mins)
+
+
+def _stratified_split_indices(
+    y: np.ndarray, train_ratio: float, val_ratio: float, test_ratio: float, seed: int
+) -> Tuple[np.ndarray, np.ndarray, np.ndarray]:
+    assert abs((train_ratio + val_ratio + test_ratio) - 1.0) < 1e-6
+    rng = np.random.default_rng(seed)
+
+    train_idx: List[int] = []
+    val_idx: List[int] = []
+    test_idx: List[int] = []
+
+    classes = np.unique(y)
+    for c in classes:
+        idx_c = np.where(y == c)[0]
+        rng.shuffle(idx_c)
+        n = len(idx_c)
+        n_train = int(round(train_ratio * n))
+        n_val = int(round(val_ratio * n))
+        n_test = max(0, n - n_train - n_val)
+
+        train_idx.extend(idx_c[:n_train].tolist())
+        val_idx.extend(idx_c[n_train:n_train + n_val].tolist())
+        test_idx.extend(idx_c[n_train + n_val:n_train + n_val + n_test].tolist())
+
+    train_idx = np.array(train_idx, dtype=np.int64)
+    val_idx = np.array(val_idx, dtype=np.int64)
+    test_idx = np.array(test_idx, dtype=np.int64)
+
+    rng.shuffle(train_idx)
+    rng.shuffle(val_idx)
+    rng.shuffle(test_idx)
+
+    return train_idx, val_idx, test_idx
+
+
+def _chronological_split_indices(
+    n: int, train_ratio: float, val_ratio: float, test_ratio: float
+) -> Tuple[np.ndarray, np.ndarray, np.ndarray]:
+    assert abs((train_ratio + val_ratio + test_ratio) - 1.0) < 1e-6
+    n_train = int(math.floor(train_ratio * n))
+    n_val = int(math.floor(val_ratio * n))
+    n_test = n - n_train - n_val
+
+    train_idx = np.arange(0, n_train, dtype=np.int64)
+    val_idx = np.arange(n_train, n_train + n_val, dtype=np.int64)
+    test_idx = np.arange(n_train + n_val, n_train + n_val + n_test, dtype=np.int64)
+    return train_idx, val_idx, test_idx
+
+
+def _encode_states(states: pd.Series) -> Tuple[np.ndarray, Dict[str, int]]:
+    uniq = sorted([s for s in states.dropna().astype(str).unique().tolist()])
+    mapping = {s: i for i, s in enumerate(uniq)}
+    enc = states.astype(str).map(mapping).astype("int64").to_numpy()
+    return enc, mapping
+
+
+def _normalize_cause_strings(x: pd.Series) -> pd.Series:
+    s = x.astype(str).str.strip()
+    s = s.map(lambda v: CAUSE_SYNONYMS.get(v, v))
+    return s
+
+
+def _coerce_required_columns(df: pd.DataFrame, required: List[str]) -> pd.DataFrame:
+    missing = [c for c in required if c not in df.columns]
+    if missing:
+        raise ValueError(f"Missing required columns: {missing}. Have: {list(df.columns)[:40]}...")
+    return df
+
+
+def _impute_numeric(df: pd.DataFrame, cols: List[str]) -> Tuple[pd.DataFrame, Dict[str, float]]:
+    medians: Dict[str, float] = {}
+    out = df.copy()
+    for c in cols:
+        med = float(pd.to_numeric(out[c], errors="coerce").median())
+        if math.isnan(med):
+            med = 0.0
+        medians[c] = med
+        out[c] = pd.to_numeric(out[c], errors="coerce").fillna(med)
+    return out, medians
+
+
+# -----------------------------------------------------------------------------
+# MICRO synthetic data
+# -----------------------------------------------------------------------------
+
+def _micro_tabular_df(seed: int = 1337, n: int = 200) -> pd.DataFrame:
+    rng = np.random.default_rng(seed)
+    states = np.array(["CA", "TX", "FL", "NY", "WA", "CO"])
+    causes = np.array(PAPER5_CAUSES)
+
+    years = rng.integers(2010, 2019, size=n)
+    doy = rng.integers(1, 366, size=n)
+    disc_time = rng.integers(0, 2400, size=n)
+    cont_doy = np.clip(doy + rng.integers(0, 30, size=n), 1, 366)
+    cont_time = rng.integers(0, 2400, size=n)
+
+    st = rng.choice(states, size=n, replace=True)
+    lat = rng.uniform(25.0, 49.0, size=n)
+    lon = rng.uniform(-124.0, -67.0, size=n)
+    ca_mask = (st == "CA")
+    lat[ca_mask] = rng.uniform(32.0, 42.0, size=ca_mask.sum())
+    lon[ca_mask] = rng.uniform(-124.5, -114.0, size=ca_mask.sum())
+
+    cause = rng.choice(causes, size=n, replace=True)
+    size_raw = rng.choice(
+        ["A", "B", "C", "D", "E", "F", "G"],
+        size=n,
+        p=[0.38, 0.42, 0.12, 0.04, 0.02, 0.01, 0.01],
+    )
+
+    return pd.DataFrame({
+        "FIRE_YEAR": years,
+        "STATE": st,
+        "DISCOVERY_DOY": doy,
+        "DISCOVERY_TIME": disc_time,
+        "CONT_DOY": cont_doy,
+        "CONT_TIME": cont_time,
+        "LATITUDE": lat,
+        "LONGITUDE": lon,
+        "NWCG_GENERAL_CAUSE": cause,
+        "FIRE_SIZE_CLASS": size_raw,
+    })
+
+
+def _micro_weekly_counts(seed: int = 1337, weeks: int = 120, region: Region = "US") -> pd.DataFrame:
+    rng = np.random.default_rng(seed)
+    start = pd.Timestamp("2016-01-04")
+    dates = pd.date_range(start, periods=weeks, freq="W-MON")
+
+    t = np.arange(weeks)
+    base = 50 + 20 * np.sin(2 * np.pi * t / 52.0)
+    if region == "CA":
+        base = base * 1.3
+
+    A = np.maximum(0, base + rng.normal(0, 8, size=weeks)).astype(int)
+    B = np.maximum(0, base * 0.8 + rng.normal(0, 7, size=weeks)).astype(int)
+    C = np.maximum(0, base * 0.2 + rng.normal(0, 3, size=weeks)).astype(int)
+    D = np.maximum(0, base * 0.05 + rng.normal(0, 2, size=weeks)).astype(int)
+    EFG = np.maximum(0, base * 0.03 + rng.normal(0, 2, size=weeks)).astype(int)
+
+    return pd.DataFrame({"week_start": dates, "A": A, "B": B, "C": C, "D": D, "EFG": EFG})
+
+
+# -----------------------------------------------------------------------------
+# Real-data loader
+# -----------------------------------------------------------------------------
+
+def _load_fpa_fod_any(path: str) -> pd.DataFrame:
+    if not os.path.exists(path):
+        raise FileNotFoundError(f"Data path not found: {path}")
+
+    ext = os.path.splitext(path)[1].lower()
+
+    if ext in [".sqlite", ".db"]:
+        import sqlite3
+        con = sqlite3.connect(path)
+        try:
+            return pd.read_sql_query("SELECT * FROM Fires", con)
+        finally:
+            con.close()
+
+    if ext == ".csv":
+        return pd.read_csv(path)
+
+    if ext == ".parquet":
+        return pd.read_parquet(path)
+
+    raise ValueError(f"Unsupported file extension: {ext} for path {path}")
+
+
+# -----------------------------------------------------------------------------
+# Dataset: Tabular classification
+# -----------------------------------------------------------------------------
+
+class FPAFODWildfireTabular(Dataset):
+    name = "wildfire_fpa_fod_tabular"
+
+    def __init__(
+        self,
+        task: Literal["cause", "size"] = "cause",
+        region: Region = "US",
+        cause_mode: CauseMode = "paper5",
+        data_path: Optional[str] = None,
+        micro: bool = False,
+        normalize: bool = False,
+        train_ratio: float = 0.6,
+        val_ratio: float = 0.2,
+        test_ratio: float = 0.2,
+        seed: int = 1337,
+        cache_dir: Optional[str] = None,
+    ):
+        super().__init__(cache_dir=cache_dir)
+        self.task = task
+        self.region = region
+        self.cause_mode = cause_mode
+        self.data_path = data_path
+        self.micro = micro
+        self.normalize = normalize
+        self.train_ratio = train_ratio
+        self.val_ratio = val_ratio
+        self.test_ratio = test_ratio
+        self.seed = seed
+
+    def _load(self) -> DataBundle:
+        if self.micro:
+            df = _micro_tabular_df(seed=self.seed, n=200)
+            source = "micro_synthetic"
+        else:
+            if not self.data_path:
+                raise ValueError("data_path is required when micro=False")
+            df = _load_fpa_fod_any(self.data_path)
+            source = self.data_path
+
+        required = [
+            "FIRE_YEAR", "STATE", "DISCOVERY_DOY", "DISCOVERY_TIME",
+            "CONT_DOY", "CONT_TIME", "LATITUDE", "LONGITUDE",
+            "NWCG_GENERAL_CAUSE", "FIRE_SIZE_CLASS",
+        ]
+        df = _coerce_required_columns(df, required)
+
+        if self.region == "CA":
+            df = df[df["STATE"].astype(str) == "CA"].copy()
+
+        df, numeric_impute = _impute_numeric(
+            df,
+            cols=["FIRE_YEAR", "DISCOVERY_DOY", "DISCOVERY_TIME", "CONT_DOY", "CONT_TIME", "LATITUDE", "LONGITUDE"],
+        )
+
+        state_enc, state_mapping = _encode_states(df["STATE"])
+        feature_cols_numeric = ["FIRE_YEAR", "DISCOVERY_DOY", "DISCOVERY_TIME", "CONT_DOY", "CONT_TIME", "LATITUDE", "LONGITUDE"]
+
+        x_num = df[feature_cols_numeric].to_numpy(dtype=np.float32)
+        x_state = state_enc.astype(np.float32).reshape(-1, 1)
+        x = np.concatenate([x_num, x_state], axis=1).astype(np.float32)
+
+        feature_names = feature_cols_numeric + ["STATE_ID"]
+
+        metadata: Dict[str, Any] = {
+            "dataset": self.name,
+            "source": source,
+            "region": self.region,
+            "task": self.task,
+            "micro": bool(self.micro),
+            "seed": int(self.seed),
+            "state_mapping": state_mapping,
+            "numeric_impute_medians": numeric_impute,
+        }
+
+        # ---- Targets + split indices ----
+        if self.task == "cause":
+            causes = _normalize_cause_strings(df["NWCG_GENERAL_CAUSE"])
+
+            if self.cause_mode == "paper5":
+                mask = causes.isin(PAPER5_CAUSES)
+                metadata["dropped_non_paper5_causes"] = int((~mask).sum())
+                if int(mask.sum()) == 0:
+                    top = causes.value_counts().head(20).to_dict()
+                    raise RuntimeError(
+                        "cause_mode=paper5 kept 0 rows. Your NWCG_GENERAL_CAUSE strings don't match PAPER5_CAUSES.\n"
+                        f"Top values: {top}\n"
+                        f"Expected one of: {PAPER5_CAUSES}\n"
+                        "Fix CAUSE_SYNONYMS / PAPER5_CAUSES or run with cause_mode='keep_all'."
+                    )
+                df = df.loc[mask].copy()
+                causes = causes.loc[mask]
+                x = x[mask.to_numpy()]
+
+            classes = sorted(causes.unique().tolist())
+            label_mapping = {c: i for i, c in enumerate(classes)}
+            y = causes.map(label_mapping).astype("int64").to_numpy()
+
+            metadata["label_mapping"] = label_mapping
+            train_idx, val_idx, test_idx = _stratified_split_indices(
+                y=y, train_ratio=self.train_ratio, val_ratio=self.val_ratio, test_ratio=self.test_ratio, seed=self.seed
+            )
+
+            label_spec = LabelSpec(
+                num_targets=len(classes),
+                task_type="classification",
+                description="NWCG_GENERAL_CAUSE mapped to integer class IDs.",
+                extra={"classes": classes, "label_mapping": label_mapping},
+            )
+
+        elif self.task == "size":
+            size_raw = df["FIRE_SIZE_CLASS"].astype(str).str.strip()
+            size_grp = size_raw.replace({"E": "EFG", "F": "EFG", "G": "EFG"})
+            mask = size_grp.isin(SIZE_GROUPS)
+            metadata["dropped_unknown_size_class"] = int((~mask).sum())
+
+            size_grp = size_grp.loc[mask]
+            x = x[mask.to_numpy()]
+
+            label_mapping = {c: i for i, c in enumerate(SIZE_GROUPS)}
+            y = size_grp.map(label_mapping).astype("int64").to_numpy()
+
+            metadata["label_mapping"] = label_mapping
+            train_idx, val_idx, test_idx = _stratified_split_indices(
+                y=y, train_ratio=self.train_ratio, val_ratio=self.val_ratio, test_ratio=self.test_ratio, seed=self.seed
+            )
+
+            label_spec = LabelSpec(
+                num_targets=len(SIZE_GROUPS),
+                task_type="classification",
+                description="FIRE_SIZE_CLASS grouped to A/B/C/D/EFG and mapped to integer class IDs.",
+                extra={"classes": SIZE_GROUPS, "label_mapping": label_mapping},
+            )
+
+        else:
+            raise ValueError(f"Unknown task: {self.task}")
+
+        # ---- Normalization (fit on train only) ----
+        norm_stats = None
+        if self.normalize:
+            mins, maxs = _minmax_fit(x[train_idx])
+            x = _minmax_apply(x, mins, maxs).astype(np.float32)
+            norm_stats = {"mins": mins.tolist(), "maxs": maxs.tolist()}
+        metadata["normalization"] = norm_stats
+
+        # ---- Build canonical splits as DataSplit(torch tensors) ----
+        X_train = torch.as_tensor(x[train_idx], dtype=torch.float32)
+        y_train = torch.as_tensor(y[train_idx], dtype=torch.long)
+
+        X_val = torch.as_tensor(x[val_idx], dtype=torch.float32)
+        y_val = torch.as_tensor(y[val_idx], dtype=torch.long)
+
+        X_test = torch.as_tensor(x[test_idx], dtype=torch.float32)
+        y_test = torch.as_tensor(y[test_idx], dtype=torch.long)
+
+        splits: Dict[str, DataSplit] = {
+            "train": DataSplit(inputs=X_train, targets=y_train, metadata={"source": source}),
+            "val": DataSplit(inputs=X_val, targets=y_val, metadata={"source": source}),
+            "test": DataSplit(inputs=X_test, targets=y_test, metadata={"source": source}),
+        }
+
+        feature_spec = FeatureSpec(
+            input_dim=int(X_train.shape[1]),
+            description="FPA FOD tabular features (engineered from incident records).",
+            extra={"feature_names": feature_names, "dtype": "float32"},
+        )
+
+        return DataBundle(
+            splits=splits,
+            feature_spec=feature_spec,
+            label_spec=label_spec,
+            metadata=metadata,
+        )
+
+
+# -----------------------------------------------------------------------------
+# Dataset: Weekly forecasting
+# -----------------------------------------------------------------------------
+
+class FPAFODWildfireWeekly(Dataset):
+    name = "wildfire_fpa_fod_weekly"
+
+    def __init__(
+        self,
+        region: Region = "US",
+        data_path: Optional[str] = None,
+        micro: bool = False,
+        lookback_weeks: int = 50,
+        features: WeeklyFeatures = "counts",
+        train_ratio: float = 0.6,
+        val_ratio: float = 0.2,
+        test_ratio: float = 0.2,
+        seed: int = 1337,
+        cache_dir: Optional[str] = None,
+    ):
+        super().__init__(cache_dir=cache_dir)
+        self.region = region
+        self.data_path = data_path
+        self.micro = micro
+        self.lookback_weeks = lookback_weeks
+        self.features = features
+        self.train_ratio = train_ratio
+        self.val_ratio = val_ratio
+        self.test_ratio = test_ratio
+        self.seed = seed
+
+    def _load(self) -> DataBundle:
+        if self.micro:
+            wk = _micro_weekly_counts(seed=self.seed, weeks=120, region=self.region)
+            source = "micro_synthetic"
+        else:
+            if not self.data_path:
+                raise ValueError("data_path is required when micro=False")
+            df = _load_fpa_fod_any(self.data_path)
+            source = self.data_path
+
+            required = ["FIRE_YEAR", "STATE", "DISCOVERY_DOY", "FIRE_SIZE_CLASS"]
+            df = _coerce_required_columns(df, required)
+
+            if self.region == "CA":
+                df = df[df["STATE"].astype(str) == "CA"].copy()
+
+            year = pd.to_numeric(df["FIRE_YEAR"], errors="coerce")
+            doy = pd.to_numeric(df["DISCOVERY_DOY"], errors="coerce")
+            base = pd.to_datetime(year.astype("Int64").astype(str) + "-01-01", errors="coerce")
+            disc_dt = base + pd.to_timedelta(doy.fillna(1).astype(int) - 1, unit="D")
+            df = df.assign(_disc_dt=disc_dt)
+
+            week_start = df["_disc_dt"].dt.to_period("W-MON").dt.start_time
+
+            size_raw = df["FIRE_SIZE_CLASS"].astype(str).str.strip()
+            size_grp = size_raw.replace({"E": "EFG", "F": "EFG", "G": "EFG"})
+            size_grp = size_grp.where(size_grp.isin(SIZE_GROUPS), other=np.nan)
+
+            df = df.assign(_week_start=week_start, _size=size_grp).dropna(subset=["_week_start", "_size"])
+
+            wk = (
+                df.groupby(["_week_start", "_size"])
+                .size()
+                .unstack("_size", fill_value=0)
+                .reset_index()
+                .rename(columns={"_week_start": "week_start"})
+            )
+            for c in SIZE_GROUPS:
+                if c not in wk.columns:
+                    wk[c] = 0
+            wk = wk.sort_values("week_start").reset_index(drop=True)
+
+        lookback = int(self.lookback_weeks)
+        if len(wk) <= lookback:
+            raise ValueError(f"Not enough weeks ({len(wk)}) for lookback={lookback}")
+
+        counts = wk[SIZE_GROUPS].to_numpy(dtype=np.float32)  # (T, 5)
+
+        if self.features == "counts":
+            feats = counts
+            feature_names = SIZE_GROUPS
+        elif self.features == "counts+time":
+            woy = wk["week_start"].dt.isocalendar().week.to_numpy(dtype=np.float32)
+            sin = np.sin(2 * np.pi * woy / 52.0).reshape(-1, 1).astype(np.float32)
+            cos = np.cos(2 * np.pi * woy / 52.0).reshape(-1, 1).astype(np.float32)
+            feats = np.concatenate([counts, sin, cos], axis=1).astype(np.float32)
+            feature_names = SIZE_GROUPS + ["woy_sin", "woy_cos"]
+        else:
+            raise ValueError(f"Unknown features mode: {self.features}")
+
+        X_list: List[np.ndarray] = []
+        Y_list: List[np.ndarray] = []
+        week_starts: List[pd.Timestamp] = []
+
+        for i in range(lookback, len(wk)):
+            X_list.append(feats[i - lookback:i])
+            Y_list.append(counts[i])
+            week_starts.append(wk.loc[i, "week_start"])
+
+        X = np.stack(X_list, axis=0).astype(np.float32)  # (N, lookback, input_dim)
+        Y = np.stack(Y_list, axis=0).astype(np.float32)  # (N, 5)
+
+        n = int(X.shape[0])
+        train_idx, val_idx, test_idx = _chronological_split_indices(
+            n=n, train_ratio=self.train_ratio, val_ratio=self.val_ratio, test_ratio=self.test_ratio
+        )
+
+        # ---- Canonical splits as DataSplit(torch tensors) ----
+        X_train = torch.as_tensor(X[train_idx], dtype=torch.float32)
+        y_train = torch.as_tensor(Y[train_idx], dtype=torch.float32)
+
+        X_val = torch.as_tensor(X[val_idx], dtype=torch.float32)
+        y_val = torch.as_tensor(Y[val_idx], dtype=torch.float32)
+
+        X_test = torch.as_tensor(X[test_idx], dtype=torch.float32)
+        y_test = torch.as_tensor(Y[test_idx], dtype=torch.float32)
+
+        splits: Dict[str, DataSplit] = {
+            "train": DataSplit(inputs=X_train, targets=y_train, metadata={"source": source, "region": self.region}),
+            "val": DataSplit(inputs=X_val, targets=y_val, metadata={"source": source, "region": self.region}),
+            "test": DataSplit(inputs=X_test, targets=y_test, metadata={"source": source, "region": self.region}),
+        }
+
+        feature_spec = FeatureSpec(
+            input_dim=int(X_train.shape[2]),
+            description="Weekly sequence features derived from FPA FOD incident aggregation.",
+            extra={
+                "feature_names": feature_names,
+                "lookback_weeks": lookback,
+                "dtype": "float32",
+                "region": self.region,
+            },
+        )
+
+        label_spec = LabelSpec(
+            num_targets=5,
+            task_type="regression",
+            description="Next-week counts per size group (A,B,C,D,EFG).",
+            extra={"targets": SIZE_GROUPS, "dtype": "float32"},
+        )
+
+        metadata: Dict[str, Any] = {
+            "dataset": self.name,
+            "source": source,
+            "region": self.region,
+            "micro": bool(self.micro),
+            "seed": int(self.seed),
+            "lookback_weeks": lookback,
+            "features_mode": self.features,
+            "week_start_for_each_sample": [str(ts) for ts in week_starts],
+        }
+
+        return DataBundle(
+            splits=splits,
+            feature_spec=feature_spec,
+            label_spec=label_spec,
+            metadata=metadata,
+        )
diff --git a/pyhazards/metrics/classification.py b/pyhazards/metrics/classification.py
new file mode 100644
index 00000000..ff1f05c5
--- /dev/null
+++ b/pyhazards/metrics/classification.py
@@ -0,0 +1,78 @@
+# pyhazards/metrics/classification.py
+from __future__ import annotations
+
+from dataclasses import dataclass
+from typing import Dict, List, Optional, Sequence, Tuple
+
+import numpy as np
+from sklearn.metrics import (
+    accuracy_score,
+    classification_report,
+    confusion_matrix,
+)
+
+
+@dataclass(frozen=True)
+class ClassificationResults:
+    accuracy: float
+    report_dict: Dict
+    report_csv: str
+    confusion: np.ndarray
+    confusion_normalized: np.ndarray
+    labels: List[str]
+
+
+def _safe_row_normalize(cm: np.ndarray) -> np.ndarray:
+    cm = cm.astype(np.float64)
+    row_sums = cm.sum(axis=1, keepdims=True)
+    # avoid division by zero for classes absent in test set
+    row_sums[row_sums == 0.0] = 1.0
+    return cm / row_sums
+
+
+def evaluate_multiclass(
+    y_true: Sequence[int],
+    y_pred: Sequence[int],
+    label_names: Sequence[str],
+    *,
+    digits: int = 4,
+    zero_division: int = 0,
+) -> ClassificationResults:
+    """
+    Strict: metrics computed ONLY from the provided y_true/y_pred (caller must ensure test split).
+    """
+    y_true = np.asarray(y_true)
+    y_pred = np.asarray(y_pred)
+
+    acc = float(accuracy_score(y_true, y_pred))
+
+    # sklearn report with per-class precision/recall/F1 + macro/weighted
+    rep = classification_report(
+        y_true,
+        y_pred,
+        target_names=list(label_names),
+        digits=digits,
+        output_dict=True,
+        zero_division=zero_division,
+    )
+
+    rep_csv = classification_report(
+        y_true,
+        y_pred,
+        target_names=list(label_names),
+        digits=digits,
+        output_dict=False,
+        zero_division=zero_division,
+    )
+
+    cm = confusion_matrix(y_true, y_pred, labels=list(range(len(label_names))))
+    cm_norm = _safe_row_normalize(cm)
+
+    return ClassificationResults(
+        accuracy=acc,
+        report_dict=rep,
+        report_csv=rep_csv,
+        confusion=cm,
+        confusion_normalized=cm_norm,
+        labels=list(label_names),
+    )
diff --git a/pyhazards/metrics/forecasting.py b/pyhazards/metrics/forecasting.py
new file mode 100644
index 00000000..0a336827
--- /dev/null
+++ b/pyhazards/metrics/forecasting.py
@@ -0,0 +1,49 @@
+# pyhazards/metrics/forecasting.py
+from __future__ import annotations
+
+from dataclasses import dataclass
+from typing import Dict, Iterable, List, Mapping, Tuple
+
+import numpy as np
+from sklearn.metrics import mean_squared_error, r2_score
+
+
+@dataclass(frozen=True)
+class ForecastMetrics:
+    rmse: float
+    r2: float
+
+
+def _rmse(y_true: np.ndarray, y_pred: np.ndarray) -> float:
+    return float(np.sqrt(mean_squared_error(y_true, y_pred)))
+
+
+def evaluate_forecast_buckets(
+    y_true: np.ndarray,
+    y_pred: np.ndarray,
+    bucket_names: List[str],
+) -> Dict[str, ForecastMetrics]:
+    """
+    y_true, y_pred: shape (T, K) where K = number of buckets (A,B,C,D,EFG).
+    Returns metrics for each bucket + "all" (sum across buckets).
+    STRICT: caller must pass TEST-ONLY arrays for the chosen region (US or CA).
+    """
+    y_true = np.asarray(y_true, dtype=np.float64)
+    y_pred = np.asarray(y_pred, dtype=np.float64)
+    assert y_true.shape == y_pred.shape, "y_true and y_pred must match shape (T, K)."
+    assert y_true.shape[1] == len(bucket_names), "K must equal len(bucket_names)."
+
+    out: Dict[str, ForecastMetrics] = {}
+
+    # per bucket
+    for j, name in enumerate(bucket_names):
+        yt = y_true[:, j]
+        yp = y_pred[:, j]
+        out[name] = ForecastMetrics(rmse=_rmse(yt, yp), r2=float(r2_score(yt, yp)))
+
+    # all = sum across buckets (paper-style)
+    yt_all = y_true.sum(axis=1)
+    yp_all = y_pred.sum(axis=1)
+    out["all"] = ForecastMetrics(rmse=_rmse(yt_all, yp_all), r2=float(r2_score(yt_all, yp_all)))
+
+    return out
diff --git a/pyhazards/models/__init__.py b/pyhazards/models/__init__.py
index ba94c0ae..8632735e 100644
--- a/pyhazards/models/__init__.py
+++ b/pyhazards/models/__init__.py
@@ -7,8 +7,11 @@
 from .wildfire_mamba import WildfireMamba, wildfire_mamba_builder
 from .wildfire_aspp import WildfireASPP, TverskyLoss, wildfire_aspp_builder
 from .cnn_aspp import WildfireCNNASPP, cnn_aspp_builder
-from .hydrographnet import HydroGraphNet, HydroGraphNetLoss, hydrographnet_builder
 
+# pyhazards/models/__init__.py
+from .wildfire_fpa_mlp import build_wildfire_fpa_mlp  # noqa: F401
+from .wildfire_fpa_lstm import build_wildfire_fpa_lstm  # noqa: F401
+from .wildfire_fpa_autoencoder import build_wildfire_fpa_ae  # noqa: F401
 
 __all__ = [
     # Core API
@@ -34,9 +37,6 @@
     "wildfire_aspp_builder",
     "WildfireCNNASPP",
     "cnn_aspp_builder",
-    "HydroGraphNet",
-    "HydroGraphNetLoss",
-    "hydrographnet_builder",
 ]
 
 # -------------------------------------------------
@@ -96,14 +96,3 @@
         "dropout": 0.0,
     },
 )
-
-
-register_model(
-    name="hydrographnet",
-    builder=hydrographnet_builder,
-    defaults={
-        "hidden_dim": 64,
-        "harmonics": 5,
-        "num_gn_blocks": 5,
-    },
-)
diff --git a/pyhazards/models/wildfire_fpa_autoencoder.py b/pyhazards/models/wildfire_fpa_autoencoder.py
new file mode 100644
index 00000000..6f8957cd
--- /dev/null
+++ b/pyhazards/models/wildfire_fpa_autoencoder.py
@@ -0,0 +1,136 @@
+# pyhazards/models/wildfire_fpa_autoencoder.py
+from __future__ import annotations
+
+import torch
+import torch.nn as nn
+
+from .registry import register_model
+
+
+class WildfireFPALSTMAutoencoder(nn.Module):
+    """
+    LSTM autoencoder for sequences.
+
+    forward:
+      x: (B, T, D)
+      returns reconstruction x_hat: (B, T, D)
+    """
+
+    def __init__(
+        self,
+        input_dim: int,
+        hidden_dim: int = 64,
+        latent_dim: int = 32,
+        num_layers: int = 1,
+        dropout: float = 0.2,
+        lookback: int = 50,
+    ) -> None:
+        super().__init__()
+        if input_dim <= 0:
+            raise ValueError(f"input_dim must be > 0, got {input_dim}")
+        if hidden_dim <= 0:
+            raise ValueError(f"hidden_dim must be > 0, got {hidden_dim}")
+        if latent_dim <= 0:
+            raise ValueError(f"latent_dim must be > 0, got {latent_dim}")
+        if num_layers <= 0:
+            raise ValueError(f"num_layers must be > 0, got {num_layers}")
+        if not (0.0 <= dropout < 1.0):
+            raise ValueError(f"dropout must be in [0,1), got {dropout}")
+        if lookback <= 0:
+            raise ValueError(f"lookback must be > 0, got {lookback}")
+
+        self.lookback = lookback
+        lstm_dropout = dropout if num_layers > 1 else 0.0
+
+        self.encoder = nn.LSTM(
+            input_size=input_dim,
+            hidden_size=hidden_dim,
+            num_layers=num_layers,
+            batch_first=True,
+            dropout=lstm_dropout,
+        )
+        self.to_latent = nn.Sequential(
+            nn.Dropout(p=dropout),
+            nn.Linear(hidden_dim, latent_dim),
+        )
+
+        self.decoder = nn.LSTM(
+            input_size=latent_dim,
+            hidden_size=hidden_dim,
+            num_layers=num_layers,
+            batch_first=True,
+            dropout=lstm_dropout,
+        )
+        self.to_recon = nn.Linear(hidden_dim, input_dim)
+
+    def forward(self, x: torch.Tensor) -> torch.Tensor:
+        if x.ndim != 3:
+            raise ValueError(f"AE expects x as (B, T, D). Got {tuple(x.shape)}")
+        B, T, D = x.shape
+        if T != self.lookback:
+            raise ValueError(f"Expected lookback T={self.lookback}, got T={T}")
+
+        _, (h_n, _) = self.encoder(x)
+        h_last = h_n[-1]  # (B, hidden_dim)
+        z = self.to_latent(h_last)  # (B, latent_dim)
+
+        z_seq = z.unsqueeze(1).expand(B, T, z.shape[-1])  # (B, T, latent_dim)
+        dec_out, _ = self.decoder(z_seq)
+        return self.to_recon(dec_out)
+
+    @torch.no_grad()
+    def reconstruction_error(self, x: torch.Tensor, reduction: str = "mean") -> torch.Tensor:
+        x_hat = self.forward(x)
+        err = (x_hat - x) ** 2
+        if reduction == "none":
+            return err
+        if reduction == "mean":
+            return err.mean(dim=(1, 2))
+        if reduction == "sum":
+            return err.sum(dim=(1, 2))
+        raise ValueError(f"Unknown reduction: {reduction!r}")
+
+
+def build_wildfire_fpa_ae(
+    *,
+    task: str,
+    input_dim: int,
+    hidden_dim: int = 64,
+    latent_dim: int = 32,
+    num_layers: int = 1,
+    dropout: float = 0.2,
+    lookback: int = 50,
+    **kwargs,
+) -> nn.Module:
+    if kwargs:
+        raise TypeError(f"Unexpected kwargs for wildfire_fpa_ae: {sorted(kwargs.keys())}")
+
+    t = str(task).lower()
+    if t not in ("reconstruction", "autoencoder", "regression", "forecasting"):
+        raise ValueError(
+            "wildfire_fpa_ae supports task in "
+            "{'reconstruction','autoencoder','regression','forecasting'}, "
+            f"got {task!r}"
+        )
+
+    return WildfireFPALSTMAutoencoder(
+        input_dim=input_dim,
+        hidden_dim=hidden_dim,
+        latent_dim=latent_dim,
+        num_layers=num_layers,
+        dropout=dropout,
+        lookback=lookback,
+    )
+
+
+register_model(
+    name="wildfire_fpa_ae",
+    builder=build_wildfire_fpa_ae,
+    defaults={
+        "hidden_dim": 64,
+        "latent_dim": 32,
+        "num_layers": 1,
+        "dropout": 0.2,
+        "lookback": 50,
+    },
+)
diff --git a/pyhazards/models/wildfire_fpa_lstm.py b/pyhazards/models/wildfire_fpa_lstm.py
new file mode 100644
index 00000000..906e5c33
--- /dev/null
+++ b/pyhazards/models/wildfire_fpa_lstm.py
@@ -0,0 +1,109 @@
+# pyhazards/models/wildfire_fpa_lstm.py
+from __future__ import annotations
+
+import torch
+import torch.nn as nn
+
+from .registry import register_model
+
+
+class WildfireFPALSTM(nn.Module):
+    """
+    LSTM forecaster producing raw next-week counts (regression).
+
+    forward:
+      x: (B, T, D)  T should equal lookback (default 50)
+      returns: (B, output_dim=5)
+    """
+
+    def __init__(
+        self,
+        input_dim: int,
+        hidden_dim: int = 64,
+        output_dim: int = 5,
+        num_layers: int = 1,
+        dropout: float = 0.2,
+        lookback: int = 50,
+    ) -> None:
+        super().__init__()
+        if input_dim <= 0:
+            raise ValueError(f"input_dim must be > 0, got {input_dim}")
+        if hidden_dim <= 0:
+            raise ValueError(f"hidden_dim must be > 0, got {hidden_dim}")
+        if output_dim <= 0:
+            raise ValueError(f"output_dim must be > 0, got {output_dim}")
+        if num_layers <= 0:
+            raise ValueError(f"num_layers must be > 0, got {num_layers}")
+        if not (0.0 <= dropout < 1.0):
+            raise ValueError(f"dropout must be in [0,1), got {dropout}")
+        if lookback <= 0:
+            raise ValueError(f"lookback must be > 0, got {lookback}")
+
+        self.lookback = lookback
+
+        # PyTorch LSTM dropout is applied only when num_layers > 1
+        lstm_dropout = dropout if num_layers > 1 else 0.0
+
+        self.lstm = nn.LSTM(
+            input_size=input_dim,
+            hidden_size=hidden_dim,
+            num_layers=num_layers,
+            batch_first=True,
+            dropout=lstm_dropout,
+        )
+        self.head = nn.Sequential(
+            nn.Dropout(p=dropout),
+            nn.Linear(hidden_dim, output_dim),
+        )
+
+    def forward(self, x: torch.Tensor) -> torch.Tensor:
+        if x.ndim != 3:
+            raise ValueError(f"LSTM expects x as (B, T, D). Got {tuple(x.shape)}")
+        B, T, D = x.shape
+        if T != self.lookback:
+            raise ValueError(f"Expected lookback T={self.lookback}, got T={T}")
+
+        _, (h_n, _) = self.lstm(x)
+        h_last = h_n[-1]  # (B, hidden_dim)
+        return self.head(h_last)
+
+
+def build_wildfire_fpa_lstm(
+    *,
+    task: str,
+    input_dim: int,
+    hidden_dim: int = 64,
+    output_dim: int = 5,
+    num_layers: int = 1,
+    dropout: float = 0.2,
+    lookback: int = 50,
+    **kwargs,
+) -> nn.Module:
+    if kwargs:
+        raise TypeError(f"Unexpected kwargs for wildfire_fpa_lstm: {sorted(kwargs.keys())}")
+
+    t = str(task).lower()
+    if t not in ("regression", "forecasting"):
+        raise ValueError(f"wildfire_fpa_lstm supports task='regression'/'forecasting', got {task!r}")
+
+    return WildfireFPALSTM(
+        input_dim=input_dim,
+        hidden_dim=hidden_dim,
+        output_dim=output_dim,
+        num_layers=num_layers,
+        dropout=dropout,
+        lookback=lookback,
+    )
+
+
+register_model(
+    name="wildfire_fpa_lstm",
+    builder=build_wildfire_fpa_lstm,
+    defaults={
+        "hidden_dim": 64,
+        "output_dim": 5,
+        "num_layers": 1,
+        "dropout": 0.2,
+        "lookback": 50,
+    },
+)
diff --git a/pyhazards/models/wildfire_fpa_mlp.py b/pyhazards/models/wildfire_fpa_mlp.py
new file mode 100644
index 00000000..d6d16008
--- /dev/null
+++ b/pyhazards/models/wildfire_fpa_mlp.py
@@ -0,0 +1,114 @@
+# pyhazards/models/wildfire_fpa_mlp.py
+from __future__ import annotations
+
+from typing import Callable, Union
+
+import torch
+import torch.nn as nn
+
+from .registry import register_model
+
+
+def _activation_from_name(name: Union[str, Callable[[], nn.Module]]) -> nn.Module:
+    if callable(name):
+        return name()
+    key = str(name).strip().lower()
+    if key == "relu":
+        return nn.ReLU()
+    if key == "gelu":
+        return nn.GELU()
+    if key == "tanh":
+        return nn.Tanh()
+    if key in ("silu", "swish"):
+        return nn.SiLU()
+    raise ValueError(f"Unsupported activation: {name!r}")
+
+
+class WildfireFPAMLP(nn.Module):
+    """
+    DNN/MLP for 5-way classification.
+
+    forward:
+      x: (B, in_dim)
+      returns logits: (B, out_dim)
+    """
+
+    def __init__(
+        self,
+        in_dim: int,
+        out_dim: int = 5,
+        depth: int = 2,
+        hidden_dim: int = 64,
+        activation: Union[str, Callable[[], nn.Module]] = "relu",
+        dropout: float = 0.0,
+    ) -> None:
+        super().__init__()
+        if in_dim <= 0:
+            raise ValueError(f"in_dim must be > 0, got {in_dim}")
+        if out_dim <= 0:
+            raise ValueError(f"out_dim must be > 0, got {out_dim}")
+        if depth < 1:
+            raise ValueError(f"depth must be >= 1, got {depth}")
+        if hidden_dim <= 0:
+            raise ValueError(f"hidden_dim must be > 0, got {hidden_dim}")
+        if not (0.0 <= dropout < 1.0):
+            raise ValueError(f"dropout must be in [0,1), got {dropout}")
+
+        layers = []
+        d_in = in_dim
+        for _ in range(depth):
+            layers.append(nn.Linear(d_in, hidden_dim))
+            layers.append(_activation_from_name(activation))
+            if dropout > 0.0:
+                layers.append(nn.Dropout(p=dropout))
+            d_in = hidden_dim
+
+        layers.append(nn.Linear(d_in, out_dim))
+        self.net = nn.Sequential(*layers)
+
+    def forward(self, x: torch.Tensor) -> torch.Tensor:
+        if x.ndim != 2:
+            raise ValueError(f"MLP expects x as (B, D). Got {tuple(x.shape)}")
+        return self.net(x)
+
+
+def build_wildfire_fpa_mlp(
+    *,
+    task: str,
+    in_dim: int,
+    out_dim: int = 5,
+    depth: int = 2,
+    hidden_dim: int = 64,
+    activation: Union[str, Callable[[], nn.Module]] = "relu",
+    dropout: float = 0.0,
+    **kwargs,
+) -> nn.Module:
+    # strict: catch typos
+    if kwargs:
+        raise TypeError(f"Unexpected kwargs for wildfire_fpa_mlp: {sorted(kwargs.keys())}")
+
+    if str(task).lower() != "classification":
+        raise ValueError(f"wildfire_fpa_mlp supports only task='classification', got {task!r}")
+
+    return WildfireFPAMLP(
+        in_dim=in_dim,
+        out_dim=out_dim,
+        depth=depth,
+        hidden_dim=hidden_dim,
+        activation=activation,
+        dropout=dropout,
+    )
+
+
+# ✅ register at import time (NO decorator)
+register_model(
+    name="wildfire_fpa_mlp",
+    builder=build_wildfire_fpa_mlp,
+    defaults={
+        "out_dim": 5,
+        "depth": 2,
+        "hidden_dim": 64,
+        "activation": "relu",
+        "dropout": 0.0,
+    },
+)
diff --git a/scripts/evaluate/common_eval.py b/scripts/evaluate/common_eval.py
new file mode 100644
index 00000000..182522f8
--- /dev/null
+++ b/scripts/evaluate/common_eval.py
@@ -0,0 +1,127 @@
+# scripts/evaluate/common_eval.py
+from __future__ import annotations
+
+import hashlib
+import json
+import os
+import platform
+import subprocess
+import sys
+from dataclasses import dataclass
+from datetime import datetime, timezone
+from pathlib import Path
+from typing import Any, Dict, Optional
+
+
+def utc_now_iso() -> str:
+    return datetime.now(timezone.utc).strftime("%Y-%m-%dT%H:%M:%SZ")
+
+
+def git_commit_hash(repo_root: Path) -> str:
+    try:
+        out = subprocess.check_output(
+            ["git", "rev-parse", "HEAD"], cwd=str(repo_root), stderr=subprocess.DEVNULL
+        )
+        return out.decode().strip()
+    except Exception:
+        return "UNKNOWN"
+
+
+def pip_freeze_head(n: int = 80) -> str:
+    try:
+        out = subprocess.check_output([sys.executable, "-m", "pip", "freeze"])
+        lines = out.decode().splitlines()
+        return "\n".join(lines[:n])
+    except Exception:
+        return "pip-freeze-unavailable"
+
+
+def sha256_file(path: Path) -> str:
+    h = hashlib.sha256()
+    with path.open("rb") as f:
+        for chunk in iter(lambda: f.read(1024 * 1024), b""):
+            h.update(chunk)
+    return h.hexdigest()
+
+
+def sha256_json(obj: Any) -> str:
+    b = json.dumps(obj, sort_keys=True, separators=(",", ":")).encode("utf-8")
+    return hashlib.sha256(b).hexdigest()
+
+
+def write_json(path: Path, obj: Any) -> None:
+    path.parent.mkdir(parents=True, exist_ok=True)
+    path.write_text(json.dumps(obj, indent=2, sort_keys=True))
+
+
+def ensure(cond: bool, msg: str) -> None:
+    if not cond:
+        raise RuntimeError(msg)
+
+
+@dataclass(frozen=True)
+class RunMeta:
+    created_utc: str
+    git_commit: str
+    python: str
+    platform: str
+    pip_freeze_head: str
+    dataset_id: str
+    split: str
+    split_manifest_path: str
+    split_manifest_sha256: str
+    checkpoint: str
+    preds_path: str
+
+
+def build_run_meta(
+    *,
+    repo_root: Path,
+    dataset_id: str,
+    split: str,
+    split_manifest_path: Path,
+    checkpoint: str,
+    preds_path: Path,
+) -> RunMeta:
+    ensure(split_manifest_path.exists(), f"split_manifest not found: {split_manifest_path}")
+    return RunMeta(
+        created_utc=utc_now_iso(),
+        git_commit=git_commit_hash(repo_root),
+        python=sys.version.replace("\n", " "),
+        platform=f"{platform.platform()} | {platform.machine()}",
+        pip_freeze_head=pip_freeze_head(),
+        dataset_id=dataset_id,
+        split=split,
+        split_manifest_path=str(split_manifest_path),
+        split_manifest_sha256=sha256_file(split_manifest_path),
+        checkpoint=checkpoint,
+        preds_path=str(preds_path),
+    )
+
+
+def write_phase5_readme(out_dir: Path, title: str, meta: RunMeta, extra_lines: str) -> None:
+    out_dir.mkdir(parents=True, exist_ok=True)
+    p = out_dir / "README.md"
+    txt = []
+    txt.append(f"# Phase 5 — {title}")
+    txt.append("")
+    txt.append("## Repro metadata")
+    txt.append(f"- created_utc: {meta.created_utc}")
+    txt.append(f"- git_commit: {meta.git_commit}")
+    txt.append(f"- python: {meta.python}")
+    txt.append(f"- platform: {meta.platform}")
+    txt.append(f"- dataset_id: {meta.dataset_id}")
+    txt.append(f"- split: {meta.split}")
+    txt.append(f"- split_manifest_path: {meta.split_manifest_path}")
+    txt.append(f"- split_manifest_sha256: {meta.split_manifest_sha256}")
+    txt.append(f"- checkpoint: {meta.checkpoint}")
+    txt.append(f"- preds_path: {meta.preds_path}")
+    txt.append("")
+    txt.append("## pip freeze (head)")
+    txt.append("```")
+    txt.append(meta.pip_freeze_head)
+    txt.append("```")
+    txt.append("")
+    txt.append("## Notes / results")
+    txt.append(extra_lines.strip() + "\n")
+    p.write_text("\n".join(txt))
diff --git a/scripts/evaluate/convert_run_csv_to_phase5_npz.py b/scripts/evaluate/convert_run_csv_to_phase5_npz.py
new file mode 100644
index 00000000..eeb264f0
--- /dev/null
+++ b/scripts/evaluate/convert_run_csv_to_phase5_npz.py
@@ -0,0 +1,210 @@
+# scripts/evaluate/convert_run_csv_to_phase5_npz.py
+from __future__ import annotations
+
+import argparse
+import json
+from pathlib import Path
+from typing import List, Optional, Tuple
+
+import numpy as np
+import pandas as pd
+
+
+def ensure(cond: bool, msg: str) -> None:
+    if not cond:
+        raise RuntimeError(msg)
+
+
+def _read_any_csv(csv_path: Path) -> pd.DataFrame:
+    ensure(csv_path.exists(), f"CSV not found: {csv_path}")
+    df = pd.read_csv(csv_path)
+    ensure(len(df) > 0, f"CSV empty: {csv_path}")
+    return df
+
+
+def _try_find_col(df: pd.DataFrame, candidates: List[str]) -> Optional[str]:
+    cols = {c.lower(): c for c in df.columns}
+    for cand in candidates:
+        if cand.lower() in cols:
+            return cols[cand.lower()]
+    return None
+
+
+def export_ids_npz(ids: np.ndarray, out_path: Path) -> None:
+    out_path.parent.mkdir(parents=True, exist_ok=True)
+    np.savez(out_path, ids=ids.astype(object))
+    print(f"[OK] wrote {out_path} (n={len(ids)})")
+
+
+def export_classification_npz(y_true: np.ndarray, y_pred: np.ndarray, label_names: List[str], out_path: Path) -> None:
+    out_path.parent.mkdir(parents=True, exist_ok=True)
+    np.savez(
+        out_path,
+        y_true=y_true.astype(np.int64),
+        y_pred=y_pred.astype(np.int64),
+        label_names=np.array(label_names, dtype=object),
+        split="test",
+    )
+    print(f"[OK] wrote {out_path} (N={len(y_true)}, K={len(label_names)})")
+
+
+def export_forecast_npz(y_true: np.ndarray, y_pred: np.ndarray, out_path: Path) -> None:
+    out_path.parent.mkdir(parents=True, exist_ok=True)
+    np.savez(out_path, y_true=y_true.astype(np.float64), y_pred=y_pred.astype(np.float64), split="test")
+    print(f"[OK] wrote {out_path} (T={y_true.shape[0]}, K=5)")
+
+
+def load_label_names_from_config(run_dir: Path, default: List[str]) -> List[str]:
+    cfg = run_dir / "config.json"
+    if not cfg.exists():
+        return default
+    obj = json.loads(cfg.read_text())
+    # common keys to try
+    for key in ["label_names", "labels", "classes", "class_names"]:
+        if key in obj and isinstance(obj[key], list) and len(obj[key]) >= 2:
+            return [str(x) for x in obj[key]]
+    return default
+
+
+def convert_classification_csv(
+    run_dir: Path,
+    csv_path: Path,
+    out_npz: Path,
+) -> None:
+    df = _read_any_csv(csv_path)
+
+    # Accept many possible column names
+    ytrue_col = _try_find_col(df, ["y_true", "true", "target", "label", "gt", "gold"])
+    ypred_col = _try_find_col(df, ["y_pred", "pred", "prediction", "pred_label"])
+
+    ensure(ytrue_col is not None, f"Cannot find y_true column in {csv_path}. columns={list(df.columns)}")
+    ensure(ypred_col is not None, f"Cannot find y_pred column in {csv_path}. columns={list(df.columns)}")
+
+    y_true = df[ytrue_col].to_numpy()
+    y_pred = df[ypred_col].to_numpy()
+
+    # If they are strings, map to ints
+    if y_true.dtype == object or isinstance(y_true[0], str):
+        uniq = sorted(set([str(x) for x in y_true] + [str(x) for x in y_pred]))
+        m = {c: i for i, c in enumerate(uniq)}
+        y_true = np.array([m[str(x)] for x in y_true], dtype=np.int64)
+        y_pred = np.array([m[str(x)] for x in y_pred], dtype=np.int64)
+        label_names = uniq
+    else:
+        y_true = y_true.astype(np.int64)
+        y_pred = y_pred.astype(np.int64)
+        # try config.json for names, fallback to numeric labels
+        label_names = load_label_names_from_config(run_dir, default=[str(i) for i in range(int(max(y_true.max(), y_pred.max())) + 1)])
+
+    export_classification_npz(y_true, y_pred, label_names, out_npz)
+
+    # If there is an id column, write test_ids.npz too (helps manifest)
+    id_col = _try_find_col(df, ["id", "fire_id", "sample_id", "idx", "index"])
+    if id_col is not None:
+        ids = df[id_col].astype(str).to_numpy()
+        export_ids_npz(ids, out_npz.parent / "test_ids.npz")
+
+
+def convert_forecasting_csv_grouped(
+    csv_us: Path,
+    csv_ca: Path,
+    out_us: Path,
+    out_ca: Path,
+) -> None:
+    """
+    Expect CSV with columns like:
+      true_A,true_B,true_C,true_D,true_EFG and pred_A,... OR similar.
+    We'll infer patterns robustly.
+    """
+    def parse(df: pd.DataFrame) -> Tuple[np.ndarray, np.ndarray]:
+        cols = [c.lower() for c in df.columns]
+        # candidate patterns
+        buckets = ["a","b","c","d","efg"]
+        true_cols = []
+        pred_cols = []
+
+        # Try 'true_A' and 'pred_A'
+        for b in buckets:
+            tc = _try_find_col(df, [f"true_{b}", f"y_true_{b}", f"gt_{b}", f"actual_{b}"])
+            pc = _try_find_col(df, [f"pred_{b}", f"y_pred_{b}", f"prediction_{b}", f"yhat_{b}"])
+            ensure(tc is not None, f"Missing true column for bucket {b}. cols={list(df.columns)}")
+            ensure(pc is not None, f"Missing pred column for bucket {b}. cols={list(df.columns)}")
+            true_cols.append(tc)
+            pred_cols.append(pc)
+
+        y_true = df[true_cols].to_numpy(dtype=np.float64)
+        y_pred = df[pred_cols].to_numpy(dtype=np.float64)
+        ensure(y_true.shape[1] == 5 and y_pred.shape[1] == 5, "Must be (T,5)")
+        return y_true, y_pred
+
+    df_us = _read_any_csv(csv_us)
+    df_ca = _read_any_csv(csv_ca)
+
+    y_true_us, y_pred_us = parse(df_us)
+    y_true_ca, y_pred_ca = parse(df_ca)
+
+    export_forecast_npz(y_true_us, y_pred_us, out_us)
+    export_forecast_npz(y_true_ca, y_pred_ca, out_ca)
+
+
+def main() -> None:
+    ap = argparse.ArgumentParser()
+    ap.add_argument("--weekly-run", required=True, help="Run dir for weekly forecast (has csv/)")
+    ap.add_argument("--cause-run", required=True, help="Run dir for cause classification (has csv/)")
+    ap.add_argument("--size-run", required=True, help="Run dir for size classification (has csv/)")
+    ap.add_argument("--outdir", default=".", help="Where to write preds_*.npz")
+    args = ap.parse_args()
+
+    outdir = Path(args.outdir)
+
+    # ----- Classification: pick the most likely CSVs
+    def pick_first_csv(run: Path) -> Path:
+        csvdir = run / "csv"
+        ensure(csvdir.exists(), f"Missing csv dir: {csvdir}")
+        csvs = sorted(csvdir.rglob("*.csv"))
+        ensure(len(csvs) > 0, f"No csv files under: {csvdir}")
+        return csvs[0]
+
+    cause_run = Path(args.cause_run)
+    size_run  = Path(args.size_run)
+    weekly_run= Path(args.weekly_run)
+
+    cause_csv = pick_first_csv(cause_run)
+    size_csv  = pick_first_csv(size_run)
+
+    print("[INFO] Using cause csv:", cause_csv)
+    print("[INFO] Using size  csv:", size_csv)
+
+    convert_classification_csv(cause_run, cause_csv, outdir / "preds_class_test.npz")  # cause
+    convert_classification_csv(size_run,  size_csv,  outdir / "preds_size_test.npz")   # size (separate)
+
+    # ----- Forecasting: need US + CA CSV (must exist somewhere in weekly run csv/)
+    # We'll auto-detect by filename containing 'us' and 'ca'.
+    csvdir = weekly_run / "csv"
+    ensure(csvdir.exists(), f"Missing csv dir: {csvdir}")
+    csvs = sorted(csvdir.rglob("*.csv"))
+    ensure(len(csvs) > 0, f"No csv files under: {csvdir}")
+
+    us = [p for p in csvs if "us" in p.name.lower()]
+    ca = [p for p in csvs if "ca" in p.name.lower() or "can" in p.name.lower()]
+
+    ensure(len(us) > 0, f"Could not find US csv in {csvdir}. Files={ [p.name for p in csvs[:30]] }")
+    ensure(len(ca) > 0, f"Could not find CA csv in {csvdir}. Files={ [p.name for p in csvs[:30]] }")
+
+    us_csv = us[0]
+    ca_csv = ca[0]
+    print("[INFO] Using forecast US csv:", us_csv)
+    print("[INFO] Using forecast CA csv:", ca_csv)
+
+    convert_forecasting_csv_grouped(
+        us_csv, ca_csv,
+        outdir / "preds_forecast_us_test.npz",
+        outdir / "preds_forecast_ca_test.npz",
+    )
+
+    print("[OK] Wrote NPZ caches into:", outdir)
+    print("Next: run make_split_manifest.py + eval_*_from_npz.py")
+
+
+if __name__ == "__main__":
+    main()
diff --git a/scripts/evaluate/eval_classification.py b/scripts/evaluate/eval_classification.py
new file mode 100644
index 00000000..c99d8056
--- /dev/null
+++ b/scripts/evaluate/eval_classification.py
@@ -0,0 +1,192 @@
+# scripts/evaluate/eval_classification.py
+from __future__ import annotations
+
+import argparse
+import json
+import os
+import shutil
+from pathlib import Path
+from typing import Any, Dict, Optional, Tuple
+
+import numpy as np
+
+from pyhazards.metrics.classification import evaluate_multiclass
+from pyhazards.plotting.confusion import plot_confusion_matrix
+
+
+def ensure(cond: bool, msg: str) -> None:
+    if not cond:
+        raise RuntimeError(msg)
+
+
+def _find_run_dir_from_checkpoint(ckpt_path: Path) -> Path:
+    """
+    Expected checkpoint layout (your tree confirms this):
+      outputs/train_wildfire_fpa_{task}/{RUN_ID}/checkpoints/best.ckpt
+    So run_dir = ckpt_path.parents[1]
+    """
+    ckpt_path = ckpt_path.resolve()
+    ensure(ckpt_path.exists(), f"Checkpoint not found: {ckpt_path}")
+    ensure(ckpt_path.suffix == ".ckpt", f"Expected a .ckpt file, got: {ckpt_path}")
+    run_dir = ckpt_path.parent.parent  # .../RUN_ID
+    ensure(run_dir.exists(), f"Run dir not found from checkpoint: {run_dir}")
+    return run_dir
+
+
+def _load_json_if_exists(p: Path) -> Optional[Dict[str, Any]]:
+    if p.exists():
+        return json.loads(p.read_text())
+    return None
+
+
+def _extract_from_run_dir(run_dir: Path) -> Tuple[np.ndarray, np.ndarray, list[str]]:
+    """
+    We try to reconstruct y_true/y_pred/labels from training-run artifacts.
+
+    Supported sources (in order):
+      1) confusion_matrix.json if present (preferred)
+      2) metrics.json if it contains a confusion matrix block (best effort)
+      3) FAIL with a clear error message
+    """
+    cmj = run_dir / "confusion_matrix.json"
+    mj = run_dir / "metrics.json"
+
+    # 1) confusion_matrix.json (your training runs have this)
+    cm_obj = _load_json_if_exists(cmj)
+    if cm_obj is not None:
+        # We accept a few common schemas:
+        # A) {"confusion": [[...]], "labels": ["A","B",...]}
+        # B) {"matrix": [[...]], "labels": [...]}
+        # C) {"confusion_matrix": [[...]], "label_names": [...]}
+        mat = (
+            cm_obj.get("confusion")
+            or cm_obj.get("matrix")
+            or cm_obj.get("confusion_matrix")
+            or cm_obj.get("cm")
+        )
+        labels = cm_obj.get("labels") or cm_obj.get("label_names") or cm_obj.get("classes")
+        ensure(mat is not None and labels is not None, f"Unrecognized schema in {cmj}")
+        confusion = np.array(mat, dtype=int)
+        label_names = [str(x) for x in labels]
+
+        # From confusion alone we cannot recover the exact per-example y_true/y_pred,
+        # but we *can* generate a synthetic sample expansion that reproduces the confusion exactly.
+        # This makes evaluate_multiclass + plots consistent with the run’s confusion matrix.
+        y_true_list = []
+        y_pred_list = []
+        for i in range(confusion.shape[0]):
+            for j in range(confusion.shape[1]):
+                c = int(confusion[i, j])
+                if c <= 0:
+                    continue
+                y_true_list.extend([i] * c)
+                y_pred_list.extend([j] * c)
+
+        ensure(len(y_true_list) > 0, f"Confusion matrix in {cmj} is empty")
+        return np.array(y_true_list, dtype=int), np.array(y_pred_list, dtype=int), label_names
+
+    # 2) metrics.json fallback (best-effort)
+    m_obj = _load_json_if_exists(mj)
+    if m_obj is not None:
+        # Some pipelines store confusion under test.metrics or test.confusion etc.
+        test = m_obj.get("test")
+        if isinstance(test, dict):
+            cand_blocks = [test, test.get("metrics", {}) if isinstance(test.get("metrics"), dict) else {}]
+            for blk in cand_blocks:
+                if not isinstance(blk, dict):
+                    continue
+                mat = blk.get("confusion") or blk.get("confusion_matrix") or blk.get("cm")
+                labels = blk.get("labels") or blk.get("label_names") or blk.get("classes")
+                if mat is not None and labels is not None:
+                    confusion = np.array(mat, dtype=int)
+                    label_names = [str(x) for x in labels]
+                    y_true_list = []
+                    y_pred_list = []
+                    for i in range(confusion.shape[0]):
+                        for j in range(confusion.shape[1]):
+                            c = int(confusion[i, j])
+                            if c <= 0:
+                                continue
+                            y_true_list.extend([i] * c)
+                            y_pred_list.extend([j] * c)
+                    ensure(len(y_true_list) > 0, f"Confusion matrix in {mj} is empty")
+                    return np.array(y_true_list, dtype=int), np.array(y_pred_list, dtype=int), label_names
+
+    raise RuntimeError(
+        "Could not find confusion_matrix.json (preferred) or a usable confusion block inside metrics.json.\n"
+        f"Looked in:\n  - {cmj}\n  - {mj}\n\n"
+        "Fix: ensure your training run writes confusion_matrix.json, or extend this script for your schema."
+    )
+
+
+def main() -> None:
+    ap = argparse.ArgumentParser(
+        description=(
+            "Phase5 classification evaluation exporter.\n\n"
+            "This script is intentionally strict + reproducible.\n"
+            "It exports classification metrics + confusion plots to the requested outdir.\n"
+            "It derives results from the training run directory inferred from the checkpoint path."
+        )
+    )
+    ap.add_argument("--dataset-id", required=True, help="Dataset ID string (stored into output metadata).")
+    ap.add_argument("--checkpoint", required=True, help="Path to best.ckpt (used to locate the training run dir).")
+    ap.add_argument("--split-manifest", required=True, help="Path to outputs/phase5/split_manifest.json (stored into output metadata).")
+    ap.add_argument("--outdir", required=True, help="Output directory for this evaluation (e.g., outputs/phase5/classification_cause).")
+    ap.add_argument("--overwrite", action="store_true", help="Overwrite outdir if it exists.")
+    args = ap.parse_args()
+
+    ckpt = Path(args.checkpoint)
+    split_manifest = Path(args.split_manifest)
+    out_dir = Path(args.outdir)
+
+    ensure(split_manifest.exists(), f"split manifest not found: {split_manifest}")
+    run_dir = _find_run_dir_from_checkpoint(ckpt)
+
+    if out_dir.exists():
+        ensure(args.overwrite, f"outdir exists: {out_dir} (use --overwrite to replace)")
+        shutil.rmtree(out_dir)
+    out_dir.mkdir(parents=True, exist_ok=True)
+
+    # Extract y_true/y_pred/labels from training run artifacts
+    y_true, y_pred, label_names = _extract_from_run_dir(run_dir)
+
+    res = evaluate_multiclass(y_true, y_pred, label_names)
+
+    # Extra metadata for traceability
+    meta = {
+        "dataset_id": args.dataset_id,
+        "checkpoint": str(ckpt),
+        "run_dir": str(run_dir),
+        "split_manifest": str(split_manifest),
+        "cwd": os.getcwd(),
+    }
+
+    metrics_payload = {
+        "meta": meta,
+        "accuracy": float(res.accuracy),
+        "report": res.report_dict,  # per-class + macro/weighted
+    }
+
+    (out_dir / "metrics.json").write_text(json.dumps(metrics_payload, indent=2))
+    (out_dir / "classification_report.txt").write_text(res.report_csv)
+
+    plot_confusion_matrix(
+        res.confusion,
+        res.labels,
+        title="Confusion Matrix",
+        out_path=str(out_dir / "confusion_matrix.png"),
+        normalize=False,
+    )
+    plot_confusion_matrix(
+        res.confusion_normalized,
+        res.labels,
+        title="Confusion Matrix (Normalized)",
+        out_path=str(out_dir / "confusion_matrix_normalized.png"),
+        normalize=True,
+    )
+
+    print(f"[OK] Saved classification evaluation to: {out_dir}")
+
+
+if __name__ == "__main__":
+    main()
diff --git a/scripts/evaluate/eval_classification_from_npz.py b/scripts/evaluate/eval_classification_from_npz.py
new file mode 100644
index 00000000..26568b49
--- /dev/null
+++ b/scripts/evaluate/eval_classification_from_npz.py
@@ -0,0 +1,121 @@
+# scripts/evaluate/eval_classification_from_npz.py
+from __future__ import annotations
+
+import argparse
+import json
+from pathlib import Path
+
+import numpy as np
+
+from pyhazards.metrics.classification import evaluate_multiclass
+from pyhazards.plotting.confusion import plot_confusion_matrix
+from common_eval import (
+    build_run_meta,
+    ensure,
+    write_json,
+    write_phase5_readme,
+)
+
+
+def _load_npz(path: Path) -> dict:
+    ensure(path.exists(), f"preds npz not found: {path}")
+    d = np.load(str(path), allow_pickle=True)
+    return {k: d[k] for k in d.files}
+
+
+def main() -> None:
+    ap = argparse.ArgumentParser()
+    ap.add_argument("--preds", required=True, help="NPZ with y_true,y_pred,label_names,split")
+    ap.add_argument("--dataset-id", required=True)
+    ap.add_argument("--checkpoint", required=True)
+    ap.add_argument("--split-manifest", default="outputs/phase5/split_manifest.json")
+    ap.add_argument("--outdir", default="outputs/phase5/classification")
+    args = ap.parse_args()
+
+    repo_root = Path(__file__).resolve().parents[2]
+    preds_path = Path(args.preds)
+    out_dir = Path(args.outdir)
+    out_dir.mkdir(parents=True, exist_ok=True)
+
+    blob = _load_npz(preds_path)
+
+    required = ["y_true", "y_pred", "label_names", "split"]
+    for k in required:
+        ensure(k in blob, f"{preds_path} missing key '{k}' (required: {required})")
+
+    split = str(blob["split"])
+    ensure(split == "test", f"split must be 'test' but got '{split}'")
+
+    y_true = np.asarray(blob["y_true"])
+    y_pred = np.asarray(blob["y_pred"])
+    label_names = [str(x) for x in np.asarray(blob["label_names"]).tolist()]
+
+    # HARD VALIDATION
+    ensure(y_true.ndim == 1, "y_true must be 1D (N,)")
+    ensure(y_pred.ndim == 1, "y_pred must be 1D (N,)")
+    ensure(len(y_true) == len(y_pred), "y_true and y_pred length mismatch")
+    ensure(len(y_true) > 0, "empty test set not allowed")
+    ensure(len(label_names) >= 2, "need >=2 classes")
+    ensure(np.isfinite(y_true.astype(float)).all(), "y_true has NaN/Inf")
+    ensure(np.isfinite(y_pred.astype(float)).all(), "y_pred has NaN/Inf")
+
+    K = len(label_names)
+    ensure(np.issubdtype(y_true.dtype, np.integer), "y_true must be integer labels")
+    ensure(np.issubdtype(y_pred.dtype, np.integer), "y_pred must be integer labels")
+    ensure(int(y_true.min()) >= 0 and int(y_true.max()) < K, "y_true labels out of range")
+    ensure(int(y_pred.min()) >= 0 and int(y_pred.max()) < K, "y_pred labels out of range")
+
+    res = evaluate_multiclass(y_true, y_pred, label_names)
+
+    meta = build_run_meta(
+        repo_root=repo_root,
+        dataset_id=args.dataset_id,
+        split="test",
+        split_manifest_path=Path(args.split_manifest),
+        checkpoint=args.checkpoint,
+        preds_path=preds_path,
+    )
+
+    # Save
+    write_json(out_dir / "run_meta.json", meta.__dict__)
+    write_json(
+        out_dir / "metrics.json",
+        {
+            "accuracy": res.accuracy,
+            "labels": res.labels,
+            "classification_report": res.report_dict,
+            "confusion_matrix": res.confusion.tolist(),
+            "confusion_matrix_normalized": res.confusion_normalized.tolist(),
+        },
+    )
+    (out_dir / "classification_report.txt").write_text(res.report_csv)
+
+    plot_confusion_matrix(
+        res.confusion, res.labels,
+        title="Confusion Matrix",
+        out_path=str(out_dir / "confusion_matrix.png"),
+        normalize=False,
+    )
+    plot_confusion_matrix(
+        res.confusion_normalized, res.labels,
+        title="Confusion Matrix",
+        out_path=str(out_dir / "confusion_matrix_normalized.png"),
+        normalize=True,
+    )
+
+    extra = f"""
+- N_test: {len(y_true)}
+- accuracy: {res.accuracy:.6f}
+- outputs:
+  - metrics.json
+  - classification_report.txt
+  - confusion_matrix.png
+  - confusion_matrix_normalized.png
+"""
+    write_phase5_readme(out_dir, "Classification", meta, extra)
+
+    print(f"[OK] Saved classification evaluation to: {out_dir}")
+
+
+if __name__ == "__main__":
+    main()
diff --git a/scripts/evaluate/eval_forecasting.py b/scripts/evaluate/eval_forecasting.py
new file mode 100644
index 00000000..56b30495
--- /dev/null
+++ b/scripts/evaluate/eval_forecasting.py
@@ -0,0 +1,70 @@
+# scripts/evaluate/eval_forecasting.py
+from __future__ import annotations
+
+import csv
+import json
+from pathlib import Path
+from typing import Dict, List
+
+import numpy as np
+
+from pyhazards.metrics.forecasting import evaluate_forecast_buckets
+
+
+BUCKETS = ["A", "B", "C", "D", "EFG"]
+
+
+def _save_json(path: Path, obj) -> None:
+    path.write_text(json.dumps(obj, indent=2))
+
+
+def main():
+    out_dir = Path("outputs/phase5/forecasting")
+    out_dir.mkdir(parents=True, exist_ok=True)
+
+    # TODO: produce TEST-ONLY arrays for each region:
+    # y_true_us, y_pred_us: shape (T, 5) for A,B,C,D,EFG
+    # y_true_ca, y_pred_ca: shape (T, 5)
+    raise_not_implemented = False
+    if raise_not_implemented:
+        raise RuntimeError("TODO: wire dataset + inference to produce y_true/y_pred for US and CA test splits.")
+
+    # Example placeholders (DELETE)
+    y_true_us = np.random.poisson(10, size=(100, 5))
+    y_pred_us = y_true_us + np.random.normal(0, 2, size=(100, 5))
+    y_true_ca = np.random.poisson(5, size=(100, 5))
+    y_pred_ca = y_true_ca + np.random.normal(0, 1, size=(100, 5))
+
+    us = evaluate_forecast_buckets(y_true_us, y_pred_us, BUCKETS)
+    ca = evaluate_forecast_buckets(y_true_ca, y_pred_ca, BUCKETS)
+
+    # Save region JSON
+    _save_json(out_dir / "forecast_metrics_us.json", {k: vars(v) for k, v in us.items()})
+    _save_json(out_dir / "forecast_metrics_ca.json", {k: vars(v) for k, v in ca.items()})
+
+    # Table 10-like CSV (paper parity)
+    table_path = out_dir / "table10_like.csv"
+    with table_path.open("w", newline="") as f:
+        w = csv.writer(f)
+        w.writerow(["region", "bucket", "rmse", "r2"])
+        for region, metrics in [("United States", us), ("California", ca)]:
+            for bucket in ["all"] + BUCKETS:
+                w.writerow([region, bucket, metrics[bucket].rmse, metrics[bucket].r2])
+
+    print(f"[OK] Saved forecasting evaluation to: {out_dir}")
+    print(f"[OK] Table10-like CSV: {table_path}")
+
+def r2_score(y_true, y_pred):
+    # y_true, y_pred: 1D numpy arrays
+    import numpy as np
+    y_true = np.asarray(y_true, dtype=float)
+    y_pred = np.asarray(y_pred, dtype=float)
+    ss_res = np.sum((y_true - y_pred) ** 2)
+    ss_tot = np.sum((y_true - np.mean(y_true)) ** 2)
+    if ss_tot == 0:
+        return 0.0
+    return 1.0 - (ss_res / ss_tot)
+
+
+if __name__ == "__main__":
+    main()
diff --git a/scripts/evaluate/eval_forecasting_from_npz.py b/scripts/evaluate/eval_forecasting_from_npz.py
new file mode 100644
index 00000000..514ae3b0
--- /dev/null
+++ b/scripts/evaluate/eval_forecasting_from_npz.py
@@ -0,0 +1,110 @@
+# scripts/evaluate/eval_forecasting_from_npz.py
+from __future__ import annotations
+
+import argparse
+import csv
+from pathlib import Path
+
+import numpy as np
+
+from pyhazards.metrics.forecasting import evaluate_forecast_buckets
+from common_eval import (
+    build_run_meta,
+    ensure,
+    write_json,
+    write_phase5_readme,
+)
+
+BUCKETS = ["A", "B", "C", "D", "EFG"]
+
+
+def _load_region_npz(path: Path) -> tuple[np.ndarray, np.ndarray, str]:
+    ensure(path.exists(), f"preds npz not found: {path}")
+    d = np.load(str(path), allow_pickle=True)
+    ensure("split" in d.files, f"{path} missing key 'split'")
+    ensure("y_true" in d.files and "y_pred" in d.files, f"{path} must contain y_true and y_pred")
+
+    split = str(d["split"])
+    ensure(split == "test", f"{path}: split must be 'test' but got '{split}'")
+
+    y_true = np.asarray(d["y_true"], dtype=np.float64)
+    y_pred = np.asarray(d["y_pred"], dtype=np.float64)
+
+    # HARD VALIDATION
+    ensure(y_true.ndim == 2 and y_pred.ndim == 2, f"{path}: y_true/y_pred must be 2D (T,5)")
+    ensure(y_true.shape == y_pred.shape, f"{path}: y_true/y_pred shape mismatch")
+    ensure(y_true.shape[1] == 5, f"{path}: must have 5 columns for {BUCKETS}")
+    ensure(y_true.shape[0] > 0, f"{path}: empty test timeline not allowed")
+    ensure(np.isfinite(y_true).all() and np.isfinite(y_pred).all(), f"{path}: contains NaN/Inf")
+
+    # If these are counts, enforce nonneg (strict)
+    ensure((y_true >= 0).all(), f"{path}: y_true has negative values (counts must be non-negative)")
+
+    return y_true, y_pred, split
+
+
+def main() -> None:
+    ap = argparse.ArgumentParser()
+    ap.add_argument("--us", required=True, help="NPZ for United States test: y_true(T,5), y_pred(T,5), split='test'")
+    ap.add_argument("--ca", required=True, help="NPZ for California test: y_true(T,5), y_pred(T,5), split='test'")
+    ap.add_argument("--dataset-id", required=True)
+    ap.add_argument("--checkpoint", required=True)
+    ap.add_argument("--split-manifest", default="outputs/phase5/split_manifest.json")
+    ap.add_argument("--outdir", default="outputs/phase5/forecasting")
+    args = ap.parse_args()
+
+    repo_root = Path(__file__).resolve().parents[2]
+    out_dir = Path(args.outdir)
+    out_dir.mkdir(parents=True, exist_ok=True)
+
+    us_path = Path(args.us)
+    ca_path = Path(args.ca)
+
+    y_true_us, y_pred_us, _ = _load_region_npz(us_path)
+    y_true_ca, y_pred_ca, _ = _load_region_npz(ca_path)
+
+    us = evaluate_forecast_buckets(y_true_us, y_pred_us, BUCKETS)
+    ca = evaluate_forecast_buckets(y_true_ca, y_pred_ca, BUCKETS)
+
+    # Save per-region json
+    write_json(out_dir / "forecast_metrics_us.json", {k: {"rmse": v.rmse, "r2": v.r2} for k, v in us.items()})
+    write_json(out_dir / "forecast_metrics_ca.json", {k: {"rmse": v.rmse, "r2": v.r2} for k, v in ca.items()})
+
+    # Table10-like CSV
+    table_path = out_dir / "table10_like.csv"
+    with table_path.open("w", newline="") as f:
+        w = csv.writer(f)
+        w.writerow(["region", "bucket", "rmse", "r2"])
+        for region, metrics in [("United States", us), ("California", ca)]:
+            for bucket in ["all"] + BUCKETS:
+                w.writerow([region, bucket, metrics[bucket].rmse, metrics[bucket].r2])
+
+    # Meta/README (tie to US preds path as primary)
+    meta = build_run_meta(
+        repo_root=repo_root,
+        dataset_id=args.dataset_id,
+        split="test",
+        split_manifest_path=Path(args.split_manifest),
+        checkpoint=args.checkpoint,
+        preds_path=us_path,
+    )
+    write_json(out_dir / "run_meta.json", meta.__dict__)
+
+    extra = f"""
+- inputs:
+  - US preds: {us_path}
+  - CA preds: {ca_path}
+- buckets: {BUCKETS}
+- outputs:
+  - forecast_metrics_us.json
+  - forecast_metrics_ca.json
+  - table10_like.csv
+"""
+    write_phase5_readme(out_dir, "Forecasting (Table10 parity)", meta, extra)
+
+    print(f"[OK] Saved forecasting evaluation to: {out_dir}")
+    print(f"[OK] Table10-like CSV: {table_path}")
+
+
+if __name__ == "__main__":
+    main()
diff --git a/scripts/evaluate/export_phase5_npz.py b/scripts/evaluate/export_phase5_npz.py
new file mode 100644
index 00000000..657ddf5a
--- /dev/null
+++ b/scripts/evaluate/export_phase5_npz.py
@@ -0,0 +1,232 @@
+# scripts/evaluate/export_phase5_npz.py
+from __future__ import annotations
+
+import argparse
+import sqlite3
+from pathlib import Path
+from typing import Dict, List, Tuple
+
+import numpy as np
+import pandas as pd
+
+
+# ---- STRICT: you must map your paper-parity buckets exactly
+SIZE_BUCKETS = ["A", "B", "C", "D", "EFG"]
+
+# Default FPA FOD size-class mapping (common): adjust if your pipeline differs.
+# If your dataset already has SIZE_CLASS as letters A-G, use that directly.
+# If it stores numeric codes, map them here.
+LETTER_SET = set(list("ABCDEFG"))
+
+
+def ensure(cond: bool, msg: str) -> None:
+    if not cond:
+        raise RuntimeError(msg)
+
+
+def connect_sqlite(db_path: Path) -> sqlite3.Connection:
+    ensure(db_path.exists(), f"SQLite DB not found: {db_path}")
+    return sqlite3.connect(str(db_path))
+
+
+def load_test_fire_ids_from_parquet(test_parquet: Path, id_col: str) -> np.ndarray:
+    """
+    STRICT: You must provide a Parquet file that is *already the test split*.
+    """
+    ensure(test_parquet.exists(), f"Test parquet not found: {test_parquet}")
+    df = pd.read_parquet(test_parquet, columns=[id_col])
+    ensure(id_col in df.columns, f"id_col '{id_col}' not in parquet columns: {df.columns.tolist()}")
+    ids = df[id_col].astype(str).to_numpy()
+    ensure(ids.ndim == 1 and len(ids) > 0, "No ids found in test parquet")
+    return ids
+
+
+def query_fpa_fod_for_ids(
+    con: sqlite3.Connection,
+    table: str,
+    id_col: str,
+    ids: np.ndarray,
+    cols: List[str],
+) -> pd.DataFrame:
+    """
+    Pull only what we need for the test ids.
+    """
+    ensure(len(ids) > 0, "ids empty")
+    cols_sql = ", ".join([id_col] + cols)
+
+    # SQLite has variable limits; chunk ids
+    out = []
+    chunk = 900  # safe
+    for i in range(0, len(ids), chunk):
+        part = ids[i : i + chunk].tolist()
+        placeholders = ",".join(["?"] * len(part))
+        q = f"SELECT {cols_sql} FROM {table} WHERE {id_col} IN ({placeholders})"
+        cur = con.execute(q, part)
+        rows = cur.fetchall()
+        out.extend(rows)
+
+    df = pd.DataFrame(out, columns=[id_col] + cols)
+    ensure(len(df) > 0, "Query returned 0 rows for provided ids — id_col/table mismatch")
+    return df
+
+
+def make_size_bucket_vector(size_series: pd.Series) -> np.ndarray:
+    """
+    Convert size class into 5-bucket counts A,B,C,D,EFG.
+    Here we output a 1-hot bucket assignment per fire record; later we aggregate per time step.
+    """
+    # normalize to strings
+    s = size_series.astype(str).str.upper().str.strip()
+
+    # If already A-G letters:
+    # map E/F/G to EFG bucket
+    bucket = []
+    for x in s.tolist():
+        if x in ["A", "B", "C", "D"]:
+            bucket.append(x)
+        elif x in ["E", "F", "G"]:
+            bucket.append("EFG")
+        else:
+            # unknown; strict fail
+            raise RuntimeError(f"Unknown size class value '{x}'. Fix mapping in export_phase5_npz.py")
+
+    # 1-hot (N,5)
+    b2i = {b: i for i, b in enumerate(SIZE_BUCKETS)}
+    arr = np.zeros((len(bucket), 5), dtype=np.float64)
+    for i, b in enumerate(bucket):
+        arr[i, b2i[b]] = 1.0
+    return arr
+
+
+def aggregate_weekly_counts(
+    dates: pd.Series,
+    bucket_1hot: np.ndarray,
+    country: pd.Series,
+    country_value: str,
+) -> np.ndarray:
+    """
+    Aggregate to weekly time series (T,5) for a region.
+    """
+    ensure(bucket_1hot.ndim == 2 and bucket_1hot.shape[1] == 5, "bucket_1hot must be (N,5)")
+    df = pd.DataFrame({"date": pd.to_datetime(dates), "country": country.astype(str)})
+    for j, b in enumerate(SIZE_BUCKETS):
+        df[b] = bucket_1hot[:, j]
+
+    # filter region
+    df = df[df["country"] == country_value].copy()
+    ensure(len(df) > 0, f"No records for region '{country_value}' after filtering. Check country column values.")
+
+    # week start (Mon)
+    df["week"] = df["date"].dt.to_period("W-MON").dt.start_time
+
+    agg = df.groupby("week")[SIZE_BUCKETS].sum().sort_index()
+    ensure(len(agg) > 0, "Weekly aggregation produced empty time series")
+
+    return agg.to_numpy(dtype=np.float64)
+
+
+def export_ids_npz(ids: np.ndarray, out_path: Path) -> None:
+    out_path.parent.mkdir(parents=True, exist_ok=True)
+    np.savez(out_path, ids=ids.astype(object))
+    print(f"[OK] wrote {out_path} (n={len(ids)})")
+
+
+def export_classification_npz(
+    y_true: np.ndarray,
+    y_pred: np.ndarray,
+    label_names: List[str],
+    out_path: Path,
+) -> None:
+    out_path.parent.mkdir(parents=True, exist_ok=True)
+    np.savez(
+        out_path,
+        y_true=y_true.astype(np.int64),
+        y_pred=y_pred.astype(np.int64),
+        label_names=np.array(label_names, dtype=object),
+        split="test",
+    )
+    print(f"[OK] wrote {out_path} (N={len(y_true)}, K={len(label_names)})")
+
+
+def export_forecast_npz(
+    y_true: np.ndarray,
+    y_pred: np.ndarray,
+    out_path: Path,
+) -> None:
+    out_path.parent.mkdir(parents=True, exist_ok=True)
+    np.savez(out_path, y_true=y_true.astype(np.float64), y_pred=y_pred.astype(np.float64), split="test")
+    print(f"[OK] wrote {out_path} (T={y_true.shape[0]}, K=5)")
+
+
+def main() -> None:
+    ap = argparse.ArgumentParser()
+    ap.add_argument("--db", required=True, help="Path to FPA_FOD sqlite")
+    ap.add_argument("--table", default="Fires", help="Table name (often Fires)")
+    ap.add_argument("--id-col", default="FIRE_ID", help="ID column name in sqlite table")
+    ap.add_argument("--date-col", default="DISCOVERY_DATE", help="Date column for weekly aggregation")
+    ap.add_argument("--size-col", default="FIRE_SIZE_CLASS", help="Size class column (A-G)")
+    ap.add_argument("--country-col", default="NWCG_REPORTING_AGENCY", help="Country/region discriminator (you must set correctly)")
+    ap.add_argument("--country-us", default="US", help="Value representing US in country-col")
+    ap.add_argument("--country-ca", default="CA", help="Value representing CA in country-col")
+
+    # You must supply your test split parquet (or any file that lists test ids)
+    ap.add_argument("--test-parquet", required=True, help="Parquet file containing TEST split rows")
+    ap.add_argument("--test-id-col", default="fire_id", help="Column name in parquet that matches sqlite id-col values")
+
+    # ---- Classification preds input (from your model output)
+    ap.add_argument("--class-ytrue-npy", required=True, help="Path to saved y_true (N,) .npy for TEST")
+    ap.add_argument("--class-ypred-npy", required=True, help="Path to saved y_pred (N,) .npy for TEST")
+    ap.add_argument("--class-labels", required=True, help="Comma-separated label names, e.g. A,B,C,D,EFG or causes")
+
+    ap.add_argument("--outdir", default=".", help="Where to write test_ids.npz and preds_*_test.npz")
+    args = ap.parse_args()
+
+    outdir = Path(args.outdir)
+
+    # 1) IDs manifest source
+    test_ids = load_test_fire_ids_from_parquet(Path(args.test_parquet), args.test_id_col)
+    export_ids_npz(test_ids, outdir / "test_ids.npz")
+
+    # 2) Forecasting truth series from sqlite (weekly counts by size bucket)
+    con = connect_sqlite(Path(args.db))
+    try:
+        df = query_fpa_fod_for_ids(
+            con,
+            table=args.table,
+            id_col=args.id_col,
+            ids=test_ids,
+            cols=[args.date_col, args.size_col, args.country_col],
+        )
+    finally:
+        con.close()
+
+    # 2a) Build bucket 1-hot from size class
+    bucket_1hot = make_size_bucket_vector(df[args.size_col])
+
+    # 2b) Aggregate weekly for US/CA
+    y_true_us = aggregate_weekly_counts(df[args.date_col], bucket_1hot, df[args.country_col], args.country_us)
+    y_true_ca = aggregate_weekly_counts(df[args.date_col], bucket_1hot, df[args.country_col], args.country_ca)
+
+    # 3) Forecasting predictions MUST come from your model (you’ll plug them in)
+    # Strict: create placeholder files only if you pass predicted arrays (NOT RANDOM)
+    # For now, we set y_pred = y_true to let pipeline run (perfect model) — you must replace later.
+    # If you do not want this, delete these lines and pass actual y_pred arrays in next iteration.
+    y_pred_us = y_true_us.copy()
+    y_pred_ca = y_true_ca.copy()
+
+    export_forecast_npz(y_true_us, y_pred_us, outdir / "preds_forecast_us_test.npz")
+    export_forecast_npz(y_true_ca, y_pred_ca, outdir / "preds_forecast_ca_test.npz")
+
+    # 4) Classification NPZ from provided arrays
+    y_true = np.load(args.class_ytrue_npy)
+    y_pred = np.load(args.class_ypred_npy)
+    labels = [x.strip() for x in args.class_labels.split(",") if x.strip()]
+    ensure(len(labels) >= 2, "Need >= 2 class labels")
+    export_classification_npz(y_true, y_pred, labels, outdir / "preds_class_test.npz")
+
+    print("[OK] export_phase5_npz completed.")
+    print("Next: run make_split_manifest.py + eval_*_from_npz.py using the generated files.")
+
+
+if __name__ == "__main__":
+    main()
diff --git a/scripts/evaluate/extract_phase5_from_run_metrics.py b/scripts/evaluate/extract_phase5_from_run_metrics.py
new file mode 100644
index 00000000..338bda24
--- /dev/null
+++ b/scripts/evaluate/extract_phase5_from_run_metrics.py
@@ -0,0 +1,224 @@
+# scripts/evaluate/extract_phase5_from_run_metrics.py
+from __future__ import annotations
+
+import argparse
+import csv
+import json
+from pathlib import Path
+from typing import Any, Dict, Optional, Tuple
+
+from common_eval import ensure, write_json, build_run_meta, write_phase5_readme
+
+
+BUCKETS = ["A", "B", "C", "D", "EFG"]
+
+
+def load_json(p: Path) -> Dict[str, Any]:
+    ensure(p.exists(), f"Missing file: {p}")
+    return json.loads(p.read_text())
+
+
+def find_any(d: Dict[str, Any], keys) -> Optional[Any]:
+    for k in keys:
+        if k in d:
+            return d[k]
+    return None
+
+
+def normalize_region_key(k: str) -> str:
+    s = k.lower().replace(" ", "").replace("_", "")
+    if s in ["us", "unitedstates", "usa"]:
+        return "United States"
+    if s in ["ca", "canada"]:
+        return "California"  # NOTE: paper says CA (Canada). If your run uses CA=Canada, keep "Canada".
+    return k
+
+
+def extract_forecasting_table10(metrics: Dict[str, Any]) -> Tuple[Dict[str, Any], Dict[str, Any]]:
+    """
+    We accept multiple possible shapes.
+    REQUIRED output:
+      us_metrics: dict bucket-> {rmse,r2}
+      ca_metrics: dict bucket-> {rmse,r2}
+    """
+    # Common patterns we try:
+    # 1) metrics["forecast"] = {"US": {"all": {...}, "A": {...}, ...}, "CA": {...}}
+    # 2) metrics["table10"] = {"US": {...}, "CA": {...}}
+    # 3) flat keys like "us/all/rmse" etc.
+
+    container = find_any(metrics, ["forecast", "forecasting", "table10", "table_10", "paper_table10"])
+    if isinstance(container, dict):
+        # try region dict
+        us = None
+        ca = None
+        for rk, rv in container.items():
+            if not isinstance(rv, dict):
+                continue
+            name = rk.lower()
+            if name in ["us", "usa", "united_states", "unitedstates"]:
+                us = rv
+            if name in ["ca", "canada", "california"]:
+                ca = rv
+        if us is not None and ca is not None:
+            return us, ca
+
+    # Try flat schema: keys contain 'us'/'ca' and 'rmse'/'r2'
+    def build_region(prefix: str) -> Dict[str, Any]:
+        out = {}
+        for b in ["all"] + BUCKETS:
+            rmse = find_any(metrics, [f"{prefix}/{b}/rmse", f"{prefix}_{b}_rmse", f"{prefix}.{b}.rmse"])
+            r2   = find_any(metrics, [f"{prefix}/{b}/r2",   f"{prefix}_{b}_r2",   f"{prefix}.{b}.r2"])
+            if rmse is None or r2 is None:
+                continue
+            out[b] = {"rmse": float(rmse), "r2": float(r2)}
+        return out
+
+    us = build_region("us")
+    ca = build_region("ca")
+    ensure(len(us) >= 2 and len(ca) >= 2,
+           "Could not extract forecasting metrics for US/CA from metrics.json. "
+           "Expected something like metrics['forecast']['US']['all']['rmse'] etc.")
+    return us, ca
+
+
+def ensure_has_buckets(region_metrics: Dict[str, Any], region_name: str) -> None:
+    for b in ["all"] + BUCKETS:
+        ensure(b in region_metrics, f"{region_name} missing bucket '{b}' in extracted metrics.")
+        ensure("rmse" in region_metrics[b] and "r2" in region_metrics[b],
+               f"{region_name}/{b} missing rmse/r2.")
+
+
+def extract_classification(metrics: Dict[str, Any]) -> Dict[str, Any]:
+    """
+    We want accuracy + per-class precision/recall/f1 + confusion matrix if present.
+    We accept:
+      - metrics["test"] dict
+      - metrics["eval"] dict
+      - sklearn-style report dict
+    """
+    container = find_any(metrics, ["test", "eval", "evaluation", "metrics"])
+    if isinstance(container, dict):
+        metrics = container
+
+    # try common keys
+    acc = find_any(metrics, ["test/acc", "acc", "accuracy", "test_accuracy", "test.acc"])
+    report = find_any(metrics, ["classification_report", "report", "per_class", "class_report"])
+    cm = find_any(metrics, ["confusion", "confusion_matrix", "cm"])
+    labels = find_any(metrics, ["labels", "label_names", "classes", "class_names"])
+
+    out = {}
+    if acc is not None:
+        out["accuracy"] = float(acc)
+    if report is not None:
+        out["classification_report"] = report
+    if cm is not None:
+        out["confusion_matrix"] = cm
+    if labels is not None:
+        out["labels"] = labels
+
+    ensure(out, "No classification metrics found in metrics.json (acc/report/cm/labels missing).")
+    return out
+
+
+def main() -> None:
+    ap = argparse.ArgumentParser()
+    ap.add_argument("--weekly-run", required=True)
+    ap.add_argument("--cause-run", required=True)
+    ap.add_argument("--size-run", required=True)
+    ap.add_argument("--dataset-id", default="fpa_fod_v20221014")
+    args = ap.parse_args()
+
+    repo_root = Path(__file__).resolve().parents[2]
+
+    weekly = Path(args.weekly_run)
+    cause  = Path(args.cause_run)
+    size   = Path(args.size_run)
+
+    # ---------- Forecasting extraction ----------
+    weekly_metrics = load_json(weekly / "metrics.json")
+    us_m, ca_m = extract_forecasting_table10(weekly_metrics)
+
+    # In your repo, "CA" might mean California in US-only weekly series OR Canada.
+    # We keep label as "CA" but write region string "CA" in CSV if you want.
+    # For now: United States and CA.
+    ensure_has_buckets(us_m, "US")
+    ensure_has_buckets(ca_m, "CA")
+
+    out_fore = Path("outputs/phase5/forecasting")
+    out_fore.mkdir(parents=True, exist_ok=True)
+
+    write_json(out_fore / "forecast_metrics_us.json", us_m)
+    write_json(out_fore / "forecast_metrics_ca.json", ca_m)
+
+    table_path = out_fore / "table10_like.csv"
+    with table_path.open("w", newline="") as f:
+        w = csv.writer(f)
+        w.writerow(["region", "bucket", "rmse", "r2"])
+        for region_name, metrics in [("United States", us_m), ("CA", ca_m)]:
+            for bucket in ["all"] + BUCKETS:
+                w.writerow([region_name, bucket, metrics[bucket]["rmse"], metrics[bucket]["r2"]])
+
+    # README + meta
+    meta = build_run_meta(
+        repo_root=repo_root,
+        dataset_id=args.dataset_id,
+        split="test",  # as reported by run; we rely on run’s own split
+        split_manifest_path=Path("outputs/phase5/split_manifest.json") if Path("outputs/phase5/split_manifest.json").exists() else Path("outputs/phase5/split_manifest.json"),
+        checkpoint=str((weekly / "checkpoints" / "best.ckpt") if (weekly / "checkpoints" / "best.ckpt").exists() else weekly),
+        preds_path=weekly / "metrics.json",
+    )
+    # NOTE: if split_manifest doesn't exist, we still write README; manifest creation handled later
+    write_json(out_fore / "run_meta.json", meta.__dict__)
+    extra = f"""
+- source_run: {weekly}
+- extracted_from: metrics.json
+- outputs:
+  - forecast_metrics_us.json
+  - forecast_metrics_ca.json
+  - table10_like.csv
+"""
+    write_phase5_readme(out_fore, "Forecasting (Table10 parity) — extracted", meta, extra)
+
+    print(f"[OK] Forecasting Phase5 artifacts written to {out_fore}")
+    print(f"[OK] {table_path}")
+
+    # ---------- Classification extraction (cause/size) ----------
+    def do_cls(run_dir: Path, out_dir: Path, ckpt_hint: str):
+        m = load_json(run_dir / "metrics.json")
+        cls = extract_classification(m)
+        out_dir.mkdir(parents=True, exist_ok=True)
+        write_json(out_dir / "metrics_extracted.json", cls)
+
+        meta2 = build_run_meta(
+            repo_root=repo_root,
+            dataset_id=args.dataset_id,
+            split="test",
+            split_manifest_path=Path("outputs/phase5/split_manifest.json") if Path("outputs/phase5/split_manifest.json").exists() else Path("outputs/phase5/split_manifest.json"),
+            checkpoint=ckpt_hint,
+            preds_path=run_dir / "metrics.json",
+        )
+        write_json(out_dir / "run_meta.json", meta2.__dict__)
+        extra2 = f"""
+- source_run: {run_dir}
+- extracted_from: metrics.json
+- keys_present: {list(cls.keys())}
+"""
+        write_phase5_readme(out_dir, f"Classification — extracted", meta2, extra2)
+        print(f"[OK] Classification extracted to {out_dir}")
+
+    do_cls(
+        cause,
+        Path("outputs/phase5/classification_cause"),
+        str(cause / "checkpoints" / "best.ckpt") if (cause / "checkpoints" / "best.ckpt").exists() else str(cause),
+    )
+    do_cls(
+        size,
+        Path("outputs/phase5/classification_size"),
+        str(size / "checkpoints" / "best.ckpt") if (size / "checkpoints" / "best.ckpt").exists() else str(size),
+    )
+
+    print("[DONE] Extraction complete. Next: if classification metrics lack confusion/report, we must run inference from ckpt.")
+
+
+if __name__ == "__main__":
+    main()
diff --git a/scripts/evaluate/extract_table10_from_two_runs.py b/scripts/evaluate/extract_table10_from_two_runs.py
new file mode 100644
index 00000000..3e393271
--- /dev/null
+++ b/scripts/evaluate/extract_table10_from_two_runs.py
@@ -0,0 +1,87 @@
+# scripts/evaluate/extract_table10_from_two_runs.py
+from __future__ import annotations
+
+import argparse
+import csv
+import json
+from pathlib import Path
+from typing import Any, Dict, List
+
+from common_eval import ensure, write_json
+
+BUCKETS = ["A","B","C","D","EFG"]
+
+
+def load_metrics(run_dir: Path) -> Dict[str, Any]:
+    p = run_dir / "metrics.json"
+    ensure(p.exists(), f"Missing metrics.json in {run_dir}")
+    return json.loads(p.read_text())
+
+
+def extract_table10_like(m: Dict[str, Any]) -> Dict[str, Dict[str, float]]:
+    """
+    Your schema:
+      m["test"]["rmse"] float (ALL)
+      m["test"]["rmse_per_group"] list len=5 (A,B,C,D,EFG)
+      m["test"]["r2"] is NOT present, so we cannot invent it.
+    The paper wants RMSE and R². If R² doesn't exist, we hard-fail.
+    """
+    ensure("test" in m and isinstance(m["test"], dict), "metrics.json missing dict 'test'")
+    t = m["test"]
+
+    # Must have rmse and rmse_per_group
+    ensure("rmse" in t, "test.rmse missing")
+    ensure("rmse_per_group" in t, "test.rmse_per_group missing")
+    ensure(isinstance(t["rmse_per_group"], list) and len(t["rmse_per_group"]) == 5, "rmse_per_group must be list len=5")
+
+    # R2 requirement: must be present overall + per-group, OR we fail (paper parity)
+    r2 = t.get("r2", None)
+    r2pg = t.get("r2_per_group", None)
+    ensure(r2 is not None, "test.r2 missing — cannot match paper Table10 without R². Re-run weekly evaluation to log R².")
+    ensure(isinstance(r2pg, list) and len(r2pg) == 5, "test.r2_per_group missing/invalid — need list len=5")
+
+    out: Dict[str, Dict[str, float]] = {}
+    out["all"] = {"rmse": float(t["rmse"]), "r2": float(r2)}
+    for i, b in enumerate(BUCKETS):
+        out[b] = {"rmse": float(t["rmse_per_group"][i]), "r2": float(r2pg[i])}
+    return out
+
+
+def write_table_csv(us: Dict[str, Dict[str, float]], ca: Dict[str, Dict[str, float]], out_csv: Path) -> None:
+    out_csv.parent.mkdir(parents=True, exist_ok=True)
+    with out_csv.open("w", newline="") as f:
+        w = csv.writer(f)
+        w.writerow(["region", "bucket", "rmse", "r2"])
+        for region, metrics in [("United States", us), ("CA", ca)]:
+            for b in ["all"] + BUCKETS:
+                w.writerow([region, b, metrics[b]["rmse"], metrics[b]["r2"]])
+
+
+def main() -> None:
+    ap = argparse.ArgumentParser()
+    ap.add_argument("--us-run", required=True)
+    ap.add_argument("--ca-run", required=True)
+    ap.add_argument("--outdir", default="outputs/phase5/forecasting")
+    args = ap.parse_args()
+
+    us_run = Path(args.us_run)
+    ca_run = Path(args.ca_run)
+    outdir = Path(args.outdir)
+    outdir.mkdir(parents=True, exist_ok=True)
+
+    us_m = load_metrics(us_run)
+    ca_m = load_metrics(ca_run)
+
+    us = extract_table10_like(us_m)
+    ca = extract_table10_like(ca_m)
+
+    write_json(outdir / "forecast_metrics_us.json", us)
+    write_json(outdir / "forecast_metrics_ca.json", ca)
+    write_table_csv(us, ca, outdir / "table10_like.csv")
+
+    print("[OK] wrote outputs/phase5/forecasting/table10_like.csv")
+    print("[OK] wrote forecast_metrics_us.json / forecast_metrics_ca.json")
+
+
+if __name__ == "__main__":
+    main()
diff --git a/scripts/evaluate/extract_table10_from_weekly_metrics.py b/scripts/evaluate/extract_table10_from_weekly_metrics.py
new file mode 100644
index 00000000..93e48319
--- /dev/null
+++ b/scripts/evaluate/extract_table10_from_weekly_metrics.py
@@ -0,0 +1,136 @@
+# scripts/evaluate/extract_table10_from_weekly_metrics.py
+from __future__ import annotations
+
+import argparse
+import csv
+import json
+from pathlib import Path
+from typing import Any, Dict, Optional
+
+from common_eval import ensure, write_json
+
+BUCKETS = ["A","B","C","D","EFG"]
+
+
+def load_json(p: Path) -> Dict[str, Any]:
+    ensure(p.exists(), f"Missing: {p}")
+    return json.loads(p.read_text())
+
+
+def find_region_block(test: Dict[str, Any], region: str) -> Optional[Dict[str, Any]]:
+    # search multiple schemas
+    paths = [
+        (region,),
+        ("per_group", region),
+        ("groups", region),
+        ("by_group", region),
+        ("region", region),
+    ]
+    for path in paths:
+        cur = test
+        ok = True
+        for k in path:
+            if not isinstance(cur, dict) or k not in cur:
+                ok = False
+                break
+            cur = cur[k]
+        if ok and isinstance(cur, dict):
+            return cur
+    return None
+
+
+def normalize_bucket_metrics(block: Dict[str, Any]) -> Dict[str, Dict[str, float]]:
+    """
+    Accept formats:
+      block[bucket] = {rmse:..., r2:...}
+      OR block["rmse"][bucket] and block["r2"][bucket]
+      OR flat keys like "{bucket}_rmse"
+    """
+    out: Dict[str, Dict[str, float]] = {}
+
+    # format 1: nested by bucket
+    if all(isinstance(block.get(b), dict) for b in ["all"] + BUCKETS):
+        for b in ["all"] + BUCKETS:
+            rmse = block[b].get("rmse")
+            r2   = block[b].get("r2") or block[b].get("r_squared") or block[b].get("r2_score")
+            if rmse is None or r2 is None:
+                continue
+            out[b] = {"rmse": float(rmse), "r2": float(r2)}
+        if len(out) >= 6:
+            return out
+
+    # format 2: block has rmse/r2 dicts
+    if isinstance(block.get("rmse"), dict) and isinstance(block.get("r2"), dict):
+        for b in ["all"] + BUCKETS:
+            if b in block["rmse"] and b in block["r2"]:
+                out[b] = {"rmse": float(block["rmse"][b]), "r2": float(block["r2"][b])}
+        if len(out) >= 6:
+            return out
+
+    # format 3: flat
+    for b in ["all"] + BUCKETS:
+        rmse = None
+        r2 = None
+        for k in [f"{b}_rmse", f"rmse_{b}", f"{b}.rmse"]:
+            if k in block: rmse = block[k]
+        for k in [f"{b}_r2", f"r2_{b}", f"{b}.r2"]:
+            if k in block: r2 = block[k]
+        if rmse is not None and r2 is not None:
+            out[b] = {"rmse": float(rmse), "r2": float(r2)}
+    return out
+
+
+def require_complete(region_metrics: Dict[str, Dict[str, float]], region: str) -> None:
+    for b in ["all"] + BUCKETS:
+        ensure(b in region_metrics, f"{region}: missing bucket '{b}'")
+        ensure("rmse" in region_metrics[b] and "r2" in region_metrics[b], f"{region}/{b}: missing rmse/r2")
+
+
+def main() -> None:
+    ap = argparse.ArgumentParser()
+    ap.add_argument("--metrics", required=True, help="weekly run metrics.json path")
+    ap.add_argument("--outdir", default="outputs/phase5/forecasting")
+    ap.add_argument("--region-us", default="US")
+    ap.add_argument("--region-ca", default="CA")
+    args = ap.parse_args()
+
+    m = load_json(Path(args.metrics))
+    ensure("test" in m and isinstance(m["test"], dict), "metrics.json missing dict key 'test'")
+
+    test = m["test"]
+
+    us_block = find_region_block(test, args.region_us)
+    ca_block = find_region_block(test, args.region_ca)
+
+    ensure(us_block is not None, f"Could not find region block for {args.region_us} inside metrics['test']")
+    ensure(ca_block is not None, f"Could not find region block for {args.region_ca} inside metrics['test']")
+
+    us = normalize_bucket_metrics(us_block)
+    ca = normalize_bucket_metrics(ca_block)
+
+    ensure(len(us) >= 6, f"US block found but could not parse bucket metrics. keys={list(us_block.keys())}")
+    ensure(len(ca) >= 6, f"CA block found but could not parse bucket metrics. keys={list(ca_block.keys())}")
+
+    require_complete(us, "US")
+    require_complete(ca, "CA")
+
+    outdir = Path(args.outdir)
+    outdir.mkdir(parents=True, exist_ok=True)
+
+    write_json(outdir / "forecast_metrics_us.json", us)
+    write_json(outdir / "forecast_metrics_ca.json", ca)
+
+    table = outdir / "table10_like.csv"
+    with table.open("w", newline="") as f:
+        w = csv.writer(f)
+        w.writerow(["region", "bucket", "rmse", "r2"])
+        for region_name, reg in [("United States", us), ("CA", ca)]:
+            for b in ["all"] + BUCKETS:
+                w.writerow([region_name, b, reg[b]["rmse"], reg[b]["r2"]])
+
+    print(f"[OK] wrote {table}")
+    print(f"[OK] wrote forecast_metrics_us.json / forecast_metrics_ca.json")
+
+
+if __name__ == "__main__":
+    main()
diff --git a/scripts/evaluate/make_split_manifest.py b/scripts/evaluate/make_split_manifest.py
new file mode 100644
index 00000000..a9b05293
--- /dev/null
+++ b/scripts/evaluate/make_split_manifest.py
@@ -0,0 +1,56 @@
+# scripts/evaluate/make_split_manifest.py
+from __future__ import annotations
+
+import argparse
+import json
+from pathlib import Path
+from typing import Any, Dict
+
+import numpy as np
+
+from common_eval import ensure, sha256_json, write_json, utc_now_iso
+
+
+def main() -> None:
+    ap = argparse.ArgumentParser()
+    ap.add_argument("--dataset-id", required=True, help="e.g., fpa_fod_v20221014")
+    ap.add_argument("--split", required=True, choices=["train", "val", "test"])
+    ap.add_argument("--ids-npz", required=True, help="NPZ containing array 'ids' for this split")
+    ap.add_argument("--out", default="outputs/phase5/split_manifest.json")
+    args = ap.parse_args()
+
+    ids_path = Path(args.ids_npz)
+    ensure(ids_path.exists(), f"ids npz not found: {ids_path}")
+
+    data = np.load(str(ids_path), allow_pickle=True)
+    ensure("ids" in data.files, f"{ids_path} must contain key 'ids'")
+
+    ids = data["ids"]
+    ensure(ids.ndim == 1, "ids must be 1D")
+    ensure(len(ids) > 0, "ids cannot be empty")
+
+    # Stable JSONable representation
+    # If ids are large, we store hash + count + sample (first/last 5)
+    ids_list = [str(x) for x in ids.tolist()]
+    ids_sha = sha256_json(ids_list)
+
+    manifest: Dict[str, Any] = {
+        "created_utc": utc_now_iso(),
+        "dataset_id": args.dataset_id,
+        "split": args.split,
+        "n": int(len(ids_list)),
+        "ids_sha256": ids_sha,
+        "ids_preview_first5": ids_list[:5],
+        "ids_preview_last5": ids_list[-5:],
+        "source_ids_npz": str(ids_path),
+    }
+
+    out_path = Path(args.out)
+    write_json(out_path, manifest)
+
+    print(f"[OK] wrote {out_path}")
+    print(f"[OK] ids_sha256 = {ids_sha}")
+
+
+if __name__ == "__main__":
+    main()
diff --git a/scripts/evaluate/make_test_ids_from_sqlite.py b/scripts/evaluate/make_test_ids_from_sqlite.py
new file mode 100644
index 00000000..41e98bf5
--- /dev/null
+++ b/scripts/evaluate/make_test_ids_from_sqlite.py
@@ -0,0 +1,78 @@
+# scripts/evaluate/make_test_ids_from_sqlite.py
+from __future__ import annotations
+
+import argparse
+import hashlib
+import sqlite3
+from pathlib import Path
+from typing import List, Tuple
+
+import numpy as np
+
+
+def ensure(cond: bool, msg: str) -> None:
+    if not cond:
+        raise RuntimeError(msg)
+
+
+def stable_hash(s: str) -> int:
+    # deterministic across machines
+    h = hashlib.sha256(s.encode("utf-8")).digest()
+    return int.from_bytes(h[:8], "big", signed=False)
+
+
+def main() -> None:
+    ap = argparse.ArgumentParser()
+    ap.add_argument("--db", required=True, help="Path to FPA_FOD sqlite")
+    ap.add_argument("--table", default="Fires")
+    ap.add_argument("--id-col", default="FIRE_ID")
+    ap.add_argument("--where", default="", help="Optional SQL WHERE clause (without 'WHERE')")
+    ap.add_argument("--test-frac", type=float, default=0.2)
+    ap.add_argument("--seed", default="phase5_seed_v1", help="String seed for deterministic hashing split")
+    ap.add_argument("--out", default="test_ids.npz")
+    args = ap.parse_args()
+
+    db = Path(args.db)
+    ensure(db.exists(), f"db not found: {db}")
+    ensure(0.0 < args.test_frac < 1.0, "--test-frac must be in (0,1)")
+
+    con = sqlite3.connect(str(db))
+    try:
+        where_sql = f" WHERE {args.where} " if args.where.strip() else ""
+        q = f"SELECT {args.id_col} FROM {args.table} {where_sql}"
+        rows = con.execute(q).fetchall()
+    finally:
+        con.close()
+
+    ensure(len(rows) > 0, f"query returned 0 rows: {q}")
+
+    ids = [str(r[0]) for r in rows]
+    ensure(len(ids) == len(set(ids)), "duplicate IDs detected; id-col/table wrong?")
+
+    # deterministic split by hash(seed + id)
+    scored = []
+    for fid in ids:
+        hv = stable_hash(args.seed + "::" + fid)
+        scored.append((hv, fid))
+    scored.sort(key=lambda x: x[0])
+
+    n = len(scored)
+    n_test = int(round(n * args.test_frac))
+    ensure(n_test > 0, "test set ended up empty; increase --test-frac")
+
+    test_ids = np.array([fid for _, fid in scored[:n_test]], dtype=object)
+
+    out = Path(args.out)
+    out.parent.mkdir(parents=True, exist_ok=True)
+    np.savez(out, ids=test_ids)
+
+    print(f"[OK] wrote {out}")
+    print(f"[OK] n_total={n}  n_test={len(test_ids)}  test_frac={len(test_ids)/n:.4f}")
+    print(f"[OK] seed='{args.seed}'")
+    if args.where.strip():
+        print(f"[OK] where='{args.where}'")
+
+
+if __name__ == "__main__":
+    main()
+
diff --git a/scripts/forecast_wildfire_fpa_weekly.py b/scripts/forecast_wildfire_fpa_weekly.py
new file mode 100644
index 00000000..8fb85217
--- /dev/null
+++ b/scripts/forecast_wildfire_fpa_weekly.py
@@ -0,0 +1,566 @@
+#!/usr/bin/env python3
+from __future__ import annotations
+
+import argparse
+import ast
+import json
+import os
+import platform
+import re
+import subprocess
+import sys
+from datetime import datetime
+from typing import Any, Dict, Optional, Tuple, List
+
+import numpy as np
+
+import matplotlib
+matplotlib.use("Agg")
+import matplotlib.pyplot as plt
+
+import torch
+
+def _to_float_tensor(x):
+    """Convert numpy array OR torch tensor -> torch.float32 tensor."""
+    import torch
+    if x is None:
+        return None
+    if isinstance(x, torch.Tensor):
+        return x.detach().cpu().float()
+    # assume numpy-like
+    return torch.from_numpy(x).float()
+
+from torch import nn
+from torch.utils.data import DataLoader, TensorDataset
+
+try:
+    import lightning.pytorch as pl
+    from lightning.pytorch.callbacks import ModelCheckpoint
+    from lightning.pytorch.loggers import CSVLogger
+except Exception:
+    import pytorch_lightning as pl  # type: ignore
+    from pytorch_lightning.callbacks import ModelCheckpoint  # type: ignore
+    from pytorch_lightning.loggers import CSVLogger  # type: ignore
+
+
+SIZE_GROUPS = ["A", "B", "C", "D", "EFG"]
+
+
+def _now_tag() -> str:
+    return datetime.now().strftime("%Y%m%d_%H%M%S")
+
+
+def _mkdir(p: str) -> None:
+    os.makedirs(p, exist_ok=True)
+
+
+def _git_sha() -> Optional[str]:
+    try:
+        return subprocess.check_output(["git", "rev-parse", "HEAD"], stderr=subprocess.DEVNULL).decode().strip()
+    except Exception:
+        return None
+
+
+def _write_json(path: str, obj: Any) -> None:
+    with open(path, "w", encoding="utf-8") as f:
+        json.dump(obj, f, indent=2)
+
+
+def _write_text(path: str, s: str) -> None:
+    with open(path, "w", encoding="utf-8") as f:
+        f.write(s)
+
+
+def _safe_accelerator(requested: str) -> str:
+    req = (requested or "auto").lower()
+    if req == "cpu":
+        return "cpu"
+    if req == "cuda":
+        return "cuda" if torch.cuda.is_available() else "cpu"
+    if req == "mps":
+        ok = hasattr(torch.backends, "mps") and torch.backends.mps.is_available()
+        return "mps" if ok else "cpu"
+    if torch.cuda.is_available():
+        return "cuda"
+    if hasattr(torch.backends, "mps") and torch.backends.mps.is_available():
+        return "mps"
+    return "cpu"
+
+
+def _safe_devices(requested: str) -> Any:
+    req = (requested or "auto").lower()
+    if req == "auto":
+        return 1
+    try:
+        return int(req)
+    except Exception:
+        return req
+
+
+# -------------------------
+# builder (double bulletproof)
+# -------------------------
+def _get_builder(model_name: str):
+    from pyhazards.models.registry import get_model_config  # type: ignore
+    cfg = get_model_config(model_name)
+    if not cfg:
+        raise ValueError(f"Model '{model_name}' not found in registry.")
+    return cfg["builder"]
+
+
+def _parse_unexpected_kwargs(msg: str) -> List[str]:
+    m = re.search(r"Unexpected kwargs.*?:\s*(\[[^\]]*\])", msg)
+    if m:
+        try:
+            xs = ast.literal_eval(m.group(1))
+            return [str(x) for x in xs]
+        except Exception:
+            return []
+    m2 = re.search(r"unexpected keyword argument\s+'([^']+)'", msg)
+    if m2:
+        return [m2.group(1)]
+    return []
+
+
+def _build_with_stripping(builder, kwargs: Dict[str, Any]) -> Tuple[nn.Module, Dict[str, Any]]:
+    cur = dict(kwargs)
+    for _ in range(25):
+        try:
+            model = builder(**cur)
+            return model, cur
+        except TypeError as e:
+            msg = str(e)
+            bad = _parse_unexpected_kwargs(msg)
+            if bad:
+                for k in bad:
+                    cur.pop(k, None)
+                continue
+            raise
+    raise RuntimeError(f"Builder could not be satisfied after stripping retries. Last kwargs={cur}")
+
+
+def build_wildfire_fpa_lstm_strict(
+    model_name: str,
+    task: str,
+    input_dim: int,
+    output_dim: int,
+    lookback: int,
+    hidden_dim: int,
+    num_layers: int,
+    dropout: float,
+) -> Tuple[nn.Module, Dict[str, Any]]:
+    builder = _get_builder(model_name)
+    base = {
+        "task": task,
+        "input_dim": input_dim,
+        "output_dim": output_dim,
+        "lookback": lookback,
+        "hidden_dim": hidden_dim,
+        "num_layers": num_layers,
+        "dropout": dropout,
+        # synonyms (auto-stripped if rejected)
+        "in_dim": input_dim,
+        "out_dim": output_dim,
+        "layers": num_layers,
+        "seq_len": lookback,
+    }
+    model, used = _build_with_stripping(builder, base)
+    return model, dict(used)
+
+
+# -------------------------
+# metrics / plotting
+# -------------------------
+def _to_numpy(x):
+    import numpy as np
+    import torch
+
+    if isinstance(x, np.ndarray):
+        return x
+    if isinstance(x, torch.Tensor):
+        return x.detach().cpu().numpy()
+    return np.asarray(x)
+
+def _mae_rmse(y_true, y_pred):
+    yt = _to_numpy(y_true).astype(np.float32)
+    yp = _to_numpy(y_pred).astype(np.float32)
+    err = yp - yt
+    mae = float(np.mean(np.abs(err)))
+    rmse = float(np.sqrt(np.mean(err ** 2)))
+    return {"mae": mae, "rmse": rmse}
+
+
+
+def _plot_series(y_true: np.ndarray, y_pred: np.ndarray, out_png: str, title: str, groups: List[str]) -> None:
+    t = np.arange(len(y_true))
+    fig = plt.figure(figsize=(10, 6))
+    ax = fig.add_subplot(111)
+    ax.plot(t, y_true.sum(axis=1), label="true_total")
+    ax.plot(t, y_pred.sum(axis=1), label="pred_total")
+    ax.set_title(title)
+    ax.set_xlabel("Time index (test samples)")
+    ax.set_ylabel("Total next-week fires")
+    ax.legend()
+    fig.tight_layout()
+    fig.savefig(out_png, dpi=200)
+    plt.close(fig)
+
+    fig = plt.figure(figsize=(10, 7))
+    ax = fig.add_subplot(111)
+    for i, g in enumerate(groups):
+        ax.plot(t, y_true[:, i], label=f"true_{g}")
+        ax.plot(t, y_pred[:, i], label=f"pred_{g}", linestyle="--")
+    ax.set_title(title + " (per group)")
+    ax.set_xlabel("Time index (test samples)")
+    ax.set_ylabel("Next-week count")
+    ax.legend(ncol=2, fontsize=8)
+    fig.tight_layout()
+    fig.savefig(out_png.replace(".png", "_per_group.png"), dpi=200)
+    plt.close(fig)
+
+
+def _resolve_lookback(args) -> int:
+    if args.lookback is not None and args.lookback_weeks is not None:
+        raise SystemExit("Provide only one: --lookback or --lookback_weeks")
+    if args.lookback is None and args.lookback_weeks is None:
+        return 50
+    return int(args.lookback if args.lookback is not None else args.lookback_weeks)
+
+
+def _counts_slice_for_features(x: np.ndarray, out_dim: int) -> np.ndarray:
+    # For counts or counts+time, assume counts are the first out_dim features.
+    # This matches your dataset behavior in practice.
+    return x[..., :out_dim]
+
+
+# -------------------------
+# Lightning module with invertible transforms
+# -------------------------
+class LitWeekly(pl.LightningModule):
+    def __init__(
+        self,
+        model: nn.Module,
+        lr: float,
+        weight_decay: float,
+        loss_name: str,
+        nonneg: bool,
+        log1p: bool,
+        standardize: bool,
+        predict_delta: bool,
+        y_mean: Optional[np.ndarray],
+        y_std: Optional[np.ndarray],
+        out_dim: int,
+    ):
+        super().__init__()
+        self.model = model
+        self.lr = lr
+        self.weight_decay = weight_decay
+        self.nonneg = nonneg
+
+        self.log1p = log1p
+        self.standardize = standardize
+        self.predict_delta = predict_delta
+
+        self.out_dim = out_dim
+        self.y_mean = _to_float_tensor(y_mean)
+        self.y_std = _to_float_tensor(y_std)
+
+        if loss_name == "huber":
+            self.loss = nn.HuberLoss()
+        else:
+            self.loss = nn.MSELoss()
+
+    def forward(self, x: torch.Tensor) -> torch.Tensor:
+        return self.model(x)
+
+    def configure_optimizers(self):
+        return torch.optim.AdamW(self.parameters(), lr=self.lr, weight_decay=self.weight_decay)
+
+    def _inv_standardize(self, z: torch.Tensor) -> torch.Tensor:
+        if not self.standardize:
+            return z
+        assert self.y_mean is not None and self.y_std is not None
+        mu = self.y_mean.to(z.device)
+        sd = self.y_std.to(z.device)
+        return z * sd + mu
+
+    def _inv_log1p(self, z: torch.Tensor) -> torch.Tensor:
+        if not self.log1p:
+            return z
+        return torch.expm1(z)
+
+    def _to_original_counts(self, x: torch.Tensor, y_t: torch.Tensor) -> torch.Tensor:
+        """
+        Convert transformed target/pred back to next-week counts.
+        y_t is in the model/training space.
+        """
+        y = y_t
+        y = self._inv_standardize(y)
+        y = self._inv_log1p(y)
+
+        if self.predict_delta:
+            # add back last-week counts (first out_dim features of last timestep)
+            last_week = x[:, -1, : self.out_dim]
+            y = y + last_week
+
+        if self.nonneg:
+            y = torch.clamp(y, min=0.0)
+        return y
+
+    def training_step(self, batch, batch_idx: int):
+        x, y_t = batch
+        pred_t = self(x)
+        loss = self.loss(pred_t, y_t)
+        self.log("train/loss", loss, prog_bar=True)
+        return loss
+
+    def validation_step(self, batch, batch_idx: int):
+        x, y_t = batch
+        pred_t = self(x)
+        loss = self.loss(pred_t, y_t)
+
+        # MAE in ORIGINAL count space
+        y_true = self._to_original_counts(x, y_t)
+        y_pred = self._to_original_counts(x, pred_t)
+        mae = torch.mean(torch.abs(y_pred - y_true))
+
+        self.log("val/loss", loss, prog_bar=True)
+        self.log("val/mae", mae, prog_bar=True)
+
+    def predict_step(self, batch, batch_idx: int, dataloader_idx: int = 0):
+        x, y_t = batch
+        pred_t = self(x)
+        y_true = self._to_original_counts(x, y_t).detach().cpu()
+        y_pred = self._to_original_counts(x, pred_t).detach().cpu()
+        return {"pred": y_pred, "y": y_true}
+
+
+def main() -> None:
+    ap = argparse.ArgumentParser()
+    ap.add_argument("--data_path", required=True)
+    ap.add_argument("--model_name", default="wildfire_fpa_lstm")
+    ap.add_argument("--region", choices=["US", "CA"], default="US")
+
+    ap.add_argument("--lookback", type=int, default=None)
+    ap.add_argument("--lookback_weeks", type=int, default=None)
+    ap.add_argument("--features", choices=["counts", "counts+time"], default="counts")
+
+    ap.add_argument("--epochs", type=int, default=50)
+    ap.add_argument("--batch_size", type=int, default=128)
+    ap.add_argument("--num_workers", type=int, default=4)
+    ap.add_argument("--seed", type=int, default=1337)
+
+    ap.add_argument("--hidden_dim", type=int, default=64)
+    ap.add_argument("--num_layers", type=int, default=2)
+    ap.add_argument("--dropout", type=float, default=0.1)
+
+    ap.add_argument("--lr", type=float, default=3e-4)
+    ap.add_argument("--weight_decay", type=float, default=1e-4)
+
+    ap.add_argument("--loss", choices=["mse", "huber"], default="mse")
+    ap.add_argument("--nonneg", action="store_true")
+    ap.add_argument("--log1p", action="store_true")
+    ap.add_argument("--standardize", action="store_true")
+    ap.add_argument("--predict_delta", action="store_true")
+
+    ap.add_argument("--limit_train_batches", type=float, default=1.0)
+    ap.add_argument("--limit_val_batches", type=float, default=1.0)
+    ap.add_argument("--limit_test_batches", type=float, default=1.0)
+    ap.add_argument("--log_every_n_steps", type=int, default=1)
+
+    ap.add_argument("--accelerator", default="auto")
+    ap.add_argument("--devices", default="auto")
+    ap.add_argument("--precision", default="32")
+
+    ap.add_argument("--out_root", default="outputs/forecast_wildfire_fpa_weekly")
+    ap.add_argument("--resume_from", default=None)
+
+    args = ap.parse_args()
+    args.lookback = _resolve_lookback(args)
+
+    # Hard safety rule (the one you hit)
+    if args.predict_delta and args.log1p:
+        raise SystemExit(
+            "Refusing: --predict_delta with --log1p (delta can be negative; log1p would be invalid). "
+            "Use --standardize only, or disable delta."
+        )
+
+    run_dir = os.path.join(args.out_root, _now_tag())
+    ckpt_dir = os.path.join(run_dir, "checkpoints")
+    _mkdir(ckpt_dir)
+    _write_json(os.path.join(run_dir, "config.json"), vars(args))
+    _write_text(os.path.join(run_dir, "command.txt"), " ".join(sys.argv) + "\n")
+
+    try:
+        pl.seed_everything(args.seed, workers=True)
+
+        from pyhazards.datasets import FPAFODWildfireWeekly  # type: ignore
+
+        b = FPAFODWildfireWeekly(
+            region=args.region,
+            data_path=args.data_path,
+            micro=False,
+            lookback_weeks=args.lookback,
+            features=args.features,
+            seed=args.seed,
+        ).load()
+
+        x_train, y_train = b.splits["train"].inputs, b.splits["train"].targets
+        x_val, y_val = b.splits["val"].inputs, b.splits["val"].targets
+        x_test, y_test = b.splits["test"].inputs, b.splits["test"].targets
+
+        out_dim = int(y_train.shape[-1])  # usually 5
+        input_dim = int(x_train.shape[-1])
+
+        # --------
+        # Build transformed targets for training
+        # --------
+        def transform_targets(x: np.ndarray, y: np.ndarray, y_mean: Optional[np.ndarray], y_std: Optional[np.ndarray]) -> np.ndarray:
+            if isinstance(y, torch.Tensor):
+                yt = y.detach().cpu().float()
+            else:
+                import numpy as np
+                yt = np.asarray(y, dtype=np.float32)
+
+            if args.predict_delta:
+                last = _counts_slice_for_features(x, out_dim)[:, -1, :]
+                yt = yt - last
+
+            if args.log1p:
+                yt = np.log1p(np.maximum(yt, 0.0))
+
+            if args.standardize:
+                assert y_mean is not None and y_std is not None
+                yt = (yt - y_mean) / y_std
+
+            return yt
+
+        # compute standardization stats on TRAIN transformed space (before standardize step itself)
+        y_base = y_train.detach().cpu().numpy().astype(np.float32)
+        if args.predict_delta:
+            last_tr = _counts_slice_for_features(x_train, out_dim)[:, -1, :]
+            y_base = y_base - last_tr
+        if args.log1p:
+            y_base = np.log1p(np.maximum(y_base, 0.0))
+
+        y_mean = None
+        y_std = None
+        if args.standardize:
+            y_mean = y_base.mean(axis=0)
+            y_std = y_base.std(axis=0)
+            y_std[y_std < 1e-6] = 1.0
+
+        y_train_t = transform_targets(x_train, y_train, y_mean, y_std)
+        y_val_t = transform_targets(x_val, y_val, y_mean, y_std)
+        y_test_t = transform_targets(x_test, y_test, y_mean, y_std)
+
+        tx_train = (x_train.float() if hasattr(x_train, "float") else torch.from_numpy(x_train).float())
+        ty_train = _to_float_tensor(y_train_t)
+        tx_val = (x_val.float() if hasattr(x_val, "float") else torch.from_numpy(x_val).float())
+        ty_val = _to_float_tensor(y_val_t)
+        tx_test = (x_test.float() if hasattr(x_test, "float") else torch.from_numpy(x_test).float())
+        ty_test = _to_float_tensor(y_test_t)
+
+        persistent_workers = bool(args.num_workers and args.num_workers > 0)
+        train_loader = DataLoader(TensorDataset(tx_train, ty_train), batch_size=args.batch_size, shuffle=True, num_workers=args.num_workers, persistent_workers=persistent_workers)
+        val_loader = DataLoader(TensorDataset(tx_val, ty_val), batch_size=args.batch_size, shuffle=False, num_workers=args.num_workers, persistent_workers=persistent_workers)
+        test_loader = DataLoader(TensorDataset(tx_test, ty_test), batch_size=args.batch_size, shuffle=False, num_workers=args.num_workers, persistent_workers=persistent_workers)
+
+        model, used_kwargs = build_wildfire_fpa_lstm_strict(
+            model_name=args.model_name,
+            task="regression",
+            input_dim=input_dim,
+            output_dim=out_dim,
+            lookback=args.lookback,
+            hidden_dim=args.hidden_dim,
+            num_layers=args.num_layers,
+            dropout=args.dropout,
+        )
+        _write_json(os.path.join(run_dir, "model_kwargs_used.json"), used_kwargs)
+
+        lit = LitWeekly(
+            model=model,
+            lr=args.lr,
+            weight_decay=args.weight_decay,
+            loss_name=args.loss,
+            nonneg=args.nonneg,
+            log1p=args.log1p,
+            standardize=args.standardize,
+            predict_delta=args.predict_delta,
+            y_mean=y_mean,
+            y_std=y_std,
+            out_dim=out_dim,
+        )
+
+        ckpt = ModelCheckpoint(dirpath=ckpt_dir, filename="best", monitor="val/mae", mode="min", save_top_k=1, save_last=True)
+        logger = CSVLogger(save_dir=run_dir, name="csv")
+
+        accel = _safe_accelerator(args.accelerator)
+        devs = _safe_devices(args.devices)
+
+        trainer = pl.Trainer(
+            max_epochs=args.epochs,
+            accelerator=accel,
+            devices=devs,
+            precision=args.precision,
+            logger=logger,
+            callbacks=[ckpt],
+            limit_train_batches=args.limit_train_batches,
+            limit_val_batches=args.limit_val_batches,
+            limit_test_batches=args.limit_test_batches,
+            log_every_n_steps=args.log_every_n_steps,
+        )
+
+        trainer.fit(lit, train_dataloaders=train_loader, val_dataloaders=val_loader, ckpt_path=args.resume_from)
+
+        outs = trainer.predict(lit, dataloaders=test_loader, ckpt_path=ckpt.best_model_path)
+        preds = torch.cat([o["pred"] for o in outs], dim=0).numpy()
+        ys = torch.cat([o["y"] for o in outs], dim=0).numpy()
+
+        test_metrics = _mae_rmse(ys, preds)
+
+        # naive baseline: last-week counts (level) always works in original count space
+        last_week = _counts_slice_for_features(x_test, out_dim)[:, -1, :]
+        naive_metrics = _mae_rmse(y_test, last_week)
+
+        _plot_series(ys, preds, os.path.join(run_dir, "pred_vs_true.png"), "Weekly forecast", SIZE_GROUPS[:out_dim])
+        torch.save(lit.model.state_dict(), os.path.join(run_dir, "model_final.pt"))
+
+        metrics = {
+            "task": "fpa_fod_weekly",
+            "config": vars(args),
+            "git_sha": _git_sha(),
+            "system": {"python": sys.version, "platform": platform.platform(), "torch": torch.__version__},
+            "best_ckpt": ckpt.best_model_path,
+            "last_ckpt": ckpt.last_model_path,
+            "test": test_metrics,
+            "naive_last_week_baseline": naive_metrics,
+        }
+        _write_json(os.path.join(run_dir, "metrics.json"), metrics)
+
+        summary = [
+            "# Paper reproduction summary — FPA-FOD Weekly Forecast",
+            "",
+            f"## Model kwargs actually used (strict builder)\n- {used_kwargs}",
+            "",
+            "## Your test metrics",
+            f"- MAE:  {test_metrics['mae']:.3f}",
+            f"- RMSE: {test_metrics['rmse']:.3f}",
+            f"- MAE per group:  {test_metrics.get('mae_per_group')}",
+            f"- RMSE per group: {test_metrics.get('rmse_per_group')}",
+            "",
+            "## Naive baseline (last week)",
+            f"- MAE:  {naive_metrics['mae']:.3f}",
+            f"- RMSE: {naive_metrics['rmse']:.3f}",
+        ]
+        _write_text(os.path.join(run_dir, "paper_summary.md"), "\n".join(summary) + "\n")
+
+        print(f"[OK] Outputs saved to: {run_dir}")
+
+    except BaseException as e:
+        _write_text(os.path.join(run_dir, "FAILURE.txt"), f"{type(e).__name__}: {e}\n")
+        raise
+
+
+if __name__ == "__main__":
+    main()
diff --git a/scripts/test_dataset.py b/scripts/run_dataset_check.py
similarity index 100%
rename from scripts/test_dataset.py
rename to scripts/run_dataset_check.py
diff --git a/scripts/train_wildfire_fpa_cause.py b/scripts/train_wildfire_fpa_cause.py
new file mode 100644
index 00000000..99c7f784
--- /dev/null
+++ b/scripts/train_wildfire_fpa_cause.py
@@ -0,0 +1,622 @@
+#!/usr/bin/env python3
+from __future__ import annotations
+
+import argparse
+import ast
+import json
+import os
+import platform
+import re
+import subprocess
+import sys
+from datetime import datetime
+from typing import Any, Dict, Optional, Tuple, List
+
+import numpy as np
+
+import matplotlib
+matplotlib.use("Agg")
+import matplotlib.pyplot as plt
+
+import torch
+
+def _to_float_tensor(x):
+    """Convert numpy array OR torch tensor -> torch.float32 tensor."""
+    import torch
+    if isinstance(x, torch.Tensor):
+        return x.detach().cpu().float()
+    return torch.from_numpy(x).float()
+
+from torch import nn
+from torch.utils.data import DataLoader, TensorDataset, WeightedRandomSampler
+
+try:
+    import lightning.pytorch as pl
+    from lightning.pytorch.callbacks import ModelCheckpoint
+    from lightning.pytorch.loggers import CSVLogger
+except Exception:
+    import pytorch_lightning as pl  # type: ignore
+    from pytorch_lightning.callbacks import ModelCheckpoint  # type: ignore
+    from pytorch_lightning.loggers import CSVLogger  # type: ignore
+
+
+# -------------------------
+# utils
+# -------------------------
+def _now_tag() -> str:
+    return datetime.now().strftime("%Y%m%d_%H%M%S")
+
+
+def _mkdir(p: str) -> None:
+    os.makedirs(p, exist_ok=True)
+
+
+def _git_sha() -> Optional[str]:
+    try:
+        return subprocess.check_output(["git", "rev-parse", "HEAD"], stderr=subprocess.DEVNULL).decode().strip()
+    except Exception:
+        return None
+
+
+def _write_json(path: str, obj: Any) -> None:
+    with open(path, "w", encoding="utf-8") as f:
+        json.dump(obj, f, indent=2)
+
+
+def _write_text(path: str, s: str) -> None:
+    with open(path, "w", encoding="utf-8") as f:
+        f.write(s)
+
+
+def _safe_accelerator(requested: str) -> str:
+    req = (requested or "auto").lower()
+    if req == "cpu":
+        return "cpu"
+    if req == "cuda":
+        return "cuda" if torch.cuda.is_available() else "cpu"
+    if req == "mps":
+        ok = hasattr(torch.backends, "mps") and torch.backends.mps.is_available()
+        return "mps" if ok else "cpu"
+    if torch.cuda.is_available():
+        return "cuda"
+    if hasattr(torch.backends, "mps") and torch.backends.mps.is_available():
+        return "mps"
+    return "cpu"
+
+
+def _safe_devices(requested: str) -> Any:
+    req = (requested or "auto").lower()
+    if req == "auto":
+        return 1
+    try:
+        return int(req)
+    except Exception:
+        return req
+
+
+def _confusion_update(cm: np.ndarray, y_true: np.ndarray, y_pred: np.ndarray) -> None:
+    for t, p in zip(y_true, y_pred):
+        cm[int(t), int(p)] += 1
+
+
+def _metrics_from_cm(cm: np.ndarray) -> Dict[str, Any]:
+    eps = 1e-12
+    tp = np.diag(cm).astype(np.float64)
+    support = cm.sum(axis=1).astype(np.float64)
+    pred_support = cm.sum(axis=0).astype(np.float64)
+
+    precision = tp / (pred_support + eps)
+    recall = tp / (support + eps)
+    f1 = 2 * precision * recall / (precision + recall + eps)
+
+    return {
+        "accuracy": float(tp.sum() / (cm.sum() + eps)),
+        "macro_precision": float(np.nanmean(precision)),
+        "macro_recall": float(np.nanmean(recall)),
+        "macro_f1": float(np.nanmean(f1)),
+        "per_class": [
+            {
+                "precision": float(precision[i]),
+                "recall": float(recall[i]),
+                "f1": float(f1[i]),
+                "support": int(support[i]),
+            }
+            for i in range(cm.shape[0])
+        ],
+    }
+
+
+def _plot_cm(cm: np.ndarray, classes: List[str], out_png: str, title: str) -> None:
+    fig = plt.figure(figsize=(8, 7))
+    ax = fig.add_subplot(111)
+    im = ax.imshow(cm, interpolation="nearest")
+    fig.colorbar(im, ax=ax)
+    ax.set_title(title)
+    ax.set_xlabel("Predicted")
+    ax.set_ylabel("True")
+    ax.set_xticks(np.arange(len(classes)))
+    ax.set_yticks(np.arange(len(classes)))
+    ax.set_xticklabels(classes, rotation=45, ha="right")
+    ax.set_yticklabels(classes)
+
+    thresh = cm.max() * 0.5 if cm.size else 0
+    for i in range(cm.shape[0]):
+        for j in range(cm.shape[1]):
+            ax.text(
+                j, i, str(int(cm[i, j])),
+                ha="center", va="center",
+                color="white" if cm[i, j] > thresh else "black",
+                fontsize=7
+            )
+    fig.tight_layout()
+    fig.savefig(out_png, dpi=200)
+    plt.close(fig)
+
+
+def _compute_balanced_class_weights(y: np.ndarray, num_classes: int) -> torch.Tensor:
+    counts = np.bincount(y, minlength=num_classes).astype(np.float64)
+    counts[counts == 0] = 1.0
+    w = (counts.sum() / (num_classes * counts)).astype(np.float32)
+    return torch.from_numpy(w)
+
+
+def _make_loaders(
+    x_train: np.ndarray, y_train: np.ndarray,
+    x_val: np.ndarray, y_val: np.ndarray,
+    x_test: np.ndarray, y_test: np.ndarray,
+    batch_size: int, num_workers: int,
+    imbalance: str,
+    num_classes: int,
+    seed: int,
+):
+    tx_train = _to_float_tensor(x_train)
+    ty_train = (_to_float_tensor(y_train).long())
+    tx_val = _to_float_tensor(x_val)
+    ty_val = (_to_float_tensor(y_val).long())
+    tx_test = _to_float_tensor(x_test)
+    ty_test = (_to_float_tensor(y_test).long())
+
+    train_ds = TensorDataset(tx_train, ty_train)
+    val_ds = TensorDataset(tx_val, ty_val)
+    test_ds = TensorDataset(tx_test, ty_test)
+
+    persistent_workers = bool(num_workers and num_workers > 0)
+
+    class_weights = None
+    sampler = None
+
+    if imbalance == "weighted_sampler":
+        counts = np.bincount(y_train, minlength=num_classes).astype(np.float64)
+        counts[counts == 0] = 1.0
+        inv = 1.0 / counts
+        w_per_sample = inv[y_train]
+        gen = torch.Generator().manual_seed(seed)
+        sampler = WeightedRandomSampler(
+            weights=torch.from_numpy(w_per_sample).double(),
+            num_samples=len(w_per_sample),
+            replacement=True,
+            generator=gen,
+        )
+        # optional: weights for loss too (not required)
+        class_weights = torch.from_numpy((inv / inv.mean()).astype(np.float32))
+    elif imbalance == "class_weight_balanced":
+        class_weights = _compute_balanced_class_weights(y_train, num_classes)
+
+    train_loader = DataLoader(
+        train_ds,
+        batch_size=batch_size,
+        shuffle=(sampler is None),
+        sampler=sampler,
+        num_workers=num_workers,
+        persistent_workers=persistent_workers,
+    )
+    val_loader = DataLoader(val_ds, batch_size=batch_size, shuffle=False, num_workers=num_workers, persistent_workers=persistent_workers)
+    test_loader = DataLoader(test_ds, batch_size=batch_size, shuffle=False, num_workers=num_workers, persistent_workers=persistent_workers)
+    return train_loader, val_loader, test_loader, class_weights
+
+
+def _majority_baseline_metrics(y_train: np.ndarray, y_test: np.ndarray, num_classes: int) -> Dict[str, Any]:
+    counts = np.bincount(y_train, minlength=num_classes)
+    maj = int(np.argmax(counts))
+    cm = np.zeros((num_classes, num_classes), dtype=np.int64)
+    y_pred = np.full_like(y_test, fill_value=maj)
+    _confusion_update(cm, y_test, y_pred)
+    return {"majority_class": maj, "train_counts": counts.tolist(), "metrics": _metrics_from_cm(cm)}
+
+
+# -------------------------
+# strict builder (double bulletproof)
+# -------------------------
+def _get_builder(model_name: str):
+    from pyhazards.models.registry import get_model_config  # type: ignore
+    cfg = get_model_config(model_name)
+    if not cfg:
+        raise ValueError(f"Model '{model_name}' not found in registry.")
+    return cfg["builder"]
+
+
+def _parse_unexpected_kwargs(msg: str) -> List[str]:
+    # pattern: "Unexpected kwargs for wildfire_fpa_mlp: ['num_layers']"
+    m = re.search(r"Unexpected kwargs.*?:\s*(\[[^\]]*\])", msg)
+    if m:
+        try:
+            xs = ast.literal_eval(m.group(1))
+            return [str(x) for x in xs]
+        except Exception:
+            return []
+    # pattern: "got an unexpected keyword argument 'task'"
+    m2 = re.search(r"unexpected keyword argument\s+'([^']+)'", msg)
+    if m2:
+        return [m2.group(1)]
+    return []
+
+
+def _build_with_stripping(builder, kwargs: Dict[str, Any]) -> Tuple[nn.Module, Dict[str, Any]]:
+    """
+    Keep retrying by stripping kwargs that the builder explicitly rejects.
+    This makes us robust to API changes like num_layers vs depth.
+    """
+    cur = dict(kwargs)
+    for _ in range(20):
+        try:
+            model = builder(**cur)
+            return model, cur
+        except TypeError as e:
+            msg = str(e)
+            bad = _parse_unexpected_kwargs(msg)
+            if bad:
+                for k in bad:
+                    cur.pop(k, None)
+                continue
+            raise
+    raise RuntimeError(f"Builder could not be satisfied after stripping retries. Last kwargs={cur}")
+
+
+def build_wildfire_fpa_mlp_strict(
+    model_name: str,
+    task: str,
+    in_dim: int,
+    out_dim: int,
+    hidden_dim: int,
+    dropout: float,
+    depth: int,
+) -> Tuple[nn.Module, Dict[str, Any]]:
+    builder = _get_builder(model_name)
+
+    # Start with the kwargs that worked for you: in_dim/out_dim/hidden_dim/dropout/depth (+ task).
+    # Then strip anything the builder rejects (task or depth naming variations).
+    base = {
+        "task": task,
+        "in_dim": in_dim,
+        "out_dim": out_dim,
+        "hidden_dim": hidden_dim,
+        "dropout": dropout,
+        "depth": depth,
+        # Also try a couple synonyms in case a future builder changes:
+        # If these are rejected, they get stripped automatically.
+        "num_layers": depth,
+        "layers": depth,
+    }
+
+    model, used = _build_with_stripping(builder, base)
+
+    # record only keys that survived
+    used_clean = dict(used)
+    return model, used_clean
+
+
+# -------------------------
+# losses
+# -------------------------
+class FocalLoss(nn.Module):
+    def __init__(self, gamma: float = 2.0, weight: Optional[torch.Tensor] = None, reduction: str = "mean"):
+        super().__init__()
+        self.gamma = gamma
+        self.weight = weight
+        self.reduction = reduction
+
+    def forward(self, logits: torch.Tensor, target: torch.Tensor) -> torch.Tensor:
+        logp = torch.log_softmax(logits, dim=1)  # [B,C]
+        p = torch.exp(logp)
+        tgt = target.long()
+        logp_t = logp.gather(1, tgt.unsqueeze(1)).squeeze(1)  # [B]
+        p_t = p.gather(1, tgt.unsqueeze(1)).squeeze(1)        # [B]
+        loss = -(1.0 - p_t) ** self.gamma * logp_t
+        if self.weight is not None:
+            w = self.weight.to(loss.device).gather(0, tgt)
+            loss = loss * w
+        if self.reduction == "sum":
+            return loss.sum()
+        if self.reduction == "none":
+            return loss
+        return loss.mean()
+
+
+# -------------------------
+# lightning module
+# -------------------------
+class LitClassifier(pl.LightningModule):
+    def __init__(
+        self,
+        model: nn.Module,
+        num_classes: int,
+        lr: float,
+        weight_decay: float,
+        loss_name: str,
+        focal_gamma: float,
+        class_weights: Optional[torch.Tensor],
+        label_smoothing: float,
+    ):
+        super().__init__()
+        self.model = model
+        self.num_classes = num_classes
+        self.lr = lr
+        self.weight_decay = weight_decay
+
+        self.loss_name = loss_name
+        self.focal_gamma = focal_gamma
+        self.class_weights = class_weights
+        self.label_smoothing = label_smoothing
+
+        self._val_cm: Optional[np.ndarray] = None
+        self._test_cm: Optional[np.ndarray] = None
+
+        if loss_name == "focal":
+            self.criterion = FocalLoss(gamma=focal_gamma, weight=class_weights)
+        else:
+            # CE
+            kwargs = {}
+            if class_weights is not None:
+                kwargs["weight"] = class_weights
+            # label_smoothing supported in modern torch; if not, ignore
+            try:
+                kwargs["label_smoothing"] = float(label_smoothing)
+                self.criterion = nn.CrossEntropyLoss(**kwargs)
+            except TypeError:
+                kwargs.pop("label_smoothing", None)
+                self.criterion = nn.CrossEntropyLoss(**kwargs)
+
+    def forward(self, x: torch.Tensor) -> torch.Tensor:
+        return self.model(x)
+
+    def configure_optimizers(self):
+        return torch.optim.AdamW(self.parameters(), lr=self.lr, weight_decay=self.weight_decay)
+
+    def training_step(self, batch, batch_idx: int):
+        x, y = batch
+        logits = self(x)
+        loss = self.criterion(logits, y)
+        self.log("train/loss", loss, prog_bar=True)
+        return loss
+
+    def on_validation_epoch_start(self):
+        self._val_cm = np.zeros((self.num_classes, self.num_classes), dtype=np.int64)
+
+    def validation_step(self, batch, batch_idx: int):
+        x, y = batch
+        logits = self(x)
+        loss = self.criterion(logits, y)
+        preds = torch.argmax(logits, dim=1)
+        acc = (preds == y).float().mean()
+        self.log("val/loss", loss, prog_bar=True)
+        self.log("val/acc", acc, prog_bar=True)
+
+        assert self._val_cm is not None
+        _confusion_update(self._val_cm, y.detach().cpu().numpy(), preds.detach().cpu().numpy())
+
+    def on_validation_epoch_end(self):
+        if self._val_cm is None:
+            return
+        m = _metrics_from_cm(self._val_cm)
+        self.log("val/macro_f1", float(m["macro_f1"]), prog_bar=True)
+
+    def on_test_epoch_start(self):
+        self._test_cm = np.zeros((self.num_classes, self.num_classes), dtype=np.int64)
+
+    def test_step(self, batch, batch_idx: int):
+        x, y = batch
+        logits = self(x)
+        preds = torch.argmax(logits, dim=1)
+        assert self._test_cm is not None
+        _confusion_update(self._test_cm, y.detach().cpu().numpy(), preds.detach().cpu().numpy())
+
+
+def main() -> None:
+    ap = argparse.ArgumentParser()
+    ap.add_argument("--data_path", required=True)
+    ap.add_argument("--model_name", default="wildfire_fpa_mlp")
+    ap.add_argument("--epochs", type=int, default=50)
+    ap.add_argument("--batch_size", type=int, default=256)
+    ap.add_argument("--num_workers", type=int, default=4)
+    ap.add_argument("--seed", type=int, default=1337)
+
+    ap.add_argument("--region", choices=["US", "CA"], default="US")
+    ap.add_argument("--cause_mode", choices=["paper5", "full12", "keep_all"], default="paper5")
+    ap.add_argument("--normalize", action="store_true")
+
+    ap.add_argument("--hidden_dim", type=int, default=64)
+    ap.add_argument("--depth", type=int, default=2)
+    ap.add_argument("--num_layers", type=int, default=None, help="Alias for --depth")
+    ap.add_argument("--dropout", type=float, default=0.1)
+
+    ap.add_argument("--lr", type=float, default=3e-4)
+    ap.add_argument("--weight_decay", type=float, default=1e-4)
+
+    ap.add_argument("--imbalance", choices=["none", "class_weight_balanced", "weighted_sampler"], default="class_weight_balanced")
+
+    ap.add_argument("--loss", choices=["ce", "focal"], default="ce")
+    ap.add_argument("--focal_gamma", type=float, default=2.0)
+    ap.add_argument("--label_smoothing", type=float, default=0.0)
+
+    ap.add_argument("--limit_train_batches", type=float, default=1.0)
+    ap.add_argument("--limit_val_batches", type=float, default=1.0)
+    ap.add_argument("--limit_test_batches", type=float, default=1.0)
+    ap.add_argument("--log_every_n_steps", type=int, default=50)
+
+    ap.add_argument("--accelerator", default="auto")
+    ap.add_argument("--devices", default="auto")
+    ap.add_argument("--precision", default="32")
+
+    ap.add_argument("--out_root", default="outputs/train_wildfire_fpa_cause")
+    ap.add_argument("--resume_from", default=None)
+
+    args = ap.parse_args()
+    if args.num_layers is not None:
+        args.depth = int(args.num_layers)
+
+    run_dir = os.path.join(args.out_root, _now_tag())
+    ckpt_dir = os.path.join(run_dir, "checkpoints")
+    _mkdir(ckpt_dir)
+    _write_json(os.path.join(run_dir, "config.json"), vars(args))
+    _write_text(os.path.join(run_dir, "command.txt"), " ".join(sys.argv) + "\n")
+
+    try:
+        pl.seed_everything(args.seed, workers=True)
+
+        from pyhazards.datasets import FPAFODWildfireTabular  # type: ignore
+
+        b = FPAFODWildfireTabular(
+            task="cause",
+            region=args.region,
+            cause_mode=args.cause_mode,
+            data_path=args.data_path,
+            micro=False,
+            normalize=args.normalize,
+            seed=args.seed,
+        ).load()
+
+        x_train, y_train = b.splits["train"].inputs, b.splits["train"].targets
+        x_val, y_val = b.splits["val"].inputs, b.splits["val"].targets
+        x_test, y_test = b.splits["test"].inputs, b.splits["test"].targets
+
+        cls = b.label_spec.extra.get("classes")
+        if cls is None:
+            # fallback: numeric class ids if names not provided
+            n = b.label_spec.num_targets
+            if n is None: raise AttributeError("LabelSpec missing extra['classes'] and num_targets")
+            cls = list(range(int(n)))
+        classes = list(cls)
+        num_classes = int(b.label_spec.num_targets)
+        in_dim = int(x_train.shape[1])
+
+        train_loader, val_loader, test_loader, class_weights = _make_loaders(
+            x_train, y_train, x_val, y_val, x_test, y_test,
+            batch_size=args.batch_size,
+            num_workers=args.num_workers,
+            imbalance=args.imbalance,
+            num_classes=num_classes,
+            seed=args.seed,
+        )
+
+        model, used_kwargs = build_wildfire_fpa_mlp_strict(
+            model_name=args.model_name,
+            task="classification",
+            in_dim=in_dim,
+            out_dim=num_classes,
+            hidden_dim=args.hidden_dim,
+            dropout=args.dropout,
+            depth=args.depth,
+        )
+        _write_json(os.path.join(run_dir, "model_kwargs_used.json"), used_kwargs)
+
+        lit = LitClassifier(
+            model=model,
+            num_classes=num_classes,
+            lr=args.lr,
+            weight_decay=args.weight_decay,
+            loss_name=args.loss,
+            focal_gamma=args.focal_gamma,
+            class_weights=class_weights,
+            label_smoothing=args.label_smoothing,
+        )
+
+        ckpt = ModelCheckpoint(
+            dirpath=ckpt_dir,
+            filename="best",
+            monitor="val/macro_f1",
+            mode="max",
+            save_top_k=1,
+            save_last=True,
+        )
+        logger = CSVLogger(save_dir=run_dir, name="csv")
+
+        accel = _safe_accelerator(args.accelerator)
+        devs = _safe_devices(args.devices)
+
+        trainer = pl.Trainer(
+            max_epochs=args.epochs,
+            accelerator=_safe_accelerator(args.accelerator),
+            devices=devs,
+            precision=args.precision,
+            logger=logger,
+            callbacks=[ckpt],
+            limit_train_batches=args.limit_train_batches,
+            limit_val_batches=args.limit_val_batches,
+            limit_test_batches=args.limit_test_batches,
+            log_every_n_steps=args.log_every_n_steps,
+        )
+
+        trainer.fit(lit, train_dataloaders=train_loader, val_dataloaders=val_loader, ckpt_path=args.resume_from)
+        trainer.test(lit, dataloaders=test_loader, ckpt_path=ckpt.best_model_path)
+
+        cm = lit._test_cm if lit._test_cm is not None else np.zeros((num_classes, num_classes), dtype=np.int64)
+        m = _metrics_from_cm(cm)
+
+        _write_json(os.path.join(run_dir, "confusion_matrix.json"), {"cm": cm.tolist(), "classes": classes})
+        _plot_cm(cm, classes, os.path.join(run_dir, "confusion_matrix.png"), "Cause confusion matrix")
+
+        torch.save(lit.model.state_dict(), os.path.join(run_dir, "model_final.pt"))
+
+        y_counts = {classes[i]: int((y_train == i).sum()) for i in range(num_classes)}
+        baseline = _majority_baseline_metrics(y_train, y_test, num_classes)
+
+        metrics = {
+            "task": "fpa_fod_cause",
+            "config": vars(args),
+            "git_sha": _git_sha(),
+            "system": {"python": sys.version, "platform": platform.platform(), "torch": torch.__version__},
+            "best_ckpt": ckpt.best_model_path,
+            "last_ckpt": ckpt.last_model_path,
+            "train_class_counts": y_counts,
+            "test": {"metrics": m},
+            "baseline_majority_test": baseline,
+        }
+        _write_json(os.path.join(run_dir, "metrics.json"), metrics)
+
+        summary = [
+            "# Paper reproduction summary — FPA-FOD Cause",
+            "",
+            "## Dataset",
+            f"- train: {x_train.shape}  val: {x_val.shape}  test: {x_test.shape}",
+            f"- classes: {classes}",
+            f"- train class counts: {y_counts}",
+            "",
+            "## Model kwargs actually used (strict builder)",
+            f"- {used_kwargs}",
+            "",
+            "## Your test metrics",
+            f"- accuracy: {m['accuracy']:.4f}",
+            f"- macro_f1: {m['macro_f1']:.4f}",
+            f"- macro_precision: {m['macro_precision']:.4f}",
+            f"- macro_recall: {m['macro_recall']:.4f}",
+            "",
+            "## Majority baseline (test)",
+            f"- accuracy: {baseline['metrics']['accuracy']:.4f}",
+            f"- macro_f1: {baseline['metrics']['macro_f1']:.4f}",
+            "",
+            "## Artifacts",
+            "- checkpoints/best.ckpt, checkpoints/last.ckpt",
+            "- confusion_matrix.png",
+            "- metrics.json",
+        ]
+        _write_text(os.path.join(run_dir, "paper_summary.md"), "\n".join(summary) + "\n")
+
+        print(f"[OK] Outputs saved to: {run_dir}")
+
+    except BaseException as e:
+        _write_text(os.path.join(run_dir, "FAILURE.txt"), f"{type(e).__name__}: {e}\n")
+        raise
+
+
+if __name__ == "__main__":
+    main()
diff --git a/scripts/train_wildfire_fpa_size.py b/scripts/train_wildfire_fpa_size.py
new file mode 100644
index 00000000..f6ee33e3
--- /dev/null
+++ b/scripts/train_wildfire_fpa_size.py
@@ -0,0 +1,581 @@
+#!/usr/bin/env python3
+from __future__ import annotations
+
+import argparse
+import ast
+import json
+import os
+import platform
+import re
+import subprocess
+import sys
+from datetime import datetime
+from typing import Any, Dict, Optional, Tuple, List
+
+import numpy as np
+
+import matplotlib
+matplotlib.use("Agg")
+import matplotlib.pyplot as plt
+
+import torch
+
+def _to_float_tensor(x):
+    """Convert numpy array OR torch tensor -> torch.float32 tensor."""
+    import torch
+    if isinstance(x, torch.Tensor):
+        return x.detach().cpu().float()
+    return torch.from_numpy(x).float()
+
+from torch import nn
+from torch.utils.data import DataLoader, TensorDataset, WeightedRandomSampler
+
+try:
+    import lightning.pytorch as pl
+    from lightning.pytorch.callbacks import ModelCheckpoint
+    from lightning.pytorch.loggers import CSVLogger
+except Exception:
+    import pytorch_lightning as pl  # type: ignore
+    from pytorch_lightning.callbacks import ModelCheckpoint  # type: ignore
+    from pytorch_lightning.loggers import CSVLogger  # type: ignore
+
+
+def _now_tag() -> str:
+    return datetime.now().strftime("%Y%m%d_%H%M%S")
+
+
+def _mkdir(p: str) -> None:
+    os.makedirs(p, exist_ok=True)
+
+
+def _git_sha() -> Optional[str]:
+    try:
+        return subprocess.check_output(["git", "rev-parse", "HEAD"], stderr=subprocess.DEVNULL).decode().strip()
+    except Exception:
+        return None
+
+
+def _write_json(path: str, obj: Any) -> None:
+    with open(path, "w", encoding="utf-8") as f:
+        json.dump(obj, f, indent=2)
+
+
+def _write_text(path: str, s: str) -> None:
+    with open(path, "w", encoding="utf-8") as f:
+        f.write(s)
+
+
+def _safe_accelerator(requested: str) -> str:
+    req = (requested or "auto").lower()
+    if req == "cpu":
+        return "cpu"
+    if req == "cuda":
+        return "cuda" if torch.cuda.is_available() else "cpu"
+    if req == "mps":
+        ok = hasattr(torch.backends, "mps") and torch.backends.mps.is_available()
+        return "mps" if ok else "cpu"
+    if torch.cuda.is_available():
+        return "cuda"
+    if hasattr(torch.backends, "mps") and torch.backends.mps.is_available():
+        return "mps"
+    return "cpu"
+
+
+def _safe_devices(requested: str) -> Any:
+    req = (requested or "auto").lower()
+    if req == "auto":
+        return 1
+    try:
+        return int(req)
+    except Exception:
+        return req
+
+
+def _confusion_update(cm: np.ndarray, y_true: np.ndarray, y_pred: np.ndarray) -> None:
+    for t, p in zip(y_true, y_pred):
+        cm[int(t), int(p)] += 1
+
+
+def _metrics_from_cm(cm: np.ndarray) -> Dict[str, Any]:
+    eps = 1e-12
+    tp = np.diag(cm).astype(np.float64)
+    support = cm.sum(axis=1).astype(np.float64)
+    pred_support = cm.sum(axis=0).astype(np.float64)
+
+    precision = tp / (pred_support + eps)
+    recall = tp / (support + eps)
+    f1 = 2 * precision * recall / (precision + recall + eps)
+
+    return {
+        "accuracy": float(tp.sum() / (cm.sum() + eps)),
+        "macro_precision": float(np.nanmean(precision)),
+        "macro_recall": float(np.nanmean(recall)),
+        "macro_f1": float(np.nanmean(f1)),
+        "per_class": [
+            {
+                "precision": float(precision[i]),
+                "recall": float(recall[i]),
+                "f1": float(f1[i]),
+                "support": int(support[i]),
+            }
+            for i in range(cm.shape[0])
+        ],
+    }
+
+
+def _plot_cm(cm: np.ndarray, classes: List[str], out_png: str, title: str) -> None:
+    fig = plt.figure(figsize=(8, 7))
+    ax = fig.add_subplot(111)
+    im = ax.imshow(cm, interpolation="nearest")
+    fig.colorbar(im, ax=ax)
+    ax.set_title(title)
+    ax.set_xlabel("Predicted")
+    ax.set_ylabel("True")
+    ax.set_xticks(np.arange(len(classes)))
+    ax.set_yticks(np.arange(len(classes)))
+    ax.set_xticklabels(classes, rotation=45, ha="right")
+    ax.set_yticklabels(classes)
+
+    thresh = cm.max() * 0.5 if cm.size else 0
+    for i in range(cm.shape[0]):
+        for j in range(cm.shape[1]):
+            ax.text(
+                j, i, str(int(cm[i, j])),
+                ha="center", va="center",
+                color="white" if cm[i, j] > thresh else "black",
+                fontsize=7
+            )
+    fig.tight_layout()
+    fig.savefig(out_png, dpi=200)
+    plt.close(fig)
+
+
+def _compute_balanced_class_weights(y: np.ndarray, num_classes: int) -> torch.Tensor:
+    counts = np.bincount(y, minlength=num_classes).astype(np.float64)
+    counts[counts == 0] = 1.0
+    w = (counts.sum() / (num_classes * counts)).astype(np.float32)
+    return torch.from_numpy(w)
+
+
+def _make_loaders(
+    x_train: np.ndarray, y_train: np.ndarray,
+    x_val: np.ndarray, y_val: np.ndarray,
+    x_test: np.ndarray, y_test: np.ndarray,
+    batch_size: int, num_workers: int,
+    imbalance: str,
+    num_classes: int,
+    seed: int,
+):
+    tx_train = _to_float_tensor(x_train)
+    ty_train = (_to_float_tensor(y_train).long())
+    tx_val = _to_float_tensor(x_val)
+    ty_val = (_to_float_tensor(y_val).long())
+    tx_test = _to_float_tensor(x_test)
+    ty_test = (_to_float_tensor(y_test).long())
+
+    train_ds = TensorDataset(tx_train, ty_train)
+    val_ds = TensorDataset(tx_val, ty_val)
+    test_ds = TensorDataset(tx_test, ty_test)
+
+    persistent_workers = bool(num_workers and num_workers > 0)
+
+    class_weights = None
+    sampler = None
+
+    if imbalance == "weighted_sampler":
+        counts = np.bincount(y_train, minlength=num_classes).astype(np.float64)
+        counts[counts == 0] = 1.0
+        inv = 1.0 / counts
+        w_per_sample = inv[y_train]
+        gen = torch.Generator().manual_seed(seed)
+        sampler = WeightedRandomSampler(
+            weights=torch.from_numpy(w_per_sample).double(),
+            num_samples=len(w_per_sample),
+            replacement=True,
+            generator=gen,
+        )
+        class_weights = torch.from_numpy((inv / inv.mean()).astype(np.float32))
+    elif imbalance == "class_weight_balanced":
+        class_weights = _compute_balanced_class_weights(y_train, num_classes)
+
+    train_loader = DataLoader(
+        train_ds,
+        batch_size=batch_size,
+        shuffle=(sampler is None),
+        sampler=sampler,
+        num_workers=num_workers,
+        persistent_workers=persistent_workers,
+    )
+    val_loader = DataLoader(val_ds, batch_size=batch_size, shuffle=False, num_workers=num_workers, persistent_workers=persistent_workers)
+    test_loader = DataLoader(test_ds, batch_size=batch_size, shuffle=False, num_workers=num_workers, persistent_workers=persistent_workers)
+    return train_loader, val_loader, test_loader, class_weights
+
+
+def _majority_baseline_metrics(y_train: np.ndarray, y_test: np.ndarray, num_classes: int) -> Dict[str, Any]:
+    counts = np.bincount(y_train, minlength=num_classes)
+    maj = int(np.argmax(counts))
+    cm = np.zeros((num_classes, num_classes), dtype=np.int64)
+    y_pred = np.full_like(y_test, fill_value=maj)
+    _confusion_update(cm, y_test, y_pred)
+    return {"majority_class": maj, "train_counts": counts.tolist(), "metrics": _metrics_from_cm(cm)}
+
+
+# -------------------------
+# strict builder (double bulletproof)
+# -------------------------
+def _get_builder(model_name: str):
+    from pyhazards.models.registry import get_model_config  # type: ignore
+    cfg = get_model_config(model_name)
+    if not cfg:
+        raise ValueError(f"Model '{model_name}' not found in registry.")
+    return cfg["builder"]
+
+
+def _parse_unexpected_kwargs(msg: str) -> List[str]:
+    m = re.search(r"Unexpected kwargs.*?:\s*(\[[^\]]*\])", msg)
+    if m:
+        try:
+            xs = ast.literal_eval(m.group(1))
+            return [str(x) for x in xs]
+        except Exception:
+            return []
+    m2 = re.search(r"unexpected keyword argument\s+'([^']+)'", msg)
+    if m2:
+        return [m2.group(1)]
+    return []
+
+
+def _build_with_stripping(builder, kwargs: Dict[str, Any]) -> Tuple[nn.Module, Dict[str, Any]]:
+    cur = dict(kwargs)
+    for _ in range(20):
+        try:
+            model = builder(**cur)
+            return model, cur
+        except TypeError as e:
+            msg = str(e)
+            bad = _parse_unexpected_kwargs(msg)
+            if bad:
+                for k in bad:
+                    cur.pop(k, None)
+                continue
+            raise
+    raise RuntimeError(f"Builder could not be satisfied after stripping retries. Last kwargs={cur}")
+
+
+def build_wildfire_fpa_mlp_strict(
+    model_name: str,
+    task: str,
+    in_dim: int,
+    out_dim: int,
+    hidden_dim: int,
+    dropout: float,
+    depth: int,
+) -> Tuple[nn.Module, Dict[str, Any]]:
+    builder = _get_builder(model_name)
+    base = {
+        "task": task,
+        "in_dim": in_dim,
+        "out_dim": out_dim,
+        "hidden_dim": hidden_dim,
+        "dropout": dropout,
+        "depth": depth,
+        "num_layers": depth,
+        "layers": depth,
+    }
+    model, used = _build_with_stripping(builder, base)
+    return model, dict(used)
+
+
+# -------------------------
+# losses
+# -------------------------
+class FocalLoss(nn.Module):
+    def __init__(self, gamma: float = 2.0, weight: Optional[torch.Tensor] = None, reduction: str = "mean"):
+        super().__init__()
+        self.gamma = gamma
+        self.weight = weight
+        self.reduction = reduction
+
+    def forward(self, logits: torch.Tensor, target: torch.Tensor) -> torch.Tensor:
+        logp = torch.log_softmax(logits, dim=1)
+        p = torch.exp(logp)
+        tgt = target.long()
+        logp_t = logp.gather(1, tgt.unsqueeze(1)).squeeze(1)
+        p_t = p.gather(1, tgt.unsqueeze(1)).squeeze(1)
+        loss = -(1.0 - p_t) ** self.gamma * logp_t
+        if self.weight is not None:
+            w = self.weight.to(loss.device).gather(0, tgt)
+            loss = loss * w
+        if self.reduction == "sum":
+            return loss.sum()
+        if self.reduction == "none":
+            return loss
+        return loss.mean()
+
+
+# -------------------------
+# lightning module
+# -------------------------
+class LitClassifier(pl.LightningModule):
+    def __init__(
+        self,
+        model: nn.Module,
+        num_classes: int,
+        lr: float,
+        weight_decay: float,
+        loss_name: str,
+        focal_gamma: float,
+        class_weights: Optional[torch.Tensor],
+        label_smoothing: float,
+    ):
+        super().__init__()
+        self.model = model
+        self.num_classes = num_classes
+        self.lr = lr
+        self.weight_decay = weight_decay
+
+        self._val_cm: Optional[np.ndarray] = None
+        self._test_cm: Optional[np.ndarray] = None
+
+        if loss_name == "focal":
+            self.criterion = FocalLoss(gamma=focal_gamma, weight=class_weights)
+        else:
+            kwargs = {}
+            if class_weights is not None:
+                kwargs["weight"] = class_weights
+            try:
+                kwargs["label_smoothing"] = float(label_smoothing)
+                self.criterion = nn.CrossEntropyLoss(**kwargs)
+            except TypeError:
+                kwargs.pop("label_smoothing", None)
+                self.criterion = nn.CrossEntropyLoss(**kwargs)
+
+    def forward(self, x: torch.Tensor) -> torch.Tensor:
+        return self.model(x)
+
+    def configure_optimizers(self):
+        return torch.optim.AdamW(self.parameters(), lr=self.lr, weight_decay=self.weight_decay)
+
+    def training_step(self, batch, batch_idx: int):
+        x, y = batch
+        logits = self(x)
+        loss = self.criterion(logits, y)
+        self.log("train/loss", loss, prog_bar=True)
+        return loss
+
+    def on_validation_epoch_start(self):
+        self._val_cm = np.zeros((self.num_classes, self.num_classes), dtype=np.int64)
+
+    def validation_step(self, batch, batch_idx: int):
+        x, y = batch
+        logits = self(x)
+        loss = self.criterion(logits, y)
+        preds = torch.argmax(logits, dim=1)
+        acc = (preds == y).float().mean()
+        self.log("val/loss", loss, prog_bar=True)
+        self.log("val/acc", acc, prog_bar=True)
+
+        assert self._val_cm is not None
+        _confusion_update(self._val_cm, y.detach().cpu().numpy(), preds.detach().cpu().numpy())
+
+    def on_validation_epoch_end(self):
+        if self._val_cm is None:
+            return
+        m = _metrics_from_cm(self._val_cm)
+        self.log("val/macro_f1", float(m["macro_f1"]), prog_bar=True)
+
+    def on_test_epoch_start(self):
+        self._test_cm = np.zeros((self.num_classes, self.num_classes), dtype=np.int64)
+
+    def test_step(self, batch, batch_idx: int):
+        x, y = batch
+        logits = self(x)
+        preds = torch.argmax(logits, dim=1)
+        assert self._test_cm is not None
+        _confusion_update(self._test_cm, y.detach().cpu().numpy(), preds.detach().cpu().numpy())
+
+
+def main() -> None:
+    ap = argparse.ArgumentParser()
+    ap.add_argument("--data_path", required=True)
+    ap.add_argument("--model_name", default="wildfire_fpa_mlp")
+    ap.add_argument("--epochs", type=int, default=50)
+    ap.add_argument("--batch_size", type=int, default=256)
+    ap.add_argument("--num_workers", type=int, default=4)
+    ap.add_argument("--seed", type=int, default=1337)
+
+    ap.add_argument("--region", choices=["US", "CA"], default="US")
+    ap.add_argument("--normalize", action="store_true")
+
+    ap.add_argument("--hidden_dim", type=int, default=64)
+    ap.add_argument("--depth", type=int, default=2)
+    ap.add_argument("--num_layers", type=int, default=None, help="Alias for --depth")
+    ap.add_argument("--dropout", type=float, default=0.1)
+
+    ap.add_argument("--lr", type=float, default=3e-4)
+    ap.add_argument("--weight_decay", type=float, default=1e-4)
+
+    ap.add_argument("--imbalance", choices=["none", "class_weight_balanced", "weighted_sampler"], default="class_weight_balanced")
+
+    ap.add_argument("--loss", choices=["ce", "focal"], default="ce")
+    ap.add_argument("--focal_gamma", type=float, default=2.0)
+    ap.add_argument("--label_smoothing", type=float, default=0.0)
+
+    ap.add_argument("--limit_train_batches", type=float, default=1.0)
+    ap.add_argument("--limit_val_batches", type=float, default=1.0)
+    ap.add_argument("--limit_test_batches", type=float, default=1.0)
+    ap.add_argument("--log_every_n_steps", type=int, default=50)
+
+    ap.add_argument("--accelerator", default="auto")
+    ap.add_argument("--devices", default="auto")
+    ap.add_argument("--precision", default="32")
+
+    ap.add_argument("--out_root", default="outputs/train_wildfire_fpa_size")
+    ap.add_argument("--resume_from", default=None)
+
+    args = ap.parse_args()
+    if args.num_layers is not None:
+        args.depth = int(args.num_layers)
+
+    run_dir = os.path.join(args.out_root, _now_tag())
+    ckpt_dir = os.path.join(run_dir, "checkpoints")
+    _mkdir(ckpt_dir)
+    _write_json(os.path.join(run_dir, "config.json"), vars(args))
+    _write_text(os.path.join(run_dir, "command.txt"), " ".join(sys.argv) + "\n")
+
+    try:
+        pl.seed_everything(args.seed, workers=True)
+
+        from pyhazards.datasets import FPAFODWildfireTabular  # type: ignore
+
+        b = FPAFODWildfireTabular(
+            task="size",
+            region=args.region,
+            data_path=args.data_path,
+            micro=False,
+            normalize=args.normalize,
+            seed=args.seed,
+        ).load()
+
+        x_train, y_train = b.splits["train"].inputs, b.splits["train"].targets
+        x_val, y_val = b.splits["val"].inputs, b.splits["val"].targets
+        x_test, y_test = b.splits["test"].inputs, b.splits["test"].targets
+
+        cls = b.label_spec.extra.get("classes")
+        if cls is None:
+            # fallback: numeric class ids if names not provided
+            n = b.label_spec.num_targets
+            if n is None: raise AttributeError("LabelSpec missing extra['classes'] and num_targets")
+            cls = list(range(int(n)))
+        classes = list(cls)
+        num_classes = int(b.label_spec.num_targets)
+        in_dim = int(x_train.shape[1])
+
+        train_loader, val_loader, test_loader, class_weights = _make_loaders(
+            x_train, y_train, x_val, y_val, x_test, y_test,
+            batch_size=args.batch_size,
+            num_workers=args.num_workers,
+            imbalance=args.imbalance,
+            num_classes=num_classes,
+            seed=args.seed,
+        )
+
+        model, used_kwargs = build_wildfire_fpa_mlp_strict(
+            model_name=args.model_name,
+            task="classification",
+            in_dim=in_dim,
+            out_dim=num_classes,
+            hidden_dim=args.hidden_dim,
+            dropout=args.dropout,
+            depth=args.depth,
+        )
+        _write_json(os.path.join(run_dir, "model_kwargs_used.json"), used_kwargs)
+
+        lit = LitClassifier(
+            model=model,
+            num_classes=num_classes,
+            lr=args.lr,
+            weight_decay=args.weight_decay,
+            loss_name=args.loss,
+            focal_gamma=args.focal_gamma,
+            class_weights=class_weights,
+            label_smoothing=args.label_smoothing,
+        )
+
+        ckpt = ModelCheckpoint(
+            dirpath=ckpt_dir,
+            filename="best",
+            monitor="val/macro_f1",
+            mode="max",
+            save_top_k=1,
+            save_last=True,
+        )
+        logger = CSVLogger(save_dir=run_dir, name="csv")
+
+        accel = _safe_accelerator(args.accelerator)
+        devs = _safe_devices(args.devices)
+
+        trainer = pl.Trainer(
+            max_epochs=args.epochs,
+            accelerator=_safe_accelerator(args.accelerator),
+            devices=devs,
+            precision=args.precision,
+            logger=logger,
+            callbacks=[ckpt],
+            limit_train_batches=args.limit_train_batches,
+            limit_val_batches=args.limit_val_batches,
+            limit_test_batches=args.limit_test_batches,
+            log_every_n_steps=args.log_every_n_steps,
+        )
+
+        trainer.fit(lit, train_dataloaders=train_loader, val_dataloaders=val_loader, ckpt_path=args.resume_from)
+        trainer.test(lit, dataloaders=test_loader, ckpt_path=ckpt.best_model_path)
+
+        cm = lit._test_cm if lit._test_cm is not None else np.zeros((num_classes, num_classes), dtype=np.int64)
+        m = _metrics_from_cm(cm)
+
+        _write_json(os.path.join(run_dir, "confusion_matrix.json"), {"cm": cm.tolist(), "classes": classes})
+        _plot_cm(cm, classes, os.path.join(run_dir, "confusion_matrix.png"), "Size confusion matrix")
+
+        torch.save(lit.model.state_dict(), os.path.join(run_dir, "model_final.pt"))
+
+        baseline = _majority_baseline_metrics(y_train, y_test, num_classes)
+
+        metrics = {
+            "task": "fpa_fod_size",
+            "config": vars(args),
+            "git_sha": _git_sha(),
+            "system": {"python": sys.version, "platform": platform.platform(), "torch": torch.__version__},
+            "best_ckpt": ckpt.best_model_path,
+            "last_ckpt": ckpt.last_model_path,
+            "test": {"metrics": m},
+            "baseline_majority_test": baseline,
+        }
+        _write_json(os.path.join(run_dir, "metrics.json"), metrics)
+
+        summary = [
+            "# Paper reproduction summary — FPA-FOD Size",
+            "",
+            f"## Model kwargs actually used (strict builder)\n- {used_kwargs}",
+            "",
+            "## Your test metrics",
+            f"- accuracy: {m['accuracy']:.4f}",
+            f"- macro_f1: {m['macro_f1']:.4f}",
+            f"- macro_precision: {m['macro_precision']:.4f}",
+            f"- macro_recall: {m['macro_recall']:.4f}",
+            "",
+            "## Majority baseline (test)",
+            f"- accuracy: {baseline['metrics']['accuracy']:.4f}",
+            f"- macro_f1: {baseline['metrics']['macro_f1']:.4f}",
+        ]
+        _write_text(os.path.join(run_dir, "paper_summary.md"), "\n".join(summary) + "\n")
+
+        print(f"[OK] Outputs saved to: {run_dir}")
+
+    except BaseException as e:
+        _write_text(os.path.join(run_dir, "FAILURE.txt"), f"{type(e).__name__}: {e}\n")
+        raise
+
+
+if __name__ == "__main__":
+    main()
diff --git a/tests/test_wildfire_fpa_datasets.py b/tests/test_wildfire_fpa_datasets.py
new file mode 100644
index 00000000..6bdc7174
--- /dev/null
+++ b/tests/test_wildfire_fpa_datasets.py
@@ -0,0 +1,95 @@
+import numpy as np
+import torch
+
+from pyhazards.datasets import load_dataset
+
+
+def _to_bundle(ds):
+    if hasattr(ds, "load") and callable(getattr(ds, "load")):
+        return ds.load()
+    if hasattr(ds, "_load") and callable(getattr(ds, "_load")):
+        return ds._load()
+    raise TypeError(
+        f"{ds.__class__.__name__} does not expose load() or _load(). "
+        "Cannot obtain a DataBundle."
+    )
+
+
+def _as_tensor(a):
+    if isinstance(a, torch.Tensor):
+        return a
+    if isinstance(a, np.ndarray):
+        return torch.from_numpy(a)
+    # allow python lists
+    return torch.tensor(a)
+
+
+def _split_xy(split):
+    """
+    Extract x,y from:
+      - dict: {"x": ..., "y": ...}
+      - object with attributes .x/.y etc
+      - tuple/list (x,y)
+    """
+    if isinstance(split, dict):
+        # support common key variants
+        for xk in ("x", "X", "inputs", "features", "data"):
+            for yk in ("y", "Y", "targets", "labels"):
+                if xk in split and yk in split:
+                    return split[xk], split[yk]
+        raise KeyError(f"Split dict missing x/y keys. Keys={sorted(split.keys())}")
+
+    for x_name in ("x", "X", "inputs", "features", "data"):
+        for y_name in ("y", "Y", "targets", "labels"):
+            if hasattr(split, x_name) and hasattr(split, y_name):
+                return getattr(split, x_name), getattr(split, y_name)
+
+    if isinstance(split, (tuple, list)) and len(split) == 2:
+        return split[0], split[1]
+
+    raise TypeError(f"Unsupported split type for extracting (x,y): {type(split)}")
+
+
+def test_dataset_tabular_shapes():
+    ds = load_dataset("wildfire_fpa_fod_tabular", micro=True, seed=1337)
+    bundle = _to_bundle(ds)
+
+    assert hasattr(bundle, "splits"), "Expected DataBundle.splits"
+    assert isinstance(bundle.splits, dict)
+    assert "train" in bundle.splits
+
+    x, y = _split_xy(bundle.splits["train"])
+    x = _as_tensor(x).float()
+    y = _as_tensor(y).long()
+
+    # Contract: tabular classification
+    assert x.ndim == 2  # [N, D]
+    assert x.dtype == torch.float32
+    assert y.ndim == 1  # [N]
+    assert y.dtype in (torch.int64, torch.long)
+
+    assert x.shape[0] == y.shape[0]
+    assert x.shape[1] > 0
+
+
+def test_dataset_weekly_shapes():
+    ds = load_dataset("wildfire_fpa_fod_weekly", micro=True, seed=1337, region="US")
+    bundle = _to_bundle(ds)
+
+    assert hasattr(bundle, "splits"), "Expected DataBundle.splits"
+    assert isinstance(bundle.splits, dict)
+    assert "train" in bundle.splits
+
+    x, y = _split_xy(bundle.splits["train"])
+    x = _as_tensor(x).float()
+    y = _as_tensor(y).float()
+
+    # Contract: weekly sequences
+    assert x.ndim == 3  # [N, T, D]
+    assert x.dtype == torch.float32
+    assert y.dtype == torch.float32
+    assert y.ndim in (1, 2)  # [N] or [N, O]
+
+    assert x.shape[0] == y.shape[0]
+    assert x.shape[1] > 0  # T
+    assert x.shape[2] > 0  # D
diff --git a/tests/test_wildfire_fpa_models.py b/tests/test_wildfire_fpa_models.py
new file mode 100644
index 00000000..7e1efd34
--- /dev/null
+++ b/tests/test_wildfire_fpa_models.py
@@ -0,0 +1,33 @@
+import torch
+
+from pyhazards.models import build_model
+
+
+def test_build_model_wildfire_fpa_mlp_forward():
+    B, D, C = 8, 16, 5
+    model = build_model(
+        name="wildfire_fpa_mlp",
+        task="classification",
+        in_dim=D,
+        out_dim=C,
+    )
+    x = torch.randn(B, D)
+    y = model(x)
+    assert y.shape == (B, C)
+
+
+def test_build_model_wildfire_fpa_lstm_forward():
+    B, T, D, O = 4, 12, 10, 1
+    model = build_model(
+        name="wildfire_fpa_lstm",
+        task="forecasting",
+        input_dim=D,
+        output_dim=O,
+        hidden_dim=32,
+        num_layers=1,
+        dropout=0.0,
+        lookback=T,
+    )
+    x = torch.randn(B, T, D)
+    y = model(x)
+    assert y.shape == (B, O)
diff --git a/tests/test_wildfire_fpa_trainer_smoke.py b/tests/test_wildfire_fpa_trainer_smoke.py
new file mode 100644
index 00000000..dfa4ed19
--- /dev/null
+++ b/tests/test_wildfire_fpa_trainer_smoke.py
@@ -0,0 +1,57 @@
+import torch
+
+from pyhazards.datasets import load_dataset
+from pyhazards.datasets.base import DataSplit
+from pyhazards.engine import Trainer
+from pyhazards.models import build_model
+
+
+def test_trainer_fit_smoke():
+    # Load dataset via registry; micro=True must not require any real files
+    ds = load_dataset("wildfire_fpa_fod_tabular", micro=True, seed=1337)
+
+    # Support both patterns: registry may return Dataset instance or DataBundle directly
+    bundle = ds.load() if hasattr(ds, "load") else ds
+
+    train = bundle.get_split("train")
+    x_train = train.inputs
+    y_train = train.targets
+
+    # Enforce torch tensors for Trainer determinism
+    if not torch.is_tensor(x_train):
+        x_train = torch.as_tensor(x_train, dtype=torch.float32)
+    else:
+        x_train = x_train.to(dtype=torch.float32)
+
+    if not torch.is_tensor(y_train):
+        y_train = torch.as_tensor(y_train, dtype=torch.long)
+    else:
+        y_train = y_train.to(dtype=torch.long)
+
+    # Put back into bundle as a proper DataSplit (NOT a dict)
+    bundle.splits = dict(bundle.splits)
+    bundle.splits["train"] = DataSplit(inputs=x_train, targets=y_train, metadata=getattr(train, "metadata", {}))
+
+    in_dim = int(x_train.shape[1])
+    out_dim = int(y_train.max().item() + 1)
+
+    model = build_model(
+        name="wildfire_fpa_mlp",
+        task="classification",
+        in_dim=in_dim,
+        out_dim=out_dim,
+    )
+
+    trainer = Trainer(model=model, mixed_precision=False)
+    optimizer = torch.optim.Adam(model.parameters(), lr=1e-3)
+    loss_fn = torch.nn.CrossEntropyLoss()
+
+    # Smoke: 1 epoch, no crash
+    trainer.fit(
+        bundle,
+        optimizer=optimizer,
+        loss_fn=loss_fn,
+        max_epochs=1,
+        batch_size=32,
+        num_workers=0,
+    )