diff --git a/examples/defog/_sbm_isolated.py b/examples/defog/_sbm_isolated.py
new file mode 100644
index 000000000..c91d6d8d9
--- /dev/null
+++ b/examples/defog/_sbm_isolated.py
@@ -0,0 +1,67 @@
+import sys
+import json
+import numpy as np
+
+try:
+    import graph_tool.all as gt
+    from scipy.stats import chi2
+except ImportError:
+    print("ERROR: graph-tool and scipy required", file=sys.stderr)
+    sys.exit(1)
+
+def main():
+    data = json.loads(sys.stdin.read())
+    edges = data['edges']
+    p_intra = data['p_intra']
+    p_inter = data['p_inter']
+    strict = data['strict']
+    refinement_steps = data['refinement_steps']
+
+    gt_g = gt.Graph()
+    if edges:
+        gt_g.add_edge_list(edges)
+        
+    try:
+        state = gt.minimize_blockmodel_dl(gt_g)
+    except ValueError:
+        print("False")
+        return
+
+    # Refine using merge-split MCMC
+    for _ in range(refinement_steps):
+        state.multiflip_mcmc_sweep(beta=np.inf, niter=10)
+
+    b = gt.contiguous_map(state.get_blocks())
+    state = state.copy(b=b)
+    e = state.get_matrix()
+    n_blocks = state.get_nonempty_B()
+    node_counts = state.get_nr().get_array()[:n_blocks]
+    edge_counts = e.todense()[:n_blocks, :n_blocks]
+
+    if strict:
+        if (node_counts > 40).sum() > 0 or (node_counts < 20).sum() > 0 or n_blocks > 5 or n_blocks < 2:
+            print("False")
+            return
+
+    max_intra_edges = node_counts * (node_counts - 1)
+    est_p_intra = np.diagonal(edge_counts) / (max_intra_edges + 1e-6)
+
+    max_inter_edges = node_counts.reshape((-1, 1)) @ node_counts.reshape((1, -1))
+    np.fill_diagonal(edge_counts, 0)
+    est_p_inter = edge_counts / (max_inter_edges + 1e-6)
+
+    W_p_intra = (est_p_intra - p_intra) ** 2 / (est_p_intra * (1 - est_p_intra) + 1e-6)
+    W_p_inter = (est_p_inter - p_inter) ** 2 / (est_p_inter * (1 - est_p_inter) + 1e-6)
+
+    W = W_p_inter.copy()
+    np.fill_diagonal(W, W_p_intra)
+    p = 1 - chi2.cdf(abs(W), 1)
+    p = p.mean()
+    
+    if p > 0.9:
+        print("True")
+    else:
+        print("False")
+
+if __name__ == "__main__":
+    main()
diff --git a/examples/defog/dataset_utils.py b/examples/defog/dataset_utils.py
new file mode 100644
index 000000000..5ca1c8f05
--- /dev/null
+++ b/examples/defog/dataset_utils.py
@@ -0,0 +1,201 @@
+import numpy as np
+import tensorlayerx as tlx
+from gammagl.data import Graph
+from datasets import (
+    PlanarGraphDataset, TreeGraphDataset, SBMGraphDataset, Comm20GraphDataset,
+    QM9Gen, GuacaMolDataset, ZINC250kGen, MOSESDataset, TLSGraphDataset
+)
+
+def create_synthetic_dataset(num_graphs=100, min_nodes=10, max_nodes=20,
+                              num_node_types=2, num_edge_types=2, p_edge=0.3):
+    r"""Create a synthetic graph dataset for testing."""
+    graphs = []
+    for _ in range(num_graphs):
+        n = np.random.randint(min_nodes, max_nodes + 1)
+        node_labels = np.random.randint(0, num_node_types, size=n)
+        x = np.eye(num_node_types, dtype=np.float32)[node_labels]
+        adj = (np.random.rand(n, n) < p_edge).astype(np.float32)
+        adj = np.triu(adj, k=1)
+        adj = adj + adj.T
+        np.fill_diagonal(adj, 0)
+        src, dst = np.nonzero(adj)
+        edge_index = np.stack([src, dst], axis=0).astype(np.int64)
+
+        if num_edge_types > 1:
+            edge_labels = np.random.randint(1, num_edge_types, size=len(src))
+            edge_attr = np.eye(num_edge_types, dtype=np.float32)[edge_labels]
+        else:
+            edge_attr = np.ones((len(src), 1), dtype=np.float32)
+
+        g = Graph(
+            x=tlx.convert_to_tensor(x),
+            edge_index=tlx.convert_to_tensor(edge_index),
+            edge_attr=tlx.convert_to_tensor(edge_attr),
+            y=tlx.convert_to_tensor(np.zeros(1, dtype=np.float32)),
+        )
+        graphs.append(g)
+    return graphs
+
+
+def compute_dataset_infos(dataset, num_node_types, num_edge_types):
+    r"""Compute dataset statistics dynamically."""
+    total_graphs = len(dataset)
+    node_counts = []
+    node_type_counts = np.zeros(num_node_types, dtype=np.float32)
+    edge_type_counts = np.zeros(num_edge_types, dtype=np.float32)
+
+    for idx in range(total_graphs):
+        g = dataset[idx]
+        x_val = g.x
+        x_np = x_val if isinstance(x_val, np.ndarray) else tlx.convert_to_numpy(x_val)
+        n = x_np.shape[0]
+        node_counts.append(n)
+
+        node_labels = np.argmax(x_np, axis=-1)
+        for label in node_labels:
+            node_type_counts[label] += 1
+
+        if getattr(g, 'edge_attr', None) is not None:
+            ea_val = g.edge_attr
+            ea_np = ea_val if isinstance(ea_val, np.ndarray) else tlx.convert_to_numpy(ea_val)
+            edge_type_sums = ea_np.sum(axis=0)
+            if edge_type_sums.shape[0] > 1:
+                edge_type_counts[1:] += edge_type_sums[1:]
+
+        total_pairs = n * (n - 1)
+        n_edges = g.edge_index.shape[1] if getattr(g, 'edge_index', None) is not None else 0
+        n_no_edge = total_pairs - n_edges
+        edge_type_counts[0] += n_no_edge
+
+        if (idx + 1) <= 3 or (idx + 1) % 50000 == 0 or (idx + 1) == total_graphs:
+            print(f"  Processing graph {idx + 1}/{total_graphs}...")
+
+    max_n = max(node_counts)
+    node_dist = np.zeros(max_n + 1, dtype=np.float32)
+    for nc in node_counts:
+        node_dist[nc] += 1
+    node_dist = node_dist / node_dist.sum()
+
+    return {
+        'output_dims': {
+            'X': num_node_types,
+            'E': num_edge_types,
+            'y': 0,
+        },
+        'node_types': node_type_counts,
+        'edge_types': edge_type_counts,
+        'max_n_nodes': max_n,
+        'node_dist': node_dist,
+    }
+
+
+class SpectreNodeTransform:
+    def __init__(self, num_node_types=2):
+        self.num_node_types = num_node_types
+
+    def __call__(self, data):
+        n = data.x.shape[0]
+        x_np = np.zeros((n, self.num_node_types), dtype=np.float32)
+        x_np[:, 0] = 1.0
+        data.x = tlx.convert_to_tensor(x_np, dtype=tlx.float32)
+        y_val = data.y if hasattr(data, 'y') and data.y is not None else np.zeros((1, 0), dtype=np.float32)
+        if isinstance(y_val, np.ndarray) and y_val.size == 0:
+            y_val = np.zeros((1, 0), dtype=np.float32)
+        data.y = tlx.convert_to_tensor(y_val, dtype=tlx.float32) if isinstance(y_val, np.ndarray) else y_val
+        return data
+
+
+class GenericNodeTransform:
+    def __call__(self, data):
+        y_val = data.y if hasattr(data, 'y') and data.y is not None else np.zeros((1, 0), dtype=np.float32)
+        data.y = tlx.convert_to_tensor(y_val, dtype=tlx.float32) if isinstance(y_val, np.ndarray) else y_val
+        return data
+
+
+def load_real_dataset(name, root=None, conditional=False, target='mu', remove_h=None):
+    if name == 'planar':
+        ds_cls = PlanarGraphDataset
+        num_node_types, num_edge_types = 2, 2
+        convert_spectre = True
+    elif name == 'tree':
+        ds_cls = TreeGraphDataset
+        num_node_types, num_edge_types = 2, 2
+        convert_spectre = True
+    elif name == 'sbm':
+        ds_cls = SBMGraphDataset
+        num_node_types, num_edge_types = 2, 2
+        convert_spectre = True
+    elif name == 'comm20':
+        ds_cls = Comm20GraphDataset
+        num_node_types, num_edge_types = 2, 2
+        convert_spectre = True
+    elif name == 'qm9':
+        ds_cls = QM9Gen
+        qm9_remove_h = True if remove_h is None else bool(remove_h)
+        num_node_types, num_edge_types = (4, 5) if qm9_remove_h else (5, 5)
+        convert_spectre = False
+    elif name == 'guacamol':
+        ds_cls = GuacaMolDataset
+        num_node_types, num_edge_types = 12, 5
+        convert_spectre = False
+    elif name == 'zinc250k':
+        ds_cls = ZINC250kGen
+        num_node_types, num_edge_types = 9, 4
+        convert_spectre = False
+    elif name == 'moses':
+        ds_cls = MOSESDataset
+        num_node_types, num_edge_types = 8, 5
+        convert_spectre = False
+    elif name == 'tls':
+        ds_cls = TLSGraphDataset
+        num_node_types, num_edge_types = 9, 2
+        convert_spectre = False
+    else:
+        raise ValueError(f"Unknown dataset: {name}")
+
+    kwargs = {'root': root} if root else {}
+    if name == 'qm9':
+        kwargs['remove_h'] = True if remove_h is None else remove_h
+        kwargs['aromatic'] = True
+    if name in ('qm9', 'tls') and conditional:
+        kwargs['conditional'] = True
+        kwargs['target'] = target
+
+    # Inject transform
+    transform = SpectreNodeTransform(num_node_types) if convert_spectre else GenericNodeTransform()
+    kwargs['transform'] = transform
+
+    train_ds = ds_cls(split='train', **kwargs)
+    val_ds = ds_cls(split='val', **kwargs)
+    test_ds = ds_cls(split='test', **kwargs)
+
+    print(f"Dataset {name}: train={len(train_ds)}, val={len(val_ds)}, test={len(test_ds)}")
+
+    if name in ('qm9', 'guacamol', 'zinc250k', 'moses'):
+        dataset_infos = {'output_dims': {'X': num_node_types, 'E': num_edge_types, 'y': 0}}
+        stats = ds_cls.STATS_REMOVE_H if name == 'qm9' and kwargs.get('remove_h') else getattr(ds_cls, 'STATS_WITH_H', getattr(ds_cls, 'STATS', None))
+        
+        if stats:
+            atom_decoder = stats.get('atom_names', getattr(ds_cls, 'ATOM_DECODER', []))
+            dataset_infos['node_types'] = stats['node_types'].astype(np.float32).copy()
+            dataset_infos['edge_types'] = stats['edge_types'].astype(np.float32).copy()
+            if 'n_nodes' in stats:
+                dataset_infos['node_dist'] = stats['n_nodes'].astype(np.float32).copy()
+                dataset_infos['max_n_nodes'] = int(stats.get('max_n_nodes', len(stats['n_nodes']) - 1))
+            dataset_infos['remove_h'] = kwargs.get('remove_h', True)
+            dataset_infos['valencies'] = list(stats['valencies'])
+            if 'valency_distribution' in stats:
+                dataset_infos['valency_distribution'] = stats['valency_distribution'].astype(np.float32).copy()
+            dataset_infos['atom_weights'] = dict(stats['atom_weights'])
+            dataset_infos['max_weight'] = float(stats['max_weight'])
+            dataset_infos['atom_decoder'] = list(atom_decoder)
+    else:
+        dataset_infos = compute_dataset_infos(train_ds, num_node_types, num_edge_types)
+
+    test_labels = None
+    if conditional:
+        test_labels_list = [tlx.convert_to_numpy(g.y).flatten() for g in test_ds]
+        if test_labels_list and len(test_labels_list[0]) > 0:
+            test_labels = tlx.convert_to_tensor(np.stack(test_labels_list, axis=0).astype(np.float32))
+
+    return train_ds, val_ds, test_ds, dataset_infos, num_node_types, num_edge_types, test_labels
diff --git a/examples/defog/datasets/__init__.py b/examples/defog/datasets/__init__.py
new file mode 100644
index 000000000..1393fd4d9
--- /dev/null
+++ b/examples/defog/datasets/__init__.py
@@ -0,0 +1,35 @@
+"""DeFoG-specific graph generation datasets.
+
+These datasets are designed for discrete flow matching graph generation
+and contain heavy preprocessing (dense adjacency matrices, atom/bond
+distribution statistics, eigenvalue caching, etc.) that is specific to
+the DeFoG training pipeline.
+
+They are intentionally kept in ``examples/defog/datasets/`` rather than
+``gammagl/datasets/`` to avoid polluting the core GammaGL package with
+optional heavy dependencies (RDKit, graph-tool, etc.).
+"""
+
+from .spectre_dataset import (
+    PlanarGraphDataset,
+    TreeGraphDataset,
+    SBMGraphDataset,
+    Comm20GraphDataset,
+)
+from .qm9_dataset import QM9Gen
+from .moses_dataset import MOSESDataset
+from .guacamol_dataset import GuacaMolDataset
+from .zinc250k_dataset import ZINC250kGen
+from .tls_dataset import TLSGraphDataset
+
+__all__ = [
+    'PlanarGraphDataset',
+    'TreeGraphDataset',
+    'SBMGraphDataset',
+    'Comm20GraphDataset',
+    'QM9Gen',
+    'MOSESDataset',
+    'GuacaMolDataset',
+    'ZINC250kGen',
+    'TLSGraphDataset',
+]
diff --git a/examples/defog/datasets/guacamol_dataset.py b/examples/defog/datasets/guacamol_dataset.py
new file mode 100644
index 000000000..e5b273755
--- /dev/null
+++ b/examples/defog/datasets/guacamol_dataset.py
@@ -0,0 +1,288 @@
+import os
+import os.path as osp
+import numpy as np
+import tensorlayerx as tlx
+from typing import Callable, List, Optional
+
+from gammagl.data import (
+    Graph,
+    InMemoryDataset,
+    download_url,
+)
+
+
+class GuacaMolDataset(InMemoryDataset):
+    r"""The GuacaMol dataset for molecular graph generation.
+
+    From the `"GuacaMol: Benchmarking Models for de Novo Molecular Design"
+    <https://arxiv.org/abs/1811.09621>`_ benchmark.
+    Contains ~1.6M molecules from ChEMBL.
+
+    Requires **RDKit** (``pip install rdkit``).
+
+    Parameters
+    ----------
+    root : str, optional
+        Root directory where the dataset should be saved.
+    split : str
+        Which split to load: ``'train'``, ``'val'``, or ``'test'``.
+    transform : callable, optional
+        A transform applied to each :class:`Graph` on access.
+    pre_transform : callable, optional
+        A transform applied during processing before saving.
+    pre_filter : callable, optional
+        A filter applied during processing.
+    force_reload : bool
+        Whether to re-process the dataset.
+    """
+
+    URLS = {
+        'train': 'https://figshare.com/ndownloader/files/13612760',
+        'test': 'https://figshare.com/ndownloader/files/13612757',
+        'valid': 'https://figshare.com/ndownloader/files/13612766',
+    }
+
+    ATOM_DECODER = ['C', 'N', 'O', 'F', 'B', 'Br', 'Cl', 'I', 'P', 'S',
+                    'Se', 'Si']
+    ATOM_ENCODER = {atom: i for i, atom in enumerate(ATOM_DECODER)}
+    NUM_ATOM_TYPES = 12
+    NUM_EDGE_TYPES = 5  # no-bond=0, single=1, double=2, triple=3, aromatic=4
+
+    STATS = {
+        'valencies': [4, 3, 2, 1, 3, 1, 1, 1, 3, 2, 2, 4],
+        'atom_weights': {0: 12, 1: 14, 2: 16, 3: 19, 4: 10.8, 5: 79.9,
+                         6: 35.4, 7: 126.9, 8: 31, 9: 32, 10: 79, 11: 28.1},
+        'max_weight': 800.0,
+        'num_node_types': 12,
+        'num_edge_types': 5,
+        'node_types': np.array([
+            7.409e-01, 1.069e-01, 1.122e-01, 1.421e-02, 6.058e-05,
+            1.717e-03, 8.411e-03, 2.290e-04, 5.695e-04, 1.467e-02,
+            4.153e-05, 5.342e-05,
+        ]),
+        'edge_types': np.array([
+            9.253e-01, 3.624e-02, 4.849e-03, 1.651e-04, 3.349e-02,
+        ]),
+    }
+
+    def __init__(self, root: Optional[str] = None, split: str = 'train',
+                 transform: Optional[Callable] = None,
+                 pre_transform: Optional[Callable] = None,
+                 pre_filter: Optional[Callable] = None,
+                 force_reload: bool = False):
+        assert split in ('train', 'val', 'test'), f"Unknown split: {split}"
+        self.name = 'guacamol_gen'
+        self.split = split
+
+        super().__init__(root, transform, pre_transform, pre_filter,
+                         force_reload=force_reload)
+
+        split_idx = {'train': 0, 'val': 1, 'test': 2}[split]
+        self.data, self.slices = self.load_data(self.processed_paths[split_idx])
+
+    @property
+    def raw_file_names(self) -> List[str]:
+        return ['train.smiles', 'test.smiles', 'valid.smiles']
+
+    @property
+    def processed_file_names(self) -> List[str]:
+        return [
+            tlx.BACKEND + '_train.pt',
+            tlx.BACKEND + '_val.pt',
+            tlx.BACKEND + '_test.pt',
+        ]
+
+    def download(self):
+        # Try Figshare direct download first
+        for split, filename in [('train', 'train.smiles'),
+                                ('test', 'test.smiles'),
+                                ('valid', 'valid.smiles')]:
+            download_url(self.URLS[split], self.raw_dir, filename=filename)
+
+        # Check if files are valid (non-empty)
+        all_valid = True
+        for filename in self.raw_file_names:
+            path = osp.join(self.raw_dir, filename)
+            if not osp.exists(path) or osp.getsize(path) == 0:
+                all_valid = False
+                break
+
+        if not all_valid:
+            # Figshare may be blocked by WAF; fallback to guacamol package data generation
+            print("Figshare download returned empty files (likely WAF blocked).")
+            print("Attempting to generate GuacaMol data via the official guacamol package...")
+            try:
+                self._generate_via_guacamol_package()
+            except Exception as e:
+                raise RuntimeError(
+                    f"Failed to download GuacaMol data from Figshare and could not "
+                    f"generate via guacamol package: {e}. "
+                    f"Please install guacamol (pip install guacamol) or manually "
+                    f"download the .smiles files from Figshare and place them in "
+                    f"{self.raw_dir}"
+                ) from e
+
+    def _generate_via_guacamol_package(self):
+        """Generate train/valid/test .smiles via the official guacamol package.
+
+        This is a fallback when Figshare direct downloads are blocked by WAF.
+        It downloads ChEMBL 24.1 from EBI FTP and runs the canonical filtering
+        pipeline.  The resulting files are copied into ``self.raw_dir``.
+        """
+        import tempfile
+        import shutil
+        import importlib.util
+
+        spec = importlib.util.find_spec('guacamol')
+        if spec is None:
+            raise RuntimeError("guacamol package not installed")
+
+        get_data_path = osp.join(osp.dirname(spec.origin), 'data', 'get_data.py')
+        if not osp.exists(get_data_path):
+            raise RuntimeError(f"guacamol get_data.py not found at {get_data_path}")
+
+        tmpdir = tempfile.mkdtemp(prefix='guacamol_datagen_')
+        try:
+            import subprocess
+            import sys
+            cmd = [
+                sys.executable, get_data_path,
+                '--destination', tmpdir,
+                '--n_jobs', '8',
+            ]
+            print(f"Running: {' '.join(cmd)}")
+            result = subprocess.run(cmd, capture_output=True, text=True)
+            if result.returncode != 0:
+                raise RuntimeError(
+                    f"guacamol data generation failed:\n{result.stderr}"
+                )
+            # Copy generated files with expected names
+            mapping = {
+                'chembl24_canon_train.smiles': 'train.smiles',
+                'chembl24_canon_dev-valid.smiles': 'valid.smiles',
+                'chembl24_canon_test.smiles': 'test.smiles',
+            }
+            for src_name, dst_name in mapping.items():
+                src = osp.join(tmpdir, src_name)
+                dst = osp.join(self.raw_dir, dst_name)
+                if osp.exists(src):
+                    shutil.copy2(src, dst)
+                    print(f"Copied {src_name} -> {dst_name}")
+                else:
+                    raise RuntimeError(f"Expected output {src} not found")
+        finally:
+            shutil.rmtree(tmpdir, ignore_errors=True)
+
+    def process(self):
+        try:
+            from rdkit import Chem, RDLogger
+            from rdkit.Chem.rdchem import BondType as BT
+        except ImportError:
+            raise ImportError(
+                "GuacaMolDataset requires RDKit. Install via: pip install rdkit"
+            )
+
+        RDLogger.DisableLog('rdApp.*')
+
+        atom_encoder = self.ATOM_ENCODER
+        bond_types = {BT.SINGLE: 1, BT.DOUBLE: 2, BT.TRIPLE: 3,
+                      BT.AROMATIC: 4}
+        num_bond_types = self.NUM_EDGE_TYPES
+        num_atom_types = self.NUM_ATOM_TYPES
+
+        split_files = {
+            'train': 'train.smiles',
+            'val': 'valid.smiles',
+            'test': 'test.smiles',
+        }
+
+        for split_name, split_idx in [('train', 0), ('val', 1), ('test', 2)]:
+            smiles_path = osp.join(self.raw_dir, split_files[split_name])
+            with open(smiles_path, 'r') as f:
+                smiles_list = [line.strip() for line in f if line.strip()]
+
+            data_list = []
+            for smile in smiles_list:
+                mol = Chem.MolFromSmiles(smile)
+                if mol is None:
+                    continue
+
+                n_atoms = mol.GetNumAtoms()
+                if n_atoms == 0:
+                    continue
+
+                # Node features: one-hot atom type
+                type_idx = []
+                valid = True
+                for atom in mol.GetAtoms():
+                    symbol = atom.GetSymbol()
+                    if symbol not in atom_encoder:
+                        valid = False
+                        break
+                    type_idx.append(atom_encoder[symbol])
+                if not valid:
+                    continue
+
+                x = np.zeros((n_atoms, num_atom_types), dtype=np.float32)
+                for j, t in enumerate(type_idx):
+                    x[j, t] = 1.0
+
+                # Edge index and attributes
+                rows, cols, edge_feats = [], [], []
+                for bond in mol.GetBonds():
+                    start = bond.GetBeginAtomIdx()
+                    end = bond.GetEndAtomIdx()
+                    bt = bond.GetBondType()
+                    if bt not in bond_types:
+                        continue
+                    bond_idx = bond_types[bt]
+                    for s, d in [(start, end), (end, start)]:
+                        rows.append(s)
+                        cols.append(d)
+                        feat = np.zeros(num_bond_types, dtype=np.float32)
+                        feat[bond_idx] = 1.0
+                        edge_feats.append(feat)
+
+                if len(rows) > 0:
+                    edge_index = np.stack([rows, cols], axis=0).astype(np.int64)
+                    edge_attr = np.stack(edge_feats, axis=0)
+                else:
+                    continue
+
+                y = np.zeros((1, 0), dtype=np.float32)
+
+                data = Graph(
+                    x=x,
+                    edge_index=edge_index,
+                    edge_attr=edge_attr,
+                    y=y,
+                    n_nodes=np.array([n_atoms], dtype=np.int64),
+                    to_tensor=True,
+                )
+
+                if self.pre_filter is not None and not self.pre_filter(data):
+                    continue
+                if self.pre_transform is not None:
+                    data = self.pre_transform(data)
+
+                data_list.append(data)
+
+            if len(data_list) == 0:
+                raise RuntimeError(
+                    f"GuacaMolDataset: no valid graphs generated for split "
+                    f"'{split_name}'. The raw SMILES file may be empty or "
+                    f"corrupt. Please check {self.raw_dir}"
+                )
+
+            collated_data, slices = self.collate(data_list)
+            self.save_data(
+                (collated_data, slices),
+                self.processed_paths[split_idx],
+            )
+
+    def get_stats(self):
+        """Return pre-computed dataset statistics for DeFoG training."""
+        return self.STATS.copy()
+
+    def __repr__(self) -> str:
+        return f'GuacaMolDataset({len(self)})'
diff --git a/examples/defog/datasets/moses_dataset.py b/examples/defog/datasets/moses_dataset.py
new file mode 100644
index 000000000..f456b42f2
--- /dev/null
+++ b/examples/defog/datasets/moses_dataset.py
@@ -0,0 +1,221 @@
+import os
+import os.path as osp
+import numpy as np
+import tensorlayerx as tlx
+from typing import Callable, List, Optional
+
+from gammagl.data import (
+    Graph,
+    InMemoryDataset,
+    download_url,
+)
+
+
+class MOSESDataset(InMemoryDataset):
+    r"""The MOSES dataset for molecular graph generation.
+
+    From the `"Molecular Sets (MOSES): A Benchmarking Platform for Molecular
+    Generation Models" <https://arxiv.org/abs/1811.12823>`_ benchmark.
+    Contains ~1.9M drug-like molecules filtered from ZINC Clean Leads.
+
+    Requires **RDKit** (``pip install rdkit``).
+
+    Parameters
+    ----------
+    root : str, optional
+        Root directory where the dataset should be saved.
+    split : str
+        Which split to load: ``'train'``, ``'val'``, or ``'test'``.
+    transform : callable, optional
+        A transform applied to each :class:`Graph` on access.
+    pre_transform : callable, optional
+        A transform applied during processing before saving.
+    pre_filter : callable, optional
+        A filter applied during processing.
+    force_reload : bool
+        Whether to re-process the dataset.
+    """
+
+    train_url = ('https://media.githubusercontent.com/media/molecularsets/'
+                 'moses/master/data/train.csv')
+    val_url = ('https://media.githubusercontent.com/media/molecularsets/'
+               'moses/master/data/test_scaffolds.csv')
+    test_url = ('https://media.githubusercontent.com/media/molecularsets/'
+                'moses/master/data/test.csv')
+
+    ATOM_DECODER = ['C', 'N', 'S', 'O', 'F', 'Cl', 'Br', 'H']
+    ATOM_ENCODER = {atom: i for i, atom in enumerate(ATOM_DECODER)}
+    NUM_ATOM_TYPES = 8
+    NUM_EDGE_TYPES = 5  # no-bond=0, single=1, double=2, triple=3, aromatic=4
+
+    STATS = {
+        'valencies': [4, 3, 4, 2, 1, 1, 1, 1],
+        'atom_weights': {0: 12, 1: 14, 2: 32, 3: 16, 4: 19, 5: 35.4,
+                         6: 79.9, 7: 1},
+        'max_weight': 350.0,
+        'num_node_types': 8,
+        'num_edge_types': 5,
+        'n_nodes': np.array([
+            0., 0., 0., 0., 0., 0., 0., 0.,
+            3.098e-06, 1.859e-05, 5.008e-05, 5.679e-05,
+            1.244e-04, 4.486e-04, 2.253e-03, 3.232e-03,
+            6.710e-03, 2.290e-02, 5.411e-02, 1.100e-01,
+            1.223e-01, 1.281e-01, 1.446e-01, 1.506e-01,
+            1.437e-01, 9.266e-02, 1.820e-02, 2.065e-06,
+        ]),
+        'node_types': np.array([0.7223, 0.1366, 0.1637, 0.1035,
+                                0.1422, 0.0054, 0.0015, 0.0]),
+        'edge_types': np.array([0.8974, 0.0473, 0.0627, 0.0004, 0.0486]),
+    }
+
+    def __init__(self, root: Optional[str] = None, split: str = 'train',
+                 transform: Optional[Callable] = None,
+                 pre_transform: Optional[Callable] = None,
+                 pre_filter: Optional[Callable] = None,
+                 force_reload: bool = False):
+        assert split in ('train', 'val', 'test'), f"Unknown split: {split}"
+        self.name = 'moses_gen'
+        self.split = split
+
+        super().__init__(root, transform, pre_transform, pre_filter,
+                         force_reload=force_reload)
+
+        split_idx = {'train': 0, 'val': 1, 'test': 2}[split]
+        self.data, self.slices = self.load_data(self.processed_paths[split_idx])
+
+    @property
+    def raw_file_names(self) -> List[str]:
+        return ['train_moses.csv', 'val_moses.csv', 'test_moses.csv']
+
+    @property
+    def processed_file_names(self) -> List[str]:
+        return [
+            tlx.BACKEND + '_train.pt',
+            tlx.BACKEND + '_val.pt',
+            tlx.BACKEND + '_test.pt',
+        ]
+
+    def download(self):
+        train_path = download_url(self.train_url, self.raw_dir)
+        os.rename(train_path, osp.join(self.raw_dir, 'train_moses.csv'))
+
+        val_path = download_url(self.val_url, self.raw_dir)
+        os.rename(val_path, osp.join(self.raw_dir, 'val_moses.csv'))
+
+        test_path = download_url(self.test_url, self.raw_dir)
+        os.rename(test_path, osp.join(self.raw_dir, 'test_moses.csv'))
+
+    def process(self):
+        try:
+            from rdkit import Chem, RDLogger
+            from rdkit.Chem.rdchem import BondType as BT
+        except ImportError:
+            raise ImportError(
+                "MOSESDataset requires RDKit. Install via: pip install rdkit")
+
+        RDLogger.DisableLog('rdApp.*')
+        import pandas as pd
+
+        atom_encoder = self.ATOM_ENCODER
+        bond_types = {BT.SINGLE: 1, BT.DOUBLE: 2, BT.TRIPLE: 3,
+                      BT.AROMATIC: 4}
+        num_bond_types = self.NUM_EDGE_TYPES
+        num_atom_types = self.NUM_ATOM_TYPES
+
+        split_files = {
+            'train': 'train_moses.csv',
+            'val': 'val_moses.csv',
+            'test': 'test_moses.csv',
+        }
+
+        for split_name, split_idx in [('train', 0), ('val', 1), ('test', 2)]:
+            csv_path = osp.join(self.raw_dir, split_files[split_name])
+            smiles_list = pd.read_csv(csv_path)['SMILES'].values
+
+            data_list = []
+            for smile in smiles_list:
+                mol = Chem.MolFromSmiles(smile)
+                if mol is None:
+                    continue
+
+                n_atoms = mol.GetNumAtoms()
+                if n_atoms == 0:
+                    continue
+
+                # Node features: one-hot atom type
+                type_idx = []
+                valid = True
+                for atom in mol.GetAtoms():
+                    symbol = atom.GetSymbol()
+                    if symbol not in atom_encoder:
+                        valid = False
+                        break
+                    type_idx.append(atom_encoder[symbol])
+                if not valid:
+                    continue
+
+                x = np.zeros((n_atoms, num_atom_types), dtype=np.float32)
+                for j, t in enumerate(type_idx):
+                    x[j, t] = 1.0
+
+                # Edge index and attributes
+                rows, cols, edge_feats = [], [], []
+                for bond in mol.GetBonds():
+                    start = bond.GetBeginAtomIdx()
+                    end = bond.GetEndAtomIdx()
+                    bt = bond.GetBondType()
+                    if bt not in bond_types:
+                        continue
+                    bond_idx = bond_types[bt]
+                    for s, d in [(start, end), (end, start)]:
+                        rows.append(s)
+                        cols.append(d)
+                        feat = np.zeros(num_bond_types, dtype=np.float32)
+                        feat[bond_idx] = 1.0
+                        edge_feats.append(feat)
+
+                if len(rows) > 0:
+                    edge_index = np.stack([rows, cols], axis=0).astype(np.int64)
+                    edge_attr = np.stack(edge_feats, axis=0)
+                else:
+                    continue  # skip molecules with no bonds
+
+                y = np.zeros((1, 0), dtype=np.float32)
+
+                data = Graph(
+                    x=x,
+                    edge_index=edge_index,
+                    edge_attr=edge_attr,
+                    y=y,
+                    n_nodes=np.array([n_atoms], dtype=np.int64),
+                    to_tensor=True,
+                )
+
+                if self.pre_filter is not None and not self.pre_filter(data):
+                    continue
+                if self.pre_transform is not None:
+                    data = self.pre_transform(data)
+
+                data_list.append(data)
+
+            if len(data_list) == 0:
+                data_list = [Graph(
+                    x=np.zeros((1, num_atom_types), dtype=np.float32),
+                    edge_index=np.zeros((2, 0), dtype=np.int64),
+                    edge_attr=np.zeros((0, num_bond_types), dtype=np.float32),
+                    y=np.zeros((1, 0), dtype=np.float32),
+                    to_tensor=True,
+                )]
+
+            collated_data, slices = self.collate(data_list)
+            self.save_data(
+                (collated_data, slices),
+                self.processed_paths[split_idx],
+            )
+
+    def get_stats(self):
+        """Return pre-computed dataset statistics for DeFoG training."""
+        return self.STATS.copy()
+
+    def __repr__(self) -> str:
+        return f'MOSESDataset({len(self)})'
diff --git a/examples/defog/datasets/qm9_dataset.py b/examples/defog/datasets/qm9_dataset.py
new file mode 100644
index 000000000..b6f564529
--- /dev/null
+++ b/examples/defog/datasets/qm9_dataset.py
@@ -0,0 +1,342 @@
+import os
+import os.path as osp
+import numpy as np
+import tensorlayerx as tlx
+from typing import Callable, List, Optional
+
+from gammagl.data import (
+    Graph,
+    InMemoryDataset,
+    download_url,
+    extract_zip,
+)
+
+
+class QM9Gen(InMemoryDataset):
+    r"""The QM9 dataset for graph generation, following the DeFoG setup.
+
+    From the `"MoleculeNet" <https://arxiv.org/abs/1703.00564>`_ benchmark,
+    containing ~134k small organic molecules with up to 9 heavy atoms.
+    This version is tailored for **graph generation** (not property prediction):
+    node features are one-hot atom types, edge features are one-hot bond types,
+    and graph labels are empty.
+
+    Requires **RDKit** (``pip install rdkit``).
+
+    Parameters
+    ----------
+    root : str, optional
+        Root directory where the dataset should be saved.
+    split : str
+        Which split to load: ``'train'``, ``'val'``, or ``'test'``.
+    remove_h : bool
+        If ``True``, remove hydrogen atoms. Default ``True`` (paper default).
+    aromatic : bool
+        If ``True``, include aromatic bond type. Default ``True``.
+    transform : callable, optional
+        A transform applied to each :class:`Graph` on access.
+    pre_transform : callable, optional
+        A transform applied during processing before saving.
+    pre_filter : callable, optional
+        A filter applied during processing.
+    force_reload : bool
+        Whether to re-process the dataset.
+    """
+
+    raw_url = ('https://deepchemdata.s3-us-west-1.amazonaws.com/datasets/'
+               'molnet_publish/qm9.zip')
+    raw_url2 = 'https://ndownloader.figshare.com/files/3195404'
+
+    # Hard-coded statistics from the DeFoG paper (remove_h=True, aromatic=True)
+    STATS_REMOVE_H = {
+        'n_nodes': np.array([0, 2.2930e-05, 3.8217e-05, 6.8791e-05, 2.3695e-04,
+                             9.7072e-04, 0.0046472, 0.023985, 0.13666, 0.83337]),
+        'node_types': np.array([0.7230, 0.1151, 0.1593, 0.0026]),
+        'edge_types': np.array([0.7261, 0.2384, 0.0274, 0.0081, 0.0]),
+        'valency_distribution': np.pad(
+            np.array([2.6071e-06, 0.1630, 0.3520, 0.3200, 0.16313, 0.00073],
+                     dtype=np.float32),
+            (0, 3 * 9 - 2 - 6),
+        ),
+        'atom_names': {0: 'C', 1: 'N', 2: 'O', 3: 'F'},
+        'valencies': [4, 3, 2, 1],
+        'atom_weights': {0: 12, 1: 14, 2: 16, 3: 19},
+        'max_weight': 150.0,
+        'max_n_nodes': 9,
+        'num_node_types': 4,
+        'num_edge_types': 5,
+    }
+
+    STATS_WITH_H = {
+        'n_nodes': np.array([
+            0, 0, 0, 1.59e-05, 3.14e-05, 4.41e-05,
+            1.13e-04, 2.17e-04, 5.21e-04, 1.38e-03,
+            3.43e-03, 8.37e-03, 2.05e-02, 4.62e-02,
+            8.80e-02, 1.37e-01, 1.75e-01, 1.78e-01,
+            1.51e-01, 1.08e-01, 5.69e-02, 2.24e-02,
+            3.76e-03, 1.46e-04, 3.14e-05, 0, 0, 0, 0, 0
+        ]),
+        'node_types': np.array([0.5122, 0.3526, 0.0562, 0.0777, 0.0013]),
+        'edge_types': np.array([0.88162, 0.11062, 5.9875e-03,
+                                1.7758e-03, 0.0]),
+        'valency_distribution': np.pad(
+            np.array([0.0, 0.5136, 0.0840, 0.0554, 0.3456, 0.0012],
+                     dtype=np.float32),
+            (0, 3 * 29 - 2 - 6),
+        ),
+        'atom_names': {0: 'H', 1: 'C', 2: 'N', 3: 'O', 4: 'F'},
+        'valencies': [1, 4, 3, 2, 1],
+        'atom_weights': {0: 1, 1: 12, 2: 14, 3: 16, 4: 19},
+        'max_weight': 390.0,
+        'max_n_nodes': 29,
+        'num_node_types': 5,
+        'num_edge_types': 5,
+    }
+
+    # Property column indices in gdb9.sdf.csv (0-indexed, after 'mol_id')
+
+
+    def __init__(self, root: Optional[str] = None, split: str = 'train',
+                 remove_h: bool = True, aromatic: bool = True,
+                 conditional: bool = False, target: str = 'mu',
+                 transform: Optional[Callable] = None,
+                 pre_transform: Optional[Callable] = None,
+                 pre_filter: Optional[Callable] = None,
+                 force_reload: bool = False):
+        assert split in ('train', 'val', 'test'), f"Unknown split: {split}"
+        assert target in ('mu', 'homo', 'both'), f"Unknown target: {target}"
+        self.name = 'qm9_gen'
+        self.split = split
+        self.remove_h = remove_h
+        self.aromatic = aromatic
+        self.conditional = conditional
+        self.target = target
+
+        super().__init__(root, transform, pre_transform, pre_filter,
+                         force_reload=force_reload)
+
+        split_idx = {'train': 0, 'val': 1, 'test': 2}[split]
+        self.data, self.slices = self.load_data(self.processed_paths[split_idx])
+
+    @property
+    def processed_dir(self) -> str:
+        h_tag = 'no_h' if self.remove_h else 'with_h'
+        if self.conditional:
+            return osp.join(self.root_dir, f'processed_{h_tag}_cond_{self.target}')
+        return osp.join(self.root_dir, f'processed_{h_tag}')
+
+    @property
+    def raw_file_names(self) -> List[str]:
+        return ['gdb9.sdf', 'gdb9.sdf.csv', 'uncharacterized.txt']
+
+    @property
+    def processed_file_names(self) -> List[str]:
+        return [
+            tlx.BACKEND + '_train.pt',
+            tlx.BACKEND + '_val.pt',
+            tlx.BACKEND + '_test.pt',
+        ]
+
+    def download(self):
+        # Download and extract QM9 zip
+        file_path = download_url(self.raw_url, self.raw_dir)
+        extract_zip(file_path, self.raw_dir)
+        if os.path.exists(file_path):
+            os.unlink(file_path)
+        # Download uncharacterized molecules list
+        download_url(self.raw_url2, self.raw_dir,
+                     filename='uncharacterized.txt')
+
+    def process(self):
+        try:
+            from rdkit import Chem
+            from rdkit.Chem.rdchem import BondType as BT
+        except ImportError:
+            raise ImportError(
+                "QM9Gen requires RDKit. Install it via: "
+                "pip install rdkit  or  conda install -c conda-forge rdkit"
+            )
+
+        import pandas as pd
+
+        # Read skip list
+        skip_path = osp.join(self.raw_dir, 'uncharacterized.txt')
+        with open(skip_path, 'r') as f:
+            skip = [int(x.split()[0]) - 1 for x in f.read().split('\n')[9:-2]]
+        skip = set(skip)
+
+        # Atom and bond type mappings
+        full_atom_types = {'H': 0, 'C': 1, 'N': 2, 'O': 3, 'F': 4}
+        if self.remove_h:
+            num_atom_types = 4
+        else:
+            num_atom_types = 5
+
+        if self.aromatic:
+            bond_types = {BT.SINGLE: 1, BT.DOUBLE: 2, BT.TRIPLE: 3,
+                          BT.AROMATIC: 4}
+            num_bond_types = 5  # 0=no-bond, 1-4=bond types
+        else:
+            bond_types = {BT.SINGLE: 1, BT.DOUBLE: 2, BT.TRIPLE: 3}
+            num_bond_types = 4
+
+        # Read target CSV for conditional labels
+        target_path = osp.join(self.raw_dir, 'gdb9.sdf.csv')
+        target_df = pd.read_csv(target_path)
+
+        # Read SDF
+        sdf_path = osp.join(self.raw_dir, 'gdb9.sdf')
+        suppl = Chem.SDMolSupplier(sdf_path, removeHs=False, sanitize=False)
+
+        # Process molecules
+        data_list = []
+
+        for i, mol in enumerate(suppl):
+            if i in skip or mol is None:
+                continue
+
+            atoms = mol.GetAtoms()
+            n_atoms = len(atoms)
+            if n_atoms == 0:
+                continue
+
+            # Node features: keep original atom ordering, then optionally
+            # drop hydrogens by heavy-atom subgraph filtering to match DeFoG.
+            full_type_idx = []
+            keep_mask = []
+            for atom in atoms:
+                symbol = atom.GetSymbol()
+                if symbol not in full_atom_types:
+                    break
+                full_type_idx.append(full_atom_types[symbol])
+                keep_mask.append(symbol != 'H')
+            else:
+                if self.remove_h:
+                    kept_types = [t - 1 for t, keep in zip(full_type_idx, keep_mask) if keep]
+                else:
+                    kept_types = full_type_idx
+
+                n_kept_atoms = len(kept_types)
+                if n_kept_atoms == 0:
+                    continue
+
+                x = np.zeros((n_kept_atoms, num_atom_types), dtype=np.float32)
+                for j, t in enumerate(kept_types):
+                    x[j, t] = 1.0
+
+                if self.remove_h:
+                    new_index = {}
+                    next_idx = 0
+                    for old_idx, keep in enumerate(keep_mask):
+                        if keep:
+                            new_index[old_idx] = next_idx
+                            next_idx += 1
+                else:
+                    new_index = {idx: idx for idx in range(n_atoms)}
+
+                rows, cols, edge_feats = [], [], []
+                for bond in mol.GetBonds():
+                    start, end = bond.GetBeginAtomIdx(), bond.GetEndAtomIdx()
+                    if self.remove_h and (not keep_mask[start] or not keep_mask[end]):
+                        continue
+                    bt = bond.GetBondType()
+                    if bt not in bond_types:
+                        continue
+                    bond_idx = bond_types[bt]
+                    mapped_start = new_index[start]
+                    mapped_end = new_index[end]
+                    for s, d in [(mapped_start, mapped_end), (mapped_end, mapped_start)]:
+                        rows.append(s)
+                        cols.append(d)
+                        feat = np.zeros(num_bond_types, dtype=np.float32)
+                        feat[bond_idx] = 1.0
+                        edge_feats.append(feat)
+
+                if len(rows) > 0:
+                    edge_index = np.stack([rows, cols], axis=0).astype(np.int64)
+                    edge_attr = np.stack(edge_feats, axis=0)
+                    # Sort edges by (source, destination) to match DeFoG
+                    perm = (edge_index[0] * n_kept_atoms + edge_index[1]).argsort()
+                    edge_index = edge_index[:, perm]
+                    edge_attr = edge_attr[perm]
+                else:
+                    edge_index = np.zeros((2, 0), dtype=np.int64)
+                    edge_attr = np.zeros((0, num_bond_types), dtype=np.float32)
+
+                n_atoms = n_kept_atoms
+
+                # Conditional labels or empty y
+                if self.conditional:
+                    if self.target == 'mu':
+                        y_val = np.array([[target_df.iloc[i]['mu']]],
+                                         dtype=np.float32)
+                    elif self.target == 'homo':
+                        y_val = np.array([[target_df.iloc[i]['homo']]],
+                                         dtype=np.float32)
+                    else:  # both
+                        y_val = np.array([[target_df.iloc[i]['mu'],
+                                           target_df.iloc[i]['homo']]],
+                                         dtype=np.float32)
+                else:
+                    y_val = np.zeros((1, 0), dtype=np.float32)
+
+                data = Graph(
+                    x=x,
+                    edge_index=edge_index,
+                    edge_attr=edge_attr,
+                    y=y_val,
+                    n_nodes=np.array([n_atoms], dtype=np.int64),
+                    to_tensor=True,
+                )
+
+                if self.pre_filter is not None and not self.pre_filter(data):
+                    continue
+                if self.pre_transform is not None:
+                    data = self.pre_transform(data)
+
+                data_list.append(data)
+                continue  # skip the else clause of the for loop
+            # If an atom was not recognized, skip this molecule
+            
+        # Perform dynamic random split (seed=42) after filtering dirty data
+        n_clean_samples = len(data_list)
+        n_train = min(100000, n_clean_samples)
+        n_test = min(int(0.1 * n_clean_samples), n_clean_samples - n_train)
+        n_val = max(0, n_clean_samples - n_train - n_test)
+
+        rng = np.random.RandomState(42)
+        indices = rng.permutation(n_clean_samples)
+        
+        train_data = [data_list[i] for i in indices[:n_train]]
+        val_data = [data_list[i] for i in indices[n_train:n_train + n_val]]
+        test_data = [data_list[i] for i in indices[n_train + n_val:]]
+        
+        split_data = {'train': train_data, 'val': val_data, 'test': test_data}
+
+        # Save each split
+        for split_name, split_idx in [('train', 0), ('val', 1), ('test', 2)]:
+            split_list = split_data[split_name]
+            if len(split_list) == 0:
+                split_list = [Graph(
+                    x=np.zeros((1, num_atom_types), dtype=np.float32),
+                    edge_index=np.zeros((2, 0), dtype=np.int64),
+                    edge_attr=np.zeros((0, num_bond_types), dtype=np.float32),
+                    y=np.zeros((1, 0), dtype=np.float32),
+                    to_tensor=True,
+                )]
+            collated_data, slices = self.collate(split_list)
+            self.save_data(
+                (collated_data, slices),
+                self.processed_paths[split_idx],
+            )
+
+    def get_stats(self):
+        """Return pre-computed dataset statistics for DeFoG training."""
+        if self.remove_h:
+            return self.STATS_REMOVE_H.copy()
+        else:
+            return self.STATS_WITH_H.copy()
+
+    def __repr__(self) -> str:
+        h_tag = 'no_h' if self.remove_h else 'with_h'
+        return f'QM9Gen({len(self)}, {h_tag})'
diff --git a/examples/defog/datasets/spectre_dataset.py b/examples/defog/datasets/spectre_dataset.py
new file mode 100644
index 000000000..861ab9fd2
--- /dev/null
+++ b/examples/defog/datasets/spectre_dataset.py
@@ -0,0 +1,368 @@
+import os
+import os.path as osp
+import pickle
+import numpy as np
+import tensorlayerx as tlx
+try:
+    import networkx as nx
+except ImportError:
+    nx = None
+from typing import Callable, List, Optional
+
+from gammagl.data import (
+    Graph,
+    InMemoryDataset,
+    download_url,
+)
+
+
+def _adj_to_edge_index_and_attr(adj, num_edge_types=2):
+    r"""Convert a dense adjacency matrix to edge_index and edge_attr.
+
+    Parameters
+    ----------
+    adj : numpy.ndarray
+        Dense adjacency matrix of shape ``(n, n)``.
+    num_edge_types : int
+        Number of edge types. Default is 2 (no-edge=0, edge=1).
+
+    Returns
+    -------
+    edge_index : numpy.ndarray
+        Edge indices of shape ``(2, E)``.
+    edge_attr : numpy.ndarray
+        Edge attributes of shape ``(E, num_edge_types)``.
+    """
+    np.fill_diagonal(adj, 0)
+    src, dst = np.nonzero(adj)
+    edge_index = np.stack([src, dst], axis=0).astype(np.int64)
+    # One-hot: index 0 = no-edge (not stored), index 1 = edge present
+    edge_attr = np.zeros((len(src), num_edge_types), dtype=np.float32)
+    edge_attr[:, 1] = 1.0
+    return edge_index, edge_attr
+
+
+class SpectreGraphDataset(InMemoryDataset):
+    r"""Base class for synthetic graph datasets from the SPECTRE benchmark.
+
+    Downloads pre-generated graphs stored as NetworkX pickle files and
+    converts them to GammaGL :class:`Graph` objects.  Three splits
+    (train / val / test) are provided directly in the downloaded file.
+
+    Subclassed by :class:`PlanarGraphDataset`, :class:`TreeGraphDataset`,
+    and :class:`SBMGraphDataset`.
+
+    Parameters
+    ----------
+    root : str, optional
+        Root directory where the dataset should be saved.
+    name : str
+        Dataset name, one of ``'planar'``, ``'tree'``, ``'sbm'``.
+    split : str
+        Which split to load: ``'train'``, ``'val'``, or ``'test'``.
+    transform : callable, optional
+        A transform applied to each :class:`Graph` on access.
+    pre_transform : callable, optional
+        A transform applied during processing before saving.
+    pre_filter : callable, optional
+        A filter applied during processing.
+    force_reload : bool
+        Whether to re-process the dataset.
+    """
+
+    urls = {
+        'planar': 'https://raw.githubusercontent.com/AndreasBergmeister/'
+                  'graph-generation/main/data/planar.pkl',
+        'tree': 'https://raw.githubusercontent.com/AndreasBergmeister/'
+                'graph-generation/main/data/tree.pkl',
+        'sbm': 'https://raw.githubusercontent.com/AndreasBergmeister/'
+               'graph-generation/main/data/sbm.pkl',
+    }
+
+    def __init__(self, root: Optional[str] = None, name: str = 'planar',
+                 split: str = 'train',
+                 transform: Optional[Callable] = None,
+                 pre_transform: Optional[Callable] = None,
+                 pre_filter: Optional[Callable] = None,
+                 force_reload: bool = False):
+        assert name in self.urls, f"Unknown dataset name: {name}"
+        assert split in ('train', 'val', 'test'), f"Unknown split: {split}"
+        self.name = name
+        self.split = split
+
+        super().__init__(root, transform, pre_transform, pre_filter,
+                         force_reload=force_reload)
+
+        split_idx = {'train': 0, 'val': 1, 'test': 2}[split]
+        self.data, self.slices = self.load_data(self.processed_paths[split_idx])
+
+    @property
+    def raw_file_names(self) -> List[str]:
+        return [f'{self.name}.pkl']
+
+    @property
+    def processed_file_names(self) -> List[str]:
+        return [
+            tlx.BACKEND + '_train.pt',
+            tlx.BACKEND + '_val.pt',
+            tlx.BACKEND + '_test.pt',
+        ]
+
+    def download(self):
+        download_url(self.urls[self.name], self.raw_dir)
+
+    def process(self):
+        pkl_path = osp.join(self.raw_dir, f'{self.name}.pkl')
+        with open(pkl_path, 'rb') as f:
+            raw_data = pickle.load(f)
+
+        for split_name, split_idx in [('train', 0), ('val', 1), ('test', 2)]:
+            graphs_nx = raw_data[split_name]
+            data_list = []
+
+            for g in graphs_nx:
+                adj = nx.to_numpy_array(g).astype(np.float32)
+                n = adj.shape[0]
+                edge_index, edge_attr = _adj_to_edge_index_and_attr(adj)
+
+                x = np.ones((n, 1), dtype=np.float32)
+                y = np.zeros((1, 0), dtype=np.float32)
+
+                data = Graph(
+                    x=x,
+                    edge_index=edge_index,
+                    edge_attr=edge_attr,
+                    y=y,
+                    n_nodes=np.array([n], dtype=np.int64),
+                    to_tensor=True,
+                )
+
+                if self.pre_filter is not None and not self.pre_filter(data):
+                    continue
+                if self.pre_transform is not None:
+                    data = self.pre_transform(data)
+
+                data_list.append(data)
+
+            collated_data, slices = self.collate(data_list)
+            self.save_data(
+                (collated_data, slices),
+                self.processed_paths[split_idx],
+            )
+
+    def __repr__(self) -> str:
+        return f'{self.name.capitalize()}GraphDataset({len(self)})'
+
+
+class PlanarGraphDataset(SpectreGraphDataset):
+    r"""The Planar graph dataset from the SPECTRE benchmark.
+
+    Contains connected planar graphs with 64 nodes each. Used for
+    evaluating graph generation models.
+
+    From `"Spectre: Spectral conditioning helps to overcome the
+    expressivity limits of one-shot graph generators"
+    <https://proceedings.mlr.press/v162/martinkus22a.html>`_ (ICML 2022).
+
+    Parameters
+    ----------
+    root : str, optional
+        Root directory where the dataset should be saved.
+    split : str
+        Which split to load: ``'train'``, ``'val'``, or ``'test'``.
+    transform : callable, optional
+        A transform applied to each :class:`Graph` on access.
+    pre_transform : callable, optional
+        A transform applied during processing before saving.
+    pre_filter : callable, optional
+        A filter applied during processing.
+    force_reload : bool
+        Whether to re-process the dataset.
+    """
+
+    def __init__(self, root: Optional[str] = None, split: str = 'train',
+                 transform: Optional[Callable] = None,
+                 pre_transform: Optional[Callable] = None,
+                 pre_filter: Optional[Callable] = None,
+                 force_reload: bool = False):
+        super().__init__(root, name='planar', split=split,
+                         transform=transform, pre_transform=pre_transform,
+                         pre_filter=pre_filter, force_reload=force_reload)
+
+
+class TreeGraphDataset(SpectreGraphDataset):
+    r"""The Tree graph dataset from the HSpectre benchmark.
+
+    Contains tree graphs (connected acyclic graphs) with 64 nodes each.
+
+    From `"Efficient and scalable graph generation through iterative
+    local expansion" <https://arxiv.org/abs/2312.11529>`_ (ICLR 2023).
+
+    Parameters
+    ----------
+    root : str, optional
+        Root directory where the dataset should be saved.
+    split : str
+        Which split to load: ``'train'``, ``'val'``, or ``'test'``.
+    transform : callable, optional
+        A transform applied to each :class:`Graph` on access.
+    pre_transform : callable, optional
+        A transform applied during processing before saving.
+    pre_filter : callable, optional
+        A filter applied during processing.
+    force_reload : bool
+        Whether to re-process the dataset.
+    """
+
+    def __init__(self, root: Optional[str] = None, split: str = 'train',
+                 transform: Optional[Callable] = None,
+                 pre_transform: Optional[Callable] = None,
+                 pre_filter: Optional[Callable] = None,
+                 force_reload: bool = False):
+        super().__init__(root, name='tree', split=split,
+                         transform=transform, pre_transform=pre_transform,
+                         pre_filter=pre_filter, force_reload=force_reload)
+
+
+class SBMGraphDataset(SpectreGraphDataset):
+    r"""The Stochastic Block Model (SBM) graph dataset from the SPECTRE benchmark.
+
+    Contains synthetic clustering graphs where nodes within the same
+    cluster have a higher probability of being connected.  Node count
+    ranges from 44 to 187.
+
+    From `"Spectre: Spectral conditioning helps to overcome the
+    expressivity limits of one-shot graph generators"
+    <https://proceedings.mlr.press/v162/martinkus22a.html>`_ (ICML 2022).
+
+    Parameters
+    ----------
+    root : str, optional
+        Root directory where the dataset should be saved.
+    split : str
+        Which split to load: ``'train'``, ``'val'``, or ``'test'``.
+    transform : callable, optional
+        A transform applied to each :class:`Graph` on access.
+    pre_transform : callable, optional
+        A transform applied during processing before saving.
+    pre_filter : callable, optional
+        A filter applied during processing.
+    force_reload : bool
+        Whether to re-process the dataset.
+    """
+
+    def __init__(self, root: Optional[str] = None, split: str = 'train',
+                 transform: Optional[Callable] = None,
+                 pre_transform: Optional[Callable] = None,
+                 pre_filter: Optional[Callable] = None,
+                 force_reload: bool = False):
+        super().__init__(root, name='sbm', split=split,
+                         transform=transform, pre_transform=pre_transform,
+                         pre_filter=pre_filter, force_reload=force_reload)
+
+
+class Comm20GraphDataset(InMemoryDataset):
+    r"""The Community-20 graph dataset from the SPECTRE benchmark.
+
+    Contains 200 small community-structured graphs.  The raw data is a
+    PyTorch ``.pt`` file storing dense adjacency matrices, which are
+    split into train / val / test with a fixed random seed.
+
+    Parameters
+    ----------
+    root : str, optional
+        Root directory where the dataset should be saved.
+    split : str
+        Which split to load: ``'train'``, ``'val'``, or ``'test'``.
+    transform : callable, optional
+        A transform applied to each :class:`Graph` on access.
+    pre_transform : callable, optional
+        A transform applied during processing before saving.
+    pre_filter : callable, optional
+        A filter applied during processing.
+    force_reload : bool
+        Whether to re-process the dataset.
+    """
+
+    URL = ('https://raw.githubusercontent.com/KarolisMart/SPECTRE/'
+           'main/data/community_12_21_100.pt')
+    NUM_GRAPHS = 100
+
+    def __init__(self, root: Optional[str] = None, split: str = 'train',
+                 transform: Optional[Callable] = None,
+                 pre_transform: Optional[Callable] = None,
+                 pre_filter: Optional[Callable] = None,
+                 force_reload: bool = False):
+        assert split in ('train', 'val', 'test'), f"Unknown split: {split}"
+        self.name = 'comm20'
+        self.split = split
+        super().__init__(root, transform, pre_transform, pre_filter,
+                         force_reload=force_reload)
+        split_idx = {'train': 0, 'val': 1, 'test': 2}[split]
+        self.data, self.slices = self.load_data(self.processed_paths[split_idx])
+
+    @property
+    def raw_file_names(self) -> List[str]:
+        return ['community_12_21_100.pt']
+
+    @property
+    def processed_file_names(self) -> List[str]:
+        return [
+            tlx.BACKEND + '_train.pt',
+            tlx.BACKEND + '_val.pt',
+            tlx.BACKEND + '_test.pt',
+        ]
+
+    def download(self):
+        download_url(self.URL, self.raw_dir)
+
+    def process(self):
+        import torch
+        pt_path = osp.join(self.raw_dir, 'community_12_21_100.pt')
+        payload = torch.load(pt_path, map_location='cpu', weights_only=False)
+        adjs = payload[0]  # list of dense adjacency tensors
+
+        rng = np.random.default_rng(1234)
+        indices = rng.permutation(self.NUM_GRAPHS)
+        test_len = int(round(self.NUM_GRAPHS * 0.2))
+        train_len = int(round((self.NUM_GRAPHS - test_len) * 0.8))
+        val_len = self.NUM_GRAPHS - train_len - test_len
+
+        split_indices = {
+            'train': indices[:train_len],
+            'val': indices[train_len:train_len + val_len],
+            'test': indices[train_len + val_len:],
+        }
+
+        for split_name, split_idx in [('train', 0), ('val', 1), ('test', 2)]:
+            data_list = []
+            for i in split_indices[split_name]:
+                adj = adjs[i].numpy().astype(np.float32)
+                n = adj.shape[0]
+                edge_index, edge_attr = _adj_to_edge_index_and_attr(adj)
+
+                x = np.ones((n, 1), dtype=np.float32)
+                y = np.zeros((1, 0), dtype=np.float32)
+
+                data = Graph(
+                    x=x,
+                    edge_index=edge_index,
+                    edge_attr=edge_attr,
+                    y=y,
+                    n_nodes=np.array([n], dtype=np.int64),
+                    to_tensor=True,
+                )
+                if self.pre_filter is not None and not self.pre_filter(data):
+                    continue
+                if self.pre_transform is not None:
+                    data = self.pre_transform(data)
+                data_list.append(data)
+
+            collated_data, slices = self.collate(data_list)
+            self.save_data(
+                (collated_data, slices),
+                self.processed_paths[split_idx],
+            )
+
+    def __repr__(self) -> str:
+        return f'Comm20GraphDataset({len(self)})'
diff --git a/examples/defog/datasets/tls_dataset.py b/examples/defog/datasets/tls_dataset.py
new file mode 100644
index 000000000..57f83b37d
--- /dev/null
+++ b/examples/defog/datasets/tls_dataset.py
@@ -0,0 +1,373 @@
+"""
+TLS (Tertiary Lymphoid Structures) graph dataset for conditional generation.
+
+Ported from DeFoG src/datasets/tls_dataset.py for GammaGL.
+
+Contains cell graphs from histopathology images, where each node represents
+a cell with a phenotype label. Graph-level labels indicate TLS content
+(high vs low) for conditional generation.
+
+Reference: https://arxiv.org/pdf/2310.06661.pdf
+"""
+import os
+import os.path as osp
+import copy
+import pickle
+import numpy as np
+import tensorlayerx as tlx
+try:
+    import networkx as nx
+    _BaseGraph = nx.Graph
+except ImportError:
+    nx = None
+    _BaseGraph = object
+from typing import Callable, List, Optional
+
+from gammagl.data import (
+    Graph,
+    InMemoryDataset,
+    download_url,
+)
+
+
+# ============================================================
+# Cell phenotype encoding
+# ============================================================
+
+PHENOTYPE_DECODER = [
+    "B",
+    "T",
+    "Epithelial",
+    "Fibroblast",
+    "Myofibroblast",
+    "CD38+ Lymphocyte",
+    "Macrophages/Granulocytes",
+    "Marker",
+    "Endothelial",
+]
+
+PHENOTYPE_ENCODER = {v: k for k, v in enumerate(PHENOTYPE_DECODER)}
+
+# For conditional generation: 9 node types, 2 edge types
+TLS_NUM_NODE_TYPES = 9
+TLS_NUM_EDGE_TYPES = 2
+
+
+# ============================================================
+# CellGraph: networkx-based cell graph with TLS features
+# ============================================================
+
+class CellGraph(_BaseGraph):
+    """A networkx Graph augmented with TLS feature computation.
+
+    Each node must have a 'phenotype' attribute.
+    """
+
+    def __init__(self, graph):
+        super().__init__()
+        self.add_nodes_from(graph.nodes(data=True))
+        self.add_edges_from(graph.edges(data=True))
+        self.tls_features = self.compute_tls_features()
+
+    def has_low_TLS(self):
+        return self.tls_features["k_1"] < 0.05
+
+    def has_high_TLS(self):
+        return 0.05 < self.tls_features["k_2"]
+
+    def to_label(self):
+        """Return binary label: 0=low TLS, 1=high TLS, -1=ambiguous."""
+        if self.has_low_TLS():
+            return 0
+        elif self.has_high_TLS():
+            return 1
+        else:
+            return -1
+
+    def to_gamma_graph(self):
+        """Convert to GammaGL Graph object."""
+        n = self.number_of_nodes()
+        phenotypes = [self.nodes[node].get("phenotype", "Marker") for node in self.nodes()]
+        encoded = [PHENOTYPE_ENCODER.get(p, 7) for p in phenotypes]  # default to Marker
+
+        x = np.eye(TLS_NUM_NODE_TYPES, dtype=np.float32)[encoded]
+
+        adj = nx.to_numpy_array(self).astype(np.float32)
+        np.fill_diagonal(adj, 0)
+        src, dst = np.nonzero(adj)
+        edge_index = np.stack([src, dst], axis=0).astype(np.int64)
+        # All edges are type 1 (edge present)
+        edge_attr = np.zeros((len(src), TLS_NUM_EDGE_TYPES), dtype=np.float32)
+        edge_attr[:, 1] = 1.0
+
+        data = Graph(
+            x=x,
+            edge_index=edge_index,
+            edge_attr=edge_attr,
+            y=np.zeros((1, 0), dtype=np.float32),  # placeholder, filled by transform
+            n_nodes=np.array([n], dtype=np.int64),
+            to_tensor=True,
+        )
+        return data
+
+    @classmethod
+    def from_dense_graph(cls, dense_graph):
+        """Construct from (node_features, adjacency_matrix) tuple.
+
+        Parameters
+        ----------
+        dense_graph : tuple of (node_features, adjacency_matrix)
+            node_features: array of int (n,), adjacency_matrix: array of int (n,n)
+
+        Returns
+        -------
+        CellGraph
+        """
+        node_features = dense_graph[0]
+        adj = dense_graph[1]
+        if not isinstance(node_features, np.ndarray):
+            node_features = np.asarray(node_features).astype(int)
+        if not isinstance(adj, np.ndarray):
+            adj = np.asarray(adj).astype(int)
+
+        # Filter out padding nodes (type == -1)
+        n_nodes = int(np.sum(node_features != -1))
+        node_features = node_features[:n_nodes]
+        adj = adj[:n_nodes, :n_nodes]
+
+        nx_graph = nx.from_numpy_array(adj)
+        for _, _, data in nx_graph.edges(data=True):
+            del data["weight"]
+
+        for node_idx in nx_graph.nodes():
+            encoded_phenotype = int(node_features[node_idx])
+            phenotype = PHENOTYPE_DECODER[encoded_phenotype] if encoded_phenotype < len(PHENOTYPE_DECODER) else "Marker"
+            nx_graph.nodes[node_idx]["phenotype"] = phenotype
+
+        return cls(nx_graph)
+
+    # ============================================================
+    # TLS feature computation
+    # ============================================================
+
+    @property
+    def map_phenotype_to_color(self):
+        return {
+            "B": "c",
+            "T": "b",
+            "Epithelial": "k",
+            "Fibroblast": "#C4A484",
+            "Myofibroblast": "g",
+            "CD38+ Lymphocyte": "#FEE12B",
+            "Macrophages/Granulocytes": "C3",
+            "Marker": "0.9",
+            "Endothelial": "0.75",
+        }
+
+    def classify_TLS_edge(self, edge):
+        allowed_cell_types = ["B", "T"]
+        start_node, end_node = edge
+        start_phenotype = self.nodes[start_node]["phenotype"]
+        end_phenotype = self.nodes[end_node]["phenotype"]
+
+        if start_phenotype not in allowed_cell_types or end_phenotype not in allowed_cell_types:
+            edge_type = "ignore"
+        elif start_phenotype == end_phenotype:
+            edge_type = "alpha"
+        else:
+            b_cell = start_node if start_phenotype == "B" else end_node
+            num_of_b_neighbors = len([
+                node for node in self.neighbors(b_cell)
+                if self.nodes[node]["phenotype"] == "B"
+            ])
+            edge_type = f"gamma_{num_of_b_neighbors}"
+        return edge_type
+
+    def compute_tls_features(self, a_max=5, min_num_gamma_edges=0):
+        """Compute TLS feature metric from arxiv.org/pdf/2310.06661.pdf."""
+        if self.is_directed():
+            raise ValueError("Graph should be undirected.")
+
+        graph_phenotypes = nx.get_node_attributes(self, "phenotype")
+        nodes_to_remove = [
+            node for node, phenotype in graph_phenotypes.items()
+            if phenotype != "B" and phenotype != "T"
+        ]
+        bt_subgraph = copy.deepcopy(self)
+        bt_subgraph.remove_nodes_from(nodes_to_remove)
+        total_num_edges = bt_subgraph.number_of_edges()
+
+        num_edge_types_idxs = self._get_edges_idxs_by_tlo_type(a_max)
+
+        denominator = total_num_edges - num_edge_types_idxs["alpha"]
+        if denominator < min_num_gamma_edges:
+            return {f"k_{a}": None for a in range(a_max + 1)}
+
+        tlo_dict = {}
+        if denominator == 0:
+            tlo_dict.update({f"k_{a}": 0.0 for a in range(a_max + 1)})
+        else:
+            k = 1.0
+            for a in range(a_max + 1):
+                k -= num_edge_types_idxs[f"gamma_{a}"] / denominator
+                tlo_dict.update({f"k_{a}": max(0.0, round(k, 4))})
+        return tlo_dict
+
+    def _get_edges_idxs_by_tlo_type(self, a_max):
+        num_edges_by_type = {f"gamma_{a}": 0 for a in range(a_max + 1)}
+        num_edges_by_type["alpha"] = 0
+        for edge in self.edges:
+            edge_type = self.classify_TLS_edge(edge)
+            if edge_type in num_edges_by_type:
+                num_edges_by_type[edge_type] += 1
+        return num_edges_by_type
+
+
+# ============================================================
+# Dataset transforms
+# ============================================================
+
+class SelectK2Transform:
+    """Transform: set y to binary label based on k_2 > 0.05."""
+
+    def __call__(self, data):
+        # y is set during processing; this is a placeholder for external use
+        return data
+
+
+# ============================================================
+# Dataset
+# ============================================================
+
+class TLSGraphDataset(InMemoryDataset):
+    r"""TLS (Tertiary Lymphoid Structures) graph dataset.
+
+    Contains cell graphs from histopathology images. Each node has a phenotype
+    label (B, T, Epithelial, etc.) and the graph-level label indicates TLS content.
+
+    Data is downloaded from the ConStruct repository.
+
+    Parameters
+    ----------
+    root : str, optional
+        Root directory where the dataset should be saved.
+    split : str
+        Which split to load: ``'train'``, ``'val'``, or ``'test'``.
+    conditional : bool
+        Whether to include conditional labels (binary TLS class).
+    target : str
+        Target property: ``'k2'`` (default).
+    transform : callable, optional
+        A transform applied to each :class:`Graph` on access.
+    pre_transform : callable, optional
+        A transform applied during processing before saving.
+    pre_filter : callable, optional
+        A filter applied during processing.
+    force_reload : bool
+        Whether to re-process the dataset.
+    """
+
+    urls = {
+        'low_tls': {
+            'train': 'https://github.com/manuelmlmadeira/ConStruct/raw/main/data/low_tls_200/raw/train.pkl',
+            'val': 'https://github.com/manuelmlmadeira/ConStruct/raw/main/data/low_tls_200/raw/val.pkl',
+            'test': 'https://github.com/manuelmlmadeira/ConStruct/raw/main/data/low_tls_200/raw/test.pkl',
+        },
+        'high_tls': {
+            'train': 'https://github.com/manuelmlmadeira/ConStruct/raw/main/data/high_tls_200/raw/train.pkl',
+            'val': 'https://github.com/manuelmlmadeira/ConStruct/raw/main/data/high_tls_200/raw/val.pkl',
+            'test': 'https://github.com/manuelmlmadeira/ConStruct/raw/main/data/high_tls_200/raw/test.pkl',
+        },
+    }
+
+    def __init__(self, root: Optional[str] = None, split: str = 'train',
+                 conditional: bool = False, target: str = 'k2',
+                 transform: Optional[Callable] = None,
+                 pre_transform: Optional[Callable] = None,
+                 pre_filter: Optional[Callable] = None,
+                 force_reload: bool = False):
+        assert split in ('train', 'val', 'test'), f"Unknown split: {split}"
+        self.name = 'tls'
+        self.split = split
+        self.conditional = conditional
+        self.target = target
+
+        super().__init__(root, transform, pre_transform, pre_filter,
+                         force_reload=force_reload)
+
+        split_idx = {'train': 0, 'val': 1, 'test': 2}[split]
+        self.data, self.slices = self.load_data(self.processed_paths[split_idx])
+
+    @property
+    def raw_file_names(self) -> List[str]:
+        return ['train.pkl', 'val.pkl', 'test.pkl']
+
+    @property
+    def processed_file_names(self) -> List[str]:
+        tag = '_cond' if self.conditional else ''
+        return [
+            tlx.BACKEND + f'_train{tag}.pt',
+            tlx.BACKEND + f'_val{tag}.pt',
+            tlx.BACKEND + f'_test{tag}.pt',
+        ]
+
+    def download(self):
+        for split_key in ['train', 'val', 'test']:
+            low_path = download_url(
+                self.urls['low_tls'][split_key],
+                osp.join(self.raw_dir, 'low_tls'))
+            low_data = pickle.load(open(low_path, 'rb'))
+
+            high_path = download_url(
+                self.urls['high_tls'][split_key],
+                osp.join(self.raw_dir, 'high_tls'))
+            high_data = pickle.load(open(high_path, 'rb'))
+
+            all_data = low_data + high_data
+            with open(osp.join(self.raw_dir, f'{split_key}.pkl'), 'wb') as f:
+                pickle.dump(all_data, f)
+            print(f"  TLS {split_key}: {len(all_data)} graphs")
+
+    def process(self):
+        for split_name, split_idx in [('train', 0), ('val', 1), ('test', 2)]:
+            pkl_path = osp.join(self.raw_dir, f'{split_name}.pkl')
+            with open(pkl_path, 'rb') as f:
+                raw_dataset = pickle.load(f)
+
+            data_list = []
+            for idx, graph in enumerate(raw_dataset):
+                cell_graph = CellGraph(graph)
+
+                if not (cell_graph.has_low_TLS() or cell_graph.has_high_TLS()):
+                    print(f"  Warning: ambiguous cell graph {idx}, skipping")
+                    continue
+
+                data = cell_graph.to_gamma_graph()
+
+                # Set conditional label
+                if self.conditional and self.target == 'k2':
+                    label = float(cell_graph.tls_features.get("k_2", 0) > 0.05)
+                    data.y = np.array([[label]], dtype=np.float32)
+
+                # Store TLS features as additional attribute
+                data.tls_features = np.array(
+                    [cell_graph.tls_features.get(f"k_{a}", 0) or 0 for a in range(6)],
+                    dtype=np.float32
+                )
+
+                if self.pre_filter is not None and not self.pre_filter(data):
+                    continue
+                if self.pre_transform is not None:
+                    data = self.pre_transform(data)
+
+                data_list.append(data)
+
+            print(f"  Processed TLS {split_name}: {len(data_list)} graphs")
+            collated_data, slices = self.collate(data_list)
+            self.save_data(
+                (collated_data, slices),
+                self.processed_paths[split_idx],
+            )
+
+    def __repr__(self) -> str:
+        return f'TLSGraphDataset(split={self.split}, len={len(self)})'
diff --git a/examples/defog/datasets/zinc250k_dataset.py b/examples/defog/datasets/zinc250k_dataset.py
new file mode 100644
index 000000000..fb6d7c3c4
--- /dev/null
+++ b/examples/defog/datasets/zinc250k_dataset.py
@@ -0,0 +1,250 @@
+import os
+import os.path as osp
+import json
+import numpy as np
+import tensorlayerx as tlx
+from typing import Callable, List, Optional
+
+from gammagl.data import (
+    Graph,
+    InMemoryDataset,
+    download_url,
+)
+
+
+class ZINC250kGen(InMemoryDataset):
+    r"""The ZINC250k dataset for molecular graph generation.
+
+    Contains ~250k drug-like molecules from the ZINC database, processed
+    with kekulization (no aromatic bonds).
+
+    From `"Grammar Variational Autoencoder"
+    <https://arxiv.org/abs/1703.01925>`_ .
+
+    Requires **RDKit** (``pip install rdkit``).
+
+    Parameters
+    ----------
+    root : str, optional
+        Root directory where the dataset should be saved.
+    split : str
+        Which split to load: ``'train'``, ``'val'``, or ``'test'``.
+    transform : callable, optional
+        A transform applied to each :class:`Graph` on access.
+    pre_transform : callable, optional
+        A transform applied during processing before saving.
+    pre_filter : callable, optional
+        A filter applied during processing.
+    force_reload : bool
+        Whether to re-process the dataset.
+    """
+
+    CSV_URL = ('https://raw.githubusercontent.com/harryjo97/GruM/'
+               'master/GruM_2D/data/zinc250k.csv')
+    IDX_URL = ('https://raw.githubusercontent.com/harryjo97/GruM/'
+               'master/GruM_2D/data/valid_idx_zinc250k.json')
+
+    ATOM_DECODER = ['C', 'N', 'O', 'F', 'P', 'S', 'Cl', 'Br', 'I']
+    ATOM_ENCODER = {atom: i for i, atom in enumerate(ATOM_DECODER)}
+    NUM_ATOM_TYPES = 9
+    NUM_EDGE_TYPES = 4  # no-bond=0, single=1, double=2, triple=3 (kekulized)
+
+    STATS = {
+        'valencies': [4, 3, 2, 1, 5, 6, 1, 1, 1],
+        'atom_weights': {0: 12, 1: 14, 2: 16, 3: 19, 4: 30, 5: 32,
+                         6: 35.5, 7: 78, 8: 127},
+        'max_weight': 500.0,
+        'max_n_nodes': 38,
+        'num_node_types': 9,
+        'num_edge_types': 4,
+        'n_nodes': np.array([
+            0.0000e+00, 0.0000e+00, 0.0000e+00, 0.0000e+00, 0.0000e+00,
+            0.0000e+00, 1.3359e-05, 2.2265e-05, 5.7889e-05, 2.9835e-04,
+            7.9263e-04, 2.9123e-03, 4.6890e-03, 7.1515e-03, 1.1275e-02,
+            1.7117e-02, 2.5360e-02, 3.5014e-02, 4.6707e-02, 5.8178e-02,
+            7.0829e-02, 8.1472e-02, 7.4922e-02, 8.4384e-02, 9.3099e-02,
+            9.1451e-02, 7.7175e-02, 6.3397e-02, 4.0331e-02, 3.1131e-02,
+            2.4394e-02, 1.9237e-02, 1.5029e-02, 1.0362e-02, 6.9155e-03,
+            4.1190e-03, 1.5942e-03, 5.6108e-04, 8.9060e-06,
+        ], dtype=np.float32),
+        'node_types': np.array([
+            7.3678e-01, 1.2211e-01, 9.9746e-02, 1.3745e-02, 2.4428e-05,
+            1.7806e-02, 7.4231e-03, 2.2057e-03, 1.5522e-04,
+        ], dtype=np.float32),
+        'edge_types': np.array([
+            9.0658e-01, 6.9411e-02, 2.3771e-02, 2.3480e-04,
+        ], dtype=np.float32),
+        'valency_distribution': np.concatenate([
+            np.array([
+                0.0000e+00, 1.1364e-01, 3.0431e-01, 3.5063e-01,
+                2.2655e-01, 2.2697e-05, 4.8356e-03,
+            ], dtype=np.float32),
+            np.zeros(105, dtype=np.float32),
+        ]),
+    }
+
+    def __init__(self, root: Optional[str] = None, split: str = 'train',
+                 transform: Optional[Callable] = None,
+                 pre_transform: Optional[Callable] = None,
+                 pre_filter: Optional[Callable] = None,
+                 force_reload: bool = False):
+        assert split in ('train', 'val', 'test'), f"Unknown split: {split}"
+        self.name = 'zinc250k_gen'
+        self.split = split
+
+        super().__init__(root, transform, pre_transform, pre_filter,
+                         force_reload=force_reload)
+
+        split_idx = {'train': 0, 'val': 1, 'test': 2}[split]
+        self.data, self.slices = self.load_data(self.processed_paths[split_idx])
+
+    @property
+    def raw_file_names(self) -> List[str]:
+        return ['zinc250k.csv', 'valid_idx_zinc250k.json']
+
+    @property
+    def processed_file_names(self) -> List[str]:
+        return [
+            tlx.BACKEND + '_train.pt',
+            tlx.BACKEND + '_val.pt',
+            tlx.BACKEND + '_test.pt',
+        ]
+
+    def download(self):
+        download_url(self.CSV_URL, self.raw_dir)
+        download_url(self.IDX_URL, self.raw_dir)
+
+    def process(self):
+        try:
+            from rdkit import Chem, RDLogger
+            from rdkit.Chem.rdchem import BondType as BT
+        except ImportError:
+            raise ImportError(
+                "ZINC250kGen requires RDKit. Install via: pip install rdkit")
+
+        RDLogger.DisableLog('rdApp.*')
+        import pandas as pd
+
+        atom_encoder = self.ATOM_ENCODER
+        bond_types = {BT.SINGLE: 1, BT.DOUBLE: 2, BT.TRIPLE: 3}
+        num_bond_types = self.NUM_EDGE_TYPES
+        num_atom_types = self.NUM_ATOM_TYPES
+
+        # Read CSV
+        csv_path = osp.join(self.raw_dir, 'zinc250k.csv')
+        df = pd.read_csv(csv_path)
+        smiles_list = df['smiles'].values
+
+        # Read validation indices
+        idx_path = osp.join(self.raw_dir, 'valid_idx_zinc250k.json')
+        with open(idx_path, 'r') as f:
+            val_indices = set(json.load(f))
+
+        # val and test share the same indices (original DeFoG behavior)
+        train_indices = [i for i in range(len(smiles_list))
+                         if i not in val_indices]
+        val_indices_list = sorted(val_indices)
+
+        split_indices = {
+            'train': train_indices,
+            'val': val_indices_list,
+            'test': val_indices_list,
+        }
+
+        for split_name, split_idx in [('train', 0), ('val', 1), ('test', 2)]:
+            indices = split_indices[split_name]
+            data_list = []
+
+            for i in indices:
+                smile = smiles_list[i]
+                mol = Chem.MolFromSmiles(smile)
+                if mol is None:
+                    continue
+
+                # Kekulize to remove aromatic bonds
+                try:
+                    Chem.Kekulize(mol, clearAromaticFlags=True)
+                except Exception:
+                    continue
+
+                n_atoms = mol.GetNumAtoms()
+                if n_atoms == 0:
+                    continue
+
+                # Node features: one-hot atom type
+                type_idx = []
+                valid = True
+                for atom in mol.GetAtoms():
+                    symbol = atom.GetSymbol()
+                    if symbol not in atom_encoder:
+                        valid = False
+                        break
+                    type_idx.append(atom_encoder[symbol])
+                if not valid:
+                    continue
+
+                x = np.zeros((n_atoms, num_atom_types), dtype=np.float32)
+                for j, t in enumerate(type_idx):
+                    x[j, t] = 1.0
+
+                # Edge index and attributes (kekulized: no aromatic)
+                rows, cols, edge_feats = [], [], []
+                for bond in mol.GetBonds():
+                    start = bond.GetBeginAtomIdx()
+                    end = bond.GetEndAtomIdx()
+                    bt = bond.GetBondType()
+                    if bt not in bond_types:
+                        continue
+                    bond_idx = bond_types[bt]
+                    for s, d in [(start, end), (end, start)]:
+                        rows.append(s)
+                        cols.append(d)
+                        feat = np.zeros(num_bond_types, dtype=np.float32)
+                        feat[bond_idx] = 1.0
+                        edge_feats.append(feat)
+
+                if len(rows) > 0:
+                    edge_index = np.stack([rows, cols], axis=0).astype(np.int64)
+                    edge_attr = np.stack(edge_feats, axis=0)
+                else:
+                    continue
+
+                y = np.zeros((1, 0), dtype=np.float32)
+
+                data = Graph(
+                    x=x,
+                    edge_index=edge_index,
+                    edge_attr=edge_attr,
+                    y=y,
+                    n_nodes=np.array([n_atoms], dtype=np.int64),
+                    to_tensor=True,
+                )
+
+                if self.pre_filter is not None and not self.pre_filter(data):
+                    continue
+                if self.pre_transform is not None:
+                    data = self.pre_transform(data)
+
+                data_list.append(data)
+
+            if len(data_list) == 0:
+                data_list = [Graph(
+                    x=np.zeros((1, num_atom_types), dtype=np.float32),
+                    edge_index=np.zeros((2, 0), dtype=np.int64),
+                    edge_attr=np.zeros((0, num_bond_types), dtype=np.float32),
+                    y=np.zeros((1, 0), dtype=np.float32),
+                    to_tensor=True,
+                )]
+
+            collated_data, slices = self.collate(data_list)
+            self.save_data(
+                (collated_data, slices),
+                self.processed_paths[split_idx],
+            )
+
+    def get_stats(self):
+        """Return pre-computed dataset statistics for DeFoG training."""
+        return self.STATS.copy()
+
+    def __repr__(self) -> str:
+        return f'ZINC250kGen({len(self)})'
diff --git a/examples/defog/defog_trainer.py b/examples/defog/defog_trainer.py
new file mode 100644
index 000000000..efa2c383e
--- /dev/null
+++ b/examples/defog/defog_trainer.py
@@ -0,0 +1,1107 @@
+import os
+os.environ['TF_CPP_MIN_LOG_LEVEL'] = '2'
+
+import sys
+import argparse
+import random
+import warnings
+import numpy as np
+import tensorlayerx as tlx
+assert tlx.BACKEND == 'torch', "DeFoG currently only supports PyTorch backend due to framework limitations."
+from tensorlayerx.model import TrainOneStep, WithLoss
+
+CURRENT_DIR = os.path.dirname(os.path.abspath(__file__))
+REPO_ROOT = os.path.abspath(os.path.join(CURRENT_DIR, '..', '..'))
+sys.path.insert(0, REPO_ROOT)
+sys.path.insert(0, CURRENT_DIR)
+if hasattr(sys.stdout, 'reconfigure'):
+    sys.stdout.reconfigure(line_buffering=True)
+if hasattr(sys.stderr, 'reconfigure'):
+    sys.stderr.reconfigure(line_buffering=True)
+
+from gammagl.models.defog import DeFoGModel
+
+from gammagl.loader import DataLoader
+
+from dataset_utils import create_synthetic_dataset, load_real_dataset, compute_dataset_infos
+from defog_utils import PlaceHolder, to_dense, EMA
+
+from flow_matching import NoiseDistribution, apply_noise, RateMatrixDesigner, TimeDistorter
+
+from extra_features import ExtraFeatures, compute_extra_data, DummyExtraFeatures, ExtraMolecularFeatures
+
+from train_metrics import TrainLossDiscrete
+from sampler import sample_batch
+from evaluator import evaluate_generated_graphs, compute_selection_score
+
+
+
+# ============================================================
+# Original DeFoG-aligned presets / checkpoint helpers
+# ============================================================
+
+_BASE_EXPERIMENT_PRESET = {
+    'transition': 'marginal',
+    'extra_features': 'rrwp',
+    'rrwp_steps': 12,
+    'n_layers': 5,
+    'hidden_mlp_X': 256,
+    'hidden_mlp_E': 128,
+    'hidden_mlp_y': 128,
+    'dx': 256,
+    'de': 64,
+    'dy': 64,
+    'n_head': 8,
+    'dim_ffX': 256,
+    'dim_ffE': 128,
+    'dim_ffy': 128,
+    'n_epochs': 1000,
+    'batch_size': 512,
+    'lr': 2e-4,
+    'weight_decay': 1e-12,
+    'ema_decay': 0.0,
+    'train_distortion': 'identity',
+    'sample_steps': 1000,
+    'sample_distortion': 'identity',
+    'eta': 0.0,
+    'omega': 0.0,
+    'rdb': 'general',
+    'rdb_crit': 'max_marginal',
+    'num_sample_fold': 1,
+    'sample_every_val': 4,
+    'check_val_every_n_epochs': 5,
+    'val_num_samples': 512,
+}
+
+_DATASET_PRESETS = {
+    'planar': {
+        'n_layers': 10,
+        'hidden_mlp_X': 128,
+        'hidden_mlp_E': 64,
+        'hidden_mlp_y': 128,
+        'dim_ffE': 64,
+        'dim_ffy': 256,
+        'n_epochs': 100000,
+        'batch_size': 64,
+        'sample_distortion': 'polydec',
+        'omega': 0.05,
+        'eta': 50.0,
+        'sample_every_val': 1,
+        'check_val_every_n_epochs': 2000,
+        'val_num_samples': 40,
+    },
+    'tree': {
+        'n_layers': 10,
+        'hidden_mlp_X': 128,
+        'hidden_mlp_E': 64,
+        'hidden_mlp_y': 128,
+        'dim_ffE': 64,
+        'dim_ffy': 256,
+        'n_epochs': 100000,
+        'batch_size': 64,
+        'train_distortion': 'polydec',
+        'sample_distortion': 'polydec',
+        'sample_every_val': 1,
+        'check_val_every_n_epochs': 2000,
+        'val_num_samples': 40,
+    },
+    'sbm': {
+        'transition': 'absorbfirst',
+        'rrwp_steps': 20,
+        'n_layers': 8,
+        'hidden_mlp_X': 128,
+        'hidden_mlp_E': 64,
+        'hidden_mlp_y': 128,
+        'de': 64,
+        'dy': 64,
+        'dim_ffE': 64,
+        'dim_ffy': 256,
+        'n_epochs': 50000,
+        'batch_size': 32,
+        'sample_every_val': 1,
+        'check_val_every_n_epochs': 2000,
+        'val_num_samples': 40,
+    },
+    'qm9': {
+        'n_layers': 9,
+        'n_epochs': 1000,
+        'batch_size': 1024,
+        'sample_steps': 500,
+        'sample_distortion': 'polydec',
+        'sample_every_val': 1,
+        'check_val_every_n_epochs': 50,
+        'val_num_samples': 512,
+    },
+    'zinc250k': {
+        'rrwp_steps': 20,
+        'n_layers': 12,
+        'hidden_mlp_X': 256,
+        'hidden_mlp_E': 128,
+        'hidden_mlp_y': 256,
+        'de': 64,
+        'dy': 128,
+        'dim_ffX': 256,
+        'dim_ffE': 128,
+        'dim_ffy': 256,
+        'n_epochs': 300,
+        'batch_size': 256,
+        'train_distortion': 'polydec',
+        'sample_distortion': 'polydec',
+        'sample_steps': 1000,
+        'omega': 0.1,
+        'eta': 300.0,
+        'sample_every_val': 2,
+        'check_val_every_n_epochs': 4,
+        'val_num_samples': 256,
+    },
+    'guacamol': {
+        'rrwp_steps': 20,
+        'n_layers': 12,
+        'hidden_mlp_X': 256,
+        'hidden_mlp_E': 128,
+        'hidden_mlp_y': 256,
+        'de': 128,
+        'dy': 128,
+        'dim_ffX': 256,
+        'dim_ffE': 128,
+        'dim_ffy': 256,
+        'n_epochs': 1000,
+        'batch_size': 64,
+        'train_distortion': 'polydec',
+        'sample_distortion': 'polydec',
+        'sample_steps': 1000,
+        'omega': 0.1,
+        'eta': 300.0,
+        'sample_every_val': 2,
+        'check_val_every_n_epochs': 2,
+        'val_num_samples': 500,
+    },
+    'moses': {
+        'rrwp_steps': 20,
+        'n_layers': 12,
+        'hidden_mlp_X': 256,
+        'hidden_mlp_E': 128,
+        'hidden_mlp_y': 256,
+        'de': 128,
+        'dy': 128,
+        'dim_ffX': 256,
+        'dim_ffE': 128,
+        'dim_ffy': 256,
+        'n_epochs': 300,
+        'batch_size': 256,
+        'train_distortion': 'polydec',
+        'sample_distortion': 'polydec',
+        'sample_steps': 1000,
+        'omega': 0.5,
+        'eta': 200.0,
+        'sample_every_val': 4,
+        'check_val_every_n_epochs': 1,
+        'val_num_samples': 256,
+    },
+    'tls': {
+        'n_layers': 10,
+        'rrwp_steps': 20,
+        'hidden_mlp_X': 128,
+        'hidden_mlp_E': 64,
+        'hidden_mlp_y': 128,
+        'dim_ffE': 64,
+        'dim_ffy': 256,
+        'n_epochs': 100000,
+        'batch_size': 64,
+        'sample_distortion': 'polydec',
+        'omega': 0.05,
+        'eta': 0.0,
+        'sample_every_val': 1,
+        'check_val_every_n_epochs': 2000,
+        'val_num_samples': 40,
+    },
+    'comm20': {
+        'n_layers': 8,
+        'n_epochs': 1000000,
+        'batch_size': 256,
+        'sample_every_val': 10,
+        'check_val_every_n_epochs': 1000,
+        'val_num_samples': 20,
+    },
+}
+
+
+def _get_explicit_cli_dests(parser, argv=None):
+    argv = sys.argv[1:] if argv is None else argv
+    explicit = set()
+    for action in parser._actions:
+        for opt in action.option_strings:
+            if opt in argv or any(arg.startswith(opt + '=') for arg in argv):
+                explicit.add(action.dest)
+                break
+    return explicit
+
+
+def apply_dataset_preset(args, parser, argv=None):
+    dataset = getattr(args, 'dataset', None)
+    if dataset in (None, 'synthetic'):
+        return args
+
+    preset = dict(_BASE_EXPERIMENT_PRESET)
+    preset.update(_DATASET_PRESETS.get(dataset, {}))
+    explicit = _get_explicit_cli_dests(parser, argv)
+    applied = []
+
+    for key, value in preset.items():
+        if hasattr(args, key) and key not in explicit:
+            setattr(args, key, value)
+            applied.append(key)
+
+    if applied:
+        preview = ', '.join(applied[:8])
+        suffix = ' ...' if len(applied) > 8 else ''
+        print(f"Applied original DeFoG preset for {dataset}: {preview}{suffix}")
+
+    return args
+
+
+def save_model_snapshot(model, ema, save_dir, prefix, output_dims=None):
+    model_path = os.path.join(save_dir, f'{prefix}_model.npz')
+    ema_path = os.path.join(save_dir, f'{prefix}_ema.pkl')
+    model.save_weights(model_path, format='npz_dict')
+
+    if ema is not None:
+        with open(ema_path, 'wb') as f:
+            f.write(ema.state_dict())
+    elif os.path.exists(ema_path):
+        os.remove(ema_path)
+
+    # Save output_dims for reproducibility when loading for sampling
+    if output_dims is not None:
+        import json as _json
+        config_path = os.path.join(save_dir, 'model_config.json')
+        with open(config_path, 'w') as _f:
+            _json.dump({'output_dims': output_dims}, _f, indent=2)
+
+    return model_path, ema_path
+
+
+def load_model_snapshot_for_sampling(model, save_dir, ema_decay=0.0):
+    prefixes = ['best', 'last']
+    chosen_prefix = None
+    model_path = None
+
+    for prefix in prefixes:
+        candidate = os.path.join(save_dir, f'{prefix}_model.npz')
+        if os.path.exists(candidate):
+            chosen_prefix = prefix
+            model_path = candidate
+            break
+
+    if model_path is None:
+        raise FileNotFoundError(
+            f"No sampling checkpoint found in {save_dir}. Expected best_model.npz or last_model.npz"
+        )
+
+    model.load_weights(model_path, format='npz_dict')
+    print(f"Loaded model from {model_path}")
+
+    ema = None
+    ema_path = os.path.join(save_dir, f'{chosen_prefix}_ema.pkl')
+    if os.path.exists(ema_path):
+        ema = EMA(model, decay=max(float(ema_decay), 0.999))
+        with open(ema_path, 'rb') as f:
+            ema.load_state_dict(f.read())
+        ema.swap_in(model)
+        print(f"  Using EMA weights from {ema_path}")
+
+    return model_path, ema
+
+
+# ============================================================
+# Training Loss Wrapper
+# ============================================================
+
+class DeFoGWithLoss(WithLoss):
+
+    r"""Wraps DeFoG model for GammaGL training with ``TrainOneStep``.
+
+    Encapsulates the full training forward pass: noise application,
+    extra feature computation, model forward, and loss computation.
+
+    Parameters
+    ----------
+    backbone : DeFoGModel
+        The graph transformer denoiser.
+    loss_fn : TrainLossDiscrete
+        The training loss function.
+    noise_dist : NoiseDistribution
+        Noise distribution handler.
+    time_distorter : TimeDistorter
+        Time distortion for sampling training time.
+    extra_features : callable
+        Structural extra feature computer.
+    domain_features : callable
+        Domain-specific extra feature computer.
+    """
+    def __init__(self, backbone, loss_fn, noise_dist, time_distorter,
+                 extra_features, domain_features, conditional=False):
+        super().__init__(backbone=backbone, loss_fn=loss_fn)
+        self.noise_dist = noise_dist
+        self.limit_dist = noise_dist.get_limit_dist()
+        self.time_distorter = time_distorter
+        self.extra_features = extra_features
+        self.domain_features = domain_features
+        self.conditional = conditional
+
+    def forward(self, data, label):
+        """
+        Parameters
+        ----------
+        data : dict
+            Keys: ``'X'``, ``'E'``, ``'y'``, ``'node_mask'``.
+        label : ignored
+
+        Returns
+        -------
+        tensor
+            Scalar loss value.
+        """
+        X = data['X']
+        E = data['E']
+        y = data['y']
+        node_mask = data['node_mask']
+
+        bs = X.shape[0]
+
+        # Classifier-free guidance: 10% dropout of conditional labels
+        if self.conditional and y.shape[-1] > 0:
+            if np.random.rand() < 0.1:
+                y = tlx.ones_like(y) * (-1.0)
+
+        # Add virtual classes for absorbing transition
+        X, E = self.noise_dist.add_virtual_classes(X, E)
+
+        # 1. Apply noise
+        noisy_data = apply_noise(X, E, y, node_mask,
+                                  self.limit_dist, self.time_distorter)
+
+        # 2. Compute extra features
+        extra_data = compute_extra_data(noisy_data, self.extra_features,
+                                         self.domain_features, self.noise_dist)
+
+        # 3. Concatenate inputs
+        X_in = tlx.concat([noisy_data['X_t'], extra_data.X], axis=-1)
+        E_in = tlx.concat([noisy_data['E_t'], extra_data.E], axis=-1)
+
+        # Ensure y tensors are 2D (bs, dy)
+        y_t = noisy_data['y_t']
+        ey = extra_data.y
+        if len(y_t.shape) == 1:
+            y_t = tlx.reshape(y_t, [bs, -1])
+        if len(ey.shape) == 1:
+            ey = tlx.reshape(ey, [bs, -1])
+        if y_t.shape[-1] == 0:
+            y_in = ey
+        elif ey.shape[-1] == 0:
+            y_in = y_t
+        else:
+            y_in = tlx.concat([y_t, ey], axis=-1)
+
+        # 4. Forward through model
+        pred_X, pred_E, pred_y = self.backbone_network(X_in, E_in, y_in, node_mask)
+
+        # 5. Compute loss against clean data (ignoring virtual classes)
+        true_X, true_E = self.noise_dist.ignore_virtual_classes(X, E)
+
+        loss = self._loss_fn(pred_X, pred_E, pred_y, true_X, true_E, y)
+
+        import torch
+        if not torch.isfinite(loss):
+            print(
+                f"Warning: Non-finite loss ({loss}) encountered at step {self.global_step}. Skipping step."
+            )
+            # In pure PyTorch we might skip optimizer.step(), but here we just return the loss
+
+        return loss
+
+# ============================================================
+# Dataset Utilities
+# ============================================================
+
+
+
+
+# ============================================================
+# ============================================================
+# Main
+# ============================================================
+
+def main(args):
+    # ------- Dataset -------
+    if args.dataset == 'synthetic':
+        print(f"Creating synthetic dataset with {args.num_graphs} graphs...")
+        test_labels = None
+        val_ds = None
+        test_ds = None
+        graphs = create_synthetic_dataset(
+            num_graphs=args.num_graphs,
+            min_nodes=args.min_nodes,
+            max_nodes=args.max_nodes,
+            num_node_types=args.num_node_types,
+            num_edge_types=args.num_edge_types,
+            p_edge=args.p_edge,
+        )
+        dataset_infos = compute_dataset_infos(graphs, args.num_node_types, args.num_edge_types)
+    else:
+        graphs, val_ds, test_ds, dataset_infos, nt, et, test_labels = load_real_dataset(
+            args.dataset, root=args.data_root,
+            conditional=getattr(args, 'conditional', False),
+            target=getattr(args, 'target', 'mu'),
+            remove_h=getattr(args, 'remove_h', None))
+        args.num_node_types = nt
+        args.num_edge_types = et
+
+    val_labels = None
+    if getattr(args, 'conditional', False) and val_ds is not None:
+        val_labels_list = []
+        for i in range(len(val_ds)):
+            g = val_ds[i]
+            y_val = g.y if g.y is not None else np.zeros((1, 0), dtype=np.float32)
+            if isinstance(y_val, np.ndarray):
+                y_np = y_val.flatten()
+            else:
+                y_np = tlx.convert_to_numpy(y_val).flatten()
+            val_labels_list.append(y_np)
+        if len(val_labels_list) > 0 and len(val_labels_list[0]) > 0:
+            val_labels = tlx.convert_to_tensor(
+                np.stack(val_labels_list, axis=0).astype(np.float32))
+
+    print(f"Dataset: {len(graphs)} graphs, max_nodes={dataset_infos['max_n_nodes']}")
+    print(f"  Node type distribution: {dataset_infos['node_types']}")
+    print(f"  Edge type distribution: {dataset_infos['edge_types']}")
+
+    # ------- Noise Distribution -------
+    noise_dist = NoiseDistribution(args.transition, dataset_infos)
+    limit_dist = noise_dist.get_limit_dist()
+
+    # ------- Extra Features -------
+    extra_features = ExtraFeatures(
+        extra_features_type=args.extra_features,
+        rrwp_steps=args.rrwp_steps,
+        dataset_info=dataset_infos,
+    )
+    # ------- Domain-specific features -------
+    is_molecular = args.dataset in ('qm9', 'guacamol', 'zinc250k', 'moses')
+    if is_molecular:
+        domain_features = ExtraMolecularFeatures(dataset_infos=dataset_infos)
+    else:
+        domain_features = DummyExtraFeatures()
+
+    # ------- Compute input dimensions by doing a dry run -------
+    # Create a dummy noisy data to infer feature sizes
+    dummy_g = graphs[0]
+    dummy_X = tlx.expand_dims(dummy_g.x, axis=0)  # (1, n, dx)
+    dummy_n = dummy_X.shape[1]
+    dummy_E_np = np.zeros((1, dummy_n, dummy_n, args.num_edge_types), dtype=np.float32)
+    dummy_E_np[0, :, :, 0] = 1.0
+    dummy_E = tlx.convert_to_tensor(dummy_E_np)
+    dummy_mask = tlx.convert_to_tensor(np.ones((1, dummy_n), dtype=bool))
+    # Determine y dimension: conditional datasets have non-empty y
+    n_cond = 0
+    if getattr(args, 'conditional', False) and test_labels is not None:
+        n_cond = test_labels.shape[-1]
+    dummy_y = tlx.zeros([1, n_cond], dtype=tlx.float32)
+    dummy_t = tlx.convert_to_tensor(np.array([[0.5]], dtype=np.float32))
+
+    # Add virtual classes
+    dummy_X_v, dummy_E_v = noise_dist.add_virtual_classes(dummy_X, dummy_E)
+
+    dummy_noisy = {
+        't': dummy_t,
+        'X_t': dummy_X_v,
+        'E_t': dummy_E_v,
+        'y_t': dummy_y,
+        'node_mask': dummy_mask,
+    }
+    extra_dummy = compute_extra_data(dummy_noisy, extra_features,
+                                      domain_features, noise_dist)
+
+    input_X_dim = dummy_X_v.shape[-1] + extra_dummy.X.shape[-1]
+    input_E_dim = dummy_E_v.shape[-1] + extra_dummy.E.shape[-1]
+    input_y_dim = dummy_y.shape[-1] + extra_dummy.y.shape[-1]
+
+    # Check for saved model_config.json (for loading checkpoints trained with different code)
+    model_config_path = os.path.join(args.save_dir, 'model_config.json')
+    if os.path.exists(model_config_path):
+        import json as _json
+        with open(model_config_path, 'r') as _f:
+            _cfg = _json.load(_f)
+        output_dims = _cfg.get('output_dims', noise_dist.get_noise_dims())
+        print(f"[config] Loaded output_dims from {model_config_path}")
+    else:
+        # Use dataset_infos output_dims (matches original DeFoG behavior)
+        output_dims = dataset_infos['output_dims']
+
+    input_dims = {'X': input_X_dim, 'E': input_E_dim, 'y': input_y_dim}
+
+    print(f"Input dims: {input_dims}")
+    print(f"Output dims: {output_dims}")
+
+    # ------- Model -------
+    hidden_mlp_dims = {'X': args.hidden_mlp_X, 'E': args.hidden_mlp_E, 'y': args.hidden_mlp_y}
+    hidden_dims = {
+        'dx': args.dx, 'de': args.de, 'dy': args.dy,
+        'n_head': args.n_head,
+        'dim_ffX': args.dim_ffX, 'dim_ffE': args.dim_ffE, 'dim_ffy': args.dim_ffy,
+    }
+
+    model = DeFoGModel(
+        n_layers=args.n_layers,
+        input_dims=input_dims,
+        hidden_mlp_dims=hidden_mlp_dims,
+        hidden_dims=hidden_dims,
+        output_dims=output_dims,
+        name='DeFoG',
+    )
+    print(f"Model created with {args.n_layers} layers")
+
+    # ------- Loss wrapper -------
+    loss_fn = TrainLossDiscrete(
+        lambda_train=[args.lambda_E, args.lambda_y],
+        kld=getattr(args, 'kld', False),
+    )
+
+    # ------- Time distorter -------
+    time_distorter = TimeDistorter(
+        train_distortion=args.train_distortion,
+        sample_distortion=args.sample_distortion,
+    )
+
+    conditional = getattr(args, 'conditional', False) and n_cond > 0
+    loss_wrapper = DeFoGWithLoss(
+        backbone=model,
+        loss_fn=loss_fn,
+        noise_dist=noise_dist,
+        time_distorter=time_distorter,
+        extra_features=extra_features,
+        domain_features=domain_features,
+        conditional=conditional,
+    )
+
+    # ------- Rate matrix designer -------
+    rate_designer = RateMatrixDesigner(
+        rdb=args.rdb,
+        rdb_crit=args.rdb_crit,
+        eta=args.eta,
+        omega=args.omega,
+        limit_dist=limit_dist,
+    )
+
+    # ------- Optimizer -------
+    import torch as _torch
+
+    # AdamW wrapper: provides the tlx.optimizers interface around torch.optim.AdamW
+    # with NaN/Inf gradient sanitization.
+    class _AdamWWrapper(tlx.optimizers.Adam):
+        def __init__(self, lr, weight_decay, amsgrad, grad_clip=None):
+            self.amsgrad = amsgrad
+            super().__init__(lr=lr, weight_decay=weight_decay, grad_clip=grad_clip)
+        def gradient(self, loss, weights=None, return_grad=True):
+            if weights is None:
+                raise AttributeError("Parameter train_weights must be entered.")
+            if not self.init_optim:
+                self.optimizer_adam = _torch.optim.AdamW(
+                    params=weights, lr=self.lr,
+                    betas=(self.beta_1, self.beta_2), eps=self.eps,
+                    weight_decay=self.weight_decay, amsgrad=self.amsgrad)
+                self.init_optim = True
+            self.optimizer_adam.zero_grad()
+            if not _torch.isfinite(loss):
+                print("[warn:optim] Non-finite loss; skipping step", flush=True)
+                return [_torch.zeros_like(w) for w in weights] if return_grad else None
+            loss.backward()
+            for w in weights:
+                if w.grad is not None and not _torch.isfinite(w.grad).all():
+                    w.grad.data = _torch.nan_to_num(w.grad.data, nan=0.0, posinf=0.0, neginf=0.0)
+            if self.grad_clip is not None:
+                gn = self.grad_clip(weights)
+                if isinstance(gn, _torch.Tensor) and not _torch.isfinite(gn):
+                    for w in weights:
+                        if w.grad is not None:
+                            w.grad.zero_()
+            return [w.grad for w in weights] if return_grad else None
+        def apply_gradients(self, grads_and_vars=None, closure=None):
+            if not self.init_optim:
+                raise AttributeError("Call gradient() first.")
+            return self.optimizer_adam.step(closure) if closure else self.optimizer_adam.step()
+
+    optimizer = _AdamWWrapper(
+        lr=args.lr,
+        weight_decay=args.weight_decay,
+        amsgrad=True,
+    )
+
+    if args.grad_clip_norm is not None:
+        optimizer.grad_clip = lambda weights: _torch.nn.utils.clip_grad_norm_(
+            [w for w in weights], max_norm=args.grad_clip_norm)
+    print("[debug] Creating TrainOneStep...")
+    train_one_step = TrainOneStep(loss_wrapper, optimizer, model.trainable_weights)
+    print("[debug] TrainOneStep created")
+
+    # DataLoader with seeded shuffle
+    print(f"[debug] Creating DataLoader with batch_size={args.batch_size}...")
+
+    class SeededRandomSampler:
+        """Reproducible random sampler: each epoch shuffles with seed + epoch."""
+        def __init__(self, data_source, seed=42):
+            self.data_source = data_source
+            self.seed = seed
+            self.epoch = 0
+        def __iter__(self):
+            rng = np.random.default_rng(self.seed + self.epoch)
+            indices = np.arange(len(self.data_source))
+            rng.shuffle(indices)
+            self.epoch += 1
+            for idx in indices:
+                yield int(idx)
+        def __len__(self):
+            return len(self.data_source)
+
+    from tensorlayerx.dataflow import BatchSampler
+    from gammagl.loader.dataloader import Collater
+    sampler = SeededRandomSampler(graphs, seed=args.seed)
+    batch_sampler = BatchSampler(sampler, args.batch_size, drop_last=False)
+    
+    try:
+        import torch
+        from torch.utils.data import DataLoader as TorchDataLoader
+        loader = TorchDataLoader(
+            graphs,
+            batch_sampler=batch_sampler,
+            collate_fn=Collater(follow_batch=None, exclude_keys=None),
+            num_workers=8,
+            pin_memory=True,
+            persistent_workers=True,
+            multiprocessing_context='spawn'
+        )
+        print("[debug] DataLoader created (seeded shuffle) with num_workers=8 (PyTorch)")
+    except Exception as e:
+        print(f"[warn] PyTorch DataLoader failed: {e}. Falling back to default.")
+        loader = DataLoader(graphs, batch_sampler=batch_sampler, collate_fn=Collater(follow_batch=None, exclude_keys=None))
+        print("[debug] DataLoader created (seeded shuffle)")
+
+    # EMA (Exponential Moving Average)
+    ema = None
+    if args.ema_decay > 0:
+        ema = EMA(model, decay=args.ema_decay)
+        print(f"EMA enabled with decay={args.ema_decay}")
+
+    if args.sample:
+        print("\nSkipping training because --sample was specified.")
+    else:
+        # Resume from checkpoint if specified
+        start_epoch = getattr(args, 'start_epoch', 0) or 0
+        if args.resume_from:
+            ckpt_path = os.path.join(args.resume_from, 'last_model.npz')
+            ema_path = os.path.join(args.resume_from, 'last_ema.pkl')
+            if not os.path.exists(ckpt_path):
+                ckpt_path = os.path.join(args.resume_from, 'best_model.npz')
+                ema_path = os.path.join(args.resume_from, 'best_ema.pkl')
+            if os.path.exists(ckpt_path):
+                model.load_weights(ckpt_path, format='npz_dict')
+                print(f"Resumed model weights from {ckpt_path}, starting at epoch {start_epoch}")
+                if ema is not None and os.path.exists(ema_path):
+                    with open(ema_path, 'rb') as f:
+                        ema.load_state_dict(f.read())
+                    print(f"Resumed EMA weights from {ema_path}")
+            else:
+                print(f"WARNING: --resume_from={args.resume_from} but no checkpoint found, training from scratch")
+
+        print(f"\nStarting training for {args.n_epochs} epochs (from epoch {start_epoch})...")
+        saved_checkpoints = []
+        max_saved_checkpoints = 5
+        val_counter = 0
+
+        for epoch in range(start_epoch, args.n_epochs):
+            model.set_train()
+            loss_fn.reset()
+            total_loss = 0.0
+            n_batches = 0
+
+            for batch_idx, batch in enumerate(loader):
+                batch.tensor()
+                # Convert sparse batch to dense
+                dense, node_mask = to_dense(batch.x, batch.edge_index,
+                                             batch.edge_attr, batch.batch)
+                bs = dense.X.shape[0]
+
+                # Extract y (conditional labels or empty)
+                if conditional and hasattr(batch, 'y') and batch.y is not None:
+                    y_np = tlx.convert_to_numpy(batch.y)
+                    if y_np.ndim == 1:
+                        y_np = y_np.reshape(bs, -1)
+                    elif y_np.ndim > 2:
+                        y_np = y_np.reshape(bs, -1)
+                    # Filter out dummy y values (shape[1]==0)
+                    if y_np.shape[-1] == 0:
+                        y = tlx.zeros([bs, 0], dtype=tlx.float32)
+                    else:
+                        y = tlx.convert_to_tensor(y_np.astype(np.float32))
+                else:
+                    y = tlx.zeros([bs, 0], dtype=tlx.float32)
+
+                data_dict = {
+                    'X': dense.X,
+                    'E': dense.E,
+                    'y': y,
+                    'node_mask': node_mask,
+                }
+
+                if y.shape[-1] > 0 and y.dtype != tlx.float32:
+                    y = tlx.cast(y, tlx.float32)
+                    data_dict['y'] = y
+
+                loss = train_one_step(data_dict, None)
+                loss_val = float(loss) if isinstance(loss, (int, float)) else \
+                    float(loss.item() if hasattr(loss, 'item') else np.asarray(loss).item())
+                total_loss += loss_val
+                n_batches += 1
+
+                if batch_idx % 20 == 0:
+                    print(f"  Epoch {epoch + 1}, Batch {batch_idx}, loss={loss_val:.4f}")
+
+                # Update EMA after each training step
+                if ema is not None:
+                    ema.update(model)
+
+            avg_loss = total_loss / max(n_batches, 1)
+
+            epoch_metrics = loss_fn.log_epoch_metrics()
+            print(f"  Epoch {epoch + 1}/{args.n_epochs}: loss={avg_loss:.4f} "
+                  f"X_CE={epoch_metrics['x_CE']:.4f} E_CE={epoch_metrics['E_CE']:.4f}")
+
+            should_validate = (
+                args.check_val_every_n_epochs > 0 and
+                (epoch + 1) % args.check_val_every_n_epochs == 0
+            )
+
+            if should_validate:
+                val_counter += 1
+                save_model_snapshot(model, ema, args.save_dir, 'last', output_dims)
+
+                if args.sample_every_val > 0 and val_counter % args.sample_every_val == 0:
+                    print(f"\nValidation sampling at epoch {epoch + 1}...")
+                    val_batch_size = max(1, int(args.val_num_samples))
+                    cond_labels = None
+                    if conditional and val_labels is not None:
+                        perm = np.random.permutation(tlx.convert_to_numpy(val_labels).shape[0])
+                        idx = perm[:val_batch_size]
+                        cond_labels = val_labels[idx]
+                        print(f"  Using classifier-free guidance (weight={args.guidance_weight})")
+
+                    if ema is not None:
+                        ema.swap_in(model)
+                    try:
+                        sample_bs = getattr(args, 'sample_batch_size', 0) or val_batch_size
+                        if sample_bs >= val_batch_size:
+                            generated_val = sample_batch(
+                                model=model,
+                                noise_dist=noise_dist,
+                                rate_matrix_designer=rate_designer,
+                                time_distorter=time_distorter,
+                                extra_features=extra_features,
+                                domain_features=domain_features,
+                                node_dist=dataset_infos['node_dist'],
+                                sample_steps=args.sample_steps,
+                                batch_size=val_batch_size,
+                                conditional=conditional,
+                                cond_labels=cond_labels,
+                                guidance_weight=args.guidance_weight,
+                            )
+                        else:
+                            all_generated_batches = []
+                            num_batches = (val_batch_size + sample_bs - 1) // sample_bs
+                            for b_idx in range(num_batches):
+                                current_bs = min(sample_bs, val_batch_size - len(all_generated_batches))
+                                print(f"  Validation batch {b_idx + 1}/{num_batches} (size={current_bs})...")
+                                batch_cond = None
+                                if cond_labels is not None:
+                                    start_idx = b_idx * sample_bs
+                                    batch_cond = cond_labels[start_idx:start_idx + current_bs]
+                                batch_generated = sample_batch(
+                                    model=model,
+                                    noise_dist=noise_dist,
+                                    rate_matrix_designer=rate_designer,
+                                    time_distorter=time_distorter,
+                                    extra_features=extra_features,
+                                    domain_features=domain_features,
+                                    node_dist=dataset_infos['node_dist'],
+                                    sample_steps=args.sample_steps,
+                                    batch_size=current_bs,
+                                    conditional=conditional,
+                                    cond_labels=batch_cond,
+                                    guidance_weight=args.guidance_weight,
+                                )
+                                all_generated_batches.extend(batch_generated)
+                            generated_val = all_generated_batches
+
+                        val_metrics = evaluate_generated_graphs(
+                            generated_val,
+                            args.dataset,
+                            graphs,
+                            val_ds,
+                            dataset_infos,
+                            reference_graphs=[val_ds[i] for i in range(min(len(val_ds), 200))] if val_ds is not None else None,
+                            train_graphs=graphs,
+                            cache_dir=args.save_dir,
+                            cond_labels=cond_labels,
+                        )
+                    finally:
+                        if ema is not None:
+                            ema.swap_out(model)
+
+                    # Save rolling N checkpoints instead of relying on selection_score
+                    ckpt_prefix = f'epoch_{epoch + 1}'
+                    save_model_snapshot(model, ema, args.save_dir, ckpt_prefix, output_dims)
+                    saved_checkpoints.append(ckpt_prefix)
+                    print(f"  Saved checkpoint at epoch {epoch + 1}")
+                    
+                    if len(saved_checkpoints) > max_saved_checkpoints:
+                        old_prefix = saved_checkpoints.pop(0)
+                        old_model_path = os.path.join(args.save_dir, f'{old_prefix}_model.npz')
+                        old_ema_path = os.path.join(args.save_dir, f'{old_prefix}_ema.pkl')
+                        if os.path.exists(old_model_path):
+                            os.remove(old_model_path)
+                        if os.path.exists(old_ema_path):
+                            os.remove(old_ema_path)
+                        print(f"  Removed old checkpoint {old_prefix}")
+
+        save_model_snapshot(model, ema, args.save_dir, 'last', output_dims)
+        print("\nTraining complete. Last snapshot saved.")
+
+    # ------- Sampling & Evaluation -------
+    if args.sample:
+        _, sampling_ema = load_model_snapshot_for_sampling(
+            model, args.save_dir, ema_decay=args.ema_decay)
+
+        num_folds = max(1, args.num_sample_fold)
+        all_fold_metrics = []
+
+        for fold in range(num_folds):
+            print(f"\n--- Sampling fold {fold + 1}/{num_folds} ---")
+            print(f"Generating {args.num_samples} graphs with {args.sample_steps} steps...")
+
+            cond_labels = None
+            if conditional and test_labels is not None:
+                perm = np.random.permutation(tlx.convert_to_numpy(test_labels).shape[0])
+                idx = perm[:args.num_samples]
+                cond_labels = test_labels[idx]
+                print(f"  Using classifier-free guidance (weight={args.guidance_weight})")
+
+            # Support memory-constrained sampling via sample_batch_size
+            sample_bs = getattr(args, 'sample_batch_size', 0) or args.num_samples
+            if sample_bs >= args.num_samples:
+                generated = sample_batch(
+                    model=model,
+                    noise_dist=noise_dist,
+                    rate_matrix_designer=rate_designer,
+                    time_distorter=time_distorter,
+                    extra_features=extra_features,
+                    domain_features=domain_features,
+                    node_dist=dataset_infos['node_dist'],
+                    sample_steps=args.sample_steps,
+                    batch_size=args.num_samples,
+                    conditional=conditional,
+                    cond_labels=cond_labels,
+                    guidance_weight=args.guidance_weight,
+                )
+            else:
+                all_generated_batches = []
+                num_batches = (args.num_samples + sample_bs - 1) // sample_bs
+                for b_idx in range(num_batches):
+                    current_bs = min(sample_bs, args.num_samples - len(all_generated_batches))
+                    print(f"  Sampling batch {b_idx + 1}/{num_batches} (size={current_bs})...")
+                    batch_cond = None
+                    if cond_labels is not None:
+                        start = b_idx * sample_bs
+                        end = start + current_bs
+                        batch_cond = cond_labels[start:end]
+                    batch_generated = sample_batch(
+                        model=model,
+                        noise_dist=noise_dist,
+                        rate_matrix_designer=rate_designer,
+                        time_distorter=time_distorter,
+                        extra_features=extra_features,
+                        domain_features=domain_features,
+                        node_dist=dataset_infos['node_dist'],
+                        sample_steps=args.sample_steps,
+                        batch_size=current_bs,
+                        conditional=conditional,
+                        cond_labels=batch_cond,
+                        guidance_weight=args.guidance_weight,
+                    )
+                    all_generated_batches.extend(batch_generated)
+                generated = all_generated_batches
+
+            print(f"Generated {len(generated)} graphs:")
+            for i, (x, e) in enumerate(generated[:5]):
+                n_edges = int(np.sum(e > 0)) // 2
+                print(f"  Graph {i}: {len(x)} nodes, {n_edges} edges")
+
+            save_name = f'generated_graphs_fold{fold}.npy' if num_folds > 1 else 'generated_graphs.npy'
+            save_path = os.path.join(args.save_dir, save_name)
+            import pickle
+            with open(save_path, 'wb') as f:
+                pickle.dump(generated, f)
+            print(f"Saved to {save_path}")
+
+            if args.evaluate:
+                print("\n" + "=" * 60)
+                print(f"EVALUATION (fold {fold + 1}/{num_folds})")
+                print("=" * 60)
+                fold_metrics = evaluate_generated_graphs(
+                    generated,
+                    args.dataset,
+                    graphs,
+                    test_ds,
+                    dataset_infos,
+                    cache_dir=args.save_dir,
+                    cond_labels=cond_labels,
+                )
+                all_fold_metrics.append(fold_metrics)
+
+        if num_folds > 1 and len(all_fold_metrics) > 1:
+            print("\n" + "=" * 60)
+            print(f"MULTI-FOLD SUMMARY ({num_folds} folds)")
+            print("=" * 60)
+            all_keys = sorted(set().union(*all_fold_metrics))
+            for key in all_keys:
+                vals = [m[key] for m in all_fold_metrics if key in m]
+                if vals:
+                    mean_val = np.mean(vals)
+                    std_val = np.std(vals)
+                    print(f"  {key}: {mean_val:.6f} +/- {std_val:.6f}")
+
+        if sampling_ema is not None:
+            sampling_ema.swap_out(model)
+
+    print("Done!")
+
+
+if __name__ == '__main__':
+    parser = argparse.ArgumentParser(description='DeFoG: Discrete Flow Matching for Graph Generation')
+
+    # Dataset
+    parser.add_argument('--dataset', type=str, default='synthetic',
+                        choices=['synthetic', 'planar', 'tree', 'sbm', 'comm20', 'qm9',
+                                 'guacamol', 'zinc250k', 'moses', 'tls'],
+                        help='Dataset to use')
+    parser.add_argument('--data_root', type=str, default=None,
+                        help='Root directory for real datasets')
+    qm9_h_group = parser.add_mutually_exclusive_group()
+    qm9_h_group.add_argument('--remove_h', dest='remove_h', action='store_true',
+                             help='Use QM9 without hydrogens')
+    qm9_h_group.add_argument('--with_h', dest='remove_h', action='store_false',
+                             help='Use QM9 with hydrogens')
+    parser.set_defaults(remove_h=None)
+    parser.add_argument('--use_defog_split', action='store_true',
+                        help='Use DeFoG original CSV split for QM9 instead of random split')
+    parser.add_argument('--num_graphs', type=int, default=200)
+    parser.add_argument('--min_nodes', type=int, default=10)
+    parser.add_argument('--max_nodes', type=int, default=20)
+    parser.add_argument('--num_node_types', type=int, default=2)
+    parser.add_argument('--num_edge_types', type=int, default=2)
+    parser.add_argument('--p_edge', type=float, default=0.3)
+
+    # Conditional generation (classifier-free guidance)
+    parser.add_argument('--conditional', action='store_true',
+                        help='Enable classifier-free guidance conditional generation')
+    parser.add_argument('--target', type=str, default='mu',
+                        choices=['mu', 'homo', 'both', 'k2'],
+                        help='Target property for conditional generation (QM9: mu/homo/both; TLS: k2)')
+    parser.add_argument('--guidance_weight', type=float, default=2.0,
+                        help='Classifier-free guidance weight')
+
+    # Model
+    parser.add_argument('--n_layers', type=int, default=5)
+    parser.add_argument('--hidden_mlp_X', type=int, default=256)
+    parser.add_argument('--hidden_mlp_E', type=int, default=128)
+    parser.add_argument('--hidden_mlp_y', type=int, default=128)
+    parser.add_argument('--dx', type=int, default=256)
+    parser.add_argument('--de', type=int, default=64)
+    parser.add_argument('--dy', type=int, default=64)
+    parser.add_argument('--n_head', type=int, default=8)
+    parser.add_argument('--dim_ffX', type=int, default=256)
+    parser.add_argument('--dim_ffE', type=int, default=128)
+    parser.add_argument('--dim_ffy', type=int, default=128)
+
+    # Training
+    parser.add_argument('--n_epochs', type=int, default=100)
+    parser.add_argument('--batch_size', type=int, default=32)
+    parser.add_argument('--lr', type=float, default=2e-4)
+    parser.add_argument('--weight_decay', type=float, default=1e-12)
+    parser.add_argument('--grad_clip_norm', type=float, default=None,
+                        help='Max norm for gradient clipping (default: disabled, matching original DeFoG)')
+    parser.add_argument('--ema_decay', type=float, default=0.0,
+                        help='EMA decay (0 = disabled, typical: 0.999)')
+    parser.add_argument('--kld', action='store_true',
+                        help='Use KL-divergence loss instead of cross-entropy')
+    parser.add_argument('--lambda_E', type=float, default=5.0)
+    parser.add_argument('--lambda_y', type=float, default=0.0)
+    parser.add_argument('--transition', type=str, default='marginal')
+    parser.add_argument('--extra_features', type=str, default='rrwp')
+    parser.add_argument('--rrwp_steps', type=int, default=12)
+    parser.add_argument('--train_distortion', type=str, default='identity')
+
+    # Sampling
+    parser.add_argument('--sample', action='store_true')
+    parser.add_argument('--evaluate', action='store_true',
+                        help='Evaluate generated graphs (molecular or synthetic metrics)')
+    parser.add_argument('--sample_steps', type=int, default=100)
+    parser.add_argument('--sample_distortion', type=str, default='identity')
+    parser.add_argument('--eta', type=float, default=0.0)
+    parser.add_argument('--omega', type=float, default=0.0)
+    parser.add_argument('--rdb', type=str, default='general')
+    parser.add_argument('--rdb_crit', type=str, default='max_marginal')
+    parser.add_argument('--num_samples', type=int, default=20)
+    parser.add_argument('--num_sample_fold', type=int, default=1,
+                        help='Number of sampling folds for evaluation (reports mean±std)')
+    parser.add_argument('--sample_batch_size', type=int, default=0,
+                        help='Batch size for sampling (0 = use num_samples, for memory-constrained evaluation)')
+    parser.add_argument('--sample_every_val', type=int, default=0,
+                        help='Run validation sampling every N validation events (0 = disabled)')
+    parser.add_argument('--check_val_every_n_epochs', type=int, default=0,
+                        help='Run validation cadence every N epochs (0 = disabled)')
+    parser.add_argument('--val_num_samples', type=int, default=40,
+                        help='Number of samples used during validation selection')
+
+    # Resume
+    parser.add_argument('--resume_from', type=str, default=None,
+                        help='Checkpoint directory to resume from (loads last_model.npz)')
+    parser.add_argument('--start_epoch', type=int, default=0,
+                        help='Epoch to start from (0-indexed, used with --resume_from)')
+
+    # System
+    parser.add_argument('--gpu', type=int, default=0)
+    parser.add_argument('--save_dir', type=str, default='./checkpoints')
+    parser.add_argument('--seed', type=int, default=42)
+
+    args = parser.parse_args()
+    args = apply_dataset_preset(args, parser)
+
+    # Set random seed
+    np.random.seed(args.seed)
+    random.seed(args.seed)
+    import torch
+    torch.manual_seed(args.seed)
+    if torch.cuda.is_available():
+        torch.cuda.manual_seed_all(args.seed)
+    torch.backends.cudnn.deterministic = True
+    torch.backends.cudnn.benchmark = False
+
+    # Set device
+    if args.gpu >= 0:
+        tlx.set_device('GPU', args.gpu)
+    else:
+        tlx.set_device('CPU')
+
+    # Create save directory
+    os.makedirs(args.save_dir, exist_ok=True)
+
+    main(args)
diff --git a/examples/defog/defog_utils.py b/examples/defog/defog_utils.py
new file mode 100644
index 000000000..36d3980ba
--- /dev/null
+++ b/examples/defog/defog_utils.py
@@ -0,0 +1,285 @@
+import math
+import numpy as np
+import tensorlayerx as tlx
+
+def _to_dense_batch(x, batch):
+    r"""Backend-agnostic conversion from sparse batched node features to dense.
+
+    Parameters
+    ----------
+    x : tensor
+        Node features ``(N_total, dx)``.
+    batch : tensor
+        Batch assignment vector ``(N_total,)``.
+
+    Returns
+    -------
+    tuple
+        ``(X, mask)`` where X is ``(bs, n_max, dx)`` and mask is ``(bs, n_max)`` bool.
+    """
+    x_np = tlx.convert_to_numpy(x)
+    if batch is None:
+        X = x_np[np.newaxis, :, :]
+        mask = np.ones((1, x_np.shape[0]), dtype=bool)
+        return tlx.convert_to_tensor(X.astype(np.float32)), \
+               tlx.convert_to_tensor(mask)
+
+    batch_np = tlx.convert_to_numpy(batch).astype(np.int64)
+    batch_size = int(batch_np.max()) + 1
+    num_nodes = np.bincount(batch_np, minlength=batch_size)
+    max_num_nodes = int(num_nodes.max())
+    dx = x_np.shape[-1]
+
+    cum_nodes = np.concatenate([[0], np.cumsum(num_nodes)[:-1]])
+
+    X = np.zeros((batch_size, max_num_nodes, dx), dtype=np.float32)
+    mask = np.zeros((batch_size, max_num_nodes), dtype=bool)
+
+    for i in range(len(batch_np)):
+        b = batch_np[i]
+        local_idx = i - cum_nodes[b]
+        X[b, local_idx] = x_np[i]
+        mask[b, local_idx] = True
+
+    return tlx.convert_to_tensor(X), tlx.convert_to_tensor(mask)
+
+class PlaceHolder:
+    def __init__(self, X, E, y=None):
+        self.X = X
+        self.E = E
+        self.y = y
+
+    def mask(self, node_mask):
+        X, E = apply_node_mask(self.X, self.E, node_mask)
+        return PlaceHolder(X=X, E=E, y=self.y)
+
+    def split(self, node_mask):
+        r"""Split a batched PlaceHolder into a list of individual graphs."""
+        bs = node_mask.shape[0]
+        n_nodes = tlx.reduce_sum(tlx.cast(node_mask, tlx.int64), axis=1)
+        n_nodes = tlx.convert_to_numpy(n_nodes)
+        graphs = []
+        X_np = tlx.convert_to_numpy(self.X)
+        E_np = tlx.convert_to_numpy(self.E)
+        for i in range(bs):
+            n = int(n_nodes[i])
+            xi = tlx.convert_to_tensor(X_np[i, :n])
+            ei = tlx.convert_to_tensor(E_np[i, :n, :n])
+            graphs.append((xi, ei))
+        return graphs
+
+    def __repr__(self):
+        x_shape = self.X.shape if hasattr(self.X, 'shape') else self.X
+        e_shape = self.E.shape if hasattr(self.E, 'shape') else self.E
+        y_shape = self.y.shape if (self.y is not None and hasattr(self.y, 'shape')) else self.y
+        return f"PlaceHolder(X={x_shape}, E={e_shape}, y={y_shape})"
+
+
+# ============================================================
+# Dense conversion utilities
+# ============================================================
+
+def apply_node_mask(X, E, node_mask):
+    """Zero-out features of padded nodes (node_mask=False)."""
+    x_mask = tlx.expand_dims(tlx.cast(node_mask, X.dtype), axis=-1)
+    e_mask = tlx.expand_dims(x_mask, axis=2) * tlx.expand_dims(x_mask, axis=1)
+    
+    X_masked = X * x_mask
+    E_masked = E * e_mask
+    return X_masked, E_masked
+
+
+def _to_dense_adj(edge_index, batch, edge_attr=None, max_num_nodes=None):
+    """Robust conversion of sparse adjacency to dense using numpy."""
+    batch_np = tlx.convert_to_numpy(batch)
+    ei_np = tlx.convert_to_numpy(edge_index)
+    ea_np = tlx.convert_to_numpy(edge_attr) if edge_attr is not None else None
+    
+    bs = int(np.max(batch_np)) + 1 if len(batch_np) > 0 else 1
+    if max_num_nodes is None:
+        nodes_per_graph = np.bincount(batch_np, minlength=bs)
+        max_num_nodes = int(np.max(nodes_per_graph))
+        
+    de = 1 if ea_np is None else (ea_np.shape[-1] if len(ea_np.shape) > 1 else 1)
+    adj = np.zeros((bs, max_num_nodes, max_num_nodes, de), dtype=np.float32)
+    
+    # Compute node offsets within each graph
+    cum_nodes = np.zeros(bs + 1, dtype=np.int64)
+    nodes_per_graph = np.bincount(batch_np, minlength=bs)
+    cum_nodes[1:] = np.cumsum(nodes_per_graph)
+    
+    if len(ei_np[0]) > 0:
+        graph_idx = batch_np[ei_np[0]]
+        src = ei_np[0] - cum_nodes[graph_idx]
+        dst = ei_np[1] - cum_nodes[graph_idx]
+        
+        valid = (src < max_num_nodes) & (dst < max_num_nodes)
+        graph_idx = graph_idx[valid]
+        src = src[valid]
+        dst = dst[valid]
+        
+        if ea_np is not None:
+            vals = ea_np[valid]
+            if len(vals.shape) == 1:
+                vals = vals.reshape(-1, 1)
+        else:
+            vals = np.ones((len(src), 1), dtype=np.float32)
+            
+        adj[graph_idx, src, dst, :] = vals
+        
+    if ea_np is None or (len(ea_np.shape) == 1):
+        adj = adj.squeeze(-1)
+        
+    return tlx.convert_to_tensor(adj)
+
+
+def to_dense(x, edge_index, edge_attr, batch, num_nodes=None):
+    r"""Convert sparse graph to dense representation.
+
+    Parameters
+    ----------
+    x : tensor
+        Node features ``(N_total, dx)``.
+    edge_index : tensor
+        Edge indices ``(2, E_total)``.
+    edge_attr : tensor
+        Edge attributes ``(E_total, de)``.
+    batch : tensor
+        Batch assignment vector ``(N_total,)``.
+
+    Returns
+    -------
+    PlaceHolder, node_mask
+        Dense graph data and boolean node mask.
+    """
+    X, node_mask = _to_dense_batch(x, batch)
+
+    max_num_nodes = X.shape[1]
+
+    # Remove self-loops
+    src = edge_index[0]
+    dst = edge_index[1]
+    mask = src != dst
+    edge_index_clean = edge_index[:, mask]
+    edge_attr_clean = edge_attr[mask] if edge_attr is not None else None
+
+    E = _to_dense_adj(edge_index_clean, batch, edge_attr_clean,
+                       max_num_nodes=max_num_nodes)
+
+    if len(E.shape) == 3:
+        E = tlx.expand_dims(E, axis=-1)
+
+    E = encode_no_edge(E)
+
+    # Apply node_mask to zero out padding positions
+    X, E = apply_node_mask(X, E, node_mask)
+
+    node_mask = tlx.cast(node_mask, tlx.bool)
+    return PlaceHolder(X=X, E=E, y=None), node_mask
+
+
+def encode_no_edge(E):
+    r"""Encode 'no-edge' as the first channel (index 0).
+
+    Parameters
+    ----------
+    E : tensor
+        Edge features ``(bs, n, n, de)``.
+
+    Returns
+    -------
+    tensor
+        Modified E with ``E[:,:,:,0] = 1`` where no edge exists.
+    """
+    if len(E.shape) != 4:
+        return E
+    if E.shape[-1] == 0:
+        return E
+
+    E_np = tlx.convert_to_numpy(E)
+    no_edge = np.sum(E_np, axis=3) == 0
+    E_np[:, :, :, 0][no_edge] = 1.0
+
+    n = E_np.shape[1]
+    for i in range(n):
+        E_np[:, i, i, :] = 0.0
+
+    return tlx.convert_to_tensor(E_np, dtype=E.dtype)
+
+
+# ============================================================
+# EMA (Exponential Moving Average)
+# ============================================================
+
+class EMA:
+    r"""Exponential Moving Average of model parameters.
+
+    Maintains shadow copies of all trainable parameters and updates them
+    after each training step: ``shadow = decay * shadow + (1 - decay) * param``.
+
+    During sampling/evaluation, the EMA weights are swapped in place of the
+    original weights, then restored afterwards.
+
+    Parameters
+    ----------
+    model : tlx.nn.Module
+        The model whose parameters to track.
+    decay : float
+        EMA decay factor (e.g. 0.999). Higher = slower update.
+    """
+    def __init__(self, model, decay=0.999):
+        self.decay = decay
+        self.shadow_params = {}
+        self._backup_params = None
+        # Initialize shadow parameters as clones of model params
+        for name, param in model.named_parameters():
+            self.shadow_params[name] = param.clone().detach()
+
+    def update(self, model):
+        """Update shadow parameters after a training step."""
+        for name, param in model.named_parameters():
+            if name in self.shadow_params:
+                new_val = self.decay * self.shadow_params[name] + (1.0 - self.decay) * param.detach()
+                self.shadow_params[name] = new_val
+            else:
+                self.shadow_params[name] = param.clone().detach()
+
+    def swap_in(self, model):
+        """Replace model parameters with EMA shadow parameters."""
+        self._backup_params = {}
+        for name, param in model.named_parameters():
+            self._backup_params[name] = param.clone().detach()
+            param.data.copy_(self.shadow_params[name])
+
+    def swap_out(self, model):
+        """Restore original model parameters from backup."""
+        if self._backup_params is not None:
+            for name, param in model.named_parameters():
+                param.data.copy_(self._backup_params[name])
+            self._backup_params = None
+
+    def state_dict(self):
+        """Return state for saving."""
+        import pickle, io
+        buf = io.BytesIO()
+        shadow_np = {k: tlx.convert_to_numpy(v) for k, v in self.shadow_params.items()}
+        pickle.dump({'shadow_params': shadow_np, 'decay': self.decay}, buf)
+        return buf.getvalue()
+
+    def load_state_dict(self, state_bytes):
+        """Load state from saved bytes."""
+        import pickle, io
+        buf = io.BytesIO(state_bytes)
+        data = pickle.loads(buf.read())
+
+        if isinstance(data, dict) and 'shadow_params' in data:
+            self.decay = data.get('decay', self.decay)
+            shadow_params = data['shadow_params']
+        else:
+            shadow_params = data
+
+        for k, v in shadow_params.items():
+            self.shadow_params[k] = tlx.convert_to_tensor(v)
+
+
+
diff --git a/examples/defog/evaluator.py b/examples/defog/evaluator.py
new file mode 100644
index 000000000..df057ac1c
--- /dev/null
+++ b/examples/defog/evaluator.py
@@ -0,0 +1,354 @@
+"""Evaluation metrics orchestration for DeFoG.
+
+This module contains the evaluation pipeline extracted from defog_trainer.py,
+including molecular metrics (validity, uniqueness, novelty, FCD) and
+synthetic graph metrics (SPECTRE MMD-based statistics).
+"""
+
+import os
+import numpy as np
+import tensorlayerx as tlx
+
+from rdkit_functions import compute_molecular_metrics
+
+
+def compute_selection_score(dataset_name, metrics):
+    """Compute a single scalar selection score from evaluation metrics.
+
+    Used for best-checkpoint selection during training validation.
+    """
+    if dataset_name in ('planar', 'tree', 'sbm', 'comm20'):
+        for key in (
+            'sampling/frac_unic_non_iso_valid',
+            'sampling/frac_unique_non_iso',
+            'sampling/frac_non_iso',
+            'sampling/frac_unique',
+            'frac_unic_non_iso_valid',
+            'frac_unique_non_iso',
+            'frac_non_iso',
+            'frac_unique',
+            'valid',
+            'planar_acc',
+            'tree_acc',
+            'V.U.N.',
+            'tls_validity',
+        ):
+            if key in metrics:
+                return float(metrics[key])
+        return float('-inf')
+
+    if dataset_name in ('qm9', 'guacamol', 'zinc250k', 'moses'):
+        score_parts = []
+        for key in ('Validity', 'Relaxed Validity', 'Uniqueness', 'Novelty'):
+            value = metrics.get(key)
+            if value is not None and value >= 0:
+                score_parts.append(float(value))
+        return float(np.mean(score_parts)) if score_parts else float('-inf')
+
+    degree = metrics.get('degree')
+    return -float(degree) if degree is not None else float('-inf')
+
+
+def graphs_to_networkx(graph_list):
+    """Convert a list of GammaGL Graph objects to networkx graphs.
+
+    Parameters
+    ----------
+    graph_list : list of Graph
+        Each graph has .x (one-hot node features), .edge_index, .edge_attr.
+
+    Returns
+    -------
+    list of nx.Graph
+    """
+    import networkx as nx
+    nx_graphs = []
+    for g in graph_list:
+        x_np = g.x if isinstance(g.x, np.ndarray) else tlx.convert_to_numpy(g.x)
+        edge_np = g.edge_index if isinstance(g.edge_index, np.ndarray) else tlx.convert_to_numpy(g.edge_index)
+
+        nx_g = nx.Graph()
+        n = x_np.shape[0]
+        nx_g.add_nodes_from(range(n))
+
+        if edge_np.shape[1] > 0:
+            src = edge_np[0].astype(int)
+            dst = edge_np[1].astype(int)
+            for s, d in zip(src, dst):
+                if s < d:
+                    nx_g.add_edge(int(s), int(d))
+        nx_graphs.append(nx_g)
+    return nx_graphs
+
+
+def collect_train_smiles_molecular(train_graphs, atom_decoder):
+    """Convert training molecular graphs to canonical SMILES.
+
+    Parameters
+    ----------
+    train_graphs : list of Graph
+        Training graphs with .x (one-hot), .edge_index, .edge_attr.
+    atom_decoder : list of str
+        Atom decoder aligned with the dataset preprocessing.
+
+    Returns
+    -------
+    list of str
+        Canonical SMILES for each valid training graph.
+    """
+    from rdkit_functions import build_molecule_with_partial_charges, mol2smiles
+
+    if atom_decoder is None:
+        return None
+
+    smiles_list = []
+    for g in train_graphs:
+        x_np = g.x if isinstance(g.x, np.ndarray) else tlx.convert_to_numpy(g.x)
+        edge_index_np = g.edge_index if isinstance(g.edge_index, np.ndarray) else tlx.convert_to_numpy(g.edge_index)
+        ea_np = g.edge_attr if isinstance(g.edge_attr, np.ndarray) else tlx.convert_to_numpy(g.edge_attr)
+
+        n = x_np.shape[0]
+        # Dense adjacency with edge types
+        adj = np.zeros((n, n), dtype=int)
+        if edge_index_np.shape[1] > 0:
+            src = edge_index_np[0].astype(int)
+            dst = edge_index_np[1].astype(int)
+            for idx in range(len(src)):
+                s, d = int(src[idx]), int(dst[idx])
+                if ea_np is not None and ea_np.ndim == 2 and ea_np.shape[0] == len(src):
+                    bond_type = int(np.argmax(ea_np[idx]))
+                else:
+                    bond_type = 1
+                adj[s, d] = bond_type
+
+        atom_types = np.argmax(x_np, axis=-1)
+        mol = build_molecule_with_partial_charges(atom_types, adj, atom_decoder)
+        smi = mol2smiles(mol)
+        if smi is not None:
+            smiles_list.append(smi)
+
+    print(f"  Collected {len(smiles_list)} valid SMILES from {len(train_graphs)} training graphs")
+    return smiles_list
+
+
+def evaluate_generated_graphs(generated, dataset_name, graphs, test_ds,
+                              dataset_infos, reference_graphs=None,
+                              train_graphs=None, cache_dir=None, cond_labels=None):
+    """Full evaluation pipeline for generated graphs.
+
+    For molecular datasets: computes validity, uniqueness, novelty, FCD, etc.
+    For synthetic datasets: computes SPECTRE MMD-based graph statistics.
+
+    Parameters
+    ----------
+    generated : list of (atom_types, edge_types)
+        Generated graphs as integer arrays.
+    dataset_name : str
+        Name of the dataset.
+    graphs : list
+        Training graphs.
+    test_ds : dataset
+        Test dataset.
+    dataset_infos : dict
+        Dataset information dictionary.
+    reference_graphs : list, optional
+        Reference graphs for FCD computation.
+    train_graphs : list, optional
+        Training graphs (overrides `graphs` for SMILES collection).
+    cache_dir : str, optional
+        Directory for caching SMILES.
+
+    Returns
+    -------
+    dict
+        Evaluation metrics.
+    """
+    fold_metrics = {}
+    is_molecular = dataset_name in ('qm9', 'guacamol', 'zinc250k', 'moses')
+
+    if is_molecular:
+        atom_decoder = dataset_infos.get('atom_decoder')
+        remove_h = dataset_infos.get('remove_h', True)
+        train_graph_source = train_graphs if train_graphs is not None else graphs
+
+        # 1. Check in-memory cache
+        train_smiles = dataset_infos.get('train_smiles')
+        reference_smiles = dataset_infos.get('reference_smiles')
+
+        # 2. Check disk cache if not in memory
+        cache_file = None
+        ref_cache_file = None
+        if cache_dir is not None:
+            os.makedirs(cache_dir, exist_ok=True)
+            cache_file = os.path.join(cache_dir, f"train_smiles_cache_{dataset_name}.pkl")
+            ref_cache_file = os.path.join(cache_dir, f"ref_smiles_cache_{dataset_name}.pkl")
+
+        if train_smiles is None and cache_file is not None and os.path.exists(cache_file):
+            import pickle
+            with open(cache_file, 'rb') as f:
+                train_smiles = pickle.load(f)
+            dataset_infos['train_smiles'] = train_smiles
+            print(f"Loaded {len(train_smiles)} training SMILES from disk cache")
+
+        if reference_smiles is None and ref_cache_file is not None and os.path.exists(ref_cache_file):
+            import pickle
+            with open(ref_cache_file, 'rb') as f:
+                reference_smiles = pickle.load(f)
+            dataset_infos['reference_smiles'] = reference_smiles
+            print(f"Loaded {len(reference_smiles)} reference SMILES from disk cache")
+
+        if atom_decoder is not None and train_smiles is None:
+            print("Collecting training SMILES...")
+            train_smiles = collect_train_smiles_molecular(train_graph_source, atom_decoder)
+            dataset_infos['train_smiles'] = train_smiles
+            if cache_file is not None:
+                import pickle
+                with open(cache_file, 'wb') as f:
+                    pickle.dump(train_smiles, f)
+                print(f"Cached training SMILES to disk: {cache_file}")
+
+        if atom_decoder is not None and reference_smiles is None:
+            if reference_graphs is not None:
+                print("Collecting reference SMILES for FCD...")
+                reference_smiles = collect_train_smiles_molecular(reference_graphs, atom_decoder)
+            elif test_ds is not None:
+                print("Collecting reference SMILES from test_ds...")
+                ref_graphs = [test_ds[i] for i in range(len(test_ds))]
+                reference_smiles = collect_train_smiles_molecular(ref_graphs, atom_decoder)
+            else:
+                reference_smiles = train_smiles
+            dataset_infos['reference_smiles'] = reference_smiles
+            if ref_cache_file is not None:
+                import pickle
+                with open(ref_cache_file, 'wb') as f:
+                    pickle.dump(reference_smiles, f)
+                print(f"Cached reference SMILES to disk: {ref_cache_file}")
+
+        if atom_decoder is not None:
+            atom_counts = np.zeros(len(atom_decoder), dtype=np.int64)
+            max_edge_type = 0
+            for atom_types, edge_types in generated:
+                atom_types_np = np.asarray(atom_types, dtype=np.int64)
+                edge_types_np = np.asarray(edge_types, dtype=np.int64)
+                valid_atom_mask = (atom_types_np >= 0) & (atom_types_np < len(atom_decoder))
+                if valid_atom_mask.any():
+                    atom_counts += np.bincount(atom_types_np[valid_atom_mask], minlength=len(atom_decoder))
+                if edge_types_np.size > 0:
+                    max_edge_type = max(max_edge_type, int(edge_types_np.max()))
+
+            bond_counts = np.zeros(max_edge_type + 1, dtype=np.int64)
+            for _, edge_types in generated:
+                edge_types_np = np.asarray(edge_types, dtype=np.int64)
+                if edge_types_np.size == 0:
+                    continue
+                upper = np.triu(edge_types_np, k=1).reshape(-1)
+                valid_bonds = upper[upper >= 0]
+                if valid_bonds.size > 0:
+                    bond_counts += np.bincount(valid_bonds, minlength=len(bond_counts))
+
+            atom_summary = {atom_decoder[i]: int(atom_counts[i]) for i in range(len(atom_decoder))}
+            bond_summary = {int(i): int(bond_counts[i]) for i in range(len(bond_counts))}
+            print(f"  Generated atom type counts: {atom_summary}")
+            print(f"  Generated bond type counts: {bond_summary}")
+
+            print(f"\nEvaluating molecular metrics on {len(generated)} generated graphs...")
+            stability_dict, rdkit_metrics, all_smiles, summary = compute_molecular_metrics(
+                generated, train_smiles, atom_decoder, remove_h)
+
+            print(f"\n  Stability: {stability_dict}")
+            print(f"  Summary: {summary}")
+            fold_metrics.update(summary)
+            fold_metrics.update(stability_dict)
+
+            try:
+                from rdkit_functions import compute_distribution_metrics
+                dist_mae = compute_distribution_metrics(
+                    generated, dataset_infos, dataset_name)
+                fold_metrics.update(dist_mae)
+                print(f"  Distribution MAE: {dist_mae}")
+            except Exception as e:
+                print(f"  Distribution MAE skipped: {e}")
+
+            try:
+                from rdkit_functions import compute_fcd
+                fcd_score = compute_fcd(all_smiles, reference_smiles)
+                fold_metrics['fcd'] = fcd_score
+                print(f"  FCD: {fcd_score:.4f}")
+            except Exception as e:
+                print(f"  FCD skipped: {e}")
+
+            fold_metrics['selection_score'] = compute_selection_score(dataset_name, fold_metrics)
+
+    elif dataset_name == 'tls':
+        from tls_metrics import compute_tls_metrics
+        print('\nEvaluating TLS conditional generation metrics...')
+        metrics = compute_tls_metrics(generated, cond_labels, train_graphs)
+        metrics['selection_score'] = compute_selection_score(dataset_name, metrics)
+        fold_metrics.update(metrics)
+        print('\n  TLS Evaluation Results:')
+        for k, v in metrics.items():
+            if isinstance(v, (int, float)):
+                print(f'    {k}: {v:.6f}')
+            else:
+                print(f'    {k}: {v}')
+
+    else:
+        from spectre_utils import evaluate_synthetic_graphs
+
+        print("Converting reference graphs to networkx...")
+        if reference_graphs is not None:
+            reference_nx = graphs_to_networkx(reference_graphs)
+        elif test_ds is not None:
+            reference_nx = graphs_to_networkx(
+                [test_ds[i] for i in range(min(len(test_ds), 200))])
+        else:
+            reference_nx = graphs_to_networkx(graphs[:min(len(graphs), 200)])
+
+        if train_graphs is not None:
+            train_nx = graphs_to_networkx(train_graphs)
+        else:
+            train_nx = graphs_to_networkx(graphs[:min(len(graphs), 200)])
+
+        metrics = evaluate_synthetic_graphs(
+            generated_graphs=generated,
+            reference_graphs=reference_nx,
+            train_graphs=train_nx,
+            dataset_name=dataset_name,
+            compute_emd=(dataset_name == 'comm20'),
+        )
+
+        alias_pairs = [
+            ('sampling/frac_unique', 'frac_unique'),
+            ('sampling/frac_non_iso', 'frac_non_iso'),
+            ('sampling/frac_unique_non_iso', 'frac_unique_non_iso'),
+            ('sampling/frac_unic_non_iso_valid', 'frac_unic_non_iso_valid'),
+        ]
+        for new_key, old_key in alias_pairs:
+            value = metrics.get(old_key)
+            if value is not None:
+                metrics[new_key] = value
+
+        if dataset_name == 'tree' and 'tree_acc' in metrics:
+            metrics.setdefault('valid', metrics['tree_acc'])
+        if dataset_name == 'planar' and 'planar_acc' in metrics:
+            metrics.setdefault('valid', metrics['planar_acc'])
+
+        if 'valid' in metrics:
+            metrics.setdefault('sampling/frac_unic_non_iso_valid', metrics['valid'])
+            metrics.setdefault('frac_unic_non_iso_valid', metrics['valid'])
+
+        metrics['selection_score'] = compute_selection_score(dataset_name, metrics)
+
+        fold_metrics.update(metrics)
+
+        print("\n  Evaluation Results:")
+        for k, v in metrics.items():
+            if isinstance(v, (int, float, np.floating)):
+                print(f"    {k}: {float(v):.6f}")
+            else:
+                print(f"    {k}: {v}")
+
+    if 'selection_score' not in fold_metrics:
+        fold_metrics['selection_score'] = compute_selection_score(dataset_name, fold_metrics)
+
+    return fold_metrics
diff --git a/examples/defog/extra_features.py b/examples/defog/extra_features.py
new file mode 100644
index 000000000..354e6265e
--- /dev/null
+++ b/examples/defog/extra_features.py
@@ -0,0 +1,855 @@
+import numpy as np
+import tensorlayerx as tlx
+class DenseFeaturePlaceHolder:
+    def __init__(self, X, E, y):
+        self.X = X
+        self.E = E
+        self.y = y
+
+
+
+class DummyExtraFeatures:
+    r"""Dummy extra features that return empty tensors."""
+    def __call__(self, noisy_data):
+        X_t = noisy_data['X_t']
+        bs = X_t.shape[0]
+        n = X_t.shape[1]
+        return DenseFeaturePlaceHolder(
+            X=tlx.zeros([bs, n, 0], dtype=tlx.float32),
+            E=tlx.zeros([bs, n, n, 0], dtype=tlx.float32),
+            y=tlx.zeros([bs, 0], dtype=tlx.float32),
+        )
+
+
+class RRWPFeatures:
+    r"""Random Walk Return Probability features.
+
+    Computes ``k`` powers of the (optionally row-normalized) adjacency matrix.
+
+    Parameters
+    ----------
+    k : int
+        Number of random walk steps.
+    normalize : bool
+        If True, row-normalize the adjacency (random walk matrix).
+    """
+    def __init__(self, k=10, normalize=True):
+        self.k = k
+        self.normalize = normalize
+
+    def __call__(self, E, k=None):
+        """
+        Parameters
+        ----------
+        E : tensor
+            Adjacency ``(bs, n, n)`` (already summed over edge types).
+            If E has 4 dims ``(bs, n, n, de)``, the non-edge channels are summed.
+
+        Returns
+        -------
+        tensor
+            RRWP features ``(bs, n, n, k)``.
+        """
+        if k is None:
+            k = self.k
+
+        # Handle both 3-dim (adj) and 4-dim (one-hot edges) inputs
+        if len(E.shape) == 4:
+            adj = tlx.reduce_sum(E[:, :, :, 1:], axis=-1)  # (bs, n, n)
+        else:
+            adj = E  # (bs, n, n)
+            
+        # Ensure float32 for calculations
+        adj = tlx.cast(adj, tlx.float32)
+
+        bs, n = adj.shape[0], adj.shape[1]
+
+        if self.normalize:
+            deg = tlx.reduce_sum(adj, axis=-1, keepdims=True)
+            deg = tlx.where(deg == 0.0, tlx.ones_like(deg), deg)
+            A_norm = adj / deg
+        else:
+            A_norm = adj
+
+        power = tlx.eye(n, dtype=tlx.float32)
+        power = tlx.expand_dims(power, 0)
+        power = tlx.tile(power, [bs, 1, 1])
+        
+        results = [power]
+
+        for i in range(1, k):
+            power = tlx.bmm(power, A_norm)
+            results.append(power)
+
+        return tlx.stack(results, axis=-1)
+
+
+# ============================================================
+# K-Node Cycle Counting (matches original DeFoG formulas)
+# ============================================================
+
+class KNodeCycles:
+    r"""Compute 3, 4, 5, 6-cycle counts per node and graph-level.
+
+    Formulas match the original DeFoG implementation exactly.
+    """
+
+    def k_cycles(self, adj):
+        """
+        Parameters
+        ----------
+        adj : ndarray
+            Adjacency matrix ``(n, n)``.
+
+        Returns
+        -------
+        tuple
+            ``(x_cycles, y_cycles)`` where ``x_cycles`` has shape ``(n, 3)``
+            (per-node k3, k4, k5) and ``y_cycles`` has shape ``(4,)``
+            (graph-level k3, k4, k5, k6).
+        """
+        A = adj.astype(np.float64)
+        n = A.shape[0]
+        d = A.sum(axis=1)  # degree vector
+
+        A2 = A @ A
+        A3 = A2 @ A
+        A4 = A3 @ A
+        A5 = A4 @ A
+        A6 = A5 @ A
+
+        # ---- k3 ----
+        k3_node = np.diag(A3) / 2.0
+        k3_graph = np.trace(A3) / 6.0
+
+        # ---- k4 (matches original: d*(d-1) - (A @ d.unsqueeze(-1)).sum(-1)) ----
+        k4_node = (
+            np.diag(A4)
+            - d * (d - 1)
+            - (A @ d[:, np.newaxis]).sum(axis=-1)
+        ) / 2.0
+        k4_graph = (
+            np.trace(A4)
+            - np.sum(d * (d - 1))
+            - np.sum(A @ d[:, np.newaxis])
+        ) / 8.0
+
+        # ---- k5 (matches original: 3 terms) ----
+        triangles = np.diag(A3)
+        k5_node = (
+            np.diag(A5)
+            - 2.0 * triangles * d
+            - (A @ triangles[:, np.newaxis]).sum(axis=-1)
+            + triangles
+        ) / 2.0
+        k5_graph = (
+            np.trace(A5)
+            - 2.0 * np.sum(triangles * d)
+            - np.sum(A @ triangles[:, np.newaxis])
+            + np.sum(triangles)
+        ) / 10.0
+
+        # ---- k6 graph only (matches original 10-term formula) ----
+        d2 = np.diag(A2)
+        a4_diag = np.diag(A4)
+        k6_graph = (
+            np.trace(A6)
+            - 3.0 * np.trace(A3 @ A3)
+            + 9.0 * np.sum(A * (A2 ** 2))
+            - 6.0 * np.sum(d2 * a4_diag)
+            + 6.0 * np.trace(A4)
+            - 4.0 * np.trace(A3)
+            + 4.0 * np.sum(d2 ** 3 / (d2 + 1e-12) * d2)  # d2^3 summed
+            + 3.0 * np.sum(A3)
+            - 12.0 * np.sum(d2 ** 2)
+            + 4.0 * np.trace(A2)
+        ) / 12.0
+
+        # Fix k6 formula to exactly match original batch_trace operations
+        term_1 = np.trace(A6)
+        term_2 = np.trace(A3 @ A3)
+        term_3 = np.sum(A * (A2 ** 2))
+        term_4 = np.sum(d2 * a4_diag)
+        term_5 = np.trace(A4)
+        term_6 = np.trace(A3)
+        term_7 = np.sum(d2 ** 3)
+        term_8 = np.sum(A3)
+        term_9 = np.sum(d2 ** 2)
+        term_10 = np.trace(A2)
+
+        k6_graph = (
+            term_1
+            - 3.0 * term_2
+            + 9.0 * term_3
+            - 6.0 * term_4
+            + 6.0 * term_5
+            - 4.0 * term_6
+            + 4.0 * term_7
+            + 3.0 * term_8
+            - 12.0 * term_9
+            + 4.0 * term_10
+        ) / 12.0
+
+        x_cycles = np.stack([k3_node, k4_node, k5_node], axis=-1).astype(np.float32)
+        x_cycles = np.clip(x_cycles, 0, None)
+
+        y_cycles = np.array([k3_graph, k4_graph, k5_graph, k6_graph],
+                            dtype=np.float32)
+        y_cycles = np.clip(y_cycles, 0, None)
+
+        return x_cycles, y_cycles
+
+
+class NodeCycleFeatures:
+    r"""Compute k-cycle counts per node."""
+    def __init__(self):
+        self.cycle_counter = KNodeCycles()
+
+    def __call__(self, noisy_data):
+        """
+        Parameters
+        ----------
+        noisy_data : dict
+            Must contain ``'E_t'`` and ``'node_mask'``.
+
+        Returns
+        -------
+        tuple
+            ``(x_cycles, y_cycles)`` as tensors.
+        """
+        E = noisy_data['E_t']
+        node_mask = noisy_data['node_mask']
+        E_np = tlx.convert_to_numpy(E)
+        mask_np = tlx.convert_to_numpy(node_mask)
+        bs, n, _, de = E_np.shape
+
+        adj = np.sum(E_np[:, :, :, 1:], axis=-1)  # (bs, n, n)
+
+        all_x_cycles = []
+        all_y_cycles = []
+        for b in range(bs):
+            x_cyc, y_cyc = self.cycle_counter.k_cycles(adj[b])
+            all_x_cycles.append(x_cyc)
+            all_y_cycles.append(y_cyc)
+
+        x_cycles = np.stack(all_x_cycles, axis=0)  # (bs, n, 3) for k3,k4,k5
+        y_cycles = np.stack(all_y_cycles, axis=0)  # (bs, 4) for k3,k4,k5,k6
+
+        # Normalize and clamp (matching original)
+        x_cycles = x_cycles / 10.0
+        y_cycles = y_cycles / 10.0
+        x_cycles = np.clip(x_cycles, 0, 1)
+        y_cycles = np.clip(y_cycles, 0, 1)
+
+        # Apply node mask to zero out padded nodes (Bug 12 fix)
+        x_cycles = x_cycles * mask_np[:, :, np.newaxis]
+
+        return (tlx.convert_to_tensor(x_cycles.astype(np.float32)),
+                tlx.convert_to_tensor(y_cycles.astype(np.float32)))
+
+
+# ============================================================
+# EigenFeatures (Laplacian eigenvalue / eigenvector features)
+# ============================================================
+
+class EigenFeatures:
+    r"""Compute Laplacian eigenvalue and eigenvector features.
+
+    Parameters
+    ----------
+    mode : str
+        ``'eigenvalues'`` for eigenvalue features only.
+        ``'all'`` for eigenvalues + eigenvectors + connected component features.
+    """
+    def __init__(self, mode='eigenvalues'):
+        self.mode = mode
+
+    def __call__(self, noisy_data):
+        E_t = noisy_data['E_t']
+        mask = noisy_data['node_mask']
+
+        E_np = tlx.convert_to_numpy(E_t)
+        mask_np = tlx.convert_to_numpy(mask)
+
+        A = np.sum(E_np[:, :, :, 1:], axis=-1).astype(np.float64)  # (bs, n, n)
+        bs, n = A.shape[0], A.shape[1]
+
+        # Apply mask to adjacency
+        A = A * mask_np[:, np.newaxis, :] * mask_np[:, :, np.newaxis]
+
+        # Compute Laplacian
+        L = _compute_laplacian(A, normalize=False)
+
+        # Mask out padding nodes: add large diagonal for non-existent nodes
+        eye_n = np.eye(n)[np.newaxis, :, :]  # (1, n, n)
+        mask_diag = 2.0 * n * eye_n * (1.0 - mask_np[:, np.newaxis, :]) * (1.0 - mask_np[:, :, np.newaxis])
+        L = L * mask_np[:, np.newaxis, :] * mask_np[:, :, np.newaxis] + mask_diag
+
+        if self.mode == 'eigenvalues':
+            eigvals = np.linalg.eigvalsh(L)  # (bs, n)
+            eigvals = eigvals / (mask_np.sum(axis=1, keepdims=True) + 1e-8)
+
+            n_connected, batch_eigenvalues = _get_eigenvalues_features(eigvals)
+            return n_connected, batch_eigenvalues
+
+        elif self.mode == 'all':
+            eigvals, eigvectors = np.linalg.eigh(L)
+            eigvals = eigvals / (mask_np.sum(axis=1, keepdims=True) + 1e-8)
+            eigvectors = eigvectors * mask_np[:, :, np.newaxis] * mask_np[:, np.newaxis, :]
+
+            n_connected, batch_eigenvalues = _get_eigenvalues_features(eigvals)
+            nonlcc_indicator, k_lowest_eigvec = _get_eigenvectors_features(
+                eigvectors, mask_np, n_connected
+            )
+            return n_connected, batch_eigenvalues, nonlcc_indicator, k_lowest_eigvec
+        else:
+            raise ValueError(f"Mode {self.mode} not implemented")
+
+
+def _compute_laplacian(adjacency, normalize=False):
+    """Compute Laplacian from adjacency matrix.
+
+    Parameters
+    ----------
+    adjacency : ndarray
+        ``(bs, n, n)``
+    normalize : bool
+        If True, compute symmetric normalized Laplacian.
+
+    Returns
+    -------
+    ndarray
+        Laplacian ``(bs, n, n)``.
+    """
+    diag = adjacency.sum(axis=-1)  # (bs, n)
+    n = diag.shape[-1]
+    D = np.zeros_like(adjacency)
+    for b in range(diag.shape[0]):
+        np.fill_diagonal(D[b], diag[b])
+    combinatorial = D - adjacency
+
+    if not normalize:
+        return (combinatorial + np.transpose(combinatorial, (0, 2, 1))) / 2.0
+
+    diag0 = diag.copy()
+    diag[diag == 0] = 1e-12
+    diag_norm = 1.0 / np.sqrt(diag)
+    D_norm = np.zeros_like(adjacency)
+    for b in range(diag.shape[0]):
+        np.fill_diagonal(D_norm[b], diag_norm[b])
+
+    eye = np.eye(n)[np.newaxis, :, :]
+    L = eye - D_norm @ adjacency @ D_norm
+    L[diag0 == 0] = 0
+    return (L + np.transpose(L, (0, 2, 1))) / 2.0
+
+
+def _get_eigenvalues_features(eigenvalues, k=5):
+    """Extract eigenvalue features.
+
+    Parameters
+    ----------
+    eigenvalues : ndarray
+        ``(bs, n)`` eigenvalues.
+    k : int
+        Number of non-zero eigenvalues to keep.
+
+    Returns
+    -------
+    tuple
+        ``(n_connected_components, first_k_eigenvalues)`` with shapes
+        ``(bs, 1)`` and ``(bs, k)``.
+    """
+    bs, n = eigenvalues.shape
+    n_connected_components = (eigenvalues < 1e-5).sum(axis=-1)  # (bs,)
+    n_connected_components = np.maximum(n_connected_components, 1)
+
+    to_extend = max(n_connected_components) + k - n
+    if to_extend > 0:
+        eigenvalues = np.hstack(
+            [eigenvalues, 2.0 * np.ones((bs, to_extend), dtype=eigenvalues.dtype)]
+        )
+
+    indices = np.arange(k)[np.newaxis, :] + n_connected_components[:, np.newaxis]
+    first_k_ev = np.take_along_axis(eigenvalues, indices, axis=1)  # (bs, k)
+
+    return (n_connected_components[:, np.newaxis].astype(np.float32),
+            first_k_ev.astype(np.float32))
+
+
+def _get_eigenvectors_features(vectors, node_mask, n_connected, k=2):
+    """Extract eigenvector features.
+
+    Parameters
+    ----------
+    vectors : ndarray
+        Eigenvectors ``(bs, n, n)`` in columns.
+    node_mask : ndarray
+        ``(bs, n)`` bool.
+    n_connected : ndarray
+        ``(bs, 1)`` number of connected components.
+    k : int
+        Number of eigenvectors to keep.
+
+    Returns
+    -------
+    tuple
+        ``(not_lcc_indicator, k_lowest_eigvec)`` with shapes ``(bs, n, 1)``
+        and ``(bs, n, k)``.
+    """
+    bs, n = vectors.shape[0], vectors.shape[1]
+
+    # Create indicator for nodes outside the largest connected component
+    first_ev = np.round(vectors[:, :, 0], decimals=3) * node_mask
+    random_noise = np.random.randn(bs, n) * (1.0 - node_mask)
+    first_ev = first_ev + random_noise
+
+    most_common = np.array([np.median(first_ev[b][node_mask[b].astype(bool)])
+                            for b in range(bs)])
+    # Use a simpler mode computation
+    most_common_list = []
+    for b in range(bs):
+        vals = first_ev[b][node_mask[b].astype(bool)]
+        if len(vals) > 0:
+            # Find most common value
+            unique_vals, counts = np.unique(np.round(vals, decimals=2), return_counts=True)
+            most_common_list.append(unique_vals[counts.argmax()])
+        else:
+            most_common_list.append(0.0)
+    most_common = np.array(most_common_list)
+
+    mask_lcc = ~(np.round(first_ev, decimals=2) == most_common[:, np.newaxis])
+    not_lcc_indicator = (mask_lcc * node_mask).astype(np.float32)[:, :, np.newaxis]
+
+    # Get k lowest eigenvectors after connected component eigenvectors
+    to_extend = max(int(n_connected.max())) + k - n
+    if to_extend > 0:
+        vectors = np.concatenate(
+            [vectors, np.zeros((bs, n, to_extend), dtype=vectors.dtype)],
+            axis=2
+        )
+
+    indices = np.arange(k)[np.newaxis, np.newaxis, :] + n_connected[:, :, np.newaxis]
+    indices = np.broadcast_to(indices, (bs, n, k)).astype(np.int64)
+    first_k_ev = np.take_along_axis(vectors, indices, axis=2)  # (bs, n, k)
+    first_k_ev = first_k_ev * node_mask[:, :, np.newaxis]
+
+    return not_lcc_indicator, first_k_ev.astype(np.float32)
+
+
+# ============================================================
+# ExtraFeatures (main dispatcher)
+# ============================================================
+
+class ExtraFeatures:
+    r"""Compute extra structural features for the DeFoG model.
+
+    Parameters
+    ----------
+    extra_features_type : str
+        Feature type: ``'rrwp'``, ``'rrwp_double'``, ``'rrwp_only'``,
+        ``'rrwp_comp'``, ``'cycles'``, ``'eigenvalues'``, ``'all'``,
+        or ``None``.
+    rrwp_steps : int
+        Number of RRWP steps.
+    dataset_info : dict
+        Dataset information with ``'max_n_nodes'`` key.
+    """
+    def __init__(self, extra_features_type='rrwp', rrwp_steps=12, dataset_info=None):
+        self.features_type = extra_features_type
+        self.max_n_nodes = dataset_info.get('max_n_nodes', 100) if dataset_info else 100
+        self.rrwp_steps = rrwp_steps
+
+        self.rrwp = RRWPFeatures(k=rrwp_steps, normalize=True)
+        self.rwp = RRWPFeatures(k=rrwp_steps, normalize=False)
+        self.cycle_features = NodeCycleFeatures()
+
+        if extra_features_type in ('eigenvalues', 'all'):
+            self.eigenfeatures = EigenFeatures(mode=extra_features_type)
+
+    def __call__(self, noisy_data):
+        """
+        Parameters
+        ----------
+        noisy_data : dict
+            Must contain ``'X_t'``, ``'E_t'``, ``'node_mask'``.
+
+        Returns
+        -------
+        DenseFeaturePlaceHolder
+            Extra features for X, E, and y.
+        """
+        E_t = noisy_data['E_t']
+        node_mask = noisy_data['node_mask']
+        bs = E_t.shape[0]
+        n = E_t.shape[1]
+
+        # n_nodes normalized by max
+        n_nodes = tlx.reduce_sum(tlx.cast(node_mask, tlx.float32), axis=1, keepdims=True)
+        n_norm = n_nodes / self.max_n_nodes
+
+        if self.features_type is None or self.features_type == 'none':
+            return DenseFeaturePlaceHolder(
+                X=tlx.zeros([bs, n, 0], dtype=tlx.float32),
+                E=tlx.zeros([bs, n, n, 0], dtype=tlx.float32),
+                y=tlx.zeros([bs, 0], dtype=tlx.float32),
+            )
+
+        if self.features_type == 'cycles':
+            x_cycles, y_cycles = self.cycle_features(noisy_data)
+            extra_y = tlx.concat([n_norm, y_cycles], axis=-1)
+            return DenseFeaturePlaceHolder(
+                X=x_cycles,
+                E=tlx.zeros([bs, n, n, 0], dtype=tlx.float32),
+                y=extra_y,
+            )
+
+        if self.features_type == 'eigenvalues':
+            eigen_out = self.eigenfeatures(noisy_data)
+            n_components, batch_eigenvalues = eigen_out
+            x_cycles, y_cycles = self.cycle_features(noisy_data)
+            extra_y = tlx.concat([n_norm, y_cycles, n_components, batch_eigenvalues], axis=-1)
+            return DenseFeaturePlaceHolder(
+                X=x_cycles,
+                E=tlx.zeros([bs, n, n, 0], dtype=tlx.float32),
+                y=extra_y,
+            )
+
+        if self.features_type == 'rrwp':
+            # Extract adjacency from one-hot edges: sum of non-"no-edge" channels
+            E_adj = _extract_adjacency(E_t)
+            rrwp_edge = self.rrwp(E_adj, k=self.rrwp_steps)  # (bs, n, n, k)
+            rrwp_node = _extract_diagonal(rrwp_edge)  # (bs, n, k)
+
+            # Initialize eigenfeatures for potential later use
+            self.eigenfeatures = EigenFeatures(mode='all')
+
+            # Cycle features for y only (Bug 14 fix: X = rrwp_node only, no cycles)
+            x_cycles, y_cycles = self.cycle_features(noisy_data)
+            extra_y = tlx.concat([n_norm, y_cycles], axis=-1)
+
+            return DenseFeaturePlaceHolder(
+                X=rrwp_node,
+                E=rrwp_edge,
+                y=extra_y,
+            )
+
+        if self.features_type == 'rrwp_double':
+            E_adj = _extract_adjacency(E_t)
+            rrwp_edge = self.rrwp(E_adj, k=self.rrwp_steps)
+            rrwp_edge_wo_norm = self.rwp(E_adj, k=self.rrwp_steps)
+
+            # Normalize unnormalized RRWP
+            rrwp_np = tlx.convert_to_numpy(rrwp_edge_wo_norm)
+            max_val = rrwp_np.max(axis=(1, 2), keepdims=True)
+            max_val = np.where(max_val == 0, 1.0, max_val)
+            rrwp_np = rrwp_np / max_val
+            rrwp_edge_wo_norm = tlx.convert_to_tensor(rrwp_np)
+
+            rrwp_edge = tlx.concat([rrwp_edge, rrwp_edge_wo_norm], axis=-1)
+            rrwp_node = _extract_diagonal(rrwp_edge)
+
+            x_cycles, y_cycles = self.cycle_features(noisy_data)
+            extra_y = tlx.concat([n_norm, y_cycles], axis=-1)
+
+            return DenseFeaturePlaceHolder(
+                X=rrwp_node,
+                E=rrwp_edge,
+                y=extra_y,
+            )
+
+        if self.features_type == 'rrwp_only':
+            E_adj = _extract_adjacency(E_t)
+            rrwp_edge = self.rrwp(E_adj, k=self.rrwp_steps)
+            rrwp_node = _extract_diagonal(rrwp_edge)
+
+            # No cycle features in y (only n_norm)
+            return DenseFeaturePlaceHolder(
+                X=rrwp_node,
+                E=rrwp_edge,
+                y=n_norm,
+            )
+
+        if self.features_type == 'rrwp_comp':
+            E_adj = _extract_adjacency(E_t)
+            half_k = max(1, self.rrwp_steps // 2)
+            rrwp_edge = self.rrwp(E_adj, k=half_k)
+            rrwp_node = _extract_diagonal(rrwp_edge)
+
+            # Complement adjacency
+            comp_adj = 1.0 - _extract_adjacency(E_t)
+            comp_rrwp_edge = self.rrwp(comp_adj, k=half_k)
+            comp_rrwp_node = _extract_diagonal(comp_rrwp_edge)
+
+            x_cycles, y_cycles = self.cycle_features(noisy_data)
+            extra_y = tlx.concat([n_norm, y_cycles], axis=-1)
+
+            return DenseFeaturePlaceHolder(
+                X=tlx.concat([rrwp_node, comp_rrwp_node], axis=-1),
+                E=tlx.concat([rrwp_edge, comp_rrwp_edge], axis=-1),
+                y=extra_y,
+            )
+
+        if self.features_type == 'all':
+            eigen_out = self.eigenfeatures(noisy_data)
+            n_components, batch_eigenvalues, nonlcc_indicator, k_lowest_eigvec = eigen_out
+
+            x_cycles, y_cycles = self.cycle_features(noisy_data)
+
+            # X = cycles + nonlcc_indicator + eigenvectors
+            X_feat = tlx.concat([x_cycles,
+                                 tlx.convert_to_tensor(nonlcc_indicator),
+                                 tlx.convert_to_tensor(k_lowest_eigvec)], axis=-1)
+
+            extra_y = tlx.concat([n_norm, y_cycles,
+                                  tlx.convert_to_tensor(n_components),
+                                  tlx.convert_to_tensor(batch_eigenvalues)], axis=-1)
+
+            return DenseFeaturePlaceHolder(
+                X=X_feat,
+                E=tlx.zeros([bs, n, n, 0], dtype=tlx.float32),
+                y=extra_y,
+            )
+
+        raise ValueError(f"Unknown extra feature type: {self.features_type}")
+
+
+def _extract_adjacency(E_t):
+    """Extract adjacency matrix from one-hot edge features.
+
+    Parameters
+    ----------
+    E_t : tensor
+        Edge features ``(bs, n, n, de)``.
+
+    Returns
+    -------
+    tensor
+        Adjacency ``(bs, n, n)``.
+    """
+    E_np = tlx.convert_to_numpy(E_t)
+    adj = np.sum(E_np[:, :, :, 1:], axis=-1).astype(np.float32)  # (bs, n, n)
+    return tlx.convert_to_tensor(adj)
+
+
+def _extract_diagonal(rrwp_edge):
+    """Extract diagonal (node-level) features from RRWP edge features.
+
+    Parameters
+    ----------
+    rrwp_edge : tensor
+        RRWP features ``(bs, n, n, k)``.
+
+    Returns
+    -------
+    tensor
+        Node features ``(bs, n, k)``.
+    """
+    # TODO: Performance Optimization
+    # Converting to numpy and back is inefficient. 
+    # Can be replaced with advanced tensor indexing or torch.diagonal equivalent.
+    rrwp_np = tlx.convert_to_numpy(rrwp_edge)
+    bs, n, _, k = rrwp_np.shape
+    diag_idx = np.arange(n)
+    result = rrwp_np[:, diag_idx, diag_idx, :]  # (bs, n, k)
+    return tlx.convert_to_tensor(result)
+
+
+def compute_extra_data(noisy_data, extra_features, domain_features, noise_dist):
+    r"""Compute extra features for the model input.
+
+    Parameters
+    ----------
+    noisy_data : dict
+        Noisy data from ``apply_noise()``.
+    extra_features : callable
+        Structural feature computer.
+    domain_features : callable
+        Domain-specific feature computer.
+    noise_dist : NoiseDistribution
+        Noise distribution (for removing virtual classes before feature computation).
+
+    Returns
+    -------
+    DenseFeaturePlaceHolder
+        Extra features for X, E, y.
+    """
+    # Strip virtual classes before computing features
+    X_t = noisy_data['X_t']
+    E_t = noisy_data['E_t']
+
+    noisy_for_features = dict(noisy_data)
+    result = noise_dist.ignore_virtual_classes(X_t, E_t)
+    noisy_for_features['X_t'] = result[0]
+    noisy_for_features['E_t'] = result[1]
+
+    # Compute structural features
+    extra = extra_features(noisy_for_features)
+
+    # Compute domain features
+    domain = domain_features(noisy_for_features)
+
+    # Concatenate
+    extra_X = tlx.concat([extra.X, domain.X], axis=-1) if domain.X.shape[-1] > 0 else extra.X
+    extra_E = tlx.concat([extra.E, domain.E], axis=-1) if domain.E.shape[-1] > 0 else extra.E
+
+    # Append timestep to y
+    t = noisy_data['t']  # (bs, 1)
+    extra_y_parts = []
+    if extra.y.shape[-1] > 0:
+        extra_y_parts.append(extra.y)
+    if domain.y.shape[-1] > 0:
+        extra_y_parts.append(domain.y)
+    extra_y_parts.append(t)  # timestep always last
+
+    extra_y = tlx.concat(extra_y_parts, axis=-1) if len(extra_y_parts) > 1 else t
+
+    return DenseFeaturePlaceHolder(X=extra_X, E=extra_E, y=extra_y)
+
+import numpy as np
+import tensorlayerx as tlx
+class DenseFeaturePlaceHolder:
+    def __init__(self, X, E, y):
+        self.X = X
+        self.E = E
+        self.y = y
+
+
+
+class ChargeFeature:
+    r"""Compute per-node charge using the original DeFoG molecular feature logic.
+
+    Parameters
+    ----------
+    remove_h : bool
+        Whether hydrogens are removed.
+    valencies : list
+        Expected valency for each atom type.
+    """
+    def __init__(self, remove_h=True, valencies=None):
+        self.remove_h = remove_h
+        if valencies is None:
+            valencies = [4, 3, 2, 1]
+        self.valencies = np.array(valencies, dtype=np.float32)
+
+    def __call__(self, noisy_data):
+        X = tlx.convert_to_numpy(noisy_data['X_t'])
+        E = tlx.convert_to_numpy(noisy_data['E_t'])
+        dx = X.shape[-1]
+        de = E.shape[-1]
+
+        if de == 5:
+            bond_orders = np.array([0, 1, 2, 3, 1.5], dtype=np.float32).reshape(1, 1, 1, -1)
+        else:
+            bond_orders = np.array([0, 1, 2, 3], dtype=np.float32).reshape(1, 1, 1, -1)
+
+        weighted_E = E * bond_orders
+        current_valencies = np.argmax(weighted_E, axis=-1).sum(axis=-1).astype(np.float32)
+
+        valencies = self.valencies
+        if len(valencies) < dx:
+            valencies = np.pad(valencies, (0, dx - len(valencies)))
+        valencies = valencies.reshape(1, 1, -1)
+        weighted_X = X * valencies
+        normal_valencies = np.argmax(weighted_X, axis=-1).astype(np.float32)
+
+        charge = (normal_valencies - current_valencies).astype(np.float32)
+        return tlx.convert_to_tensor(charge)
+
+
+class ValencyFeature:
+    r"""Compute per-node valency using the original DeFoG molecular feature logic."""
+    def __call__(self, noisy_data):
+        E = tlx.convert_to_numpy(noisy_data['E_t'])
+        de = E.shape[-1]
+
+        if de == 5:
+            bond_orders = np.array([0, 1, 2, 3, 1.5], dtype=np.float32).reshape(1, 1, 1, -1)
+        else:
+            bond_orders = np.array([0, 1, 2, 3], dtype=np.float32).reshape(1, 1, 1, -1)
+
+        weighted_E = E * bond_orders
+        valencies = np.argmax(weighted_E, axis=-1).sum(axis=-1).astype(np.float32)
+        return tlx.convert_to_tensor(valencies)
+
+
+class WeightFeature:
+    r"""Compute normalized molecular weight.
+
+    Parameters
+    ----------
+    max_weight : float
+        Maximum molecular weight for normalization.
+    atom_weights : list or dict
+        Atomic weight for each atom type.
+    """
+    def __init__(self, max_weight=500.0, atom_weights=None):
+        self.max_weight = max_weight
+        if atom_weights is None:
+            atom_weights = [12.0, 14.0, 16.0, 19.0]
+        if isinstance(atom_weights, dict):
+            atom_weights = [atom_weights[k] for k in sorted(atom_weights.keys())]
+        self.atom_weights = np.array(atom_weights, dtype=np.float32)
+
+    def __call__(self, noisy_data):
+        X = tlx.convert_to_numpy(noisy_data['X_t'])
+        bs, n, dx = X.shape
+
+        atom_types = np.argmax(X, axis=-1)
+        aw = self.atom_weights
+        if len(aw) < dx:
+            aw = np.pad(aw, (0, dx - len(aw)))
+
+        weights = aw[atom_types]
+        mol_weight = np.sum(weights, axis=1, keepdims=True).astype(np.float32)
+        mol_weight = mol_weight / self.max_weight
+        return tlx.convert_to_tensor(mol_weight)
+
+
+class ExtraMolecularFeatures:
+    r"""Molecular-specific extra features: charge, valency, and weight.
+
+    Parameters
+    ----------
+    dataset_infos : dict
+        Dataset information with keys ``'remove_h'``, ``'valencies'``,
+        ``'max_weight'``, ``'atom_weights'``.
+    """
+    def __init__(self, dataset_infos=None):
+        if dataset_infos is None:
+            dataset_infos = {}
+        self.charge = ChargeFeature(
+            remove_h=dataset_infos.get('remove_h', True),
+            valencies=dataset_infos.get('valencies', None),
+        )
+        self.valency = ValencyFeature()
+        self.weight = WeightFeature(
+            max_weight=dataset_infos.get('max_weight', 500.0),
+            atom_weights=dataset_infos.get('atom_weights', None),
+        )
+
+    def __call__(self, noisy_data):
+        """
+        Returns
+        -------
+        DenseFeaturePlaceHolder
+            X: ``(bs, n, 2)`` (charge + valency), E: empty, y: ``(bs, 1)`` (weight).
+        """
+        charge = self.charge(noisy_data)    # (bs, n, 1)
+        valency = self.valency(noisy_data)  # (bs, n, 1)
+        weight = self.weight(noisy_data)    # (bs, 1)
+
+        bs = charge.shape[0]
+        n = charge.shape[1]
+
+        return DenseFeaturePlaceHolder(
+            X=tlx.concat([
+                tlx.expand_dims(charge, axis=-1),
+                tlx.expand_dims(valency, axis=-1),
+            ], axis=-1),
+            E=tlx.zeros([bs, n, n, 0], dtype=tlx.float32),
+            y=weight,
+        )
+
+
+
diff --git a/examples/defog/flow_matching.py b/examples/defog/flow_matching.py
new file mode 100644
index 000000000..dcf8287c3
--- /dev/null
+++ b/examples/defog/flow_matching.py
@@ -0,0 +1,873 @@
+from defog_utils import PlaceHolder, apply_node_mask
+import torch
+import math
+import numpy as np
+import tensorlayerx as tlx
+
+
+
+
+def p_xt_g_x1(X1, E1, t, limit_dist):
+    r"""Compute the conditional probability ``p(x_t | x_1)`` under linear flow.
+
+    Linear interpolation: ``p(x_t | x_1) = t * onehot(x_1) + (1-t) * limit_dist``.
+
+    Parameters
+    ----------
+    X1 : tensor
+        Integer node labels, shape ``(bs, n)``.
+    E1 : tensor
+        Integer edge labels, shape ``(bs, n, n)``.
+    t : tensor
+        Time values in [0,1], shape ``(bs, 1)``.
+    limit_dist : PlaceHolder
+        Noise distribution with ``.X`` of shape ``(dx,)`` and ``.E`` of shape ``(de,)``.
+
+    Returns
+    -------
+    tuple
+        ``(prob_X, prob_E)`` with shapes ``(bs, n, dx)`` and ``(bs, n, n, de)``.
+    """
+    dx = len(limit_dist.X)
+    de = len(limit_dist.E)
+
+    # t_time shape: (bs, 1, 1) for broadcasting with X
+    t_squeezed = torch.reshape(t, [-1])  # (bs,)
+    t_x = torch.reshape(t_squeezed, [-1, 1, 1])       # (bs, 1, 1)
+    t_e = torch.reshape(t_squeezed, [-1, 1, 1, 1])    # (bs, 1, 1, 1)
+
+    X1_onehot = torch.nn.functional.one_hot(X1, num_classes=dx).float()  # (bs, n, dx)
+    E1_onehot = torch.nn.functional.one_hot(E1, num_classes=de).float()  # (bs, n, n, de)
+
+    limit_X = torch.reshape(limit_dist.X.to(torch.float32), [1, 1, dx])    # (1, 1, dx)
+    limit_E = torch.reshape(limit_dist.E.to(torch.float32), [1, 1, 1, de]) # (1, 1, 1, de)
+
+    prob_X = t_x * X1_onehot + (1.0 - t_x) * limit_X
+    prob_E = t_e * E1_onehot + (1.0 - t_e) * limit_E
+
+    prob_X = torch.clamp(prob_X, min=0.0, max=1.0)
+    prob_E = torch.clamp(prob_E, min=0.0, max=1.0)
+
+    return prob_X, prob_E
+
+
+def dt_p_xt_g_x1(X1, E1, limit_dist):
+    r"""Compute the time derivative ``d/dt p(x_t | x_1)``.
+
+    Since the interpolation is linear, the derivative is constant:
+    ``d/dt p(x_t | x_1) = onehot(x_1) - limit_dist``.
+
+    Parameters
+    ----------
+    X1 : tensor
+        Integer node labels, shape ``(bs, n)``.
+    E1 : tensor
+        Integer edge labels, shape ``(bs, n, n)``.
+    limit_dist : PlaceHolder
+        Noise distribution.
+
+    Returns
+    -------
+    tuple
+        ``(dX, dE)`` with shapes ``(bs, n, dx)`` and ``(bs, n, n, de)``.
+    """
+    dx = len(limit_dist.X)
+    de = len(limit_dist.E)
+
+    X1_onehot = torch.nn.functional.one_hot(X1, num_classes=dx).float()  # (bs, n, dx)
+    E1_onehot = torch.nn.functional.one_hot(E1, num_classes=de).float()  # (bs, n, n, de)
+
+    limit_X = torch.reshape(limit_dist.X.to(torch.float32), [1, 1, dx])
+    limit_E = torch.reshape(limit_dist.E.to(torch.float32), [1, 1, 1, de])
+
+    dX = X1_onehot - limit_X
+    dE = E1_onehot - limit_E
+
+    return dX, dE
+
+
+
+
+def assert_correctly_masked(variable, node_mask):
+    r"""Debug assertion: masked positions should be near zero.
+
+    Parameters
+    ----------
+    variable : tensor
+        Tensor to check.
+    node_mask : tensor
+        Boolean mask ``(bs, n)``.
+    """
+    x_mask = torch.unsqueeze(node_mask.to(torch.float32), dim=-1)
+    inv_mask = 1.0 - x_mask
+    masked_vals = variable * inv_mask
+    max_val = float(torch.max(torch.abs(masked_vals)))
+    assert max_val < 1e-4, f"Masked values not zero: max={max_val}"
+
+
+def sample_discrete_feature_noise(limit_dist, node_mask):
+    r"""Sample noise from the limit distribution.
+
+    Parameters
+    ----------
+    limit_dist : PlaceHolder
+        Noise distribution with ``.X`` shape ``(dx,)``, ``.E`` shape ``(de,)``.
+    node_mask : tensor
+        Boolean mask ``(bs, n)``.
+
+    Returns
+    -------
+    PlaceHolder
+        Sampled one-hot noise ``(X, E, y)`` masked.
+    """
+    nm = node_mask.to(torch.float32)
+    bs = nm.shape[0]
+    n = nm.shape[1]
+
+    dx = len(limit_dist.X)
+    de = len(limit_dist.E)
+
+    # Sample nodes
+    x_probs = limit_dist.X.to(torch.float32)
+    x_probs = torch.tile(torch.reshape(x_probs, [1, dx]), [bs * n, 1])  # (bs*n, dx)
+    U_X = torch.multinomial(x_probs, 1)  # (bs*n, 1)
+    U_X = torch.reshape(U_X, [bs, n])  # (bs, n)
+
+    # Sample edges
+    e_probs = limit_dist.E.to(torch.float32)
+    e_probs = torch.tile(torch.reshape(e_probs, [1, de]), [bs * n * n, 1])
+    U_E = torch.multinomial(e_probs, 1)
+    U_E = torch.reshape(U_E, [bs, n, n])
+
+    # Convert to one-hot
+    U_X_oh = torch.nn.functional.one_hot(U_X, num_classes=dx).float()  # (bs, n, dx)
+    U_E_oh = torch.nn.functional.one_hot(U_E, num_classes=de).float()  # (bs, n, n, de)
+
+    # Symmetrize E: keep upper triangle, add transpose
+    U_E_oh = torch.triu(U_E_oh, diagonal=1)
+    U_E_oh = U_E_oh + U_E_oh.permute(*[0, 2, 1, 3])
+
+    U_y = torch.zeros([bs, 0], dtype=torch.float32)
+
+    return PlaceHolder(X=U_X_oh, E=U_E_oh, y=U_y).mask(node_mask)
+
+
+def sample_discrete_features(probX, probE, node_mask, mask=False):
+    r"""Sample discrete features (integer class labels) from probability distributions.
+
+    Parameters
+    ----------
+    probX : tensor
+        Node class probabilities ``(bs, n, dx)``.
+    probE : tensor
+        Edge class probabilities ``(bs, n, n, de)``.
+    node_mask : tensor
+        Boolean mask ``(bs, n)``.
+    mask : bool
+        If True, zero out masked positions after sampling.
+
+    Returns
+    -------
+    PlaceHolder
+        Sampled integer indices ``X: (bs, n)``, ``E: (bs, n, n)``.
+    """
+    bs = probX.shape[0]
+    n = probX.shape[1]
+    dx = probX.shape[2]
+    de = probE.shape[3]
+
+    # Handle masked nodes: set uniform probability
+    nm = node_mask.to(torch.float32)
+    nm_exp = torch.unsqueeze(nm, dim=-1)  # (bs, n, 1)
+    uniform_x = torch.ones_like(probX) / dx
+    probX = probX * nm_exp + uniform_x * (1.0 - nm_exp)
+
+    # Sample nodes
+    probX_flat = torch.reshape(probX, [bs * n, dx])
+    X_t = torch.multinomial(probX_flat, 1)  # (bs*n, 1)
+    X_t = torch.reshape(X_t, [bs, n])
+
+    # Handle masked edges: set uniform probability for invalid pairs and diagonal
+    inv_edge_mask = 1.0 - torch.unsqueeze(nm, dim=2) * torch.unsqueeze(nm, dim=1)
+    inv_edge_mask = torch.unsqueeze(inv_edge_mask, dim=-1)  # (bs, n, n, 1)
+
+    eye_n = torch.eye(n)
+    diag_mask = torch.reshape(eye_n, [1, n, n, 1])
+    diag_mask = torch.tile(diag_mask, [bs, 1, 1, 1])
+
+    uniform_e = torch.ones_like(probE) / de
+    probE = probE * (1.0 - inv_edge_mask) + uniform_e * inv_edge_mask
+    probE = probE * (1.0 - diag_mask) + uniform_e * diag_mask
+
+    # Sample edges
+    probE_flat = torch.reshape(probE, [bs * n * n, de])
+    E_t = torch.multinomial(probE_flat, 1)
+    E_t = torch.reshape(E_t, [bs, n, n])
+
+    # Symmetrize: keep upper triangle, mirror to lower
+    E_t_int = E_t.to(torch.int64)
+    E_t_int = torch.triu(E_t_int, diagonal=1)
+
+    E_t_int = E_t_int + E_t_int.permute(*[0, 2, 1])
+
+    if mask:
+        nm_int = node_mask.to(torch.int64)
+        X_t = X_t.to(torch.int64) * nm_int
+        em = torch.unsqueeze(nm_int, dim=2) * torch.unsqueeze(nm_int, dim=1)
+        E_t_int = E_t_int * em
+
+    X_t = X_t.to(torch.int64)
+    E_t_int = E_t_int.to(torch.int64)
+    U_y = torch.zeros([bs, 0], dtype=torch.float32)
+
+    return PlaceHolder(X=X_t, E=E_t_int, y=U_y)
+
+
+
+
+class TimeDistorter:
+    r"""Time distortion for training and sampling schedules.
+
+    Transforms uniform time ``t in [0, 1]`` to reshape the
+    training/sampling schedule.
+
+    Parameters
+    ----------
+    train_distortion : str
+        Distortion type for training. One of ``'identity'``, ``'cos'``,
+        ``'revcos'``, ``'polyinc'``, ``'polydec'``.
+    sample_distortion : str
+        Distortion type for sampling.
+    """
+    def __init__(self, train_distortion='identity', sample_distortion='identity'):
+        self.train_distortion = train_distortion
+        self.sample_distortion = sample_distortion
+
+    def train_ft(self, batch_size):
+        r"""Sample distorted time values for training.
+
+        Parameters
+        ----------
+        batch_size : int
+            Number of time samples to generate.
+
+        Returns
+        -------
+        tensor
+            Distorted time values of shape ``(batch_size, 1)``.
+        """
+        t_uniform = torch.tensor(
+            np.random.uniform(0, 1, size=(batch_size, 1)).astype(np.float32)
+        )
+        return self.apply_distortion(t_uniform, self.train_distortion)
+
+    def sample_ft(self, t, sample_distortion=None):
+        r"""Apply distortion to a time value during sampling.
+
+        Parameters
+        ----------
+        t : tensor
+            Time values.
+        sample_distortion : str, optional
+            Override distortion type. Defaults to ``self.sample_distortion``.
+
+        Returns
+        -------
+        tensor
+            Distorted time values.
+        """
+        if sample_distortion is None:
+            sample_distortion = self.sample_distortion
+        return self.apply_distortion(t, sample_distortion)
+
+    def apply_distortion(self, t, distortion_type):
+        r"""Apply a time distortion function.
+
+        Parameters
+        ----------
+        t : tensor
+            Time values in [0, 1].
+        distortion_type : str
+            One of ``'identity'``, ``'cos'``, ``'revcos'``, ``'polyinc'``, ``'polydec'``.
+
+        Returns
+        -------
+        tensor
+            Distorted time values in [0, 1].
+        """
+        if distortion_type == 'identity':
+            return t
+        elif distortion_type == 'cos':
+            return (1.0 - torch.cos(t * math.pi)) / 2.0
+        elif distortion_type == 'revcos':
+            return 2.0 * t - (1.0 - torch.cos(t * math.pi)) / 2.0
+        elif distortion_type == 'polyinc':
+            return t ** 2
+        elif distortion_type == 'polydec':
+            return 2.0 * t - t ** 2
+        else:
+            raise ValueError(f"Unknown distortion type: {distortion_type}")
+
+
+
+
+class NoiseDistribution:
+    r"""Noise/limit distribution for the discrete flow matching process.
+
+    Supports multiple transition types that define the noise distribution at ``t=0``.
+
+    Parameters
+    ----------
+    model_transition : str
+        Transition type. One of ``'uniform'``, ``'marginal'``, ``'absorbing'``,
+        ``'absorbfirst'``, ``'argmax'``, ``'edge_marginal'``, ``'node_marginal'``.
+    dataset_infos : object
+        Dataset info object with attributes ``output_dims``, ``node_types``,
+        ``edge_types``.
+    """
+    def __init__(self, model_transition, dataset_infos):
+        self.transition = model_transition
+
+        output_dims = dataset_infos['output_dims']
+        x_num_classes = output_dims['X']
+        e_num_classes = output_dims['E']
+        y_num_classes = output_dims.get('y', 0)
+
+        self.x_num_classes = x_num_classes
+        self.e_num_classes = e_num_classes
+        self.y_num_classes = y_num_classes
+
+        self.x_added_classes = 0
+        self.e_added_classes = 0
+        self.y_added_classes = 0
+
+        node_types = dataset_infos.get('node_types', None)
+        edge_types = dataset_infos.get('edge_types', None)
+
+        if model_transition == 'uniform':
+            x_limit = np.ones(x_num_classes, dtype=np.float32) / x_num_classes
+            e_limit = np.ones(e_num_classes, dtype=np.float32) / e_num_classes
+
+        elif model_transition == 'absorbfirst':
+            x_limit = np.zeros(x_num_classes, dtype=np.float32)
+            x_limit[0] = 1.0
+            e_limit = np.zeros(e_num_classes, dtype=np.float32)
+            e_limit[0] = 1.0
+
+        elif model_transition == 'argmax':
+            node_marginal = node_types / node_types.sum()
+            edge_marginal = edge_types / edge_types.sum()
+            x_limit = np.zeros(x_num_classes, dtype=np.float32)
+            x_limit[np.argmax(node_marginal)] = 1.0
+            e_limit = np.zeros(e_num_classes, dtype=np.float32)
+            e_limit[np.argmax(edge_marginal)] = 1.0
+
+        elif model_transition == 'absorbing':
+            if x_num_classes > 1:
+                self.x_added_classes = 1
+                self.x_num_classes = x_num_classes + 1
+            if e_num_classes > 1:
+                self.e_added_classes = 1
+                self.e_num_classes = e_num_classes + 1
+            x_limit = np.zeros(self.x_num_classes, dtype=np.float32)
+            x_limit[-1] = 1.0
+            e_limit = np.zeros(self.e_num_classes, dtype=np.float32)
+            e_limit[-1] = 1.0
+
+        elif model_transition == 'marginal':
+            x_limit = (node_types / node_types.sum()).astype(np.float32)
+            e_limit = (edge_types / edge_types.sum()).astype(np.float32)
+
+        elif model_transition == 'edge_marginal':
+            x_limit = np.ones(x_num_classes, dtype=np.float32) / x_num_classes
+            e_limit = (edge_types / edge_types.sum()).astype(np.float32)
+
+        elif model_transition == 'node_marginal':
+            x_limit = (node_types / node_types.sum()).astype(np.float32)
+            e_limit = np.ones(e_num_classes, dtype=np.float32) / e_num_classes
+
+        else:
+            raise ValueError(f"Unknown transition type: {model_transition}")
+
+        if y_num_classes > 0:
+            y_limit = np.ones(y_num_classes, dtype=np.float32) / y_num_classes
+        else:
+            y_limit = np.zeros(0, dtype=np.float32)
+
+        self.limit_dist = PlaceHolder(
+            X=torch.tensor(x_limit),
+            E=torch.tensor(e_limit),
+            y=torch.tensor(y_limit),
+        )
+
+        print(f"[NoiseDistribution] transition={model_transition}")
+        print(f"  X limit: {x_limit}")
+        print(f"  E limit: {e_limit}")
+
+    def update_dataset_infos(self, dataset_infos):
+        r"""Update dataset_infos to account for virtual absorbing classes.
+
+        When using the absorbing transition, the atom decoder is extended with
+        a virtual token so downstream molecular tooling sees the correct node
+        vocabulary.
+
+        Parameters
+        ----------
+        dataset_infos : dict
+            Dataset info dict (modified in-place).
+        """
+        if self.transition != 'absorbing':
+            return
+
+        if dataset_infos.get('atom_decoder', None) is not None and self.x_added_classes > 0:
+            dataset_infos['atom_decoder'] = list(dataset_infos['atom_decoder']) + ['Y'] * self.x_added_classes
+
+    def update_input_output_dims(self, input_dims):
+        r"""Update input dims to account for added virtual classes."""
+        input_dims['X'] = input_dims['X'] + self.x_added_classes
+        input_dims['E'] = input_dims['E'] + self.e_added_classes
+        input_dims['y'] = input_dims['y'] + self.y_added_classes
+
+    def get_limit_dist(self):
+        r"""Return the limit distribution."""
+        return self.limit_dist
+
+    def get_noise_dims(self):
+        r"""Return the noise distribution dimensions."""
+        return {
+            'X': len(self.limit_dist.X),
+            'E': len(self.limit_dist.E),
+            'y': len(self.limit_dist.y),
+        }
+
+    def ignore_virtual_classes(self, X, E, y=None):
+        r"""Remove virtual absorbing-state classes from X and E."""
+        if self.transition != 'absorbing':
+            return (X, E, y) if y is not None else (X, E)
+
+        if self.x_added_classes > 0:
+            X = X[..., :-self.x_added_classes]
+        if self.e_added_classes > 0:
+            E = E[..., :-self.e_added_classes]
+        if y is not None:
+            if self.y_added_classes > 0:
+                y = y[..., :-self.y_added_classes]
+            return X, E, y
+        return X, E
+
+    def add_virtual_classes(self, X, E, y=None):
+        r"""Add virtual absorbing-state classes to X and E."""
+        if self.transition != 'absorbing':
+            return (X, E, y) if y is not None else (X, E)
+
+        if self.x_added_classes > 0:
+            zeros_x = torch.zeros(list(X.shape[:-1]) + [self.x_added_classes],
+                                dtype=X.dtype)
+            X = torch.cat([X, zeros_x], dim=-1)
+        if self.e_added_classes > 0:
+            zeros_e = torch.zeros(list(E.shape[:-1]) + [self.e_added_classes],
+                                dtype=E.dtype)
+            E = torch.cat([E, zeros_e], dim=-1)
+        if y is not None:
+            return X, E, y
+        return X, E
+
+
+def apply_noise(X, E, y, node_mask, limit_dist, time_distorter):
+    r"""Apply noise to clean graph data for training.
+
+    Parameters
+    ----------
+    X : tensor
+        Clean node features ``(bs, n, dx)`` (one-hot).
+    E : tensor
+        Clean edge features ``(bs, n, n, de)`` (one-hot).
+    y : tensor
+        Global features ``(bs, dy)``.
+    node_mask : tensor
+        Boolean mask ``(bs, n)``.
+    limit_dist : PlaceHolder
+        Noise limit distribution.
+    time_distorter : TimeDistorter
+        Time distortion function.
+
+    Returns
+    -------
+    dict
+        Noisy data with keys ``'t'``, ``'X_t'``, ``'E_t'``, ``'y_t'``, ``'node_mask'``.
+    """
+    bs = X.shape[0]
+
+    # Move limit_dist to the same device as input (needed for DataParallel)
+    device = X.device if hasattr(X, 'device') else None
+    if device is not None and hasattr(limit_dist, 'X') and hasattr(limit_dist.X, 'to'):
+        if limit_dist.X.device != device:
+            limit_dist = PlaceHolder(X=limit_dist.X.to(device), E=limit_dist.E.to(device), y=limit_dist.y)
+
+    # Sample time
+    t = time_distorter.train_ft(bs)  # (bs, 1)
+
+    # Get clean integer labels
+    X_1 = torch.argmax(X, dim=-1)   # (bs, n)
+    E_1 = torch.argmax(E, dim=-1)   # (bs, n, n)
+
+    # Compute transition probabilities
+    prob_X, prob_E = p_xt_g_x1(X_1, E_1, t, limit_dist)
+    # prob_X: (bs, n, dx), prob_E: (bs, n, n, de)
+
+    # Sample noisy features
+    sampled = sample_discrete_features(prob_X, prob_E, node_mask)
+    X_t_int = sampled.X  # (bs, n)
+    E_t_int = sampled.E  # (bs, n, n)
+
+    dx = len(limit_dist.X)
+    de = len(limit_dist.E)
+
+    # One-hot encode
+    X_t = torch.nn.functional.one_hot(X_t_int, num_classes=dx).float()  # (bs, n, dx)
+    E_t = torch.nn.functional.one_hot(E_t_int, num_classes=de).float()  # (bs, n, n, de)
+
+    # Mask
+    X_t, E_t = apply_node_mask(X_t, E_t, node_mask)
+
+    y_t = y if y is not None else torch.zeros([bs, 0], dtype=torch.float32)
+
+    return {
+        't': t,
+        'X_t': X_t,
+        'E_t': E_t,
+        'y_t': y_t,
+        'node_mask': node_mask,
+    }
+
+
+
+
+
+class RateMatrixDesigner:
+    r"""Designs the rate matrix for CTMC-based sampling.
+
+    Decomposes the rate matrix as ``R_t = R* + R^db + R^tg``:
+
+    - ``R*``: Deterministic optimal flow rate.
+    - ``R^db``: Detailed-balance stochastic rate (controlled by ``eta``).
+    - ``R^tg``: Target guidance rate (controlled by ``omega``).
+
+    Parameters
+    ----------
+    rdb : str
+        Detailed-balance design type: ``'general'``, ``'marginal'``,
+        ``'column'``, or ``'entry'``.
+    rdb_crit : str
+        Sub-criterion for ``rdb`` design (for ``column``: ``'max_marginal'``,
+        ``'x_t'``, ``'abs_state'``, ``'p_x1_g_xt'``, ``'x_1'``,
+        ``'p_xt_g_x1'``, ``'xhat_t'``; for ``entry``: ``'abs_state'``,
+        ``'first'``.
+    eta : float
+        Stochasticity strength for ``R^db``.
+    omega : float
+        Target guidance strength for ``R^tg``.
+    limit_dist : PlaceHolder
+        Noise limit distribution.
+    """
+    def __init__(self, rdb='general', rdb_crit='max_marginal',
+                 eta=0.0, omega=0.0, limit_dist=None):
+        self.rdb = rdb
+        self.rdb_crit = rdb_crit
+        self.eta = eta
+        self.omega = omega
+        self.limit_dist = limit_dist
+        self.num_classes_X = len(limit_dist.X) if limit_dist is not None else 0
+        self.num_classes_E = len(limit_dist.E) if limit_dist is not None else 0
+
+    def compute_graph_rate_matrix(self, t, node_mask, G_t, G_1_pred):
+        r"""Compute the full rate matrix ``R_t = R* + R^db + R^tg``."""
+        X_t, E_t = G_t
+        X_1_pred, E_1_pred = G_1_pred
+
+        # Get integer labels
+        X_t_label = torch.unsqueeze(torch.argmax(X_t, dim=-1), dim=-1)  # (bs, n, 1)
+        E_t_label = torch.unsqueeze(torch.argmax(E_t, dim=-1), dim=-1)  # (bs, n, n, 1)
+
+        # Sample x_1 from predicted distributions
+        sampled = sample_discrete_features(X_1_pred, E_1_pred, node_mask)
+        X_1_sampled = sampled.X  # (bs, n) int
+        E_1_sampled = sampled.E  # (bs, n, n) int
+
+        # Compute shared variables
+        dfm_vars = self._compute_dfm_variables(t, X_t_label, E_t_label,
+                                                X_1_sampled, E_1_sampled)
+
+        # R*
+        Rstar_X, Rstar_E = self._compute_Rstar(dfm_vars)
+
+        # R^db
+        Rdb_X, Rdb_E = self._compute_RDB(X_t_label, E_t_label,
+                                           X_1_pred, E_1_pred,
+                                           X_1_sampled, E_1_sampled,
+                                           node_mask, t, dfm_vars)
+
+        # R^tg
+        Rtg_X, Rtg_E = self._compute_R_tg(X_1_sampled, E_1_sampled,
+                                            X_t_label, E_t_label, dfm_vars)
+
+        R_t_X = Rstar_X + Rdb_X + Rtg_X
+        R_t_E = Rstar_E + Rdb_E + Rtg_E
+
+        R_t_X, R_t_E = self._stabilize(R_t_X, R_t_E, dfm_vars)
+
+        return R_t_X, R_t_E
+
+    def _compute_dfm_variables(self, t, X_t_label, E_t_label,
+                                X_1_sampled, E_1_sampled):
+        """Precompute shared quantities for rate matrix computation."""
+        dX, dE = dt_p_xt_g_x1(X_1_sampled, E_1_sampled, self.limit_dist)
+
+        dt_at_Xt = torch.gather(dX, -1, X_t_label)
+        dt_at_Et = torch.gather(dE, -1, E_t_label)
+
+        pX, pE = p_xt_g_x1(X_1_sampled, E_1_sampled, t, self.limit_dist)
+
+        pt_at_Xt = torch.gather(pX, -1, X_t_label)
+        pt_at_Et = torch.gather(pE, -1, E_t_label)
+
+        Z_X = (pX != 0).sum(dim=-1)
+        Z_E = (pE != 0).sum(dim=-1)
+
+        return {
+            'dt_p_vals_X': dX, 'dt_p_vals_E': dE,
+            'dt_at_Xt': dt_at_Xt, 'dt_at_Et': dt_at_Et,
+            'pt_vals_X': pX, 'pt_vals_E': pE,
+            'pt_at_Xt': pt_at_Xt, 'pt_at_Et': pt_at_Et,
+            'Z_X': Z_X, 'Z_E': Z_E,
+        }
+
+    def _compute_Rstar(self, dfm_vars):
+        """Compute the deterministic optimal rate R*."""
+        dX = dfm_vars['dt_p_vals_X']
+        dE = dfm_vars['dt_p_vals_E']
+        dt_at_Xt = dfm_vars['dt_at_Xt']
+        dt_at_Et = dfm_vars['dt_at_Et']
+        Z_X = dfm_vars['Z_X']
+        Z_E = dfm_vars['Z_E']
+        pt_at_Xt = dfm_vars['pt_at_Xt']
+        pt_at_Et = dfm_vars['pt_at_Et']
+
+        inner_X = dX - dt_at_Xt
+        Rstar_numer_X = torch.relu(inner_X)
+        denom_X = torch.unsqueeze(Z_X, dim=-1) * pt_at_Xt
+        denom_X = torch.where(denom_X == 0, torch.ones_like(denom_X), denom_X)
+        Rstar_X = Rstar_numer_X / denom_X
+
+        inner_E = dE - dt_at_Et
+        Rstar_numer_E = torch.relu(inner_E)
+        denom_E = torch.unsqueeze(Z_E, dim=-1) * pt_at_Et
+        denom_E = torch.where(denom_E == 0, torch.ones_like(denom_E), denom_E)
+        Rstar_E = Rstar_numer_E / denom_E
+
+        return Rstar_X, Rstar_E
+
+    def _compute_RDB(self, X_t_label, E_t_label, X_1_pred, E_1_pred,
+                     X_1_sampled, E_1_sampled, node_mask, t, dfm_vars):
+        """Compute the detailed-balance stochastic rate R^db.
+
+        Supports ``'general'``, ``'marginal'``, ``'column'`` (with 7 sub-criteria),
+        and ``'entry'`` (with ``'abs_state'`` and ``'first'``) designs.
+        """
+        if self.eta == 0:
+            zeros_X = torch.zeros_like(dfm_vars['pt_vals_X'])
+            zeros_E = torch.zeros_like(dfm_vars['pt_vals_E'])
+            return zeros_X, zeros_E
+
+        pX = dfm_vars['pt_vals_X']  # (bs, n, dx)
+        pE = dfm_vars['pt_vals_E']  # (bs, n, n, de)
+
+        dx = pX.shape[-1]
+        de = pE.shape[-1]
+
+        if self.rdb == 'general':
+            x_mask = torch.ones_like(pX)
+            e_mask = torch.ones_like(pE)
+
+        elif self.rdb == 'marginal':
+            limit_X = torch.reshape(self.limit_dist.X, [1, 1, -1])
+            limit_E = torch.reshape(self.limit_dist.E, [1, 1, 1, -1])
+            limit_at_Xt = torch.gather(
+                torch.tile(limit_X, [pX.shape[0], pX.shape[1], 1]), -1, X_t_label
+            )
+            limit_at_Et = torch.gather(
+                torch.tile(limit_E, [pE.shape[0], pE.shape[1], pE.shape[2], 1]), -1, E_t_label
+            )
+            x_mask = (limit_X > limit_at_Xt).to(torch.float32)
+            e_mask = (limit_E > limit_at_Et).to(torch.float32)
+
+        elif self.rdb == 'column':
+            # Determine column indices based on sub-criterion
+            if self.rdb_crit == 'max_marginal':
+                limit_X_np = self.limit_dist.X.detach().cpu().numpy()
+                limit_E_np = self.limit_dist.E.detach().cpu().numpy()
+                x_col = np.full_like(X_t_label.detach().cpu().numpy().squeeze(-1),
+                                     limit_X_np.argmax(), dtype=np.int64)
+                e_col = np.full_like(E_t_label.detach().cpu().numpy().squeeze(-1),
+                                     limit_E_np.argmax(), dtype=np.int64)
+                x_column_idxs = torch.tensor(x_col[..., np.newaxis])
+                e_column_idxs = torch.tensor(e_col[..., np.newaxis])
+
+            elif self.rdb_crit == 'x_t':
+                x_column_idxs = X_t_label
+                e_column_idxs = E_t_label
+
+            elif self.rdb_crit == 'abs_state':
+                x_column_idxs = torch.ones_like(X_t_label) * (dx - 1)
+                e_column_idxs = torch.ones_like(E_t_label) * (de - 1)
+
+            elif self.rdb_crit == 'p_x1_g_xt':
+                x_column_idxs = torch.unsqueeze(torch.argmax(X_1_pred, dim=-1), dim=-1)
+                e_column_idxs = torch.unsqueeze(torch.argmax(E_1_pred, dim=-1), dim=-1)
+
+            elif self.rdb_crit == 'x_1':
+                x_column_idxs = torch.unsqueeze(X_1_sampled, dim=-1)
+                e_column_idxs = torch.unsqueeze(E_1_sampled, dim=-1)
+
+            elif self.rdb_crit == 'p_xt_g_x1':
+                x_column_idxs = torch.unsqueeze(torch.argmax(pX, dim=-1), dim=-1)
+                e_column_idxs = torch.unsqueeze(torch.argmax(pE, dim=-1), dim=-1)
+
+            elif self.rdb_crit == 'xhat_t':
+                sampled_hat = sample_discrete_features(pX, pE, node_mask)
+                x_column_idxs = torch.unsqueeze(sampled_hat.X, dim=-1)
+                e_column_idxs = torch.unsqueeze(sampled_hat.E, dim=-1)
+
+            else:
+                raise NotImplementedError(f"rdb_crit '{self.rdb_crit}' not implemented for column")
+
+            # Create mask: one-hot at column_idx, plus keep current state
+            x_col_squeezed = torch.reshape(x_column_idxs, [pX.shape[0], pX.shape[1]])
+            e_col_squeezed = torch.reshape(e_column_idxs, pE.shape[:3])
+
+            x_mask = torch.nn.functional.one_hot(x_col_squeezed, num_classes=dx).float()
+            e_mask = torch.nn.functional.one_hot(e_col_squeezed, num_classes=de).float()
+
+            # Also keep current state
+            eq_x = torch.reshape(x_column_idxs == X_t_label.to(x_mask.shape[:-1] + (1,)),
+                            torch.float32)
+            eq_e = torch.reshape(e_column_idxs == E_t_label.to(e_mask.shape[:-1] + (1,)),
+                            torch.float32)
+            x_mask = torch.maximum(x_mask, eq_x)
+            e_mask = torch.maximum(e_mask, eq_e)
+
+        elif self.rdb == 'entry':
+            if self.rdb_crit == 'abs_state':
+                x_masked_idx = dx - 1
+                e_masked_idx = de - 1
+                x1_idxs = torch.unsqueeze(X_1_sampled, dim=-1)  # (bs, n, 1)
+                e1_idxs = torch.unsqueeze(E_1_sampled, dim=-1)  # (bs, n, n, 1)
+            elif self.rdb_crit == 'first':
+                x_masked_idx = 0
+                e_masked_idx = 0
+                x1_idxs = torch.unsqueeze(X_1_sampled, dim=-1)
+                e1_idxs = torch.unsqueeze(E_1_sampled, dim=-1)
+            else:
+                raise NotImplementedError(f"rdb_crit '{self.rdb_crit}' not implemented for entry")
+
+            # Build mask via numpy for advanced indexing
+            x_mask_np = np.zeros_like(pX.detach().cpu().numpy())
+            e_mask_np = np.zeros_like(pE.detach().cpu().numpy())
+
+            X_t_np = X_t_label.detach().cpu().numpy().squeeze(-1).astype(np.int64)
+            E_t_np = E_t_label.detach().cpu().numpy().squeeze(-1).astype(np.int64)
+            x1_np = x1_idxs.detach().cpu().numpy().squeeze(-1).astype(np.int64)
+            e1_np = e1_idxs.detach().cpu().numpy().squeeze(-1).astype(np.int64)
+
+            bs = x_mask_np.shape[0]
+
+            # X: swap between current and target
+            for b in range(bs):
+                n = x_mask_np.shape[1]
+                for i in range(n):
+                    if X_t_np[b, i] == x1_np[b, i]:
+                        # Current == target: mark the absorbing state
+                        if x_masked_idx < dx:
+                            x_mask_np[b, i, x_masked_idx] = 1.0
+                    elif X_t_np[b, i] == x_masked_idx:
+                        # Current is the absorbing state: mark target
+                        x_mask_np[b, i, x1_np[b, i]] = 1.0
+
+            for b in range(bs):
+                ni, nj = e_mask_np.shape[1], e_mask_np.shape[2]
+                for i in range(ni):
+                    for j in range(nj):
+                        if E_t_np[b, i, j] == e1_np[b, i, j]:
+                            if e_masked_idx < de:
+                                e_mask_np[b, i, j, e_masked_idx] = 1.0
+                        elif E_t_np[b, i, j] == e_masked_idx:
+                            e_mask_np[b, i, j, e1_np[b, i, j]] = 1.0
+
+            x_mask = torch.tensor(x_mask_np)
+            e_mask = torch.tensor(e_mask_np)
+
+        else:
+            raise NotImplementedError(f"rdb type '{self.rdb}' not implemented")
+
+        Rdb_X = pX * x_mask * self.eta
+        Rdb_E = pE * e_mask * self.eta
+
+        return Rdb_X, Rdb_E
+
+    def _compute_R_tg(self, X_1_sampled, E_1_sampled, X_t_label, E_t_label,
+                      dfm_vars):
+        """Compute the target guidance rate R^tg."""
+        if self.omega == 0:
+            zeros_X = torch.zeros_like(dfm_vars['pt_vals_X'])
+            zeros_E = torch.zeros_like(dfm_vars['pt_vals_E'])
+            return zeros_X, zeros_E
+
+        dx = self.num_classes_X
+        de = self.num_classes_E
+
+        X1_oh = torch.nn.functional.one_hot(X_1_sampled, num_classes=dx).float()
+        E1_oh = torch.nn.functional.one_hot(E_1_sampled, num_classes=de).float()
+
+        X1_exp = torch.unsqueeze(X_1_sampled, dim=-1)
+        mask_X =( X1_exp != X_t_label).to(torch.float32)
+        E1_exp = torch.unsqueeze(E_1_sampled, dim=-1)
+        mask_E =( E1_exp != E_t_label).to(torch.float32)
+
+        numer_X = X1_oh * self.omega * mask_X
+        denom_X = torch.unsqueeze(dfm_vars['Z_X'], dim=-1) * dfm_vars['pt_at_Xt']
+        denom_X = torch.where(denom_X == 0, torch.ones_like(denom_X), denom_X)
+        Rtg_X = numer_X / denom_X
+
+        numer_E = E1_oh * self.omega * mask_E
+        denom_E = torch.unsqueeze(dfm_vars['Z_E'], dim=-1) * dfm_vars['pt_at_Et']
+        denom_E = torch.where(denom_E == 0, torch.ones_like(denom_E), denom_E)
+        Rtg_E = numer_E / denom_E
+
+        return Rtg_X, Rtg_E
+
+    def _stabilize(self, R_X, R_E, dfm_vars):
+        """Post-processing to avoid numerical instabilities."""
+        R_X = torch.nan_to_num(R_X, nan=0.0)
+        R_E = torch.nan_to_num(R_E, nan=0.0)
+
+        R_X = torch.where(R_X > 1e5, torch.zeros_like(R_X), R_X)
+        R_E = torch.where(R_E > 1e5, torch.zeros_like(R_E), R_E)
+
+        pt_at_Xt = dfm_vars['pt_at_Xt']
+        zero_mask_X =( pt_at_Xt == 0).to(torch.float32)
+        R_X = R_X * (1.0 - zero_mask_X)
+
+        pt_at_Et = dfm_vars['pt_at_Et']
+        zero_mask_E =( pt_at_Et == 0).to(torch.float32)
+        R_E = R_E * (1.0 - zero_mask_E)
+
+        pX = dfm_vars['pt_vals_X']
+        pE = dfm_vars['pt_vals_E']
+        col_mask_X =( pX == 0).to(torch.float32)
+        R_X = R_X * (1.0 - col_mask_X)
+        col_mask_E =( pE == 0).to(torch.float32)
+        R_E = R_E * (1.0 - col_mask_E)
+
+        return R_X, R_E
+
+
diff --git a/examples/defog/rdkit_functions.py b/examples/defog/rdkit_functions.py
new file mode 100644
index 000000000..dbfa6ce83
--- /dev/null
+++ b/examples/defog/rdkit_functions.py
@@ -0,0 +1,549 @@
+"""
+Graph-to-SMILES conversion and molecular evaluation metrics.
+Ported from DeFoG src/analysis/rdkit_functions.py for GammaGL (TensorLayerX).
+
+Provides:
+- build_molecule / build_molecule_with_partial_charges
+- mol2smiles
+- BasicMolecularMetrics (validity, uniqueness, novelty, relaxed validity)
+- check_stability / compute_molecular_metrics
+"""
+import re
+import numpy as np
+
+try:
+    from rdkit import Chem
+    use_rdkit = True
+except ImportError:
+    use_rdkit = False
+    import warnings
+    warnings.warn("rdkit not found, molecular evaluation will fail")
+
+# ============================================================
+# Molecular constants
+# ============================================================
+
+allowed_bonds = {
+    "H": 1, "C": 4, "N": 3, "O": 2, "F": 1,
+    "B": 3, "Al": 3, "Si": 4, "P": [3, 5], "S": 4,
+    "Cl": 1, "As": 3, "Br": 1, "I": 1,
+    "Hg": [1, 2], "Bi": [3, 5], "Se": [2, 4, 6],
+}
+
+bond_dict = [
+    None,
+    Chem.rdchem.BondType.SINGLE if use_rdkit else None,
+    Chem.rdchem.BondType.DOUBLE if use_rdkit else None,
+    Chem.rdchem.BondType.TRIPLE if use_rdkit else None,
+    Chem.rdchem.BondType.AROMATIC if use_rdkit else None,
+]
+
+ATOM_VALENCY = {6: 4, 7: 3, 8: 2, 9: 1, 15: 3, 16: 2, 17: 1, 35: 1, 53: 1}
+BOND_ORDER_BY_TYPE = {
+    0: 0.0,
+    1: 1.0,
+    2: 2.0,
+    3: 3.0,
+    4: 1.5,
+}
+
+
+def _bond_order_from_type(bond_type):
+    bond_type = int(bond_type)
+    return BOND_ORDER_BY_TYPE.get(bond_type, 0.0)
+
+
+# ============================================================
+# Core molecule building
+# ============================================================
+
+def mol2smiles(mol):
+    """RDKit Mol -> canonical SMILES string (or None)."""
+    if mol is None:
+        return None
+    try:
+        Chem.SanitizeMol(mol)
+    except ValueError:
+        return None
+    return Chem.MolToSmiles(mol)
+
+
+def build_molecule(atom_types, edge_types, atom_decoder, verbose=False):
+    """Build an RDKit RWMol from integer atom types and integer edge types.
+
+    Parameters
+    ----------
+    atom_types : array-like, shape (n,)
+        Integer node type indices.
+    edge_types : array-like, shape (n, n)
+        Integer edge type matrix (upper triangle used).
+    atom_decoder : list of str
+        Maps integer index -> element symbol.
+    verbose : bool
+
+    Returns
+    -------
+    rdkit.Chem.RWMol
+    """
+    mol = Chem.RWMol()
+    for atom in atom_types:
+        a = Chem.Atom(atom_decoder[int(atom)])
+        mol.AddAtom(a)
+
+    edge_types = np.triu(edge_types)
+    edge_types = np.copy(edge_types)
+    edge_types[edge_types >= len(bond_dict)] = 0  # virtual state → no bond
+    rows, cols = np.nonzero(edge_types)
+    for i, j in zip(rows, cols):
+        if i != j:
+            etype = int(edge_types[i, j])
+            if etype > 0:
+                mol.AddBond(int(i), int(j), bond_dict[etype])
+
+    return mol
+
+
+def build_molecule_with_partial_charges(atom_types, edge_types, atom_decoder,
+                                         verbose=False):
+    """Build molecule and add formal charges to fix valence errors."""
+    mol = Chem.RWMol()
+    for atom in atom_types:
+        a = Chem.Atom(atom_decoder[int(atom)])
+        mol.AddAtom(a)
+
+    edge_types = np.triu(edge_types)
+    edge_types = np.copy(edge_types)
+    edge_types[edge_types >= len(bond_dict)] = 0
+    rows, cols = np.nonzero(edge_types)
+
+    for i, j in zip(rows, cols):
+        if i != j:
+            etype = int(edge_types[i, j])
+            if etype > 0:
+                mol.AddBond(int(i), int(j), bond_dict[etype])
+            flag, atomid_valence = check_valency(mol)
+            if flag:
+                continue
+            else:
+                if atomid_valence and len(atomid_valence) >= 2:
+                    idx = atomid_valence[0]
+                    v = atomid_valence[1]
+                    an = mol.GetAtomWithIdx(idx).GetAtomicNum()
+                    if an in (7, 8, 16) and (v - ATOM_VALENCY.get(an, 0)) == 1:
+                        mol.GetAtomWithIdx(idx).SetFormalCharge(1)
+    return mol
+
+
+# ============================================================
+# GDSS valence helpers
+# ============================================================
+
+def check_valency(mol):
+    """Check valence validity. Returns (ok, atomid_valence_or_None)."""
+    try:
+        Chem.SanitizeMol(mol, sanitizeOps=Chem.SanitizeFlags.SANITIZE_PROPERTIES)
+        return True, None
+    except ValueError as e:
+        e = str(e)
+        p = e.find("#")
+        e_sub = e[p:]
+        atomid_valence = list(map(int, re.findall(r"\d+", e_sub)))
+        return False, atomid_valence
+
+
+
+
+def check_stability(atom_types, edge_types, atom_decoder):
+    """Check molecular stability (correct bond counts per atom).
+
+    Returns (molecule_stable, n_stable_bonds, n_atoms).
+    """
+    n = len(atom_types)
+    n_bonds = np.zeros(n, dtype=np.float32)
+
+    for i in range(n):
+        for j in range(i + 1, n):
+            bond_type = int(round(float(edge_types[i, j] + edge_types[j, i]) / 2.0))
+            val = abs(_bond_order_from_type(bond_type))
+            n_bonds[i] += val
+            n_bonds[j] += val
+
+    n_stable = 0
+    for atom_type, n_bond in zip(atom_types, n_bonds):
+        possible = allowed_bonds.get(atom_decoder[int(atom_type)], 0)
+        if isinstance(possible, int):
+            is_stable = np.isclose(float(possible), float(n_bond))
+        else:
+            is_stable = any(np.isclose(float(candidate), float(n_bond))
+                            for candidate in possible)
+        n_stable += int(is_stable)
+
+    molecule_stable = n_stable == n
+    return molecule_stable, n_stable, n
+
+
+# ============================================================
+# BasicMolecularMetrics
+# ============================================================
+
+class BasicMolecularMetrics:
+    """Compute validity, uniqueness, novelty on generated molecules."""
+
+    def __init__(self, atom_decoder, train_smiles=None, remove_h=True):
+        self.atom_decoder = atom_decoder
+        self.train_smiles = train_smiles
+        self.remove_h = remove_h
+
+    def compute_validity(self, generated):
+        """Check which generated graphs yield valid SMILES.
+
+        Parameters
+        ----------
+        generated : list of (atom_types, edge_types) tuples
+            atom_types: ndarray (n,), edge_types: ndarray (n, n)
+
+        Returns
+        -------
+        valid : list of str
+            Valid SMILES (largest component).
+        validity : float
+            Fraction valid.
+        num_components : ndarray
+            Connected components per graph.
+        all_smiles : list
+            All SMILES (including None for invalid).
+        """
+        valid = []
+        num_components = []
+        all_smiles = []
+
+        for graph in generated:
+            atom_types, edge_types = graph
+            mol = build_molecule(atom_types, edge_types, self.atom_decoder)
+            smiles = mol2smiles(mol)
+
+            try:
+                mol_frags = Chem.rdmolops.GetMolFrags(mol, asMols=True, sanitizeFrags=True)
+                num_components.append(len(mol_frags))
+            except Exception:
+                pass
+
+            if smiles is not None:
+                try:
+                    mol_frags = Chem.rdmolops.GetMolFrags(mol, asMols=True, sanitizeFrags=True)
+                    largest_mol = max(mol_frags, default=mol, key=lambda m: m.GetNumAtoms())
+                    smiles = mol2smiles(largest_mol)
+                    valid.append(smiles)
+                    all_smiles.append(smiles)
+                except Chem.rdchem.AtomValenceException:
+                    all_smiles.append(None)
+                except Chem.rdchem.KekulizeException:
+                    all_smiles.append(None)
+            else:
+                all_smiles.append(None)
+
+        validity = len(valid) / max(len(generated), 1)
+        return valid, validity, np.array(num_components), all_smiles
+
+    def compute_uniqueness(self, valid):
+        """Deduplicate valid SMILES."""
+        unique = list(set(valid))
+        uniqueness = len(unique) / max(len(valid), 1)
+        return unique, uniqueness
+
+    def compute_novelty(self, unique):
+        """Check how many unique SMILES are not in training set."""
+        if self.train_smiles is None:
+            print("Dataset smiles is None, novelty computation skipped")
+            return [], 1.0
+        novel = [s for s in unique if s not in self.train_smiles]
+        novelty = len(novel) / max(len(unique), 1)
+        return novel, novelty
+
+    def compute_relaxed_validity(self, generated):
+        """Validity with partial charge correction."""
+        valid = []
+        for graph in generated:
+            atom_types, edge_types = graph
+            mol = build_molecule_with_partial_charges(
+                atom_types, edge_types, self.atom_decoder)
+            smiles = mol2smiles(mol)
+            if smiles is not None:
+                try:
+                    mol_frags = Chem.rdmolops.GetMolFrags(
+                        mol, asMols=True, sanitizeFrags=True)
+                    largest_mol = max(mol_frags, default=mol,
+                                      key=lambda m: m.GetNumAtoms())
+                    smiles = mol2smiles(largest_mol)
+                    valid.append(smiles)
+                except (Chem.rdchem.AtomValenceException,
+                        Chem.rdchem.KekulizeException):
+                    pass
+        relaxed_validity = len(valid) / max(len(generated), 1)
+        return valid, relaxed_validity
+
+    def evaluate(self, generated):
+        """Run all metrics.
+
+        Parameters
+        ----------
+        generated : list of (atom_types, edge_types)
+
+        Returns
+        -------
+        metrics : list [validity, relaxed_validity, uniqueness, novelty]
+        unique : list of str
+        nc_dict : dict (nc_min, nc_max, nc_mu)
+        all_smiles : list
+        """
+        valid, validity, num_components, all_smiles = self.compute_validity(generated)
+
+        nc_mu = float(num_components.mean()) if len(num_components) > 0 else 0
+        nc_min = float(num_components.min()) if len(num_components) > 0 else 0
+        nc_max = float(num_components.max()) if len(num_components) > 0 else 0
+
+        print(f"  Validity: {validity * 100:.2f}%  ({len(generated)} molecules)")
+        print(f"  Connected components: min={nc_min:.2f} mean={nc_mu:.2f} max={nc_max:.2f}")
+
+        relaxed_valid, relaxed_validity = self.compute_relaxed_validity(generated)
+        print(f"  Relaxed validity: {relaxed_validity * 100:.2f}%")
+
+        if relaxed_validity > 0:
+            unique, uniqueness = self.compute_uniqueness(relaxed_valid)
+            print(f"  Uniqueness: {uniqueness * 100:.2f}%  ({len(relaxed_valid)} valid)")
+
+            if self.train_smiles is not None:
+                _, novelty = self.compute_novelty(unique)
+                print(f"  Novelty: {novelty * 100:.2f}%  ({len(unique)} unique)")
+            else:
+                novelty = -1.0
+        else:
+            novelty = -1.0
+            uniqueness = 0.0
+            unique = []
+
+        return (
+            [validity, relaxed_validity, uniqueness, novelty],
+            unique,
+            dict(nc_min=nc_min, nc_max=nc_max, nc_mu=nc_mu),
+            all_smiles,
+        )
+
+
+# ============================================================
+# Top-level molecular metrics function
+# ============================================================
+
+def compute_molecular_metrics(molecule_list, train_smiles, atom_decoder,
+                               remove_h=True):
+    """Compute all molecular metrics on generated graphs.
+
+    Parameters
+    ----------
+    molecule_list : list of (atom_types, edge_types)
+        Generated graphs.
+    train_smiles : list of str or None
+        Training set SMILES for novelty computation.
+    atom_decoder : list of str
+        Element symbol for each node type index.
+    remove_h : bool
+        Whether hydrogen was removed from the dataset.
+
+    Returns
+    -------
+    stability_dict : dict
+        mol_stable, atm_stable (or -1 if remove_h).
+    rdkit_metrics : tuple from BasicMolecularMetrics.evaluate().
+    all_smiles : list
+    summary : dict
+    """
+    if not remove_h:
+        print("Analyzing molecule stability...")
+        molecule_stable = 0
+        nr_stable_bonds = 0
+        n_atoms = 0
+
+        for mol_data in molecule_list:
+            atom_types, edge_types = mol_data
+            s, nb, na = check_stability(atom_types, edge_types, atom_decoder)
+            molecule_stable += int(s)
+            nr_stable_bonds += int(nb)
+            n_atoms += int(na)
+
+        stability_dict = {
+            "mol_stable": molecule_stable / max(len(molecule_list), 1),
+            "atm_stable": nr_stable_bonds / max(n_atoms, 1),
+        }
+    else:
+        stability_dict = {"mol_stable": -1, "atm_stable": -1}
+
+    metrics = BasicMolecularMetrics(atom_decoder, train_smiles, remove_h)
+    rdkit_metrics = metrics.evaluate(molecule_list)
+    metric_values, unique_smiles, nc, all_smiles = rdkit_metrics
+
+    summary = {
+        "Validity": metric_values[0],
+        "Relaxed Validity": metric_values[1],
+        "Uniqueness": metric_values[2],
+        "Novelty": metric_values[3],
+        "nc_max": nc["nc_max"],
+        "nc_mu": nc["nc_mu"],
+    }
+
+    return stability_dict, rdkit_metrics, all_smiles, summary
+
+
+# ============================================================
+# FCD (Frechet ChemNet Distance)
+# ============================================================
+
+def compute_fcd(generated_smiles, reference_smiles):
+    """Compute Frechet ChemNet Distance between generated and reference SMILES.
+
+    Parameters
+    ----------
+    generated_smiles : list of str
+        Generated canonical SMILES (None entries will be filtered).
+    reference_smiles : list of str
+        Reference (test/val) canonical SMILES.
+
+    Returns
+    -------
+    float
+        FCD score, or -1 if computation fails.
+    """
+    try:
+        from fcd import get_fcd
+    except ImportError:
+        print("  FCD: fcd package not installed. Install with: pip install fcd")
+        return -1.0
+
+    import time
+    print("  Starting FCD computation...")
+    start = time.time()
+
+    generated_smiles = [s for s in generated_smiles if s is not None]
+    try:
+        fcd_score = get_fcd(generated_smiles, reference_smiles)
+    except Exception as e:
+        print(f"  Error in FCD computation: {e}. Setting FCD to -1.")
+        fcd_score = -1.0
+
+    elapsed = time.time() - start
+    print(f"  FCD: {fcd_score:.4f} (took {elapsed:.1f}s)")
+    return fcd_score
+
+
+# ============================================================
+# Distribution MAE metrics (node/edge/valency/n distribution)
+# ============================================================
+
+def compute_distribution_metrics(generated_graphs, dataset_infos, dataset_name):
+    """Compute distribution MAE metrics: node count, atom type, bond type, valency.
+
+    Parameters
+    ----------
+    generated_graphs : list of (X_int, E_int) tuples
+        Generated graphs.
+    dataset_infos : dict
+        Dataset statistics with keys 'max_n_nodes', 'node_types', 'edge_types'.
+    dataset_name : str
+        Dataset name for atom decoder lookup.
+
+    Returns
+    -------
+    dict
+        Distribution MAE metrics.
+    """
+    max_n = dataset_infos['max_n_nodes']
+    num_node_types = len(dataset_infos['node_types'])
+    num_edge_types = len(dataset_infos['edge_types'])
+
+    # Reference distributions (normalized)
+    ref_node_types = dataset_infos['node_types'].copy()
+    ref_node_types = ref_node_types / (ref_node_types.sum() + 1e-8)
+
+    ref_edge_types = dataset_infos['edge_types'].copy()
+    ref_edge_types = ref_edge_types / (ref_edge_types.sum() + 1e-8)
+
+    # Node count distribution
+    ref_node_dist = dataset_infos['node_dist'].copy()
+
+    # Accumulate generated distributions
+    gen_n_dist = np.zeros_like(ref_node_dist)
+    gen_node_dist = np.zeros(num_node_types, dtype=np.float32)
+    gen_edge_dist = np.zeros(num_edge_types, dtype=np.float32)
+    gen_valency_dist = np.zeros(max(3 * max_n - 2, 1), dtype=np.float32)
+    bond_orders = np.array([0.0, 1.0, 2.0, 3.0, 1.5], dtype=np.float32)
+
+    for (x, e) in generated_graphs:
+        n = len(x)
+        if n < len(gen_n_dist):
+            gen_n_dist[n] += 1
+
+        # Atom type distribution
+        for atom_type in x:
+            at = int(atom_type)
+            if 0 <= at < num_node_types:
+                gen_node_dist[at] += 1
+
+        # Bond type distribution (upper triangle only)
+        for i in range(n):
+            for j in range(i + 1, n):
+                bt = int(e[i, j])
+                if 0 <= bt < num_edge_types:
+                    gen_edge_dist[bt] += 1
+
+        # Valency distribution
+        for i in range(n):
+            val = 0.0
+            for j in range(n):
+                bt = int(e[i, j])
+                if 0 <= bt < len(bond_orders):
+                    val += bond_orders[bt]
+            val_idx = int(val)
+            if 0 <= val_idx < len(gen_valency_dist):
+                gen_valency_dist[val_idx] += 1
+
+    # Normalize
+    gen_n_dist_norm = gen_n_dist / (gen_n_dist.sum() + 1e-8)
+    ref_n_dist_norm = ref_node_dist / (ref_node_dist.sum() + 1e-8)
+
+    gen_node_norm = gen_node_dist / (gen_node_dist.sum() + 1e-8)
+    gen_edge_norm = gen_edge_dist / (gen_edge_dist.sum() + 1e-8)
+    gen_val_norm = gen_valency_dist / (gen_valency_dist.sum() + 1e-8)
+
+    ref_val_norm = np.zeros_like(gen_valency_dist)
+    ref_valency = dataset_infos.get('valency_distribution')
+    if ref_valency is not None:
+        ref_valency = np.asarray(ref_valency, dtype=np.float32).reshape(-1)
+        ref_len = min(len(ref_valency), len(ref_val_norm))
+        ref_val_norm[:ref_len] = ref_valency[:ref_len]
+        ref_val_norm = ref_val_norm / (ref_val_norm.sum() + 1e-8)
+    else:
+        # Fallback for datasets without a stored reference valency histogram.
+        ref_val_norm[0] = ref_node_types[0]
+        ref_val_norm = ref_val_norm / (ref_val_norm.sum() + 1e-8)
+
+    metrics = {}
+
+    # Node count MAE
+    min_len = min(len(gen_n_dist_norm), len(ref_n_dist_norm))
+    metrics['n_dist_mae'] = float(np.abs(
+        gen_n_dist_norm[:min_len] - ref_n_dist_norm[:min_len]).mean())
+
+    # Atom type MAE
+    min_len = min(len(gen_node_norm), len(ref_node_types))
+    metrics['node_dist_mae'] = float(np.abs(
+        gen_node_norm[:min_len] - ref_node_types[:min_len]).mean())
+
+    # Bond type MAE
+    min_len = min(len(gen_edge_norm), len(ref_edge_types))
+    metrics['edge_dist_mae'] = float(np.abs(
+        gen_edge_norm[:min_len] - ref_edge_types[:min_len]).mean())
+
+    # Valency MAE
+    metrics['valency_dist_mae'] = float(np.abs(
+        gen_val_norm - ref_val_norm).mean())
+
+    return metrics
diff --git a/examples/defog/readme.md b/examples/defog/readme.md
new file mode 100644
index 000000000..0a8adedae
--- /dev/null
+++ b/examples/defog/readme.md
@@ -0,0 +1,362 @@
+# DeFoG: Discrete Flow Matching for Graph Generation
+
+- Paper: [https://arxiv.org/abs/2410.04263](https://arxiv.org/abs/2410.04263)
+- Original code: [https://github.com/manuelmlmadeira/DeFoG](https://github.com/manuelmlmadeira/DeFoG)
+
+This directory contains the GammaGL reproduction of DeFoG. The current implementation uses a shared training / validation-sampling / final-sampling pipeline, with validation-driven best-checkpoint selection and optional EMA swapping during evaluation and sampling.
+
+## Supported Datasets
+
+| Dataset | Type | Node Types | Edge Types | Conditional |
+|---------|------|------------|------------|-------------|
+| synthetic | Synthetic | configurable | configurable | -- |
+| planar | Synthetic | 2 | 2 | -- |
+| tree | Synthetic | 2 | 2 | -- |
+| sbm | Synthetic | 2 | 2 | -- |
+| qm9 | Molecular | 4 | 5 | mu / homo / both |
+| guacamol | Molecular | 12 | 5 | -- |
+| zinc250k | Molecular | 9 | 4 | -- |
+| moses | Molecular | 8 | 5 | -- |
+| tls | Molecular | 9 | 2 | half-half |
+
+## File Structure
+
+| File | Description |
+| ---- | ----------- |
+| `defog_trainer.py` | Main entry for training, validation sampling, final sampling, and evaluation |
+| `defog_utils.py` | Dense conversion, placeholder utilities, EMA, backend helpers |
+| `flow_matching.py` | Mathematical engine for CTMC rate matrix, discrete flow matching, noise distribution, and time distorter |
+
+| `dataset_utils.py` | Dataset loading and dataset-info computation |
+| `evaluator.py` | Evaluation driver (validity, uniqueness, novelty, FCD, SPECTRE) |
+| `sampler.py` | CTMC forward / backward sampling |
+| `extra_features.py` | Structural and molecular features (RRWP, cycles, eigen features, etc.) |
+| `train_metrics.py` | Cross-entropy / KLD training metrics |
+| `rdkit_functions.py` | Molecular metrics and SMILES helpers |
+| `spectre_utils.py` | Synthetic graph evaluation |
+| `multi_gpu.py` | Multi-GPU training helpers |
+
+## Model Components
+
+| File | Description |
+| ---- | ----------- |
+| `gammagl/models/defog.py` | `DeFoGModel` Graph Transformer denoiser |
+| `gammagl/layers/attention/defog_layer.py` | `XEyTransformerLayer`, `NodeEdgeBlock`, `Xtoy`, `Etoy` |
+
+## Preset Behavior
+
+For named datasets (`planar`, `tree`, `sbm`, `qm9`, `guacamol`, `zinc250k`, `moses`), `defog_trainer.py` and `defog_sample_only.py` automatically apply DeFoG-aligned dataset presets through `apply_dataset_preset()`.
+
+- Presets set dataset-specific values such as `n_layers`, `batch_size`, `sample_steps`, validation cadence, and sampling distortion.
+- Explicit CLI flags override the preset values.
+- `tls` and `synthetic` do not receive these preset overrides.
+
+## Checkpoint Semantics
+
+Training writes paired model / EMA snapshots:
+
+- `last_model.npz` / `last_ema.pkl`
+- `best_model.npz` / `best_ema.pkl`
+
+Best-checkpoint selection is validation-driven:
+
+1. save `last`
+2. run validation sampling
+3. evaluate generated graphs
+4. compute `selection_score`
+5. update `best` if the validation score improves
+
+Sampling prefers `best_model.npz`; if it does not exist, it falls back to `last_model.npz`.
+
+## Dependencies & Architecture Boundary
+
+**GammaGL Core Integration:**
+- The graph neural network structure (`gammagl/models/defog.py`, `gammagl/layers/attention/defog_layer.py`) and dataset classes (`gammagl/datasets/...`) are strictly isolated from heavy domain-specific dependencies.
+- You do **not** need `rdkit`, `graph-tool`, or `networkx` to import GammaGL core modules. They are gracefully caught or lazily imported.
+
+**Examples Sandbox (`examples/defog`):**
+- **Why is there so much custom code?** DeFoG is the *first* Discrete Flow Matching model in GammaGL. It requires a Continuous-Time Markov Chain (CTMC) flow matching solver, discrete loss computation, and complex evaluation protocols that are not standard classification/regression tasks. To avoid bloating the GammaGL core with flow-matching specifics, these components are kept in `examples/defog`.
+- **Optional Dependencies (see `requirements.txt`)**:
+  - `rdkit` and `fcd`: Required **only** for molecular dataset evaluation. If missing, it will gracefully warn and skip molecular metrics.
+  - `graph-tool` and `scipy`: Required **only** for SBM graph validation. If missing, SBM evaluation will raise an `ImportError` instructing the user to install them (preferably via `conda install -c conda-forge graph-tool scipy`).
+  - `pyemd`, `networkx`, `scipy`: Used for SPECTRE evaluation metrics.
+- Running complete generation evaluation on molecular datasets requires `rdkit` and potentially `orca` / `graph-tool` as specified by the original authors.
+
+**Supported Backend:**
+- This implementation currently supports **`TL_BACKEND=torch` only**. Other TensorLayerX backends (TensorFlow, PaddlePaddle, MindSpore) have not been tested and are not guaranteed to work. The model operations rely on specific PyTorch sparse and broadcasting behaviors.
+- Note: `multi_gpu.py` is an experimental, pure PyTorch multi-GPU wrapper intended only for users with heavy Torch environments.
+
+## Quick Start / Minimal Smoke Test
+
+We provide minimal, CPU-friendly smoke test commands (1 epoch, minimal synthetic data). This verifies that the entire flow-matching training loop runs properly on the currently supported **TensorLayerX `torch` backend** (`TL_BACKEND=torch`) without needing GPU or heavy dependencies like `rdkit`/`graph-tool`.
+
+```bash
+cd examples/defog
+
+# 方案A：两步（训练 + 采样）
+# 先训练 1 epoch
+TL_BACKEND="torch" python defog_trainer.py \
+  --dataset synthetic \
+  --n_layers 1 \
+  --n_epochs 1 \
+  --batch_size 2 \
+  --num_graphs 10 \
+  --gpu -1
+
+# 再基于刚保存的 checkpoint 做最小采样验证
+TL_BACKEND="torch" python defog_sample_only.py \
+  --dataset synthetic \
+  --n_layers 1 \
+  --sample_steps 2 \
+  --num_samples 2 \
+  --gpu -1 \
+  --model_path checkpoints/last_model.npz
+```
+
+## Advanced Examples
+
+Run commands from `examples/defog`.
+
+```bash
+# Quick smoke test on synthetic data
+TL_BACKEND="torch" python defog_trainer.py \
+  --n_layers 2 \
+  --n_epochs 3 \
+  --batch_size 4 \
+  --sample \
+  --sample_steps 5 \
+  --num_samples 3 \
+  --num_graphs 20
+
+# Planar training with preset hyperparameters (single run)
+TL_BACKEND="torch" python defog_trainer.py --dataset planar --data_root ./datasets --sample --evaluate
+
+# Planar 3-seed reproduction
+for seed in 43 44 45; do
+  TL_BACKEND="torch" python defog_trainer.py \
+    --dataset planar \
+    --data_root ./datasets \
+    --seed $seed \
+    --save_dir ./checkpoints_planar_seed${seed} \
+    --sample \
+    --evaluate
+done
+
+## Dependency Management
+To keep GammaGL lightweight, DeFoG evaluates synthetic graphs and molecules using specialized external libraries which are **not** installed by default.
+
+### Optional Evaluation Dependencies
+If you want to perform full evaluation on `spectre` or molecular datasets (`qm9`, `zinc250k`):
+```bash
+# For synthetic graph evaluation (SPECTRE)
+pip install pyemd scipy networkx
+
+# For molecular evaluation (QM9, ZINC250k)
+pip install rdkit fcd
+```
+If these dependencies are missing, the training will still run normally but the evaluation metrics will be skipped and output `-1` or `NaN`.
+
+## Minimal CPU Smoke Test
+You can verify the model is functioning correctly without any heavy dependencies by running a minimal smoke test on a small synthetic dataset:
+```bash
+TL_BACKEND="torch" python defog_trainer.py --dataset synthetic --n_epochs 1 --batch_size 2 --sample --sample_steps 2 --num_graphs 4 --n_layers 2 --gpu -1 --data_root ./_review_data --save_dir ./_review_outputs
+```
+Expected output will show the dataset building dynamically and the loss being printed, followed by completion without crashing. Alternatively, you can run the provided smoke test script:
+```bash
+python tests/models/test_defog_smoke.py
+```
+This ensures that the GammaGL core layers and flow matching engine are backend-neutral and do not suffer from any hard `torch` or `rdkit` import issues.
+
+# Tree / SBM training with preset hyperparameters
+TL_BACKEND="torch" python defog_trainer.py --dataset tree --data_root ./datasets --sample --evaluate
+TL_BACKEND="torch" python defog_trainer.py --dataset sbm --data_root ./datasets --sample --evaluate
+
+# QM9 training with preset hyperparameters
+TL_BACKEND="torch" python defog_trainer.py --dataset qm9 --data_root ./datasets --sample --evaluate
+
+# QM9 conditional generation
+TL_BACKEND="torch" python defog_trainer.py \
+  --dataset qm9 \
+  --data_root ./datasets \
+  --conditional \
+  --target mu \
+  --guidance_weight 2.0 \
+  --sample \
+  --evaluate
+
+# Sampling only from an existing checkpoint directory
+TL_BACKEND="torch" python defog_sample_only.py \
+  --dataset planar \
+  --data_root ./datasets \
+  --save_dir ./checkpoints_planar \
+  --evaluate
+
+# EMA + multi-fold sampling evaluation
+TL_BACKEND="torch" python defog_sample_only.py \
+  --dataset qm9 \
+  --data_root ./datasets \
+  --save_dir ./checkpoints_qm9 \
+  --ema_decay 0.999 \
+  --num_sample_fold 3 \
+  --evaluate
+```
+
+## Benchmark Results
+
+### Planar (3 seeds, completed)
+
+Trained for 100,000 epochs each. Results compared against the DeFoG paper (Table 7).
+
+| Metric | Paper (DeFoG) | Seed 0 | Seed 1 | Seed 2 | Mean ± Std |
+|--------|---------------|--------|--------|--------|------------|
+| Valid ↑ | 99.5 ± 1.0 | 99.0 | 97.7 | 99.5 | **98.7 ± 0.8** |
+| Unique ↑ | 100.0 ± 0.0 | 100.0 | 100.0 | 100.0 | **100.0 ± 0.0** |
+| Non-iso ↑ | 100.0 ± 0.0 | 100.0 | 100.0 | 100.0 | **100.0 ± 0.0** |
+| Planar Acc ↑ | — | 99.0 | 97.7 | 99.5 | **98.7 ± 0.8** |
+| Degree ↓ | 0.0005 ± 0.0002 | 0.000032 | 0.000038 | 0.000511 | **0.000194 ± 0.000221** |
+| Spectre ↓ | 0.0072 ± 0.0011 | 0.004791 | 0.004527 | 0.004335 | **0.004551 ± 0.000189** |
+| Clustering ↓ | 0.0501 ± 0.0149 | 0.020626 | 0.018320 | 0.037029 | **0.025325 ± 0.008415** |
+| Orbit ↓ | 0.0006 ± 0.0004 | 0.000059 | 0.001910 | 0.000223 | **0.000731 ± 0.000824** |
+| Wavelet ↓ | 0.0014 ± 0.0002 | 0.000016 | 0.000139 | 0.000167 | **0.000107 ± 0.000066** |
+
+### Tree (3 seeds, completed)
+
+Trained for 100,000 epochs each. Best checkpoint evaluated with 40 samples, 1000 denoising steps.
+
+| Metric | Paper (DeFoG) | Seed 0 (best) | Seed 1 | Seed 2 | Mean ± Std |
+|--------|---------------|---------------|--------|--------|------------|
+| Valid ↑ | 100.0 | 100.0 | 95.0 | 100.0 | **98.3 ± 2.4** |
+| Unique ↑ | 100.0 | 82.5 | 87.5 | 85.0 | **85.0 ± 2.0** |
+| Non-iso ↑ | 100.0 | 100.0 | 100.0 | 100.0 | **100.0 ± 0.0** |
+| Tree Acc ↑ | 100.0 | 100.0 | 95.0 | 100.0 | **98.3 ± 2.4** |
+| Degree ↓ | — | 0.000575 | 0.000582 | 0.000287 | **0.000481 ± 0.000137** |
+| Spectre ↓ | — | 0.010322 | 0.011042 | 0.011507 | **0.010957 ± 0.000488** |
+| Clustering ↓ | — | 0.000000 | 0.000000 | 0.000000 | **0.000000 ± 0.000000** |
+| Orbit ↓ | — | 0.000007 | 0.000013 | 0.000000 | **0.000007 ± 0.000005** |
+| Wavelet ↓ | — | 0.000613 | 0.000518 | 0.000586 | **0.000572 ± 0.000039** |
+
+*Training command (per seed):*
+```bash
+TL_BACKEND="torch" python defog_trainer.py \
+  --dataset tree \
+  --data_root ./datasets \
+  --save_dir ./checkpoints_tree_seed${seed}_final \
+  --seed ${seed} \
+  --gpu 0 \
+  --n_layers 10 --hidden_mlp_X 128 --hidden_mlp_E 64 --hidden_mlp_y 128 \
+  --dx 256 --de 64 --dy 64 --dim_ffX 256 --dim_ffE 64 --dim_ffy 256 \
+  --n_head 8 --n_epochs 100000 --batch_size 64 --lr 2e-4 \
+  --train_distortion polydec --sample_distortion polydec \
+  --omega 0 --eta 0 --sample_steps 1000 \
+  --check_val_every_n_epochs 2000 --sample_every_val 1 --val_num_samples 40
+```
+
+### QM9 no-H (3 seeds, completed)
+
+Trained for 1,000 epochs each without explicit hydrogens. Results compared against the DeFoG paper (no-H, 500 steps). **Best checkpoint evaluated with 10,000 samples, 500 denoising steps.**
+
+| Metric | Paper (DeFoG) | Seed 0 | Seed 1 | Seed 2 | Mean ± Std |
+|--------|---------------|--------|--------|--------|------------|
+| Validity ↑ | 99.3 ± 0.0 | 96.22 | 99.38 | 99.01 | **98.20 ± 1.41** |
+| Relaxed Validity ↑ | 99.4 ± 0.1 | 97.19 | 99.57 | 99.18 | **98.65 ± 1.04** |
+| Uniqueness ↑ | 96.3 ± 0.3 | 97.47 | 96.35 | 96.40 | **96.74 ± 0.52** |
+| Novelty ↑ | — | 59.54 | 33.17 | 33.47 | **42.06 ± 12.36** |
+| FCD ↓ | 0.12 ± 0.00 | 0.5782 | 0.1202 | 0.1033 | **0.2672 ± 0.2199** |
+
+*Training command:*
+```bash
+TL_BACKEND="torch" python defog_trainer.py \
+  --dataset qm9 \
+  --data_root ./datasets \
+  --save_dir ./checkpoints_qm9_noh_seed0_final \
+  --seed 0 \
+  --gpu 5 \
+  --n_layers 9 \
+  --n_epochs 1000 \
+  --batch_size 1024 \
+  --lr 2e-4 \
+  --train_distortion identity \
+  --sample_distortion polydec \
+  --sample_steps 500 \
+  --omega 0 \
+  --eta 0 \
+  --check_val_every_n_epochs 50 \
+  --sample_every_val 1 \
+  --val_num_samples 512 \
+  --remove_h
+```
+
+## Important Parameters
+
+The parser-level defaults are generic. For named datasets, presets may replace them unless you pass an explicit CLI override.
+
+| Parameter | Parser Default | Description |
+| --------- | -------------- | ----------- |
+| `--dataset` | `synthetic` | Dataset name |
+| `--data_root` | `None` | Root directory for real datasets |
+| `--seed` | `42` | Random seed |
+| `--use_defog_split` | off | Use DeFoG original CSV split for QM9 instead of random split |
+| `--remove_h` / `--with_h` | `None` | Use QM9 without/with hydrogens |
+| `--conditional` | off | Enable classifier-free guidance (QM9 only) |
+| `--target` | `mu` | Conditional target: `mu` / `homo` / `both` / `k2` |
+| `--guidance_weight` | `2.0` | CFG weight |
+| `--n_layers` | `5` | Transformer depth |
+| `--batch_size` | `32` | Training batch size |
+| `--sample_batch_size` | `0` | Sampling batch size (0 = use num_samples) |
+| `--n_epochs` | `100` | Training epochs |
+| `--lr` | `2e-4` | Learning rate |
+| `--weight_decay` | `1e-12` | AdamW weight decay |
+| `--ema_decay` | `0.0` | EMA decay (`0` disables EMA) |
+| `--grad_clip_norm` | `None` | Gradient clipping norm (disabled by default) |
+| `--kld` | off | Use KLD for node / edge losses |
+| `--lambda_E` | `5.0` | Edge loss weight |
+| `--lambda_y` | `0.0` | Global-property loss weight |
+| `--transition` | `marginal` | Noise transition |
+| `--extra_features` | `rrwp` | Extra structural features |
+| `--rrwp_steps` | `12` | RRWP steps |
+| `--train_distortion` | `identity` | Training time distortion |
+| `--sample` | off | Run final sampling after training |
+| `--evaluate` | off | Evaluate generated graphs |
+| `--sample_steps` | `100` | Number of denoising steps |
+| `--sample_distortion` | `identity` | Sampling time distortion |
+| `--num_samples` | `20` | Number of generated graphs |
+| `--num_sample_fold` | `1` | Number of sampling folds |
+| `--sample_every_val` | `0` | Run validation sampling every N validation events |
+| `--check_val_every_n_epochs` | `0` | Run validation cadence every N epochs |
+| `--val_num_samples` | `40` | Number of samples used in validation selection |
+| `--eta` | `0.0` | R^db strength |
+| `--omega` | `0.0` | R^tg strength |
+| `--rdb` | `general` | RDB design |
+| `--rdb_crit` | `max_marginal` | RDB sub-criterion |
+| `--save_dir` | `./checkpoints` | Output checkpoint directory |
+| `--resume_from` | `None` | Checkpoint directory to resume from (loads last_model.npz) |
+| `--start_epoch` | `0` | Epoch to start resuming from |
+
+## Evaluation Outputs
+
+### Synthetic (`planar`, `tree`, `sbm`)
+Typical outputs include:
+
+- `planar_acc` / `tree_acc`
+- `frac_unique`
+- `frac_non_iso`
+- `frac_unique_non_iso`
+- `frac_unic_non_iso_valid`
+- compatibility aliases under `sampling/...`
+- `selection_score`
+
+### Molecular (`qm9`, `guacamol`, `zinc250k`, `moses`, `tls`)
+Typical outputs include:
+
+- `Validity`
+- `Relaxed Validity`
+- `Uniqueness`
+- `Novelty`
+- `fcd`
+- distribution MAE terms
+- `selection_score`
+
+## Notes
+
+- `defog_sample_only.py` must use model hyperparameters compatible with the saved checkpoint. If you rely on dataset presets, keep the dataset name consistent with the training run.
+- For reproducibility checks, prefer evaluating checkpoints produced by the current training code rather than mixing in older checkpoints created before the validation / checkpoint / metric-key fixes.
diff --git a/examples/defog/requirements.txt b/examples/defog/requirements.txt
new file mode 100644
index 000000000..9edd4e2a0
--- /dev/null
+++ b/examples/defog/requirements.txt
@@ -0,0 +1,15 @@
+# Core Optional Dependencies for DeFoG Example
+# These are required for generating features or evaluating molecular/graph properties.
+
+# RDKit is strictly required for evaluating generated molecular validity (qm9, zinc)
+rdkit
+
+# Graph-Tool is strictly required for SBM structural community validation 
+# (Note: graph-tool installation via conda is recommended due to C++ compilation requirements)
+# conda install -c conda-forge graph-tool
+graph-tool
+
+# NetworkX is used across many heuristic verifications
+networkx
+
+
diff --git a/examples/defog/sampler.py b/examples/defog/sampler.py
new file mode 100644
index 000000000..3782fbbf8
--- /dev/null
+++ b/examples/defog/sampler.py
@@ -0,0 +1,307 @@
+"""CTMC sampling loop for DeFoG (Discrete Flow-matching for Graph Generation).
+
+This module contains the core sampling logic extracted from defog_trainer.py,
+including the CTMC one-step transition probability computation,
+classifier-free guidance helpers, and the main batch sampling loop.
+"""
+
+import os
+import numpy as np
+import tensorlayerx as tlx
+
+from defog_utils import PlaceHolder, apply_node_mask
+import torch.nn.functional as F
+from flow_matching import sample_discrete_features, sample_discrete_feature_noise
+from extra_features import compute_extra_data
+
+
+def compute_step_probs(R_t_X, R_t_E, X_t, E_t, dt):
+    r"""Convert rate matrices to one-step CTMC transition probabilities.
+
+    Matches the original DeFoG logic: zero the current-state column first,
+    then write back the stay probability so rows sum to 1.
+    """
+    step_X = R_t_X * dt
+    step_E = R_t_E * dt
+
+    cur_X = tlx.argmax(X_t, axis=-1)
+    cur_E = tlx.argmax(E_t, axis=-1)
+
+    # TODO: Performance Optimization
+    # Currently converting tensors to numpy for inplace modifications (setting current state 
+    # to 0 and writing back stay probability). Doing this entirely in pure tensors would 
+    # prevent CPU-GPU synchronization bottlenecks during sampling.
+    step_X_np = tlx.convert_to_numpy(step_X)
+    step_E_np = tlx.convert_to_numpy(step_E)
+    cur_X_np = tlx.convert_to_numpy(cur_X).astype(np.int64)
+    cur_E_np = tlx.convert_to_numpy(cur_E).astype(np.int64)
+
+    bs, n, dx = step_X_np.shape
+    _, n1, n2, de = step_E_np.shape
+
+    step_X_np[np.arange(bs)[:, None], np.arange(n)[None, :], cur_X_np] = 0.0
+    stay_X = np.clip(1.0 - step_X_np.sum(axis=-1, keepdims=True), a_min=0.0, a_max=None)
+    step_X_np[np.arange(bs)[:, None], np.arange(n)[None, :], cur_X_np] = stay_X[..., 0]
+
+    b_idx = np.arange(bs)[:, None, None]
+    i_idx = np.arange(n1)[None, :, None]
+    j_idx = np.arange(n2)[None, None, :]
+    step_E_np[b_idx, i_idx, j_idx, cur_E_np] = 0.0
+    stay_E = np.clip(1.0 - step_E_np.sum(axis=-1, keepdims=True), a_min=0.0, a_max=None)
+    step_E_np[b_idx, i_idx, j_idx, cur_E_np] = stay_E[..., 0]
+
+    prob_X = tlx.convert_to_tensor(step_X_np.astype(np.float32))
+    prob_E = tlx.convert_to_tensor(step_E_np.astype(np.float32))
+    return prob_X, prob_E
+
+
+def _cfg_unconditional_pred(model, X_in, E_in, extra_data, y_t, node_mask):
+    r"""Compute unconditional model predictions for classifier-free guidance."""
+    y_uncond = tlx.ones_like(y_t) * (-1.0)
+    y_in_uncond = tlx.concat([y_uncond, extra_data.y], axis=-1)
+    
+    import torch
+    with torch.no_grad():
+        pred_X_u, pred_E_u, _ = model(X_in, E_in, y_in_uncond, node_mask)
+    pred_X_soft_u = tlx.softmax(pred_X_u, axis=-1)
+    pred_E_soft_u = tlx.softmax(pred_E_u, axis=-1)
+    return pred_X_soft_u, pred_E_soft_u
+
+
+def sample_batch(model, noise_dist, rate_matrix_designer, time_distorter,
+                 extra_features, domain_features, node_dist,
+                 sample_steps, batch_size, num_nodes=None,
+                 conditional=False, cond_labels=None, guidance_weight=2.0):
+    r"""Generate graphs via CTMC sampling.
+
+    Parameters
+    ----------
+    model : DeFoGModel
+        The trained denoiser model.
+    noise_dist : NoiseDistribution
+        Noise distribution.
+    rate_matrix_designer : RateMatrixDesigner
+        Rate matrix computer.
+    time_distorter : TimeDistorter
+        Time distortion for sampling.
+    extra_features : callable
+        Structural extra features.
+    domain_features : callable
+        Domain-specific extra features.
+    node_dist : ndarray
+        Distribution over number of nodes.
+    sample_steps : int
+        Number of sampling steps.
+    batch_size : int
+        Number of graphs to generate.
+    num_nodes : list, optional
+        Pre-specified number of nodes per graph.
+    conditional : bool
+        Whether to use classifier-free guidance.
+    cond_labels : tensor, optional
+        Conditional labels ``(batch_size, n_cond)`` for guided generation.
+    guidance_weight : float
+        Classifier-free guidance weight. Default 2.0.
+
+    Returns
+    -------
+    list
+        List of tuples ``(X_int, E_int)`` for each generated graph.
+    """
+    model.set_eval()
+    import torch
+    limit_dist = noise_dist.get_limit_dist()
+
+    # Sample number of nodes
+    if num_nodes is None:
+        p = node_dist / node_dist.sum()
+        n_nodes = np.random.choice(len(node_dist), size=batch_size, p=p)
+    else:
+        n_nodes = np.array(num_nodes[:batch_size])
+
+    n_max = int(np.max(n_nodes))
+
+    # Build node mask
+    node_mask_np = np.zeros((batch_size, n_max), dtype=np.float32)
+    for i, n in enumerate(n_nodes):
+        node_mask_np[i, :n] = 1.0
+    node_mask = tlx.convert_to_tensor(node_mask_np.astype(bool))
+
+    # Sample initial noise from limit distribution
+    z = sample_discrete_feature_noise(limit_dist, node_mask)
+    X_t = z.X  # (bs, n_max, dx)
+    E_t = z.E  # (bs, n_max, n_max, de)
+
+    dx = len(limit_dist.X)
+    de = len(limit_dist.E)
+
+    debug_sampling = os.environ.get('DEFOG_DEBUG_SAMPLING', '0') == '1'
+    if debug_sampling:
+        node_mask_np = tlx.convert_to_numpy(node_mask).astype(bool)
+        upper_mask_np = np.triu(np.ones((n_max, n_max), dtype=bool), k=1)[None, :, :]
+        valid_edge_mask_np = (
+            node_mask_np[:, :, None] & node_mask_np[:, None, :] & upper_mask_np
+        )
+
+        def _debug_edge_probs(tag, tensor, step_idx):
+            arr = tlx.convert_to_numpy(tensor)
+            rows = arr[valid_edge_mask_np]
+            if rows.size == 0:
+                return
+            mean_probs = rows.mean(axis=0)
+            print(
+                f"edge_probs={np.array2string(mean_probs, precision=4, suppress_small=True)}",
+                flush=True,
+            )
+
+        def _debug_edge_labels(tag, tensor, step_idx):
+            arr = tlx.convert_to_numpy(tensor).astype(np.int64)
+            labels = arr[valid_edge_mask_np]
+            if labels.size == 0:
+                return
+            counts = np.bincount(labels, minlength=de)
+            print(
+                f"edge_counts={counts.tolist()}",
+                flush=True,
+            )
+
+    for step in range(sample_steps):
+        t_int = step
+        s_int = step + 1
+
+        t_norm = tlx.convert_to_tensor(
+            np.full((batch_size, 1), t_int / sample_steps, dtype=np.float32)
+        )
+        s_norm = tlx.convert_to_tensor(
+            np.full((batch_size, 1), s_int / sample_steps, dtype=np.float32)
+        )
+
+        # Avoid failure mode of absorbing transition at t=0
+        if noise_dist.transition in ('absorbing', 'absorbfirst') and t_int == 0:
+            t_norm = t_norm + 1e-6
+
+        t_dist = time_distorter.sample_ft(t_norm)
+        s_dist = time_distorter.sample_ft(s_norm)
+        dt = float(tlx.convert_to_numpy(s_dist[0, 0] - t_dist[0, 0]))
+
+        # Build noisy data dict
+        if conditional and cond_labels is not None:
+            y_t = cond_labels  # (batch_size, n_cond)
+        else:
+            y_t = tlx.zeros([batch_size, 0], dtype=tlx.float32)
+
+            # TLS half-half conditional sampling: half label=0, half label=1
+            if conditional and cond_labels is None:
+                half = batch_size // 2
+                y_t_np = np.zeros((batch_size, 1), dtype=np.float32)
+                y_t_np[half:, 0] = 1.0
+                y_t = tlx.convert_to_tensor(y_t_np)
+        noisy_data = {
+            't': t_dist,
+            'X_t': X_t,
+            'E_t': E_t,
+            'y_t': y_t,
+            'node_mask': node_mask,
+        }
+
+        # Compute extra features
+        extra_data = compute_extra_data(noisy_data, extra_features,
+                                         domain_features, noise_dist)
+
+        # Forward pass
+        X_in = tlx.concat([X_t, extra_data.X], axis=-1)
+        E_in = tlx.concat([E_t, extra_data.E], axis=-1)
+        y_in = tlx.concat([y_t, extra_data.y], axis=-1)
+
+        with torch.no_grad():
+            pred_X, pred_E, _ = model(X_in, E_in, y_in, node_mask)
+
+        # Softmax predictions
+        pred_X_soft = tlx.softmax(pred_X, axis=-1)
+        pred_E_soft = tlx.softmax(pred_E, axis=-1)
+
+        if debug_sampling and step < 2:
+            _debug_edge_probs('pred_E_soft', pred_E_soft, step)
+
+        is_last_step = (s_int == sample_steps)
+
+        if debug_sampling and step == sample_steps - 1:
+            _debug_edge_probs('pred_E_soft', pred_E_soft, step)
+
+        if is_last_step:
+            # Final step: sample directly from predictions
+            # Apply CFG at prediction level for the final step
+            if conditional and cond_labels is not None:
+                pred_X_soft_u, pred_E_soft_u = _cfg_unconditional_pred(
+                    model, X_in, E_in, extra_data, y_t, node_mask)
+                eps_cfg = 1e-6
+                w = guidance_weight
+                pred_X_soft = tlx.softmax(
+                    (1 - w) * tlx.log(pred_X_soft_u + eps_cfg)
+                    + w * tlx.log(pred_X_soft + eps_cfg), axis=-1)
+                pred_E_soft = tlx.softmax(
+                    (1 - w) * tlx.log(pred_E_soft_u + eps_cfg)
+                    + w * tlx.log(pred_E_soft + eps_cfg), axis=-1)
+
+            sampled = sample_discrete_features(pred_X_soft, pred_E_soft, node_mask)
+            X_t = F.one_hot(sampled.X, dx).float()
+            E_t = F.one_hot(sampled.E, de).float()
+        else:
+            # Compute conditional rate matrix
+            R_X, R_E = rate_matrix_designer.compute_graph_rate_matrix(
+                t_dist, node_mask, (X_t, E_t), (pred_X_soft, pred_E_soft)
+            )
+
+            # Classifier-free guidance: blend rate matrices in log-space
+            if conditional and cond_labels is not None:
+                pred_X_soft_u, pred_E_soft_u = _cfg_unconditional_pred(
+                    model, X_in, E_in, extra_data, y_t, node_mask)
+                R_X_u, R_E_u = rate_matrix_designer.compute_graph_rate_matrix(
+                    t_dist, node_mask, (X_t, E_t), (pred_X_soft_u, pred_E_soft_u)
+                )
+
+                # Log-space geometric interpolation of rate matrices
+                eps_cfg = 1e-6
+                w = guidance_weight
+                R_X = tlx.exp(
+                    (1 - w) * tlx.log(R_X_u + eps_cfg)
+                    + w * tlx.log(R_X + eps_cfg)
+                )
+                R_E = tlx.exp(
+                    (1 - w) * tlx.log(R_E_u + eps_cfg)
+                    + w * tlx.log(R_E + eps_cfg)
+                )
+
+            prob_X, prob_E = compute_step_probs(R_X, R_E, X_t, E_t, dt)
+
+            if debug_sampling and step < 2:
+                _debug_edge_probs('prob_E', prob_E, step)
+
+            # Match original DeFoG sampling path: sample directly from the
+            # CTMC one-step probabilities without extra post-processing.
+            sampled = sample_discrete_features(prob_X, prob_E, node_mask)
+            if debug_sampling and step < 2:
+                _debug_edge_labels('sampled_E', sampled.E, step)
+            X_t = F.one_hot(sampled.X, dx).float()
+            E_t = F.one_hot(sampled.E, de).float()
+
+        # Mask
+        X_t, E_t = apply_node_mask(X_t, E_t, node_mask)
+
+    # Remove virtual classes
+    result = noise_dist.ignore_virtual_classes(X_t, E_t)
+    X_final, E_final = result[0], result[1]
+
+    # Collapse to integer labels
+    X_int = tlx.argmax(X_final, axis=-1)
+    E_int = tlx.argmax(E_final, axis=-1)
+
+    # Split into individual graphs
+    graphs = []
+    for i in range(batch_size):
+        n = int(n_nodes[i])
+        xi = tlx.convert_to_numpy(X_int[i, :n])
+        ei = tlx.convert_to_numpy(E_int[i, :n, :n])
+        graphs.append((xi, ei))
+
+    return graphs
diff --git a/examples/defog/spectre_utils.py b/examples/defog/spectre_utils.py
new file mode 100644
index 000000000..000986bf6
--- /dev/null
+++ b/examples/defog/spectre_utils.py
@@ -0,0 +1,810 @@
+###############################################################################
+#
+# Adapted from https://github.com/lrjconan/GRAN/ which in turn is adapted from https://github.com/JiaxuanYou/graph-generation
+#
+###############################################################################
+import numpy as np
+import concurrent.futures
+from functools import partial
+
+try:
+    import pyemd
+    from scipy.linalg import toeplitz
+    SPECTRE_METRICS_AVAILABLE = True
+except ImportError:
+    SPECTRE_METRICS_AVAILABLE = False
+    import warnings
+    warnings.warn("pyemd or scipy not found, spectre evaluation metrics will fail.")
+
+
+
+
+
+def emd(x, y, sigma=1.0, distance_scaling=1.0):
+    """EMD
+    Args:
+        x, y: 1D pmf of two distributions with the same support
+        sigma: standard deviation
+    """
+    support_size = max(len(x), len(y))
+    d_mat = toeplitz(range(support_size)).astype(float)
+    distance_mat = d_mat / distance_scaling
+
+    # convert histogram values x and y to float, and make them equal len
+    x = x.astype(float)
+    y = y.astype(float)
+    if len(x) < len(y):
+        x = np.hstack((x, [0.0] * (support_size - len(x))))
+    elif len(y) < len(x):
+        y = np.hstack((y, [0.0] * (support_size - len(y))))
+
+    return np.abs(pyemd.emd(x, y, distance_mat))
+
+
+def gaussian_emd(x, y, sigma=1.0, distance_scaling=1.0):
+    """Gaussian kernel with squared distance in exponential term replaced by EMD
+    Args:
+        x, y: 1D pmf of two distributions with the same support
+        sigma: standard deviation
+    """
+    support_size = max(len(x), len(y))
+    d_mat = toeplitz(range(support_size)).astype(float)
+    distance_mat = d_mat / distance_scaling
+
+    # convert histogram values x and y to float, and make them equal len
+    x = x.astype(float)
+    y = y.astype(float)
+    if len(x) < len(y):
+        x = np.hstack((x, [0.0] * (support_size - len(x))))
+    elif len(y) < len(x):
+        y = np.hstack((y, [0.0] * (support_size - len(y))))
+
+    emd = pyemd.emd(x, y, distance_mat)
+    return np.exp(-emd * emd / (2 * sigma * sigma))
+
+
+def gaussian(x, y, sigma=1.0):
+    support_size = max(len(x), len(y))
+    # convert histogram values x and y to float, and make them equal len
+    x = x.astype(float)
+    y = y.astype(float)
+    if len(x) < len(y):
+        x = np.hstack((x, [0.0] * (support_size - len(x))))
+    elif len(y) < len(x):
+        y = np.hstack((y, [0.0] * (support_size - len(y))))
+
+    dist = np.linalg.norm(x - y, 2)
+    return np.exp(-dist * dist / (2 * sigma * sigma))
+
+
+def gaussian_tv(x, y, sigma=1.0):
+    support_size = max(len(x), len(y))
+    # convert histogram values x and y to float, and make them equal len
+    x = x.astype(float)
+    y = y.astype(float)
+    if len(x) < len(y):
+        x = np.hstack((x, [0.0] * (support_size - len(x))))
+    elif len(y) < len(x):
+        y = np.hstack((y, [0.0] * (support_size - len(y))))
+
+    dist = np.abs(x - y).sum() / 2.0
+    return np.exp(-dist * dist / (2 * sigma * sigma))
+
+
+def kernel_parallel_unpacked(x, samples2, kernel):
+    d = 0
+    for s2 in samples2:
+        d += kernel(x, s2)
+    return d
+
+
+def kernel_parallel_worker(t):
+    return kernel_parallel_unpacked(*t)
+
+
+def disc(samples1, samples2, kernel, is_parallel=True, *args, **kwargs):
+    """Discrepancy between 2 samples"""
+    d = 0
+
+    if not is_parallel:
+        for s1 in samples1:
+            for s2 in samples2:
+                d += kernel(s1, s2, *args, **kwargs)
+    else:
+        with concurrent.futures.ThreadPoolExecutor() as executor:
+            for dist in executor.map(
+                kernel_parallel_worker,
+                [(s1, samples2, partial(kernel, *args, **kwargs)) for s1 in samples1],
+            ):
+                d += dist
+    if len(samples1) * len(samples2) > 0:
+        d /= len(samples1) * len(samples2)
+    else:
+        d = 1e6
+    return d
+
+
+def compute_mmd(samples1, samples2, kernel, is_hist=True, *args, **kwargs):
+    """MMD between two samples"""
+    # normalize histograms into pmf
+    if is_hist:
+        samples1 = [s1 / (np.sum(s1) + 1e-6) for s1 in samples1]
+        samples2 = [s2 / (np.sum(s2) + 1e-6) for s2 in samples2]
+    mmd = (
+        disc(samples1, samples1, kernel, *args, **kwargs)
+        + disc(samples2, samples2, kernel, *args, **kwargs)
+        - 2 * disc(samples1, samples2, kernel, *args, **kwargs)
+    )
+
+    mmd = np.abs(mmd)
+
+
+
+    return mmd
+
+
+
+
+
+"""
+Synthetic graph evaluation metrics (degree, spectral, clustering, orbit, etc.)
+Ported from DeFoG src/analysis/spectre_utils.py for GammaGL.
+
+Key changes from original:
+- Removed torch / torch_geometric dependency (pure numpy / networkx)
+- Removed wandb dependency (print to stdout)
+- Removed graph_tool dependency (SBM accuracy simplified)
+- Removed pygsp dependency (wavelet stats disabled by default)
+- Uses dist_helper.py from this directory for MMD computation
+"""
+import os
+import copy
+import signal
+import numpy as np
+import concurrent.futures
+from datetime import datetime
+
+try:
+    import networkx as nx
+    from scipy.linalg import eigvalsh
+except ImportError:
+    pass
+
+
+PRINT_TIME = False
+__all__ = [
+    "degree_stats",
+    "clustering_stats",
+    "orbit_stats_all",
+    "spectral_stats",
+    "eval_acc_planar_graph",
+    "eval_acc_tree_graph",
+]
+
+
+# ============================================================
+# Degree statistics
+# ============================================================
+
+def degree_worker(G):
+    return np.array(nx.degree_histogram(G))
+
+
+def degree_stats(graph_ref_list, graph_pred_list, is_parallel=True, compute_emd=False):
+    """MMD between degree distributions of two sets of graphs."""
+    sample_ref = []
+    sample_pred = []
+    graph_pred_list_remove_empty = [
+        G for G in graph_pred_list if not G.number_of_nodes() == 0
+    ]
+
+    prev = datetime.now()
+    if is_parallel:
+        with concurrent.futures.ThreadPoolExecutor() as executor:
+            for deg_hist in executor.map(degree_worker, graph_ref_list):
+                sample_ref.append(deg_hist)
+        with concurrent.futures.ThreadPoolExecutor() as executor:
+            for deg_hist in executor.map(degree_worker, graph_pred_list_remove_empty):
+                sample_pred.append(deg_hist)
+    else:
+        for G in graph_ref_list:
+            sample_ref.append(np.array(nx.degree_histogram(G)))
+        for G in graph_pred_list_remove_empty:
+            sample_pred.append(np.array(nx.degree_histogram(G)))
+
+    if compute_emd:
+        mmd_dist = compute_mmd(sample_ref, sample_pred, kernel=gaussian_emd)
+    else:
+        mmd_dist = compute_mmd(sample_ref, sample_pred, kernel=gaussian_tv)
+
+    elapsed = datetime.now() - prev
+    if PRINT_TIME:
+        print("Time computing degree mmd: ", elapsed)
+    return mmd_dist
+
+
+# ============================================================
+# Spectral statistics
+# ============================================================
+
+def spectral_worker(G, n_eigvals=-1):
+    try:
+        eigs = eigvalsh(nx.normalized_laplacian_matrix(G).todense())
+    except Exception:
+        eigs = np.zeros(G.number_of_nodes())
+    if n_eigvals > 0:
+        eigs = eigs[1 : n_eigvals + 1]
+    spectral_pmf, _ = np.histogram(eigs, bins=200, range=(-1e-5, 2), density=False)
+    spectral_pmf = spectral_pmf / spectral_pmf.sum()
+    return spectral_pmf
+
+
+def spectral_stats(graph_ref_list, graph_pred_list, is_parallel=True,
+                   n_eigvals=-1, compute_emd=False):
+    """MMD between spectral distributions of two sets of graphs."""
+    sample_ref = []
+    sample_pred = []
+    graph_pred_list_remove_empty = [
+        G for G in graph_pred_list if not G.number_of_nodes() == 0
+    ]
+
+    prev = datetime.now()
+    if is_parallel:
+        with concurrent.futures.ThreadPoolExecutor() as executor:
+            for spectral_density in executor.map(
+                spectral_worker, graph_ref_list, [n_eigvals] * len(graph_ref_list)
+            ):
+                sample_ref.append(spectral_density)
+        with concurrent.futures.ThreadPoolExecutor() as executor:
+            for spectral_density in executor.map(
+                spectral_worker, graph_pred_list_remove_empty,
+                [n_eigvals] * len(graph_pred_list_remove_empty),
+            ):
+                sample_pred.append(spectral_density)
+    else:
+        for G in graph_ref_list:
+            sample_ref.append(spectral_worker(G, n_eigvals))
+        for G in graph_pred_list_remove_empty:
+            sample_pred.append(spectral_worker(G, n_eigvals))
+
+    if compute_emd:
+        mmd_dist = compute_mmd(sample_ref, sample_pred, kernel=gaussian_emd)
+    else:
+        mmd_dist = compute_mmd(sample_ref, sample_pred, kernel=gaussian_tv)
+
+    elapsed = datetime.now() - prev
+    if PRINT_TIME:
+        print("Time computing spectral mmd: ", elapsed)
+    return mmd_dist
+
+
+# ============================================================
+# Clustering statistics
+# ============================================================
+
+def clustering_worker(param):
+    G, bins = param
+    clustering_coeffs_list = list(nx.clustering(G).values())
+    hist, _ = np.histogram(clustering_coeffs_list, bins=bins, range=(0.0, 1.0), density=False)
+    return hist
+
+
+def clustering_stats(graph_ref_list, graph_pred_list, bins=100,
+                     is_parallel=True, compute_emd=False):
+    """MMD between clustering coefficient distributions."""
+    sample_ref = []
+    sample_pred = []
+    graph_pred_list_remove_empty = [
+        G for G in graph_pred_list if not G.number_of_nodes() == 0
+    ]
+
+    prev = datetime.now()
+    if is_parallel:
+        with concurrent.futures.ThreadPoolExecutor() as executor:
+            for clustering_hist in executor.map(
+                clustering_worker, [(G, bins) for G in graph_ref_list]
+            ):
+                sample_ref.append(clustering_hist)
+        with concurrent.futures.ThreadPoolExecutor() as executor:
+            for clustering_hist in executor.map(
+                clustering_worker, [(G, bins) for G in graph_pred_list_remove_empty]
+            ):
+                sample_pred.append(clustering_hist)
+    else:
+        for G in graph_ref_list:
+            clustering_coeffs_list = list(nx.clustering(G).values())
+            hist, _ = np.histogram(clustering_coeffs_list, bins=bins, range=(0.0, 1.0), density=False)
+            sample_ref.append(hist)
+        for G in graph_pred_list_remove_empty:
+            clustering_coeffs_list = list(nx.clustering(G).values())
+            hist, _ = np.histogram(clustering_coeffs_list, bins=bins, range=(0.0, 1.0), density=False)
+            sample_pred.append(hist)
+
+    if compute_emd:
+        mmd_dist = compute_mmd(
+            sample_ref, sample_pred, kernel=gaussian_emd,
+            sigma=1.0 / 10, distance_scaling=bins,
+        )
+    else:
+        mmd_dist = compute_mmd(
+            sample_ref, sample_pred, kernel=gaussian_tv, sigma=1.0 / 10,
+        )
+
+    elapsed = datetime.now() - prev
+    if PRINT_TIME:
+        print("Time computing clustering mmd: ", elapsed)
+    return mmd_dist
+
+
+# ============================================================
+# Orbit / Motif statistics (requires compiled orca binary)
+# ============================================================
+
+motif_to_indices = {
+    "3path": [1, 2],
+    "4cycle": [8],
+}
+COUNT_START_STR = "orbit counts:"
+
+
+def edge_list_reindexed(G):
+    idx = 0
+    id2idx = dict()
+    for u in G.nodes():
+        id2idx[str(u)] = idx
+        idx += 1
+    edges = []
+    for u, v in G.edges():
+        edges.append((id2idx[str(u)], id2idx[str(v)]))
+    return edges
+
+
+def orca(graph):
+    """Run the orca binary to compute orbit counts for a graph."""
+    from secrets import choice
+    from string import ascii_uppercase, digits
+    import subprocess as sp
+
+    tmp_fname = 'orca/tmp_{}.txt'.format(''.join(choice(ascii_uppercase + digits) for _ in range(8)))
+    tmp_fname = os.path.join(os.path.dirname(os.path.realpath(__file__)), tmp_fname)
+
+    f = open(tmp_fname, "w")
+    f.write(str(graph.number_of_nodes()) + " " + str(graph.number_of_edges()) + "\n")
+    for u, v in edge_list_reindexed(graph):
+        f.write(str(u) + " " + str(v) + "\n")
+    f.close()
+
+    output = sp.check_output([
+        str(os.path.join(os.path.dirname(os.path.realpath(__file__)), "orca/orca")),
+        "node", "4", tmp_fname, "std",
+    ])
+    output = output.decode("utf8").strip()
+    idx = output.find(COUNT_START_STR) + len(COUNT_START_STR) + 2
+    output = output[idx:]
+    node_orbit_counts = np.array([
+        list(map(int, node_cnts.strip().split(" ")))
+        for node_cnts in output.strip("\n").split("\n")
+    ])
+
+    try:
+        os.remove(tmp_fname)
+    except OSError:
+        pass
+
+    return node_orbit_counts
+
+
+
+
+
+def orbit_stats_all(graph_ref_list, graph_pred_list, compute_emd=False):
+    total_counts_ref = []
+    total_counts_pred = []
+    graph_pred_list_remove_empty = [
+        G for G in graph_pred_list if not G.number_of_nodes() == 0
+    ]
+
+    for G in graph_ref_list:
+        orbit_counts = orca(G)
+        orbit_counts_graph = np.sum(orbit_counts, axis=0) / G.number_of_nodes()
+        total_counts_ref.append(orbit_counts_graph)
+
+    for G in graph_pred_list:
+        orbit_counts = orca(G)
+        orbit_counts_graph = np.sum(orbit_counts, axis=0) / G.number_of_nodes()
+        total_counts_pred.append(orbit_counts_graph)
+
+    total_counts_ref = np.array(total_counts_ref)
+    total_counts_pred = np.array(total_counts_pred)
+
+    if compute_emd:
+        mmd_dist = compute_mmd(
+            total_counts_ref, total_counts_pred, kernel=gaussian, is_hist=False, sigma=30.0,
+        )
+    else:
+        mmd_dist = compute_mmd(
+            total_counts_ref, total_counts_pred, kernel=gaussian_tv, is_hist=False, sigma=30.0,
+        )
+    return mmd_dist
+
+
+# ============================================================
+# Graph accuracy checks
+# ============================================================
+
+def eval_acc_planar_graph(G_list):
+    count = 0
+    for gg in G_list:
+        if nx.is_connected(gg) and nx.check_planarity(gg)[0]:
+            count += 1
+    return count / float(len(G_list))
+
+
+def eval_acc_tree_graph(G_list):
+    count = 0
+    for gg in G_list:
+        if nx.is_tree(gg):
+            count += 1
+    return count / float(len(G_list))
+
+
+
+
+
+# ============================================================
+# Uniqueness / isomorphism fractions
+# ============================================================
+
+def eval_fraction_unique(fake_graphs, precise=False):
+    count_non_unique = 0
+    fake_evaluated = []
+    for fake_g in fake_graphs:
+        unique = True
+        if not fake_g.number_of_nodes() == 0:
+            for fake_old in fake_evaluated:
+                if precise:
+                    if nx.faster_could_be_isomorphic(fake_g, fake_old):
+                        if nx.is_isomorphic(fake_g, fake_old):
+                            count_non_unique += 1
+                            unique = False
+                            break
+                else:
+                    if nx.faster_could_be_isomorphic(fake_g, fake_old):
+                        if nx.could_be_isomorphic(fake_g, fake_old):
+                            count_non_unique += 1
+                            unique = False
+                            break
+        if unique:
+            fake_evaluated.append(fake_g)
+    return (float(len(fake_graphs)) - count_non_unique) / float(len(fake_graphs))
+
+
+def eval_fraction_isomorphic(fake_graphs, train_graphs):
+    count = 0
+    for fake_g in fake_graphs:
+        for train_g in train_graphs:
+            if nx.faster_could_be_isomorphic(fake_g, train_g):
+                if nx.is_isomorphic(fake_g, train_g):
+                    count += 1
+                    break
+    return count / float(len(fake_graphs))
+
+
+def eval_fraction_unique_non_isomorphic_valid(fake_graphs, train_graphs, validity_func):
+    count_non_unique = 0
+    count_isomorphic = 0
+    count_valid = 0
+    fake_evaluated = []
+
+    for fake_g in fake_graphs:
+        unique = True
+        if fake_g.number_of_nodes() != 0:
+            for fake_old in fake_evaluated:
+                if nx.faster_could_be_isomorphic(fake_g, fake_old):
+                    if nx.is_isomorphic(fake_g, fake_old):
+                        count_non_unique += 1
+                        unique = False
+                        break
+        if unique:
+            fake_evaluated.append(fake_g)
+            non_isomorphic = True
+            for train_g in train_graphs:
+                if nx.faster_could_be_isomorphic(fake_g, train_g):
+                    if nx.is_isomorphic(fake_g, train_g):
+                        count_isomorphic += 1
+                        non_isomorphic = False
+                        break
+            if non_isomorphic and validity_func is not None and validity_func(fake_g):
+                count_valid += 1
+
+    total = float(len(fake_graphs))
+    frac_unique = (total - count_non_unique) / total
+    frac_unique_non_isomorphic = (total - count_non_unique - count_isomorphic) / total
+    frac_unique_non_isomorphic_valid = count_valid / total
+    return frac_unique, frac_unique_non_isomorphic, frac_unique_non_isomorphic_valid
+
+
+# ============================================================
+# Top-level evaluation function
+# ============================================================
+
+def evaluate_synthetic_graphs(generated_graphs, reference_graphs, train_graphs,
+                               dataset_name, compute_emd=False):
+    """Evaluate generated synthetic graphs against reference/test graphs.
+
+    Parameters
+    ----------
+    generated_graphs : list of (X_int, E_int) tuples
+        Generated graphs from the model.
+    reference_graphs : list of nx.Graph
+        Reference (test/val) graphs as networkx objects.
+    train_graphs : list of nx.Graph
+        Training graphs (for novelty computation).
+    dataset_name : str
+        One of 'planar', 'tree', 'sbm'.
+    compute_emd : bool
+        Whether to use EMD-based MMD (slower but more accurate).
+
+    Returns
+    -------
+    dict
+        Evaluation metrics.
+    """
+    # Convert generated (X_int, E_int) tuples to networkx graphs
+    networkx_graphs = []
+    for graph in generated_graphs:
+        node_types, edge_types = graph
+        A = (edge_types > 0).astype(int)
+        nx_graph = nx.from_numpy_array(A)
+        networkx_graphs.append(nx_graph)
+
+    print(f"\nEvaluating {len(networkx_graphs)} generated graphs "
+          f"against {len(reference_graphs)} reference graphs "
+          f"(dataset: {dataset_name})")
+
+    metrics = {}
+
+    # --- Degree MMD ---
+    print("  Computing degree stats...")
+    metrics['degree'] = degree_stats(
+        reference_graphs, networkx_graphs, is_parallel=True, compute_emd=compute_emd)
+    print(f"    Degree MMD: {metrics['degree']:.6f}")
+
+    # --- Spectral MMD ---
+    print("  Computing spectral stats...")
+    metrics['spectre'] = spectral_stats(
+        reference_graphs, networkx_graphs, is_parallel=True, compute_emd=compute_emd)
+    print(f"    Spectral MMD: {metrics['spectre']:.6f}")
+
+    # --- Clustering MMD ---
+    print("  Computing clustering stats...")
+    metrics['clustering'] = clustering_stats(
+        reference_graphs, networkx_graphs, bins=100, is_parallel=True, compute_emd=compute_emd)
+    print(f"    Clustering MMD: {metrics['clustering']:.6f}")
+
+    # --- Orbit MMD (requires orca binary) ---
+    try:
+        print("  Computing orbit stats...")
+        metrics['orbit'] = orbit_stats_all(
+            reference_graphs, networkx_graphs, compute_emd=compute_emd)
+        print(f"    Orbit MMD: {metrics['orbit']:.6f}")
+    except Exception as e:
+        print(f"    Orbit stats skipped (orca not available): {e}")
+
+    # --- Wavelet / Spectral Filter MMD ---
+    try:
+        print("  Computing wavelet stats...")
+        metrics['wavelet'] = spectral_filter_stats(
+            reference_graphs, networkx_graphs,
+            n_filters=12, is_parallel=True, compute_emd=compute_emd)
+        print(f"    Wavelet MMD: {metrics['wavelet']:.6f}")
+    except Exception as e:
+        print(f"    Wavelet stats skipped: {e}")
+
+    # --- Validity (planar / tree / SBM accuracy) ---
+    if dataset_name == 'planar':
+        print("  Computing planar accuracy...")
+        metrics['planar_acc'] = eval_acc_planar_graph(networkx_graphs)
+        print(f"    Planar accuracy: {metrics['planar_acc']:.4f}")
+        validity_func = lambda g: nx.is_connected(g) and nx.check_planarity(g)[0]
+    elif dataset_name == 'tree':
+        print("  Computing tree accuracy...")
+        metrics['tree_acc'] = eval_acc_tree_graph(networkx_graphs)
+        print(f"    Tree accuracy: {metrics['tree_acc']:.4f}")
+        validity_func = nx.is_tree
+    elif dataset_name == 'sbm':
+        print("  SBM accuracy using official DiGress/DeFoG graph_tool MDL with Wald Test...")
+        def is_sbm_valid_official(g, p_intra=0.3, p_inter=0.005, strict=True, refinement_steps=1000):
+            import json
+            import subprocess
+            import os
+            
+            # Extract edges from networkx graph to pass to subprocess
+            edges = list(g.edges())
+            data = {
+                'edges': edges,
+                'p_intra': p_intra,
+                'p_inter': p_inter,
+                'strict': strict,
+                'refinement_steps': refinement_steps
+            }
+            
+            script_path = os.path.join(os.path.dirname(os.path.abspath(__file__)), "_sbm_isolated.py")
+            try:
+                # Run the isolated script
+                result = subprocess.run(
+                    ["python", script_path],
+                    input=json.dumps(data).encode('utf-8'),
+                    capture_output=True,
+                    check=True
+                )
+                output = result.stdout.decode('utf-8').strip()
+                return output == "True"
+            except subprocess.CalledProcessError as e:
+                # If there's a serious error in the script (like missing graph-tool)
+                print(f"SBM Evaluation error: {e.stderr.decode('utf-8')}", file=sys.stderr)
+                return False
+        
+        validity_func = is_sbm_valid_official
+    elif dataset_name == 'comm20':
+        print("  Comm20 accuracy skipped (no dedicated checker)")
+        validity_func = nx.is_connected
+    else:
+        validity_func = None
+
+    # --- Uniqueness / Novelty ---
+    if train_graphs is not None and len(train_graphs) > 0:
+        print("  Computing uniqueness and novelty...")
+        frac_unique = eval_fraction_unique(networkx_graphs)
+        frac_non_iso = 1.0 - eval_fraction_isomorphic(networkx_graphs, train_graphs)
+        metrics['frac_unique'] = frac_unique
+        metrics['frac_non_iso'] = frac_non_iso
+        print(f"    Unique: {frac_unique:.4f}, Non-isomorphic: {frac_non_iso:.4f}")
+
+        if validity_func is not None:
+            valid_count = sum(1 for g in networkx_graphs if validity_func(g))
+            metrics['valid'] = valid_count / len(networkx_graphs)
+            (
+                frac_unique,
+                frac_unique_non_iso,
+                frac_unic_non_iso_valid,
+            ) = eval_fraction_unique_non_isomorphic_valid(
+                networkx_graphs,
+                train_graphs,
+                validity_func,
+            )
+            metrics['frac_unique'] = frac_unique
+            metrics['frac_unique_non_iso'] = frac_unique_non_iso
+            metrics['frac_unic_non_iso_valid'] = frac_unic_non_iso_valid
+            print(
+                f"    Valid: {metrics['valid']:.4f}, "
+                f"Unique non-iso: {frac_unique_non_iso:.4f}, "
+                f"Unique non-iso valid: {frac_unic_non_iso_valid:.4f}"
+            )
+
+    return metrics
+
+
+# ============================================================
+# Wavelet / Spectral Filter statistics (pure numpy, no pygsp)
+# ============================================================
+
+def eigh_worker(G):
+    """Compute eigenvalues and eigenvectors of normalized Laplacian."""
+    try:
+        L = nx.normalized_laplacian_matrix(G).todense()
+        eigvals, eigvecs = np.linalg.eigh(np.asarray(L))
+    except Exception:
+        n = G.number_of_nodes()
+        eigvals = np.zeros(n)
+        eigvecs = np.zeros((n, n))
+    return eigvals, eigvecs
+
+
+def compute_list_eigh(graph_list, is_parallel=True):
+    """Compute eigendecomposition for a list of graphs."""
+    eigval_list = []
+    eigvec_list = []
+    if is_parallel:
+        with concurrent.futures.ThreadPoolExecutor() as executor:
+            for eigvals, eigvecs in executor.map(eigh_worker, graph_list):
+                eigval_list.append(eigvals)
+                eigvec_list.append(eigvecs)
+    else:
+        for G in graph_list:
+            eigvals, eigvecs = eigh_worker(G)
+            eigval_list.append(eigvals)
+            eigvec_list.append(eigvecs)
+    return eigval_list, eigvec_list
+
+
+def _heat_kernel_wavelets(eigvals, eigvecs, scales):
+    """Compute heat kernel wavelet responses.
+
+    Parameters
+    ----------
+    eigvals : ndarray
+        Eigenvalues ``(n,)``.
+    eigvecs : ndarray
+        Eigenvectors ``(n, n)``.
+    scales : list of float
+        Wavelet scales.
+
+    Returns
+    -------
+    ndarray
+        Wavelet responses ``(n_scales, n, n)``.
+    """
+    n = len(eigvals)
+    results = []
+    for s in scales:
+        # Heat kernel: h(s, lambda) = exp(-s * lambda)
+        h = np.exp(-s * eigvals)  # (n,)
+        # Wavelet = U @ diag(h) @ U^T
+        W = eigvecs @ np.diag(h) @ eigvecs.T  # (n, n)
+        results.append(W)
+    return np.array(results)  # (n_scales, n, n)
+
+
+def _get_spectral_filter_worker_np(eigvals, eigvecs, n_filters=12, bound=1.4):
+    """Pure-numpy spectral filter response (replaces pygsp-based version).
+
+    Uses heat kernel wavelets at logarithmically spaced scales.
+    """
+    scales = np.logspace(-2, 1, n_filters)  # 0.01 to 10
+    wavelets = _heat_kernel_wavelets(eigvals, eigvecs, scales)  # (nf, n, n)
+
+    # Compute squared norm per node per filter
+    norm_filt = np.sum(wavelets ** 2, axis=2)  # (nf, n)
+
+    # Histogram per filter
+    hist = np.array([
+        np.histogram(norm_filt[i], range=[0, bound], bins=100)[0]
+        for i in range(n_filters)
+    ])
+    return hist.flatten()
+
+
+def spectral_filter_stats(graph_ref_list, graph_pred_list,
+                          n_filters=12, is_parallel=True, compute_emd=False):
+    """Compute MMD between spectral filter wavelet responses.
+
+    Uses heat kernel wavelets as a drop-in replacement for pygsp Abspline
+    filters (no pygsp dependency required).
+
+    Parameters
+    ----------
+    graph_ref_list, graph_pred_list : list of nx.Graph
+    n_filters : int
+        Number of wavelet scales.
+    is_parallel : bool
+    compute_emd : bool
+    """
+    print("  Computing eigendecompositions for wavelet stats...")
+    eigval_ref, eigvec_ref = compute_list_eigh(graph_ref_list, is_parallel)
+    eigval_pred, eigvec_pred = compute_list_eigh(
+        [G for G in graph_pred_list if G.number_of_nodes() > 0],
+        is_parallel
+    )
+
+    bound = 1.4
+    sample_ref = []
+    sample_pred = []
+
+    prev = datetime.now()
+    for i in range(len(eigval_ref)):
+        sample_ref.append(_get_spectral_filter_worker_np(
+            eigval_ref[i], eigvec_ref[i], n_filters, bound))
+    for i in range(len(eigval_pred)):
+        sample_pred.append(_get_spectral_filter_worker_np(
+            eigval_pred[i], eigvec_pred[i], n_filters, bound))
+
+    if compute_emd:
+        mmd_dist = compute_mmd(sample_ref, sample_pred, kernel=emd)
+    else:
+        mmd_dist = compute_mmd(sample_ref, sample_pred, kernel=gaussian_tv)
+
+    elapsed = datetime.now() - prev
+    if PRINT_TIME:
+        print("Time computing wavelet mmd: ", elapsed)
+    return mmd_dist
diff --git a/examples/defog/tls_metrics.py b/examples/defog/tls_metrics.py
new file mode 100644
index 000000000..416487d82
--- /dev/null
+++ b/examples/defog/tls_metrics.py
@@ -0,0 +1,100 @@
+import numpy as np
+import tensorlayerx as tlx
+import networkx as nx
+
+def compute_tls_metrics(generated, cond_labels, train_graphs):
+    from gammagl.datasets.tls_dataset import CellGraph, PHENOTYPE_DECODER
+    
+    train_hashes = set()
+    if train_graphs is not None:
+        for g in train_graphs:
+            x_np = g.x if isinstance(g.x, np.ndarray) else tlx.convert_to_numpy(g.x)
+            edge_np = g.edge_index if isinstance(g.edge_index, np.ndarray) else tlx.convert_to_numpy(g.edge_index)
+            nx_g = nx.Graph()
+            n = x_np.shape[0]
+            nx_g.add_nodes_from(range(n))
+            if x_np.shape[-1] == 9:
+                node_types = np.argmax(x_np, axis=-1)
+                for i in range(n):
+                    nx_g.nodes[i]['phenotype'] = PHENOTYPE_DECODER[node_types[i]]
+            if edge_np.shape[1] > 0:
+                src, dst = edge_np[0].astype(int), edge_np[1].astype(int)
+                for s, d in zip(src, dst):
+                    if s < d:
+                        nx_g.add_edge(s, d)
+            train_hashes.add(nx.weisfeiler_lehman_graph_hash(nx_g, node_attr='phenotype'))
+
+    valid_graphs = []
+    tls_correct = 0
+    total_cond = 0
+    generated_hashes = []
+    
+    for idx, (atom_types, edge_types) in enumerate(generated):
+        atom_types_np = np.asarray(atom_types, dtype=np.int64)
+        edge_types_np = np.asarray(edge_types, dtype=np.int64)
+        n = atom_types_np.shape[0]
+        nx_g = nx.Graph()
+        
+        for i in range(n):
+            node_type_idx = int(atom_types_np[i])
+            phenotype = PHENOTYPE_DECODER[node_type_idx] if node_type_idx < len(PHENOTYPE_DECODER) else 'Marker'
+            nx_g.add_node(i, phenotype=phenotype)
+            
+        if edge_types_np.size > 0:
+            upper = np.triu(edge_types_np, k=1)
+            src, dst = np.nonzero(upper)
+            for s, d in zip(src, dst):
+                if upper[s, d] > 0:
+                    nx_g.add_edge(int(s), int(d))
+        
+        is_valid = True
+        try:
+            if not nx.is_connected(nx_g):
+                is_valid = False
+            elif not nx.check_planarity(nx_g)[0]:
+                is_valid = False
+        except Exception:
+            is_valid = False
+            
+        if is_valid:
+            valid_graphs.append(nx_g)
+            generated_hashes.append(nx.weisfeiler_lehman_graph_hash(nx_g, node_attr='phenotype'))
+            
+            if cond_labels is not None and idx < len(cond_labels):
+                label_tensor = cond_labels[idx]
+                try:
+                    label = int(tlx.convert_to_numpy(label_tensor).item())
+                    cell_g = CellGraph(nx_g)
+                    if label == 1 and cell_g.has_high_TLS():
+                        tls_correct += 1
+                    elif label == 0 and cell_g.has_low_TLS():
+                        tls_correct += 1
+                    total_cond += 1
+                except Exception:
+                    pass
+
+    num_generated = len(generated)
+    num_valid = len(valid_graphs)
+    valid_ratio = num_valid / num_generated if num_generated > 0 else 0.0
+    
+    unique_hashes = set(generated_hashes)
+    num_unique = len(unique_hashes)
+    unique_ratio = num_unique / num_valid if num_valid > 0 else 0.0
+    
+    novel_hashes = unique_hashes - train_hashes
+    num_novel = len(novel_hashes)
+    novel_ratio = num_novel / num_unique if num_unique > 0 else 0.0
+    
+    v_u_n = valid_ratio * unique_ratio * novel_ratio
+    
+    metrics = {
+        'valid': valid_ratio,
+        'unique': unique_ratio,
+        'novel': novel_ratio,
+        'V.U.N.': v_u_n,
+    }
+    
+    if total_cond > 0:
+        metrics['tls_validity'] = tls_correct / total_cond
+        
+    return metrics
diff --git a/examples/defog/train_metrics.py b/examples/defog/train_metrics.py
new file mode 100644
index 000000000..dd3c5d64d
--- /dev/null
+++ b/examples/defog/train_metrics.py
@@ -0,0 +1,95 @@
+import tensorlayerx as tlx
+import numpy as np
+
+class LossMetric:
+    """Generic accumulator for losses with running mean."""
+    def __init__(self):
+        self.total_loss = 0.0
+        self.total_samples = 0
+
+    def update_precomputed(self, loss_val, n):
+        if n > 0:
+            self.total_loss += float(loss_val) * int(n)
+            self.total_samples += int(n)
+        return float(loss_val)
+
+    def compute(self):
+        if self.total_samples == 0: return 0.0
+        return self.total_loss / self.total_samples
+
+    def reset(self):
+        self.total_loss = 0.0
+        self.total_samples = 0
+
+class TrainLossDiscrete(tlx.nn.Module):
+    r"""Training loss for DeFoG: weighted sum of node, edge, and global losses."""
+    def __init__(self, lambda_train=None, kld=False, name=None):
+        super().__init__(name=name)
+        self.lambda_train = lambda_train if lambda_train else [5.0, 0.0]
+        self.kld = kld
+        self.node_loss = LossMetric()
+        self.edge_loss = LossMetric()
+        self.y_loss = LossMetric()
+
+    def forward(self, pred_X, pred_E, pred_y, true_X, true_E, true_y):
+        bs, n, dx = pred_X.shape[0], pred_X.shape[1], pred_X.shape[2]
+        de = pred_E.shape[-1]
+
+        pred_X_flat = tlx.reshape(pred_X, [-1, dx])
+        true_X_flat = tlx.reshape(true_X, [-1, dx])
+        pred_E_flat = tlx.reshape(pred_E, [-1, de])
+        true_E_flat = tlx.reshape(true_E, [-1, de])
+
+        x_mask = tlx.reduce_sum(tlx.abs(true_X_flat), axis=-1) > 0
+        e_mask = tlx.reduce_sum(tlx.abs(true_E_flat), axis=-1) > 0
+
+        loss_X = self._masked_loss(self.node_loss, pred_X_flat, true_X_flat, x_mask)
+        loss_E = self._masked_loss(self.edge_loss, pred_E_flat, true_E_flat, e_mask)
+
+        if pred_y is not None and pred_y.shape[-1] > 0 and true_y is not None and true_y.shape[-1] > 0:
+            true_y_labels = tlx.cast(tlx.argmax(true_y, axis=-1), tlx.int64)
+            per_sample_loss_y = tlx.losses.softmax_cross_entropy_with_logits(pred_y, true_y_labels)
+            
+            if len(per_sample_loss_y.shape) == 0:
+                loss_y = per_sample_loss_y
+            else:
+                loss_y = tlx.reduce_mean(per_sample_loss_y)
+            self.y_loss.update_precomputed(float(tlx.convert_to_numpy(loss_y)), bs)
+        else:
+            loss_y = tlx.convert_to_tensor(0.0)
+
+        return loss_X + self.lambda_train[0] * loss_E + self.lambda_train[1] * loss_y
+
+    def _masked_loss(self, metric, preds, targets, mask):
+        mask_float = tlx.cast(mask, tlx.float32)
+        n_valid = tlx.reduce_sum(mask_float) + 1e-8
+
+        if self.kld:
+            preds_max = tlx.reduce_max(preds, axis=-1, keepdims=True)
+            exp_preds = tlx.exp(preds - preds_max)
+            log_preds = (preds - preds_max) - tlx.log(tlx.reduce_sum(exp_preds, axis=-1, keepdims=True) + 1e-10)
+            targets_safe = targets + 1e-10
+            kl_per_sample = tlx.reduce_sum(targets * (tlx.log(targets_safe) - log_preds), axis=-1)
+            loss = tlx.reduce_sum(kl_per_sample * mask_float) / n_valid
+        else:
+            true_labels = tlx.cast(tlx.argmax(targets, axis=-1), tlx.int64)
+            per_sample_loss = tlx.losses.softmax_cross_entropy_with_logits(preds, true_labels)
+            
+            # Handling scalar vs vector reduction based on backend implementation
+            if len(per_sample_loss.shape) == 0:
+                loss = per_sample_loss
+            else:
+                loss = tlx.reduce_sum(per_sample_loss * mask_float) / n_valid
+        
+        metric_val = float(tlx.convert_to_numpy(loss))
+        metric_n = int(tlx.convert_to_numpy(tlx.reduce_sum(mask_float)))
+        metric.update_precomputed(metric_val, metric_n)
+        return loss
+
+    def log_epoch_metrics(self):
+        return {'x_CE': self.node_loss.compute(), 'E_CE': self.edge_loss.compute(), 'y_CE': self.y_loss.compute()}
+
+    def reset(self):
+        self.node_loss.reset()
+        self.edge_loss.reset()
+        self.y_loss.reset()
diff --git a/gammagl/datasets/__init__.py b/gammagl/datasets/__init__.py
index 1c3351876..1ed85ccf6 100644
--- a/gammagl/datasets/__init__.py
+++ b/gammagl/datasets/__init__.py
@@ -66,3 +66,4 @@
 ]
 
 classes = __all__
+
diff --git a/gammagl/layers/attention/__init__.py b/gammagl/layers/attention/__init__.py
index e83c140a6..316eae448 100644
--- a/gammagl/layers/attention/__init__.py
+++ b/gammagl/layers/attention/__init__.py
@@ -5,6 +5,7 @@
 from .heco_encoder import Sc_encoder
 from .heco_encoder import Mp_encoder
 from .sgformer_layer import TransConvLayer, GraphConvLayer
+from .defog_layer import XEyTransformerLayer, NodeEdgeBlock, Xtoy, Etoy
 __all__ = [
     'Sc_encoder',
     'Mp_encoder',
@@ -14,6 +15,10 @@
     'GraphormerLayer',
     'TransConvLayer',
     'GraphConvLayer',
+    'XEyTransformerLayer',
+    'NodeEdgeBlock',
+    'Xtoy',
+    'Etoy',
 ]
 
 classes = __all__
diff --git a/gammagl/layers/attention/defog_layer.py b/gammagl/layers/attention/defog_layer.py
new file mode 100644
index 000000000..9bc21ef58
--- /dev/null
+++ b/gammagl/layers/attention/defog_layer.py
@@ -0,0 +1,365 @@
+import math
+import tensorlayerx as tlx
+
+
+def masked_softmax(x, mask, dim=-1):
+    r"""Softmax with masking: sets masked positions to ``-inf`` before softmax.
+
+    Parameters
+    ----------
+    x : tensor
+        Input tensor.
+    mask : tensor
+        Boolean or float mask (1 = valid, 0 = masked).
+    dim : int
+        Dimension along which to compute softmax.
+
+    Returns
+    -------
+    tensor
+        Softmax output with masked positions zeroed.
+    """
+    mask_sum = float(tlx.convert_to_numpy(tlx.reduce_sum(tlx.cast(mask, tlx.float32))))
+
+    if mask_sum == 0:
+        return x
+
+    mask_float = tlx.cast(mask, x.dtype)
+    while len(mask_float.shape) < len(x.shape):
+        mask_float = tlx.expand_dims(mask_float, axis=-1)
+
+    neg_inf = tlx.zeros_like(x) - 1e9
+    x_masked = tlx.where(mask_float > 0.5, x, neg_inf)
+    return tlx.softmax(x_masked, axis=dim)
+
+
+class Xtoy(tlx.nn.Module):
+    r"""Aggregate node features to global features via statistics.
+
+    Computes ``[mean, min, max, std]`` over the node dimension, concatenates
+    them, and applies a linear projection.
+
+    Parameters
+    ----------
+    dx : int
+        Input node feature dimension.
+    dy : int
+        Output global feature dimension.
+    name : str, optional
+        Module name.
+    """
+    def __init__(self, dx, dy, name=None):
+        super().__init__(name=name)
+        self.lin = tlx.layers.Linear(in_features=4 * dx, out_features=dy, W_init=tlx.initializers.HeUniform(a=math.sqrt(5)))
+
+    def forward(self, X):
+        """
+        Parameters
+        ----------
+        X : tensor
+            Node features of shape ``(bs, n, dx)``.
+
+        Returns
+        -------
+        tensor
+            Global features of shape ``(bs, dy)``.
+        """
+        m = tlx.reduce_mean(X, axis=1)       # (bs, dx)
+        mi = tlx.reduce_min(X, axis=1)       # (bs, dx)
+        ma = tlx.reduce_max(X, axis=1)       # (bs, dx)
+
+        mean_val = tlx.reduce_mean(X, axis=1, keepdims=True)
+        diff_sq = (X - mean_val) ** 2
+        std = tlx.sqrt(tlx.reduce_mean(diff_sq, axis=1) + 1e-12)
+
+        z = tlx.concat([m, mi, ma, std], axis=-1)  # (bs, 4*dx)
+        return self.lin(z)
+
+
+class Etoy(tlx.nn.Module):
+    r"""Aggregate edge features to global features via statistics.
+
+    Parameters
+    ----------
+    d : int
+        Input edge feature dimension.
+    dy : int
+        Output global feature dimension.
+    name : str, optional
+        Module name.
+    """
+    def __init__(self, d, dy, name=None):
+        super().__init__(name=name)
+        self.lin = tlx.layers.Linear(in_features=4 * d, out_features=dy, W_init=tlx.initializers.HeUniform(a=math.sqrt(5)))
+
+    def forward(self, E):
+        """
+        Parameters
+        ----------
+        E : tensor
+            Edge features of shape ``(bs, n, n, de)``.
+
+        Returns
+        -------
+        tensor
+            Global features of shape ``(bs, dy)``.
+        """
+        m = tlx.reduce_mean(E, axis=(1, 2))    # (bs, de)
+
+        mi = tlx.reduce_min(tlx.reduce_min(E, axis=2), axis=1)
+        ma = tlx.reduce_max(tlx.reduce_max(E, axis=2), axis=1)
+
+        mean_val = tlx.reduce_mean(E, axis=(1, 2), keepdims=True)
+        diff_sq = (E - mean_val) ** 2
+        std = tlx.sqrt(tlx.reduce_mean(diff_sq, axis=(1, 2)) + 1e-12)
+
+        z = tlx.concat([m, mi, ma, std], axis=-1)  # (bs, 4*de)
+        return self.lin(z)
+
+
+class NodeEdgeBlock(tlx.nn.Module):
+    r"""Self-attention block with FiLM conditioning from edge and global features.
+
+    This block implements multi-head attention where edge features modulate
+    attention scores via FiLM (Feature-wise Linear Modulation), and global
+    features condition both node and edge updates.
+
+    Parameters
+    ----------
+    dx : int
+        Node feature dimension.
+    de : int
+        Edge feature dimension.
+    dy : int
+        Global feature dimension.
+    n_head : int
+        Number of attention heads.
+    name : str, optional
+        Module name.
+    """
+    def __init__(self, dx, de, dy, n_head, name=None):
+        super().__init__(name=name)
+        assert dx % n_head == 0, f"dx ({dx}) must be divisible by n_head ({n_head})"
+        self.dx = dx
+        self.de = de
+        self.dy = dy
+        self.n_head = n_head
+        self.df = dx // n_head
+
+        # QKV projections
+        self.q = tlx.layers.Linear(in_features=dx, out_features=dx, W_init=tlx.initializers.HeUniform(a=math.sqrt(5)))
+        self.k = tlx.layers.Linear(in_features=dx, out_features=dx, W_init=tlx.initializers.HeUniform(a=math.sqrt(5)))
+        self.v = tlx.layers.Linear(in_features=dx, out_features=dx, W_init=tlx.initializers.HeUniform(a=math.sqrt(5)))
+
+        # FiLM: E -> attention (multiply and add)
+        self.e_add = tlx.layers.Linear(in_features=de, out_features=dx, W_init=tlx.initializers.HeUniform(a=math.sqrt(5)))
+        self.e_mul = tlx.layers.Linear(in_features=de, out_features=dx, W_init=tlx.initializers.HeUniform(a=math.sqrt(5)))
+
+        # FiLM: y -> E
+        self.y_e_mul = tlx.layers.Linear(in_features=dy, out_features=dx, W_init=tlx.initializers.HeUniform(a=math.sqrt(5)))
+        self.y_e_add = tlx.layers.Linear(in_features=dy, out_features=dx, W_init=tlx.initializers.HeUniform(a=math.sqrt(5)))
+
+        # FiLM: y -> X
+        self.y_x_mul = tlx.layers.Linear(in_features=dy, out_features=dx, W_init=tlx.initializers.HeUniform(a=math.sqrt(5)))
+        self.y_x_add = tlx.layers.Linear(in_features=dy, out_features=dx, W_init=tlx.initializers.HeUniform(a=math.sqrt(5)))
+
+        # Global feature processing
+        self.y_y = tlx.layers.Linear(in_features=dy, out_features=dy, W_init=tlx.initializers.HeUniform(a=math.sqrt(5)))
+        self.x_y = Xtoy(dx, dy)
+        self.e_y = Etoy(de, dy)
+
+        # Output projections
+        self.x_out = tlx.layers.Linear(in_features=dx, out_features=dx, W_init=tlx.initializers.HeUniform(a=math.sqrt(5)))
+        self.e_out = tlx.layers.Linear(in_features=dx, out_features=de, W_init=tlx.initializers.HeUniform(a=math.sqrt(5)))
+        self.y_out = tlx.nn.Sequential(
+            tlx.layers.Linear(in_features=dy, out_features=dy, W_init=tlx.initializers.HeUniform(a=math.sqrt(5))),
+            tlx.layers.ReLU(),
+            tlx.layers.Linear(in_features=dy, out_features=dy, W_init=tlx.initializers.HeUniform(a=math.sqrt(5))),
+        )
+
+    def forward(self, X, E, y, node_mask):
+        """
+        Parameters
+        ----------
+        X : tensor
+            Node features ``(bs, n, dx)``.
+        E : tensor
+            Edge features ``(bs, n, n, de)``.
+        y : tensor
+            Global features ``(bs, dy)``.
+        node_mask : tensor
+            Boolean mask ``(bs, n)``.
+
+        Returns
+        -------
+        tuple
+            ``(newX, newE, new_y)`` with same shapes as inputs.
+        """
+        bs, n, _ = X.shape
+        x_mask = tlx.expand_dims(tlx.cast(node_mask, X.dtype), axis=-1)  # (bs, n, 1)
+        e_mask1 = tlx.expand_dims(x_mask, axis=2)  # (bs, n, 1, 1)
+        e_mask2 = tlx.expand_dims(x_mask, axis=1)  # (bs, 1, n, 1)
+
+        # Q, K, V
+        Q = self.q(X) * x_mask  # (bs, n, dx)
+        K = self.k(X) * x_mask
+        V = self.v(X) * x_mask
+
+        # Reshape for multi-head: (bs, n, n_head, df)
+        Q = tlx.reshape(Q, [bs, n, self.n_head, self.df])
+        K = tlx.reshape(K, [bs, n, self.n_head, self.df])
+
+        # Attention scores: (bs, n, 1, n_head, df) * (bs, 1, n, n_head, df)
+        Q_exp = tlx.expand_dims(Q, axis=2)  # (bs, n, 1, n_head, df)
+        K_exp = tlx.expand_dims(K, axis=1)  # (bs, 1, n, n_head, df)
+        Y = Q_exp * K_exp / math.sqrt(self.df)  # (bs, n, n, n_head, df)
+
+        # FiLM: Edge modulation of attention
+        E1 = self.e_mul(E) * (e_mask1 * e_mask2)  # (bs, n, n, dx)
+        E1 = tlx.reshape(E1, [bs, n, n, self.n_head, self.df])
+        E2 = self.e_add(E) * (e_mask1 * e_mask2)
+        E2 = tlx.reshape(E2, [bs, n, n, self.n_head, self.df])
+        Y = Y * (E1 + 1) + E2
+
+        # New edge features
+        newE = tlx.reshape(Y, [bs, n, n, self.dx])  # flatten heads
+
+        # FiLM: y -> E
+        ye1 = tlx.reshape(self.y_e_add(y), [bs, 1, 1, self.dx])
+        ye2 = tlx.reshape(self.y_e_mul(y), [bs, 1, 1, self.dx])
+        newE = ye1 + (ye2 + 1) * newE
+        newE = self.e_out(newE) * (e_mask1 * e_mask2)  # (bs, n, n, de)
+
+        # Softmax attention
+        softmax_mask = tlx.cast(
+            tlx.expand_dims(x_mask, axis=1),  # (bs, 1, n, 1)
+            tlx.float32
+        )
+        softmax_mask = tlx.tile(
+            tlx.reshape(softmax_mask, [bs, 1, n, 1]),
+            [1, n, 1, self.n_head]
+        )  # (bs, n, n, n_head)
+        attn = masked_softmax(Y, softmax_mask, dim=2)  # (bs, n, n, n_head, df)
+
+        # Value aggregation
+        V = tlx.reshape(V, [bs, n, self.n_head, self.df])
+        V_exp = tlx.expand_dims(V, axis=1)  # (bs, 1, n, n_head, df)
+        weighted_V = attn * V_exp  # (bs, n, n, n_head, df)
+        weighted_V = tlx.reduce_sum(weighted_V, axis=2)  # (bs, n, n_head, df)
+        weighted_V = tlx.reshape(weighted_V, [bs, n, self.dx])  # (bs, n, dx)
+
+        # FiLM: y -> X
+        yx1 = tlx.reshape(self.y_x_add(y), [bs, 1, self.dx])
+        yx2 = tlx.reshape(self.y_x_mul(y), [bs, 1, self.dx])
+        newX = yx1 + (yx2 + 1) * weighted_V
+        newX = self.x_out(newX) * x_mask  # (bs, n, dx)
+
+        # Global feature update
+        y_out = self.y_y(y)
+        x_y = self.x_y(X)
+        e_y = self.e_y(E)
+        new_y = y_out + x_y + e_y
+        new_y = self.y_out(new_y)
+
+        return newX, newE, new_y
+
+
+class XEyTransformerLayer(tlx.nn.Module):
+    r"""A single transformer layer that jointly updates node, edge, and global features.
+
+    Uses ``NodeEdgeBlock`` for self-attention followed by feed-forward networks
+    (FFN) for each feature type, with residual connections and layer normalization.
+
+    Parameters
+    ----------
+    dx : int
+        Node feature dimension.
+    de : int
+        Edge feature dimension.
+    dy : int
+        Global feature dimension.
+    n_head : int
+        Number of attention heads.
+    dim_ffX : int
+        FFN hidden dimension for nodes.
+    dim_ffE : int
+        FFN hidden dimension for edges.
+    dim_ffy : int
+        FFN hidden dimension for global features.
+    dropout : float
+        Dropout rate.
+    layer_norm_eps : float
+        Epsilon for layer normalization.
+    name : str, optional
+        Module name.
+    """
+    def __init__(self, dx, de, dy, n_head, dim_ffX=2048, dim_ffE=128,
+                 dim_ffy=2048, dropout=0.1, layer_norm_eps=1e-5, name=None):
+        super().__init__(name=name)
+
+        self.self_attn = NodeEdgeBlock(dx, de, dy, n_head)
+
+        # FFN for X
+        self.linX1 = tlx.layers.Linear(in_features=dx, out_features=dim_ffX, W_init=tlx.initializers.HeUniform(a=math.sqrt(5)))
+        self.linX2 = tlx.layers.Linear(in_features=dim_ffX, out_features=dx, W_init=tlx.initializers.HeUniform(a=math.sqrt(5)))
+        self.normX1 = tlx.layers.LayerNorm(normalized_shape=dx, epsilon=layer_norm_eps)
+        self.normX2 = tlx.layers.LayerNorm(normalized_shape=dx, epsilon=layer_norm_eps)
+        self.dropoutX1 = tlx.layers.Dropout(p=dropout)
+        self.dropoutX2 = tlx.layers.Dropout(p=dropout)
+        self.dropoutX3 = tlx.layers.Dropout(p=dropout)
+
+        # FFN for E
+        self.linE1 = tlx.layers.Linear(in_features=de, out_features=dim_ffE, W_init=tlx.initializers.HeUniform(a=math.sqrt(5)))
+        self.linE2 = tlx.layers.Linear(in_features=dim_ffE, out_features=de, W_init=tlx.initializers.HeUniform(a=math.sqrt(5)))
+        self.normE1 = tlx.layers.LayerNorm(normalized_shape=de, epsilon=layer_norm_eps)
+        self.normE2 = tlx.layers.LayerNorm(normalized_shape=de, epsilon=layer_norm_eps)
+        self.dropoutE1 = tlx.layers.Dropout(p=dropout)
+        self.dropoutE2 = tlx.layers.Dropout(p=dropout)
+        self.dropoutE3 = tlx.layers.Dropout(p=dropout)
+
+        # FFN for y
+        self.lin_y1 = tlx.layers.Linear(in_features=dy, out_features=dim_ffy, W_init=tlx.initializers.HeUniform(a=math.sqrt(5)))
+        self.lin_y2 = tlx.layers.Linear(in_features=dim_ffy, out_features=dy, W_init=tlx.initializers.HeUniform(a=math.sqrt(5)))
+        self.norm_y1 = tlx.layers.LayerNorm(normalized_shape=dy, epsilon=layer_norm_eps)
+        self.norm_y2 = tlx.layers.LayerNorm(normalized_shape=dy, epsilon=layer_norm_eps)
+        self.dropout_y1 = tlx.layers.Dropout(p=dropout)
+        self.dropout_y2 = tlx.layers.Dropout(p=dropout)
+        self.dropout_y3 = tlx.layers.Dropout(p=dropout)
+
+    def forward(self, X, E, y, node_mask):
+        """
+        Parameters
+        ----------
+        X : tensor
+            Node features ``(bs, n, dx)``.
+        E : tensor
+            Edge features ``(bs, n, n, de)``.
+        y : tensor
+            Global features ``(bs, dy)``.
+        node_mask : tensor
+            Boolean mask ``(bs, n)``.
+
+        Returns
+        -------
+        tuple
+            ``(X, E, y)`` with same shapes as inputs.
+        """
+        # Self-attention
+        newX, newE, new_y = self.self_attn(X, E, y, node_mask)
+
+        # Residual + LayerNorm (block 1)
+        X = self.normX1(X + self.dropoutX1(newX))
+        E = self.normE1(E + self.dropoutE1(newE))
+        y = self.norm_y1(y + self.dropout_y1(new_y))
+
+        # FFN
+        ff_X = self.linX2(self.dropoutX2(tlx.relu(self.linX1(X))))
+        ff_E = self.linE2(self.dropoutE2(tlx.relu(self.linE1(E))))
+        ff_y = self.lin_y2(self.dropout_y2(tlx.relu(self.lin_y1(y))))
+
+        # Residual + LayerNorm (block 2)
+        X = self.normX2(X + self.dropoutX3(ff_X))
+        E = self.normE2(E + self.dropoutE3(ff_E))
+        y = self.norm_y2(y + self.dropout_y3(ff_y))
+
+        return X, E, y
diff --git a/gammagl/models/__init__.py b/gammagl/models/__init__.py
index 062ee67ea..0177873a8 100644
--- a/gammagl/models/__init__.py
+++ b/gammagl/models/__init__.py
@@ -67,6 +67,7 @@
 from .sgformer import SGFormerModel
 from .adagad import PreModel, ReModel
 from .nodeid import NodeIDGNN
+from .defog import DeFoGModel
 
 __all__ = [
     'HeCo',
@@ -143,6 +144,7 @@
     'PreModel',
     'ReModel'
     , 'NodeIDGNN'
+    , 'DeFoGModel'
 ]
 
 classes = __all__
diff --git a/gammagl/models/defog.py b/gammagl/models/defog.py
new file mode 100644
index 000000000..5bc8fa57c
--- /dev/null
+++ b/gammagl/models/defog.py
@@ -0,0 +1,206 @@
+import math
+import tensorlayerx as tlx
+from gammagl.layers.attention.defog_layer import XEyTransformerLayer
+
+
+def _timestep_embedding(timesteps, dim, max_period=10000):
+    r"""Sinusoidal timestep embedding (internal helper)."""
+    import numpy as np
+    half = dim // 2
+    freqs = np.exp(-math.log(max_period) * np.arange(0, half, dtype=np.float32) / half)
+    freqs = tlx.convert_to_tensor(freqs)
+
+    ts = tlx.cast(tlx.reshape(timesteps, [-1, 1]), tlx.float32)
+    freqs = tlx.reshape(freqs, [1, -1])
+    args = ts * freqs
+
+    cos_part = tlx.cos(args)
+    sin_part = tlx.sin(args)
+    embedding = tlx.concat([cos_part, sin_part], axis=-1)
+
+    if dim % 2 == 1:
+        zeros_pad = tlx.zeros([embedding.shape[0], 1], dtype=tlx.float32)
+        embedding = tlx.concat([embedding, zeros_pad], axis=-1)
+
+    return embedding
+
+
+class DeFoGModel(tlx.nn.Module):
+    r"""Graph Transformer denoiser network for DeFoG discrete flow matching.
+
+    From the `"DeFoG: Discrete Flow Matching for Graph Generation"
+    <https://arxiv.org/abs/2410.04263>`_ paper (ICML 2025).
+
+    The model takes a noisy graph representation ``(X, E, y)`` and predicts the
+    clean graph. It uses a stack of ``XEyTransformerLayer`` blocks with FiLM
+    conditioning between node, edge, and global features. A 64-dimensional
+    sinusoidal timestep embedding is concatenated to the global features before
+    the input MLP.
+
+    Parameters
+    ----------
+    n_layers : int
+        Number of ``XEyTransformerLayer`` blocks.
+    input_dims : dict
+        Input feature dimensions with keys ``'X'``, ``'E'``, ``'y'``.
+    hidden_mlp_dims : dict
+        Hidden MLP dimensions with keys ``'X'``, ``'E'``, ``'y'``.
+    hidden_dims : dict
+        Hidden transformer dimensions with keys ``'dx'``, ``'de'``, ``'dy'``,
+        ``'n_head'``, ``'dim_ffX'``, ``'dim_ffE'``, ``'dim_ffy'``.
+    output_dims : dict
+        Output dimensions with keys ``'X'``, ``'E'``, ``'y'``.
+    name : str, optional
+        Model name.
+    """
+    def __init__(self, n_layers, input_dims, hidden_mlp_dims, hidden_dims,
+                 output_dims, name=None):
+        super().__init__(name=name)
+        self.n_layers = n_layers
+        self.out_dim_X = output_dims['X']
+        self.out_dim_E = output_dims['E']
+        self.out_dim_y = output_dims['y']
+
+        # Input MLPs
+        self.mlp_in_X = tlx.nn.Sequential(
+            tlx.layers.Linear(in_features=input_dims['X'],
+                              out_features=hidden_mlp_dims['X'], W_init=tlx.initializers.HeUniform(a=math.sqrt(5))),
+            tlx.layers.ReLU(),
+            tlx.layers.Linear(in_features=hidden_mlp_dims['X'],
+                              out_features=hidden_dims['dx'], W_init=tlx.initializers.HeUniform(a=math.sqrt(5))),
+            tlx.layers.ReLU(),
+        )
+        self.mlp_in_E = tlx.nn.Sequential(
+            tlx.layers.Linear(in_features=input_dims['E'],
+                              out_features=hidden_mlp_dims['E'], W_init=tlx.initializers.HeUniform(a=math.sqrt(5))),
+            tlx.layers.ReLU(),
+            tlx.layers.Linear(in_features=hidden_mlp_dims['E'],
+                              out_features=hidden_dims['de'], W_init=tlx.initializers.HeUniform(a=math.sqrt(5))),
+            tlx.layers.ReLU(),
+        )
+        # +64 for timestep embedding dimension
+        self.mlp_in_y = tlx.nn.Sequential(
+            tlx.layers.Linear(in_features=input_dims['y'] + 64,
+                              out_features=hidden_mlp_dims['y'], W_init=tlx.initializers.HeUniform(a=math.sqrt(5))),
+            tlx.layers.ReLU(),
+            tlx.layers.Linear(in_features=hidden_mlp_dims['y'],
+                              out_features=hidden_dims['dy'], W_init=tlx.initializers.HeUniform(a=math.sqrt(5))),
+            tlx.layers.ReLU(),
+        )
+
+        # Transformer layers
+        self.tf_layers = tlx.nn.ModuleList()
+        for i in range(n_layers):
+            self.tf_layers.append(
+                XEyTransformerLayer(
+                    dx=hidden_dims['dx'],
+                    de=hidden_dims['de'],
+                    dy=hidden_dims['dy'],
+                    n_head=hidden_dims['n_head'],
+                    dim_ffX=hidden_dims['dim_ffX'],
+                    dim_ffE=hidden_dims['dim_ffE'],
+                    dim_ffy=hidden_dims.get('dim_ffy', 2048),
+                )
+            )
+
+        # Output MLPs
+        self.mlp_out_X = tlx.nn.Sequential(
+            tlx.layers.Linear(in_features=hidden_dims['dx'],
+                              out_features=hidden_mlp_dims['X'], W_init=tlx.initializers.HeUniform(a=math.sqrt(5))),
+            tlx.layers.ReLU(),
+            tlx.layers.Linear(in_features=hidden_mlp_dims['X'],
+                              out_features=output_dims['X'], W_init=tlx.initializers.HeUniform(a=math.sqrt(5))),
+        )
+        self.mlp_out_E = tlx.nn.Sequential(
+            tlx.layers.Linear(in_features=hidden_dims['de'],
+                              out_features=hidden_mlp_dims['E'], W_init=tlx.initializers.HeUniform(a=math.sqrt(5))),
+            tlx.layers.ReLU(),
+            tlx.layers.Linear(in_features=hidden_mlp_dims['E'],
+                              out_features=output_dims['E'], W_init=tlx.initializers.HeUniform(a=math.sqrt(5))),
+        )
+        self.mlp_out_y = tlx.nn.Sequential(
+            tlx.layers.Linear(in_features=hidden_dims['dy'],
+                              out_features=hidden_mlp_dims['y'], W_init=tlx.initializers.HeUniform(a=math.sqrt(5))),
+            tlx.layers.ReLU(),
+            tlx.layers.Linear(in_features=hidden_mlp_dims['y'],
+                              out_features=output_dims['y'], W_init=tlx.initializers.HeUniform(a=math.sqrt(5))),
+        )
+
+    def forward(self, X, E, y, node_mask):
+        r"""Forward pass of the Graph Transformer.
+
+        Parameters
+        ----------
+        X : tensor
+            Node features ``(bs, n, input_dims['X'])``.
+        E : tensor
+            Edge features ``(bs, n, n, input_dims['E'])``.
+        y : tensor
+            Global features ``(bs, input_dims['y'])``. The last element
+            of each row is expected to be the timestep ``t``.
+        node_mask : tensor
+            Boolean mask ``(bs, n)``.
+
+        Returns
+        -------
+        tuple
+            ``(X, E, y)`` predicted clean graph logits.
+        """
+        bs = X.shape[0]
+        n = X.shape[1]
+
+        # Diagonal mask for edges (zero out self-loops)
+        eye_n = tlx.cast(tlx.eye(n), tlx.bool)
+        diag_mask = tlx.cast(
+            ~tlx.tile(tlx.reshape(eye_n, [1, n, n, 1]), [bs, 1, 1, 1]),
+            X.dtype
+        )
+
+        # Skip connections from input
+        X_to_out = X[..., :self.out_dim_X]
+        E_to_out = E[..., :self.out_dim_E]
+        y_to_out = y[..., :self.out_dim_y]
+
+        # Input MLP for E + symmetrize
+        new_E = self.mlp_in_E(E)
+        new_E = (new_E + tlx.transpose(new_E, perm=[0, 2, 1, 3])) / 2.0
+
+        # Timestep embedding: extract last element of y as timestep
+        t = y[:, -1:]  # (bs, 1)
+        time_emb = _timestep_embedding(t, 64)  # (bs, 64)
+        y_with_t = tlx.concat([y, time_emb], axis=-1)  # (bs, dy + 64)
+
+        # Apply input MLPs and mask
+        new_X = self.mlp_in_X(X)
+        new_y = self.mlp_in_y(y_with_t)
+
+        # Mask
+        x_mask = tlx.expand_dims(tlx.cast(node_mask, new_X.dtype), axis=-1)
+        e_mask = tlx.expand_dims(x_mask, axis=2) * tlx.expand_dims(x_mask, axis=1)
+        new_X = new_X * x_mask
+        new_E = new_E * e_mask
+
+        X, E, y = new_X, new_E, new_y
+
+        # Transformer layers
+        for layer in self.tf_layers:
+            X, E, y = layer(X, E, y, node_mask)
+
+        # Output MLPs
+        X = self.mlp_out_X(X)
+        E = self.mlp_out_E(E)
+        y = self.mlp_out_y(y)
+
+        # Skip connections
+        X = X + X_to_out
+        E = (E + E_to_out) * diag_mask
+        y = y + y_to_out
+
+        # Symmetrize E
+        E = (E + tlx.transpose(E, perm=[0, 2, 1, 3])) / 2.0
+
+        # Final masking
+        X = X * x_mask
+        E = E * e_mask
+
+        return X, E, y
diff --git a/tests/layers/attention/test_defog_layer.py b/tests/layers/attention/test_defog_layer.py
new file mode 100644
index 000000000..039ff6ab3
--- /dev/null
+++ b/tests/layers/attention/test_defog_layer.py
@@ -0,0 +1,48 @@
+import os
+os.environ['TL_BACKEND'] = 'torch'
+import tensorlayerx as tlx
+from gammagl.layers.attention.defog_layer import XEyTransformerLayer
+
+def test_defog_layer():
+    dx, de, dy = 16, 8, 4
+    n_head = 4
+    layer = XEyTransformerLayer(dx, de, dy, n_head)
+    
+    bs, n = 2, 5
+    X = tlx.ones((bs, n, dx))
+    E = tlx.ones((bs, n, n, de))
+    y = tlx.ones((bs, dy))
+    node_mask = tlx.ones((bs, n))
+    
+    out_X, out_E, out_y = layer(X, E, y, node_mask)
+    
+    assert out_X.shape == (bs, n, dx)
+    assert out_E.shape == (bs, n, n, de)
+    assert out_y.shape == (bs, dy)
+
+def test_defog_layer_edge_cases():
+    dx, de, dy = 16, 8, 4
+    n_head = 4
+    layer = XEyTransformerLayer(dx, de, dy, n_head)
+    
+    # 1. Batch size 1
+    bs, n = 1, 3
+    X = tlx.ones((bs, n, dx))
+    E = tlx.ones((bs, n, n, de))
+    y = tlx.ones((bs, dy))
+    node_mask = tlx.ones((bs, n))
+    out_X, out_E, out_y = layer(X, E, y, node_mask)
+    assert out_X.shape == (bs, n, dx)
+    
+    # 2. Empty graph (n=1)
+    bs, n = 2, 1
+    X = tlx.ones((bs, n, dx))
+    E = tlx.ones((bs, n, n, de))
+    y = tlx.ones((bs, dy))
+    node_mask = tlx.ones((bs, n))
+    out_X, out_E, out_y = layer(X, E, y, node_mask)
+    assert out_X.shape == (bs, n, dx)
+
+if __name__ == '__main__':
+    test_defog_layer()
+    test_defog_layer_edge_cases()
diff --git a/tests/models/test_defog.py b/tests/models/test_defog.py
new file mode 100644
index 000000000..95be7490a
--- /dev/null
+++ b/tests/models/test_defog.py
@@ -0,0 +1,80 @@
+import os
+os.environ['TL_BACKEND'] = 'torch'
+import tensorlayerx as tlx
+from gammagl.models.defog import DeFoGModel
+import torch
+
+def test_defog_model_shape_and_mask():
+    input_dims = {'X': 5, 'E': 4, 'y': 2}
+    hidden_mlp_dims = {'X': 16, 'E': 8, 'y': 16}
+    hidden_dims = {
+        'dx': 16, 'de': 8, 'dy': 16,
+        'n_head': 2, 'dim_ffX': 32, 'dim_ffE': 16, 'dim_ffy': 32
+    }
+    output_dims = {'X': 5, 'E': 4, 'y': 0}
+    
+    model = DeFoGModel(
+        n_layers=1,
+        input_dims=input_dims,
+        hidden_mlp_dims=hidden_mlp_dims,
+        hidden_dims=hidden_dims,
+        output_dims=output_dims
+    )
+    
+    bs, n = 2, 5
+    X = tlx.ones((bs, n, input_dims['X']))
+    E = tlx.ones((bs, n, n, input_dims['E']))
+    # Test symmetric input E
+    E = (E + tlx.transpose(E, perm=(0, 2, 1, 3))) / 2.0
+    
+    y = tlx.ones((bs, input_dims['y']))
+    
+    # Test node_mask (one node masked out in the second graph)
+    node_mask = tlx.ones((bs, n))
+    node_mask_tensor = tlx.convert_to_tensor(node_mask)
+    if isinstance(node_mask_tensor, torch.Tensor):
+        node_mask_tensor[1, 4] = 0.0
+    
+    out_X, out_E, out_y = model(X, E, y, node_mask_tensor)
+    
+    assert out_X.shape == (bs, n, output_dims['X'])
+    assert out_E.shape == (bs, n, n, output_dims['E'])
+    assert out_y.shape == (bs, 0)
+    
+    # Test symmetric output E
+    out_E_transpose = tlx.transpose(out_E, perm=(0, 2, 1, 3))
+    # We allow some precision error
+    diff = tlx.abs(out_E - out_E_transpose)
+    if isinstance(diff, torch.Tensor):
+        assert torch.max(diff).item() < 1e-4
+
+def test_defog_model_edge_cases():
+    input_dims = {'X': 5, 'E': 4, 'y': 2}
+    hidden_mlp_dims = {'X': 16, 'E': 8, 'y': 16}
+    hidden_dims = {'dx': 16, 'de': 8, 'dy': 16, 'n_head': 2, 'dim_ffX': 32, 'dim_ffE': 16, 'dim_ffy': 32}
+    output_dims = {'X': 5, 'E': 4, 'y': 2}
+    
+    model = DeFoGModel(1, input_dims, hidden_mlp_dims, hidden_dims, output_dims)
+    
+    # 1. Batch size 1
+    bs, n = 1, 3
+    X = tlx.ones((bs, n, input_dims['X']))
+    E = tlx.ones((bs, n, n, input_dims['E']))
+    y = tlx.ones((bs, input_dims['y']))
+    node_mask = tlx.ones((bs, n))
+    out_X, out_E, out_y = model(X, E, y, node_mask)
+    assert out_X.shape == (1, 3, 5)
+    
+    # 2. Empty graph (n=1, technically a single node graph, n=0 may crash torch layers)
+    bs, n = 2, 1
+    X = tlx.ones((bs, n, input_dims['X']))
+    E = tlx.ones((bs, n, n, input_dims['E']))
+    y = tlx.ones((bs, input_dims['y']))
+    node_mask = tlx.ones((bs, n))
+    out_X, out_E, out_y = model(X, E, y, node_mask)
+    assert out_X.shape == (2, 1, 5)
+
+if __name__ == '__main__':
+    test_defog_model_shape_and_mask()
+    test_defog_model_edge_cases()
+    print("DeFoGModel advanced tests passed!")
diff --git a/tests/models/test_defog_backend.py b/tests/models/test_defog_backend.py
new file mode 100644
index 000000000..d8194eeb2
--- /dev/null
+++ b/tests/models/test_defog_backend.py
@@ -0,0 +1,38 @@
+import os
+import subprocess
+import sys
+
+def test_defog_backend_imports():
+    """
+    Ensure that DeFoG core components can be parsed and imported 
+    without crashing under non-torch backends (e.g. tensorflow).
+    This proves there are no stray `import torch` or PyG hard dependencies
+    in the shared GammaGL namespace.
+    """
+    script = """
+import tensorlayerx as tlx
+from gammagl.models.defog import DeFoGModel
+from gammagl.layers.attention.defog_layer import XEyTransformerLayer
+print("Import successful on backend:", tlx.BACKEND)
+"""
+    
+    env = os.environ.copy()
+    # Try with tensorflow backend
+    env['TL_BACKEND'] = 'tensorflow'
+    
+    cmd = [sys.executable, "-c", script]
+    result = subprocess.run(cmd, env=env, capture_output=True, text=True)
+    
+    # If the user doesn't have tensorflow installed, it will fail with ModuleNotFoundError: No module named 'tensorflow'
+    # We should only assert success if tensorflow actually loads or just consider it passed if it didn't fail due to torch
+    if result.returncode != 0:
+        if "No module named 'tensorflow'" in result.stderr or "No module named 'tensorlayerx'" in result.stderr:
+            # Skip if TF/TLX is missing in the local environment
+            return
+        elif "tensorflow" in result.stderr and "dll" in result.stderr.lower():
+             return
+        assert False, f"Import failed on tensorflow backend: {result.stderr}"
+
+if __name__ == '__main__':
+    test_defog_backend_imports()
+    print("Backend import test passed!")
diff --git a/tests/models/test_defog_smoke.py b/tests/models/test_defog_smoke.py
new file mode 100644
index 000000000..a2883ce10
--- /dev/null
+++ b/tests/models/test_defog_smoke.py
@@ -0,0 +1,55 @@
+import os
+import subprocess
+import sys
+
+def test_defog_smoke():
+    """
+    Smoke test to verify that the DeFoG training loop runs without errors
+    on a tiny synthetic dataset. This ensures that the core components
+    (model, loss, flow matching, dataloader) work properly on CPU and do not
+    crash due to missing heavy dependencies (like RDKit or Graph-Tool).
+    """
+    # Assuming the test is run from the repository root
+    script_path = os.path.join("examples", "defog", "defog_trainer.py")
+    if not os.path.exists(script_path):
+        # Fallback if tests are run from a different working directory
+        repo_root = os.path.abspath(os.path.join(os.path.dirname(__file__), "..", ".."))
+        script_path = os.path.join(repo_root, "examples", "defog", "defog_trainer.py")
+
+    assert os.path.exists(script_path), f"Cannot find trainer script at {script_path}"
+
+    cmd = [
+        sys.executable,
+        script_path,
+        "--dataset", "synthetic",
+        "--num_graphs", "4",
+        "--min_nodes", "4",
+        "--max_nodes", "8",
+        "--batch_size", "2",
+        "--n_epochs", "1",
+        "--n_layers", "2",
+        "--hidden_mlp_X", "16",
+        "--hidden_mlp_E", "8",
+        "--hidden_mlp_y", "16",
+        "--dx", "16",
+        "--de", "8",
+        "--dy", "16",
+        "--dim_ffX", "32",
+        "--dim_ffy", "32",
+        "--check_val_every_n_epochs", "1",
+        "--gpu", "-1",
+        "--data_root", "./_review_data",
+        "--save_dir", "./_review_outputs"
+    ]
+
+    env = os.environ.copy()
+    env['TL_BACKEND'] = 'torch'
+
+    print("Running DeFoG smoke test...")
+    result = subprocess.run(cmd, env=env, capture_output=True, text=True)
+    
+    if result.returncode != 0:
+        print("Stdout:\n", result.stdout)
+        print("Stderr:\n", result.stderr)
+        
+    assert result.returncode == 0, f"DeFoG smoke test failed with return code {result.returncode}"