From 75b8e955de9a2936dcc34a76881e021f93387c6c Mon Sep 17 00:00:00 2001
From: Jim Burtoft <jburtoft@amazon.com>
Date: Mon, 9 Mar 2026 10:28:31 -0400
Subject: [PATCH 1/2] Add Boltz-2 contrib model with NKI kernels for pairformer
 inference

Boltz-2 biomolecular structure prediction on Trainium 2 using custom NKI
kernels for the O(N^3) triangular attention and multiplicative update
operations. Uses torch_neuronx.trace() with weight replacement pattern
to compile 64 pairformer layers in 7.4 minutes.

Validated on trn2.3xlarge at N=256 (s_cos=0.999220, z_cos=0.943929)
and trn2.48xlarge at N=512 (s_cos=0.999460, z_cos=0.979214).
---
 contrib/models/Boltz-2/README.md              | 233 +++++++++
 contrib/models/Boltz-2/src/__init__.py        |   8 +
 contrib/models/Boltz-2/src/modeling_boltz2.py | 318 +++++++++++
 .../Boltz-2/src/nki_triangular_attention.py   | 492 ++++++++++++++++++
 .../models/Boltz-2/src/nki_triangular_mul.py  | 268 ++++++++++
 contrib/models/Boltz-2/test/__init__.py       |   0
 .../Boltz-2/test/integration/__init__.py      |   0
 .../Boltz-2/test/integration/test_model.py    | 291 +++++++++++
 contrib/models/Boltz-2/test/unit/__init__.py  |   0
 9 files changed, 1610 insertions(+)
 create mode 100644 contrib/models/Boltz-2/README.md
 create mode 100644 contrib/models/Boltz-2/src/__init__.py
 create mode 100644 contrib/models/Boltz-2/src/modeling_boltz2.py
 create mode 100644 contrib/models/Boltz-2/src/nki_triangular_attention.py
 create mode 100644 contrib/models/Boltz-2/src/nki_triangular_mul.py
 create mode 100644 contrib/models/Boltz-2/test/__init__.py
 create mode 100644 contrib/models/Boltz-2/test/integration/__init__.py
 create mode 100644 contrib/models/Boltz-2/test/integration/test_model.py
 create mode 100644 contrib/models/Boltz-2/test/unit/__init__.py

diff --git a/contrib/models/Boltz-2/README.md b/contrib/models/Boltz-2/README.md
new file mode 100644
index 00000000..51e4f4ed
--- /dev/null
+++ b/contrib/models/Boltz-2/README.md
@@ -0,0 +1,233 @@
+# Contrib Model: Boltz-2
+
+Biomolecular structure prediction on AWS Trainium 2 using NKI custom kernels for the pairformer trunk.
+
+## Model Information
+
+- **HuggingFace ID:** N/A (PyPI: `boltz==2.2.1`, GitHub: [jwohlwend/boltz](https://github.com/jwohlwend/boltz))
+- **Model Type:** Biomolecular structure prediction (pairformer + diffusion)
+- **Parameters:** 507M (BF16)
+- **Architecture:** 64-layer pairformer trunk with triangular attention, triangular multiplicative updates, and pair bias attention; diffusion score model for coordinate generation
+- **License:** MIT
+
+## Validation Results
+
+**Validated:** 2026-02-27
+**Instance:** trn2.3xlarge (LNC=2, 4 logical NeuronCores, 96 GB HBM)
+**SDK:** Neuron SDK 2.28, PyTorch 2.9
+
+### Pairformer Accuracy (Weight Replacement + NKI Kernels)
+
+| N | Layers | s_cos | z_cos | Status |
+|---|--------|-------|-------|--------|
+| 128 | 1 | 0.999796 | 0.998359 | PASS |
+| 128 | 2 | 0.999667 | 0.997491 | PASS |
+| 128 | 4 | 0.999757 | 0.996898 | PASS |
+| 128 | 8 | 0.999713 | 0.995417 | PASS |
+| 256 | 1 | 0.999847 | 0.999971 | PASS |
+| 256 | 64 | 0.999220 | 0.943929 | PASS |
+| 512 | 64 | 0.999460 | 0.979214 | PASS |
+
+### Standalone NKI Kernel Accuracy
+
+| Kernel | N | Cosine Similarity | Status |
+|--------|---|-------------------|--------|
+| Triangular Attention | 128 | 0.999713 | PASS |
+| Triangular Attention | 256 | 1.000029 | PASS |
+| Triangular Mul (Outgoing) | 128 | 0.999967 | PASS |
+| Triangular Mul (Outgoing) | 256 | 0.999903 | PASS |
+| Triangular Mul (Incoming) | 128 | 0.999967 | PASS |
+
+### Benchmark Results
+
+**Pairformer compilation (trn2.3xlarge):**
+
+| N | Compile Layer 0 | Weight Swaps (63 layers) | Total Setup |
+|---|-----------------|-------------------------|-------------|
+| 256 | 423s (7.1 min) | 21s (0.3s each) | 443s (7.4 min) |
+
+**Pairformer inference (trn2.3xlarge, warm, 64 layers):**
+
+| N | Total Latency | Per Layer |
+|---|---------------|-----------|
+| 256 | 11.082s | 173ms |
+| 512 | 105.920s | 1655ms |
+
+**Full pipeline (N=256, insulin B chain, 30 tokens, 20 diffusion steps):**
+
+| Phase | Time |
+|-------|------|
+| Pairformer compilation | 7.6 min |
+| Diffusion compilation | 2.4 min |
+| **Total compilation** | **10.1 min** |
+| Trunk inference (embed + MSA + PF) | 22.5s |
+| Diffusion (20 steps) | 1.2s |
+| Confidence | 0.8s |
+| **Total inference** | **24.5s** |
+
+## Usage
+
+### Prerequisites
+
+```bash
+# Activate Neuron venv on trn2.3xlarge DLAMI (Ubuntu 24.04, SDK 2.28)
+source /opt/aws_neuronx_venv_pytorch_2_9/bin/activate
+
+# Install Boltz-2
+pip install boltz==2.2.1
+
+# Download checkpoint (auto-downloads on first boltz predict run)
+boltz predict --help
+```
+
+### Pairformer-Only Inference
+
+```python
+import os
+import sys
+os.environ["NEURON_PLATFORM_TARGET_OVERRIDE"] = "trn2"
+
+# Add src/ to path
+sys.path.insert(0, "contrib/models/Boltz-2/src")
+
+from modeling_boltz2 import (
+    patch_boltz2_with_nki_kernels,
+    compile_pairformer_weight_replaced,
+    run_pairformer_layers,
+)
+
+# Step 1: Patch BEFORE importing Boltz-2 model
+patch_boltz2_with_nki_kernels()
+
+# Step 2: Load model
+from dataclasses import asdict
+from boltz.main import (
+    Boltz2, Boltz2DiffusionParams, BoltzSteeringParams,
+    MSAModuleArgs, PairformerArgsV2,
+)
+
+model = Boltz2.load_from_checkpoint(
+    "~/.boltz/boltz2_conf.ckpt",
+    strict=True,
+    predict_args={"recycling_steps": 1, "sampling_steps": 20,
+                  "diffusion_samples": 1, "max_parallel_samples": 1,
+                  "write_confidence_summary": False,
+                  "write_full_pae": False, "write_full_pde": False},
+    map_location="cpu",
+    diffusion_process_args=asdict(Boltz2DiffusionParams()),
+    ema=False, use_kernels=False,
+    pairformer_args=asdict(PairformerArgsV2()),
+    msa_args=asdict(MSAModuleArgs(use_paired_feature=True)),
+    steering_args=asdict(BoltzSteeringParams()),
+)
+model.eval()
+
+# Step 3: Compile pairformer (7.4 min at N=256)
+traced_layers, compile_time, swap_time = compile_pairformer_weight_replaced(
+    model, N=256, target="trn2"
+)
+
+# Step 4: Run inference
+import torch
+s = torch.randn(1, 256, 384, dtype=torch.bfloat16) * 0.1
+z = torch.randn(1, 256, 256, 128, dtype=torch.bfloat16) * 0.1
+mask = torch.ones(1, 256, dtype=torch.float32)
+pair_mask = torch.ones(1, 256, 256, dtype=torch.float32)
+
+s_out, z_out, latency = run_pairformer_layers(traced_layers, s, z, mask, pair_mask)
+print(f"Inference: {latency:.1f}s ({latency/64*1000:.0f}ms/layer)")
+```
+
+## Compatibility Matrix
+
+| Instance | SDK 2.28 | SDK 2.27 |
+|----------|----------|----------|
+| trn2.3xlarge (N=256) | VALIDATED | VALIDATED |
+| trn2.48xlarge (N=512) | VALIDATED | Not tested |
+| inf2.8xlarge (N=128, <=8 layers) | Not tested | VALIDATED |
+
+## Example Checkpoints
+
+* [boltz==2.2.1](https://pypi.org/project/boltz/2.2.1/) (PyPI) - auto-downloads to `~/.boltz/boltz2_conf.ckpt`
+
+## Testing Instructions
+
+```bash
+# On trn2.3xlarge with Neuron SDK 2.28
+source /opt/aws_neuronx_venv_pytorch_2_9/bin/activate
+pip install boltz==2.2.1 pytest
+
+export NEURON_PLATFORM_TARGET_OVERRIDE=trn2
+export NEURON_RT_VISIBLE_CORES=0
+
+# Run tests (compiles 2 layers at N=128, ~2 min)
+cd contrib/models/Boltz-2
+PYTHONPATH=src pytest test/integration/test_model.py -v -s
+```
+
+## Architecture Details
+
+### Approach
+
+This contribution uses a different approach from typical NxDI contrib models:
+
+1. **torch_neuronx.trace()** (not NxDI model classes) for compilation
+2. **NKI custom kernels** for the four O(N^3) triangular operations
+3. **Weight replacement** pattern: compile one layer, clone 63 times with `replace_weights()`
+4. **Monkey-patching** to inject NKI kernels into the upstream Boltz-2 codebase
+
+### NKI Kernels
+
+Two NKI kernels replace the computationally expensive operations:
+
+| Kernel | Operation | Complexity | Description |
+|--------|-----------|-----------|-------------|
+| `nki_triangular_attention.py` | Triangular Attention | O(N^3) | Multi-head attention per row with full 2D triangle bias, online softmax |
+| `nki_triangular_mul.py` | Triangular Multiplicative Update | O(N^3) | Einsum contraction: `result[i,j,d] = sum_k a[i,k,d] * b[j,k,d]` |
+
+Each kernel is called twice per pairformer layer (starting node + ending node variants), totaling 4 NKI kernel calls per layer.
+
+### Compilation
+
+| Parameter | Value |
+|-----------|-------|
+| `inline_weights_to_neff` | `False` (enables weight replacement) |
+| `compiler_args` | `["--target", "trn2"]` |
+| Compile time (N=256) | 423s for layer 0 |
+| Weight swap time | 0.3s per layer (63 layers) |
+| Total setup | 443s (7.4 min) |
+
+### Model Constants
+
+| Parameter | Value |
+|-----------|-------|
+| C_s (single repr) | 384 |
+| C_z (pair repr) | 128 |
+| H (attention heads) | 4 |
+| d (head dim) | 32 |
+| Pairformer layers | 64 |
+| Total parameters | 507M |
+
+### Important Notes
+
+- **Do NOT use `--auto-cast matmult`** for this model. It destroys accuracy for Boltz-2's triangular operations.
+- `NEURON_PLATFORM_TARGET_OVERRIDE=trn2` must be set before any Neuron imports.
+- `NEURON_RT_VISIBLE_CORES=0` restricts to a single NeuronCore (sufficient for single-structure inference).
+- N must be a multiple of 128 (P_MAX tiling constraint of NKI kernels).
+
+## Known Issues
+
+1. **N=512 requires trn2.48xlarge**: The 64-layer pairformer at N=512 exceeds trn2.3xlarge memory during compilation. Use trn2.48xlarge instead.
+2. **Per-layer host-device round trips**: Each of the 64 pairformer layers is a separate traced model, incurring host-device transfer overhead per layer (~173ms/layer at N=256, of which a significant fraction is sync overhead rather than compute).
+3. **DMA descriptor limit on inf2**: N=128 with more than 8 layers hits the inf2 DMA descriptor limit. Use trn2 for full-model inference.
+4. **Cold start**: First inference after compilation is slower due to device initialization.
+
+## Source Files
+
+| File | Description |
+|------|-------------|
+| `src/modeling_boltz2.py` | Main module: monkey-patching, compilation, inference |
+| `src/nki_triangular_attention.py` | NKI kernel: triangular attention with online softmax |
+| `src/nki_triangular_mul.py` | NKI kernel: triangular multiplicative update (einsum) |
+| `src/__init__.py` | Package exports |
+| `test/integration/test_model.py` | Accuracy + latency tests |
diff --git a/contrib/models/Boltz-2/src/__init__.py b/contrib/models/Boltz-2/src/__init__.py
new file mode 100644
index 00000000..2535a09e
--- /dev/null
+++ b/contrib/models/Boltz-2/src/__init__.py
@@ -0,0 +1,8 @@
+from .modeling_boltz2 import (
+    compile_pairformer_weight_replaced,
+    patch_boltz2_with_nki_kernels,
+    run_pairformer_layers,
+    SinglePairformerLayerWrapper,
+)
+from .nki_triangular_attention import triangular_attention_fwd
+from .nki_triangular_mul import triangular_mul_fwd
diff --git a/contrib/models/Boltz-2/src/modeling_boltz2.py b/contrib/models/Boltz-2/src/modeling_boltz2.py
new file mode 100644
index 00000000..eda13c94
--- /dev/null
+++ b/contrib/models/Boltz-2/src/modeling_boltz2.py
@@ -0,0 +1,318 @@
+"""Boltz-2 pairformer inference on AWS Trainium 2.
+
+This module provides Neuron-accelerated inference for the Boltz-2 biomolecular
+structure prediction model. It compiles the pairformer trunk (64 transformer
+layers) using torch_neuronx.trace() with NKI custom kernels for the O(N^3)
+triangular attention and triangular multiplicative update operations.
+
+Key techniques:
+  - Weight replacement: compile ONE layer with inline_weights_to_neff=False,
+    then create 64 copies with replace_weights() for each layer's unique weights.
+    This reduces setup from hours (chunked compilation) to ~7 minutes.
+  - NKI custom kernels: the four O(N^3) operations (TriAttnStart, TriAttnEnd,
+    TriMulOut, TriMulIn) are implemented as NKI kernels that run directly on the
+    NeuronCore, bypassing the standard XLA compiler for these operations.
+  - Monkey-patching: NKI kernels are injected into the Boltz-2 codebase at runtime
+    by replacing the kernel_triangular_attn and kernel_triangular_mult functions
+    before tracing.
+
+Usage:
+    source /opt/aws_neuronx_venv_pytorch_2_9/bin/activate
+    pip install boltz==2.2.1
+
+    from modeling_boltz2 import (
+        patch_boltz2_with_nki_kernels,
+        compile_pairformer_weight_replaced,
+        run_pairformer_layers,
+    )
+
+    # Step 1: Patch Boltz-2 with NKI kernels (BEFORE importing model)
+    patch_boltz2_with_nki_kernels()
+
+    # Step 2: Load model
+    model = Boltz2.load_from_checkpoint(checkpoint_path, ...)
+
+    # Step 3: Compile pairformer
+    traced_layers, compile_time, swap_time = compile_pairformer_weight_replaced(
+        model, N=256, target="trn2"
+    )
+
+    # Step 4: Run inference
+    s_out, z_out, latency = run_pairformer_layers(
+        traced_layers, s, z, mask, pair_mask
+    )
+
+Instance: trn2.3xlarge (128 GB RAM, 96 GB HBM)
+"""
+
+import copy
+import os
+import sys
+import time
+
+import torch
+import torch.nn.functional as F
+import torch_neuronx
+
+# Ensure src/ is on the path for NKI kernel imports
+_SRC_DIR = os.path.dirname(os.path.abspath(__file__))
+if _SRC_DIR not in sys.path:
+    sys.path.insert(0, _SRC_DIR)
+
+
+# ========================================================================
+# NKI kernel integration wrappers
+# ========================================================================
+
+
+def _nki_kernel_triangular_attn(q, k, v, tri_bias, mask, scale):
+    """Drop-in replacement for kernel_triangular_attn in Boltz-2 primitives.
+
+    Reshapes Boltz-2's multi-head tensors to the NKI kernel's head-interleaved
+    format, calls the NKI kernel, and reshapes the output back.
+
+    Args:
+        q: [B, I, H, J, d] - unscaled query
+        k: [B, I, H, K, d] - key
+        v: [B, I, H, K, d] - value
+        tri_bias: [B, 1, H, I, J] - triangle bias
+        mask: [B, I, 1, 1, J] - padding mask (ignored, all-ones at inference)
+        scale: float - 1/sqrt(d)
+
+    Returns:
+        [B, I, H, Q, d] - attention output in Boltz-2's expected format.
+    """
+    from nki_triangular_attention import triangular_attention_fwd
+
+    B, I, H, J, d = q.shape
+    N = I
+    Hd = H * d
+
+    q_nki = q[0].permute(0, 2, 1, 3).contiguous().reshape(N, N, Hd)
+    k_nki = k[0].permute(0, 2, 1, 3).contiguous().reshape(N, N, Hd)
+    v_nki = v[0].permute(0, 2, 1, 3).contiguous().reshape(N, N, Hd)
+    bias_nki = tri_bias[0, 0].permute(1, 2, 0).contiguous()
+
+    out_nki = triangular_attention_fwd(q_nki, k_nki, v_nki, bias_nki, scale)
+
+    out = out_nki.reshape(N, N, H, d).permute(0, 2, 1, 3).unsqueeze(0)
+    return out
+
+
+def _nki_kernel_triangular_mult(
+    x,
+    direction,
+    mask,
+    norm_in_weight,
+    norm_in_bias,
+    p_in_weight,
+    g_in_weight,
+    norm_out_weight,
+    norm_out_bias,
+    p_out_weight,
+    g_out_weight,
+    eps,
+):
+    """Drop-in replacement for kernel_triangular_mult in Boltz-2.
+
+    Performs LayerNorm, projections, and gating in PyTorch, then calls the NKI
+    kernel for the O(N^3) einsum contraction, then does output norm + gating.
+
+    Args:
+        x: [B, N, N, D] - pair representation
+        direction: "outgoing" or "incoming"
+        mask: [B, N, N] - padding mask
+        norm_in_weight, norm_in_bias: LayerNorm parameters
+        p_in_weight: [2*D, D] - input projection
+        g_in_weight: [2*D, D] - input gate
+        norm_out_weight, norm_out_bias: output LayerNorm parameters
+        p_out_weight: [D, D] - output projection
+        g_out_weight: [D, D] - output gate
+        eps: float - LayerNorm epsilon
+
+    Returns:
+        [B, N, N, D] - triangular multiplicative update output
+    """
+    from nki_triangular_mul import triangular_mul_fwd
+
+    B, N, _, D = x.shape
+
+    x_normed = F.layer_norm(x, [D], norm_in_weight, norm_in_bias, eps)
+    x_in = x_normed
+
+    projected = F.linear(x_normed, p_in_weight)
+    gated = F.linear(x_normed, g_in_weight).sigmoid()
+    x_gated = projected * gated
+    x_gated = x_gated * mask.unsqueeze(-1)
+
+    a, b = torch.chunk(x_gated, 2, dim=-1)
+
+    a_nki = a[0]
+    b_nki = b[0]
+
+    if direction == "incoming":
+        a_nki = a_nki.permute(1, 0, 2).contiguous()
+        b_nki = b_nki.permute(1, 0, 2).contiguous()
+
+    result_nki = triangular_mul_fwd(a_nki, b_nki)
+    result = result_nki.unsqueeze(0)
+
+    result = F.layer_norm(result, [D], norm_out_weight, norm_out_bias, eps)
+    output = F.linear(result, p_out_weight) * F.linear(x_in, g_out_weight).sigmoid()
+
+    return output
+
+
+# ========================================================================
+# Monkey-patching
+# ========================================================================
+
+
+def patch_boltz2_with_nki_kernels():
+    """Monkey-patch Boltz-2 to use NKI kernels for triangular operations.
+
+    MUST be called BEFORE importing Boltz-2 model classes, or at minimum
+    before tracing any modules that use these operations.
+    """
+    from boltz.model.layers.triangular_attention import primitives
+    import boltz.model.layers.triangular_mult as tri_mul_module
+
+    primitives.kernel_triangular_attn = _nki_kernel_triangular_attn
+    tri_mul_module.kernel_triangular_mult = _nki_kernel_triangular_mult
+
+
+# ========================================================================
+# Pairformer layer wrapper
+# ========================================================================
+
+
+class SinglePairformerLayerWrapper(torch.nn.Module):
+    """Wraps a single PairformerLayer for tracing with NKI kernels enabled."""
+
+    def __init__(self, layer):
+        super().__init__()
+        self.layer = layer
+
+    def forward(self, s, z, mask, pair_mask):
+        s, z = self.layer(s, z, mask, pair_mask, use_kernels=True)
+        return s, z
+
+
+# ========================================================================
+# Compilation via weight replacement
+# ========================================================================
+
+
+def compile_pairformer_weight_replaced(model, N, target="trn2"):
+    """Compile 64-layer pairformer using weight replacement.
+
+    Compiles a single layer with inline_weights_to_neff=False, then creates
+    64 copies with replace_weights() for each layer's unique weights.
+
+    Args:
+        model: Loaded Boltz2 model instance.
+        N: Sequence length (must be multiple of 128). Typically 256.
+        target: Neuron compilation target ("trn2").
+
+    Returns:
+        traced_layers: list of 64 traced models
+        compile_time: time to compile layer 0
+        total_swap_time: time for all weight swaps
+    """
+    C_s, C_z = 384, 128
+    all_layers = list(model.pairformer_module.layers)
+    NUM_LAYERS = len(all_layers)
+
+    s_dummy = torch.randn(1, N, C_s, dtype=torch.bfloat16) * 0.1
+    z_dummy = torch.randn(1, N, N, C_z, dtype=torch.bfloat16) * 0.1
+    mask_dummy = torch.ones(1, N, dtype=torch.float32)
+    pair_mask_dummy = torch.ones(1, N, N, dtype=torch.float32)
+
+    compiler_args = ["--target", target]
+
+    # Phase 1: Compile layer 0
+    print(f"\n  Compiling layer 0 (inline_weights_to_neff=False, target={target})...")
+    layer0_bf16 = copy.deepcopy(all_layers[0]).to(torch.bfloat16)
+    wrapper0 = SinglePairformerLayerWrapper(layer0_bf16)
+    wrapper0.eval()
+
+    t0 = time.time()
+    traced_template = torch_neuronx.trace(
+        wrapper0,
+        (s_dummy, z_dummy, mask_dummy, pair_mask_dummy),
+        compiler_args=compiler_args,
+        inline_weights_to_neff=False,
+    )
+    compile_time = time.time() - t0
+    print(f"  Layer 0 compiled in {compile_time:.1f}s")
+
+    try:
+        torch_neuronx.move_trace_to_device(traced_template, 0)
+    except Exception:
+        pass
+
+    # Warmup
+    with torch.no_grad():
+        _ = traced_template(s_dummy, z_dummy, mask_dummy, pair_mask_dummy)
+
+    # Phase 2: Weight replacement for layers 1..63
+    print(f"\n  Replacing weights for {NUM_LAYERS - 1} layers...")
+    traced_layers = [traced_template]
+    replace_times = []
+
+    for i in range(1, NUM_LAYERS):
+        layer_bf16 = copy.deepcopy(all_layers[i]).to(torch.bfloat16)
+        wrapper_i = SinglePairformerLayerWrapper(layer_bf16)
+        wrapper_i.eval()
+        new_state_dict = wrapper_i.state_dict()
+
+        t0 = time.time()
+        traced_copy = copy.deepcopy(traced_template)
+        torch_neuronx.replace_weights(traced_copy, new_state_dict)
+        swap_time = time.time() - t0
+        replace_times.append(swap_time)
+
+        try:
+            torch_neuronx.move_trace_to_device(traced_copy, 0)
+        except Exception:
+            pass
+
+        traced_layers.append(traced_copy)
+
+        if (i + 1) % 16 == 0 or i == NUM_LAYERS - 1:
+            print(f"    Layer {i + 1}/{NUM_LAYERS}: swap {swap_time:.1f}s")
+
+    total_swap_time = sum(replace_times)
+    print(
+        f"\n  Pairformer setup: compile {compile_time:.0f}s + "
+        f"swaps {total_swap_time:.0f}s = {compile_time + total_swap_time:.0f}s"
+    )
+
+    return traced_layers, compile_time, total_swap_time
+
+
+def run_pairformer_layers(traced_layers, s, z, mask, pair_mask):
+    """Run all traced pairformer layers sequentially.
+
+    Args:
+        traced_layers: list of 64 traced models from compile_pairformer_weight_replaced
+        s: [1, N, 384] single representation, bfloat16
+        z: [1, N, N, 128] pair representation, bfloat16
+        mask: [1, N] padding mask, float32
+        pair_mask: [1, N, N] pair padding mask, float32
+
+    Returns:
+        s_out, z_out: final pairformer outputs
+        total_time: total inference time in seconds
+    """
+    s_curr = s.to(torch.bfloat16)
+    z_curr = z.to(torch.bfloat16)
+
+    t0 = time.time()
+    for traced in traced_layers:
+        s_curr, z_curr = traced(s_curr, z_curr, mask, pair_mask)
+        s_curr = s_curr.to(torch.bfloat16)
+        z_curr = z_curr.to(torch.bfloat16)
+    total_time = time.time() - t0
+
+    return s_curr, z_curr, total_time
diff --git a/contrib/models/Boltz-2/src/nki_triangular_attention.py b/contrib/models/Boltz-2/src/nki_triangular_attention.py
new file mode 100644
index 00000000..a1fea28c
--- /dev/null
+++ b/contrib/models/Boltz-2/src/nki_triangular_attention.py
@@ -0,0 +1,492 @@
+# Copyright Amazon.com, Inc. or its affiliates. All Rights Reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License").
+# You may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+"""Triangular attention kernel for AlphaFold-family pairformer architectures.
+
+Implements the core attention computation from the triangular attention operation
+used in Boltz-2, AlphaFold2, and AlphaFold3 pairformer layers. This operation
+applies multi-head attention independently along each row (or column) of the
+pair representation matrix, with an additive bias derived from the pair
+representation projected to attention heads.
+
+The "starting node" (row-wise) variant is implemented directly. The "ending node"
+(column-wise) variant is obtained by transposing the pair representation (swapping
+spatial dims 0 and 1) before and after calling this kernel.
+
+Algorithm:
+    For each row i of the pair matrix, independently:
+        1. Extract Q[i,:,:], K[i,:,:], V[i,:,:] — each [N, Hd]
+        2. For each head h:
+            logits[j,k] = Q[i,j,h] @ K[i,k,h]^T * scale + bias[j,k,h]
+            weights[j,:] = softmax(logits[j,:])
+            output[i,j,h] = weights[j,:] @ V[i,:,h]
+
+    The bias is a full N x N matrix per head, shared across all rows i.
+    It is derived from the pair representation projected to H heads:
+        bias = permute(Linear(LayerNorm(z)), (2, 0, 1))  ->  [H, N, N]
+    and stored as (N, N, H) for natural DMA access.
+
+    Online softmax (Milakov & Gimelshein, 2018) is used to tile over the key
+    dimension without materializing the full N x N attention matrix.
+
+IO Shapes:
+    q:      (N, N, H * d)   — query, after QKV projection
+    k:      (N, N, H * d)   — key
+    v:      (N, N, H * d)   — value
+    bias:   (N, N, H)       — triangle bias, bias[q, k, h] for query pos q, key pos k
+    output: (N, N, H * d)   — attention output before gating/output projection
+
+Tiling:
+    - Partition dimension (axis 0 of SBUF tiles): P_MAX = 128
+    - N must be a multiple of P_MAX
+    - Head dimension d must be <= P_MAX (typically d=32 for Boltz-2, H=4)
+    - For Q@K^T: d is padded to P_MAX for nc_matmul contraction axis
+    - For attn@V: contraction over P_MAX key positions (natural fit)
+    - Bias is loaded as (P_MAX, P_MAX) tiles per (j_tile, k_tile) pair
+
+Reference:
+    Wohlwend et al., "Boltz-2: Predicting the Structure and Interactions of
+    Biomolecular Complexes", 2025. https://github.com/jwohlwend/boltz
+"""
+
+import numpy as np
+
+import nki
+import nki.isa as nisa
+import nki.language as nl
+
+# Partition dimension maximum. nl.tile_size.pmax returns a non-integer at
+# import time; using a Python int is required for static shape declarations.
+P_MAX = 128
+
+
+@nki.jit
+def triangular_attention_fwd(
+    q: nl.ndarray,
+    k: nl.ndarray,
+    v: nl.ndarray,
+    bias: nl.ndarray,
+    scale: float = 0.1767766952966369,
+) -> nl.ndarray:
+    """Compute triangular attention (starting node / row-wise).
+
+    Performs multi-head attention independently for each row of the pair
+    representation matrix, with a full 2D additive triangle bias per head.
+    Uses online softmax to handle large sequence lengths without materializing
+    the full N x N attention matrix.
+
+    Dimensions:
+        N:  Sequence length (number of tokens/residues). Must be a multiple of 128.
+        Hd: Total head dimension (H * d). Typically 128 for Boltz-2 (H=4, d=32).
+        H:  Number of attention heads.
+        d:  Per-head dimension (Hd // H).
+
+    Args:
+        q (nl.ndarray): Query tensor, shape (N, N, Hd), dtype bfloat16.
+        k (nl.ndarray): Key tensor, shape (N, N, Hd), dtype bfloat16.
+        v (nl.ndarray): Value tensor, shape (N, N, Hd), dtype bfloat16.
+        bias (nl.ndarray): Triangle bias, shape (N, N, H), dtype bfloat16.
+            bias[q_pos, k_pos, h] is the additive bias for query position
+            q_pos, key position k_pos, head h. This bias is shared across
+            all rows i — the same bias matrix applies to every row's
+            independent attention computation.
+        scale (float): Attention scaling factor, typically 1/sqrt(d).
+            Default is 1/sqrt(32) for Boltz-2.
+
+    Returns:
+        nl.ndarray: Attention output, shape (N, N, Hd), dtype bfloat16.
+
+    Notes:
+        - Linear projections (QKV, gating, output) are handled outside this
+          kernel for modularity. The caller applies them before/after.
+        - For the "ending node" (column-wise) variant, transpose the pair
+          representation's spatial dimensions before and after calling this kernel.
+        - Online softmax operates in float32 for numerical stability; the final
+          output is cast back to the input dtype (bfloat16).
+        - Q@K^T uses nc_matmul with d padded to P_MAX on the contraction axis.
+          For d=32 this wastes 75% of TensorEngine lanes but is correct.
+
+    Pseudocode:
+        output = zeros(N, N, Hd)
+        for i_row in range(N):                     # independent per row
+            for h in range(H):                      # independent per head
+                for j_tile in range(N // P_MAX):    # output query tile
+                    q_tile = load q[i_row, j_start:j_end, h*d:(h+1)*d]
+                    # Online softmax over key tiles
+                    m, l, o_acc = -inf, 0, 0
+                    for k_tile in range(N // P_MAX):
+                        k_t = load k[i_row, k_start:k_end, h*d:(h+1)*d]
+                        v_t = load v[i_row, k_start:k_end, h*d:(h+1)*d]
+                        bias_tile = load bias[j_start:j_end, k_start:k_end, h]
+                        logits = q_tile @ k_t^T * scale + bias_tile
+                        m_new = max(m, max(logits))
+                        correction = exp(m - m_new)
+                        o_acc = o_acc * correction + exp(logits - m_new) @ v_t
+                        l = l * correction + sum(exp(logits - m_new))
+                        m = m_new
+                    output[i_row, j_start:j_end, h*d:(h+1)*d] = o_acc / l
+        return output
+    """
+    N, N2, Hd = q.shape
+    N_bias, N_bias2, H = bias.shape
+    d = Hd // H
+
+    assert N == N2, f"Q must be square: got ({N}, {N2}, {Hd})"
+    assert N == N_bias and N == N_bias2, (
+        f"Bias shape ({N_bias}, {N_bias2}, {H}) must match Q N ({N})"
+    )
+    assert N % P_MAX == 0, f"N ({N}) must be a multiple of P_MAX ({P_MAX})"
+    assert d <= P_MAX, f"Head dim d ({d}) must be <= P_MAX ({P_MAX})"
+
+    # Output tensor in HBM — same shape as input
+    output = nl.ndarray((N, N, Hd), dtype=q.dtype, buffer=nl.shared_hbm)
+
+    # Access pattern constants for loading from 3D HBM tensors.
+    # For a tensor of shape (N, N, Hd), the row-major strides are:
+    #   dim 0 stride = N * Hd,  dim 1 stride = Hd,  dim 2 stride = 1
+    # We load 2D tiles (P_MAX, d) from a specific (i_row, j_start, hd_start).
+    q_stride_row = N * Hd  # stride along dim 0 (row i)
+    q_stride_col = Hd  # stride along dim 1 (position j/k within row)
+
+    # For bias tensor of shape (N, N, H), strides are:
+    #   dim 0 stride = N * H,  dim 1 stride = H,  dim 2 stride = 1
+    # We load 2D tiles (P_MAX, P_MAX) from bias[j_start, k_start, h] for each
+    # (j_tile, k_tile, head) combination. The tile has P_MAX query positions
+    # (strided by N*H) and P_MAX key positions (strided by H).
+    bias_stride_q = N * H  # stride along dim 0 (query position)
+    bias_stride_k = H  # stride along dim 1 (key position)
+
+    # Process each row i independently (embarrassingly parallel across rows)
+    for i_row in nl.affine_range(N):
+        # Base HBM offset for row i: i_row * N * Hd
+        row_base = i_row * q_stride_row
+
+        # Process each head independently (parallel across heads)
+        for h in nl.affine_range(H):
+            hd_start = h * d
+
+            # Tile over query positions j (output positions), P_MAX at a time
+            for j_tile in nl.affine_range(N // P_MAX):
+                j_start = j_tile * P_MAX
+
+                # ---- Load Q tile: (P_MAX, d) ----
+                # From q[i_row, j_start : j_start+P_MAX, hd_start : hd_start+d]
+                q_tile = nl.ndarray((P_MAX, d), dtype=q.dtype, buffer=nl.sbuf)
+                nisa.dma_copy(
+                    dst=q_tile,
+                    src=q.ap(
+                        pattern=[[q_stride_col, P_MAX], [1, d]],
+                        offset=row_base + j_start * q_stride_col + hd_start,
+                    ),
+                )
+
+                # ---- Prepare Q^T padded for nc_matmul ----
+                # nc_matmul contraction dim must be P_MAX (partition axis).
+                # With d < P_MAX, we pad Q to (P_MAX, P_MAX), transpose, then
+                # use the transposed version as stationary operand.
+                q_padded = nl.ndarray((P_MAX, P_MAX), dtype=q.dtype, buffer=nl.sbuf)
+                nisa.memset(dst=q_padded, value=0.0)
+                nisa.tensor_copy(dst=q_padded[0:P_MAX, 0:d], src=q_tile)
+
+                q_t_psum = nl.ndarray((P_MAX, P_MAX), dtype=q.dtype, buffer=nl.psum)
+                nisa.nc_transpose(dst=q_t_psum, data=q_padded)
+                q_t = nl.ndarray((P_MAX, P_MAX), dtype=q.dtype, buffer=nl.sbuf)
+                nisa.tensor_copy(dst=q_t, src=q_t_psum)
+
+                # ---- Online softmax accumulators ----
+                m_prev = nl.ndarray((P_MAX, 1), dtype=nl.float32, buffer=nl.sbuf)
+                nisa.memset(dst=m_prev, value=-1e30)
+
+                l_prev = nl.ndarray((P_MAX, 1), dtype=nl.float32, buffer=nl.sbuf)
+                nisa.memset(dst=l_prev, value=0.0)
+
+                o_acc = nl.ndarray((P_MAX, d), dtype=nl.float32, buffer=nl.sbuf)
+                nisa.memset(dst=o_acc, value=0.0)
+
+                # ---- Tile over key positions (sequential for online softmax) ----
+                for k_tile_idx in nl.sequential_range(N // P_MAX):
+                    k_start = k_tile_idx * P_MAX
+
+                    # Load K tile: (P_MAX, d)
+                    # From k[i_row, k_start : k_start+P_MAX, hd_start : hd_start+d]
+                    k_tile_sb = nl.ndarray((P_MAX, d), dtype=k.dtype, buffer=nl.sbuf)
+                    nisa.dma_copy(
+                        dst=k_tile_sb,
+                        src=k.ap(
+                            pattern=[[q_stride_col, P_MAX], [1, d]],
+                            offset=row_base + k_start * q_stride_col + hd_start,
+                        ),
+                    )
+
+                    # Load V tile: (P_MAX, d)
+                    # From v[i_row, k_start : k_start+P_MAX, hd_start : hd_start+d]
+                    v_tile_sb = nl.ndarray((P_MAX, d), dtype=v.dtype, buffer=nl.sbuf)
+                    nisa.dma_copy(
+                        dst=v_tile_sb,
+                        src=v.ap(
+                            pattern=[[q_stride_col, P_MAX], [1, d]],
+                            offset=row_base + k_start * q_stride_col + hd_start,
+                        ),
+                    )
+
+                    # ---- Load bias tile: (P_MAX, P_MAX) ----
+                    # From bias[j_start : j_start+P_MAX, k_start : k_start+P_MAX, h]
+                    # This is a full 2D bias tile for query positions j and key
+                    # positions k. Each element bias[j, k, h] is the additive bias
+                    # for query pos j, key pos k, head h.
+                    bias_tile = nl.ndarray(
+                        (P_MAX, P_MAX), dtype=bias.dtype, buffer=nl.sbuf
+                    )
+                    nisa.dma_copy(
+                        dst=bias_tile,
+                        src=bias.ap(
+                            pattern=[[bias_stride_q, P_MAX], [bias_stride_k, P_MAX]],
+                            offset=j_start * bias_stride_q
+                            + k_start * bias_stride_k
+                            + h,
+                        ),
+                    )
+
+                    # ---- Compute Q @ K^T via nc_matmul ----
+                    # Pad K to (P_MAX, P_MAX), transpose, then matmul.
+                    # Result: logits (P_MAX, P_MAX) = Q^T^T @ K^T = Q @ K^T
+                    k_padded = nl.ndarray((P_MAX, P_MAX), dtype=k.dtype, buffer=nl.sbuf)
+                    nisa.memset(dst=k_padded, value=0.0)
+                    nisa.tensor_copy(dst=k_padded[0:P_MAX, 0:d], src=k_tile_sb)
+
+                    k_t_psum = nl.ndarray((P_MAX, P_MAX), dtype=k.dtype, buffer=nl.psum)
+                    nisa.nc_transpose(dst=k_t_psum, data=k_padded)
+                    k_t = nl.ndarray((P_MAX, P_MAX), dtype=k.dtype, buffer=nl.sbuf)
+                    nisa.tensor_copy(dst=k_t, src=k_t_psum)
+
+                    logits_psum = nl.ndarray(
+                        (P_MAX, P_MAX), dtype=nl.float32, buffer=nl.psum
+                    )
+                    nisa.nc_matmul(dst=logits_psum, stationary=q_t, moving=k_t)
+
+                    logits = nl.ndarray(
+                        (P_MAX, P_MAX), dtype=nl.float32, buffer=nl.sbuf
+                    )
+                    nisa.tensor_copy(dst=logits, src=logits_psum)
+
+                    # Scale logits by 1/sqrt(d)
+                    logits_scaled = nl.ndarray(
+                        (P_MAX, P_MAX), dtype=nl.float32, buffer=nl.sbuf
+                    )
+                    nisa.tensor_scalar(
+                        dst=logits_scaled,
+                        data=logits,
+                        op0=nl.multiply,
+                        operand0=scale,
+                        engine=nisa.vector_engine,
+                    )
+
+                    # Add full 2D triangle bias: (P_MAX, P_MAX) + (P_MAX, P_MAX)
+                    bias_fp32 = nl.ndarray(
+                        (P_MAX, P_MAX), dtype=nl.float32, buffer=nl.sbuf
+                    )
+                    nisa.tensor_copy(dst=bias_fp32, src=bias_tile)
+
+                    logits_biased = nl.ndarray(
+                        (P_MAX, P_MAX), dtype=nl.float32, buffer=nl.sbuf
+                    )
+                    nisa.tensor_tensor(
+                        dst=logits_biased,
+                        data1=logits_scaled,
+                        data2=bias_fp32,
+                        op=nl.add,
+                    )
+
+                    # ---- Online softmax (Milakov & Gimelshein, 2018) ----
+                    # Step 1: tile max
+                    tile_max = nl.ndarray((P_MAX, 1), dtype=nl.float32, buffer=nl.sbuf)
+                    nisa.tensor_reduce(
+                        dst=tile_max, op=nl.maximum, data=logits_biased, axis=1
+                    )
+
+                    # Step 2: running max
+                    m_new = nl.ndarray((P_MAX, 1), dtype=nl.float32, buffer=nl.sbuf)
+                    nisa.tensor_tensor(
+                        dst=m_new, data1=m_prev, data2=tile_max, op=nl.maximum
+                    )
+
+                    # Step 3: correction factor exp(m_prev - m_new)
+                    m_diff = nl.ndarray((P_MAX, 1), dtype=nl.float32, buffer=nl.sbuf)
+                    nisa.tensor_tensor(
+                        dst=m_diff, data1=m_prev, data2=m_new, op=nl.subtract
+                    )
+                    correction = nl.ndarray(
+                        (P_MAX, 1), dtype=nl.float32, buffer=nl.sbuf
+                    )
+                    nisa.activation(
+                        dst=correction, op=nl.exp, data=m_diff, bias=None, scale=1.0
+                    )
+
+                    # Step 4: exp(logits - m_new)
+                    logits_shifted = nl.ndarray(
+                        (P_MAX, P_MAX), dtype=nl.float32, buffer=nl.sbuf
+                    )
+                    nisa.tensor_scalar(
+                        dst=logits_shifted,
+                        data=logits_biased,
+                        op0=nl.subtract,
+                        operand0=m_new,
+                        engine=nisa.vector_engine,
+                    )
+                    exp_logits = nl.ndarray(
+                        (P_MAX, P_MAX), dtype=nl.float32, buffer=nl.sbuf
+                    )
+                    nisa.activation(
+                        dst=exp_logits,
+                        op=nl.exp,
+                        data=logits_shifted,
+                        bias=None,
+                        scale=1.0,
+                    )
+
+                    # Step 5: update running sum l = l * correction + sum(exp_logits)
+                    l_corrected = nl.ndarray(
+                        (P_MAX, 1), dtype=nl.float32, buffer=nl.sbuf
+                    )
+                    nisa.tensor_tensor(
+                        dst=l_corrected, data1=l_prev, data2=correction, op=nl.multiply
+                    )
+                    tile_sum = nl.ndarray((P_MAX, 1), dtype=nl.float32, buffer=nl.sbuf)
+                    nisa.tensor_reduce(dst=tile_sum, op=nl.add, data=exp_logits, axis=1)
+                    l_new = nl.ndarray((P_MAX, 1), dtype=nl.float32, buffer=nl.sbuf)
+                    nisa.tensor_tensor(
+                        dst=l_new, data1=l_corrected, data2=tile_sum, op=nl.add
+                    )
+
+                    # Step 6: rescale previous output accumulator
+                    o_scaled = nl.ndarray((P_MAX, d), dtype=nl.float32, buffer=nl.sbuf)
+                    nisa.tensor_scalar(
+                        dst=o_scaled,
+                        data=o_acc,
+                        op0=nl.multiply,
+                        operand0=correction,
+                        engine=nisa.vector_engine,
+                    )
+
+                    # ---- exp_logits @ V via nc_matmul ----
+                    # Contraction is over P_MAX key positions (natural fit).
+                    # Need to transpose exp_logits for nc_matmul convention.
+                    exp_bf16 = nl.ndarray((P_MAX, P_MAX), dtype=q.dtype, buffer=nl.sbuf)
+                    nisa.tensor_copy(dst=exp_bf16, src=exp_logits)
+
+                    exp_t_psum = nl.ndarray(
+                        (P_MAX, P_MAX), dtype=q.dtype, buffer=nl.psum
+                    )
+                    nisa.nc_transpose(dst=exp_t_psum, data=exp_bf16)
+                    exp_t = nl.ndarray((P_MAX, P_MAX), dtype=q.dtype, buffer=nl.sbuf)
+                    nisa.tensor_copy(dst=exp_t, src=exp_t_psum)
+
+                    pv_psum = nl.ndarray((P_MAX, d), dtype=nl.float32, buffer=nl.psum)
+                    nisa.nc_matmul(dst=pv_psum, stationary=exp_t, moving=v_tile_sb)
+
+                    pv_sbuf = nl.ndarray((P_MAX, d), dtype=nl.float32, buffer=nl.sbuf)
+                    nisa.tensor_copy(dst=pv_sbuf, src=pv_psum)
+
+                    # Step 7: accumulate o_acc = o_scaled + pv
+                    nisa.tensor_tensor(
+                        dst=o_acc, data1=o_scaled, data2=pv_sbuf, op=nl.add
+                    )
+
+                    # Update running state
+                    nisa.tensor_copy(dst=m_prev, src=m_new)
+                    nisa.tensor_copy(dst=l_prev, src=l_new)
+
+                # ---- Finalize: output = o_acc / l ----
+                inv_l = nl.ndarray((P_MAX, 1), dtype=nl.float32, buffer=nl.sbuf)
+                nisa.reciprocal(dst=inv_l, data=l_prev)
+
+                o_final = nl.ndarray((P_MAX, d), dtype=nl.float32, buffer=nl.sbuf)
+                nisa.tensor_scalar(
+                    dst=o_final,
+                    data=o_acc,
+                    op0=nl.multiply,
+                    operand0=inv_l,
+                    engine=nisa.vector_engine,
+                )
+
+                # Cast back to input dtype and store to HBM
+                o_out = nl.ndarray((P_MAX, d), dtype=q.dtype, buffer=nl.sbuf)
+                nisa.tensor_copy(dst=o_out, src=o_final)
+
+                nisa.dma_copy(
+                    dst=output.ap(
+                        pattern=[[q_stride_col, P_MAX], [1, d]],
+                        offset=row_base + j_start * q_stride_col + hd_start,
+                    ),
+                    src=o_out,
+                )
+
+    return output
+
+
+# ---------------------------------------------------------------------------
+# CPU reference implementation for testing
+# ---------------------------------------------------------------------------
+
+
+def triangular_attention_ref(q, k, v, bias, scale):
+    """NumPy reference implementation for triangular attention.
+
+    Args:
+        q: (N, N, H*d) float32 — query tensor.
+        k: (N, N, H*d) float32 — key tensor.
+        v: (N, N, H*d) float32 — value tensor.
+        bias: (N, N, H) float32 — triangle bias. bias[q_pos, k_pos, h] is the
+            additive bias for query position q_pos, key position k_pos, head h.
+            Shared across all rows.
+        scale: float — 1/sqrt(d).
+
+    Returns:
+        output: (N, N, H*d) float32 — attention output.
+    """
+    N, _, Hd = q.shape
+    H = bias.shape[2]
+    d = Hd // H
+
+    output = np.zeros_like(q)
+
+    for h in range(H):
+        hd_s = h * d
+        hd_e = (h + 1) * d
+
+        q_h = q[:, :, hd_s:hd_e]  # (N, N, d)
+        k_h = k[:, :, hd_s:hd_e]  # (N, N, d)
+        v_h = v[:, :, hd_s:hd_e]  # (N, N, d)
+        bias_h = bias[:, :, h]  # (N, N) — full 2D bias for head h
+
+        for i in range(N):
+            q_row = q_h[i]  # (N, d)
+            k_row = k_h[i]  # (N, d)
+            v_row = v_h[i]  # (N, d)
+
+            # logits: (N, N) = q_row @ k_row^T
+            logits = q_row @ k_row.T * scale  # (N, N)
+
+            # Add full 2D bias: bias_h[j, k] for each (query j, key k) pair
+            logits += bias_h  # (N, N) + (N, N)
+
+            # Softmax over k (axis=-1)
+            logits_max = logits.max(axis=-1, keepdims=True)
+            exp_logits = np.exp(logits - logits_max)
+            attn_weights = exp_logits / exp_logits.sum(axis=-1, keepdims=True)
+
+            # Weighted sum: (N, N) @ (N, d) -> (N, d)
+            output[i, :, hd_s:hd_e] = attn_weights @ v_row
+
+    return output
diff --git a/contrib/models/Boltz-2/src/nki_triangular_mul.py b/contrib/models/Boltz-2/src/nki_triangular_mul.py
new file mode 100644
index 00000000..7cdb8e8b
--- /dev/null
+++ b/contrib/models/Boltz-2/src/nki_triangular_mul.py
@@ -0,0 +1,268 @@
+# Copyright Amazon.com, Inc. or its affiliates. All Rights Reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License").
+# You may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+"""Triangular multiplicative update kernel for AlphaFold-family pairformers.
+
+Implements the core einsum contraction from the triangular multiplicative update
+operation used in Boltz-2, AlphaFold2, and AlphaFold3 pairformer layers. This
+operation computes a contraction over one spatial dimension of two gated
+projections of the pair representation, producing a rank-2 update per channel.
+
+Two variants exist:
+    - Outgoing: result[i,j,d] = sum_k a[i,k,d] * b[j,k,d]   (einsum "bikd,bjkd->bijd")
+    - Incoming: result[i,j,d] = sum_k a[k,i,d] * b[k,j,d]   (einsum "bkid,bkjd->bijd")
+
+This kernel implements the outgoing variant directly. The incoming variant is
+obtained by transposing the spatial dimensions of a and b before calling.
+
+Algorithm:
+    For each channel d independently:
+        result[:,:,d] = a[:,:,d] @ b[:,:,d].T       (N x N matrix multiply)
+
+    This decomposes the 4D einsum into D independent NxN matmuls, one per
+    channel. Each matmul is tiled into P_MAX x P_MAX blocks and accumulated
+    in float32.
+
+IO Shapes:
+    a:      (N, N, D)   — first gated projection, after sigmoid gating and masking
+    b:      (N, N, D)   — second gated projection
+    output: (N, N, D)   — contraction result, in float32 semantics
+
+    Where D = 128 for Boltz-2 (dim of pair representation).
+
+Tiling:
+    - Partition dimension (axis 0 of SBUF tiles): P_MAX = 128
+    - N must be a multiple of P_MAX
+    - D must be <= P_MAX (D=128 for Boltz-2)
+    - For each channel d: tile the NxN matmul a_d @ b_d^T into P_MAX blocks
+    - Accumulate in float32, output in bfloat16
+
+Precision:
+    - Boltz-2 explicitly casts a and b to float32 before the einsum
+    - The kernel accepts bfloat16 inputs (post-gating) and accumulates in float32
+    - nc_matmul on NeuronCore accumulates in float32 natively for bf16 inputs
+    - Output is cast to bfloat16 for the subsequent LayerNorm + output gating
+
+Reference:
+    Wohlwend et al., "Boltz-2: Predicting the Structure and Interactions of
+    Biomolecular Complexes", 2025. https://github.com/jwohlwend/boltz
+"""
+
+import numpy as np
+
+import nki
+import nki.isa as nisa
+import nki.language as nl
+
+P_MAX = 128
+
+
+@nki.jit
+def triangular_mul_fwd(
+    a: nl.ndarray,
+    b: nl.ndarray,
+) -> nl.ndarray:
+    """Compute triangular multiplicative update (outgoing variant).
+
+    Performs the core einsum contraction: result[i,j,d] = sum_k a[i,k,d] * b[j,k,d].
+    This is equivalent to D independent NxN matrix multiplies: for each channel d,
+    result[:,:,d] = a[:,:,d] @ b[:,:,d].T.
+
+    For the incoming variant (bkid,bkjd->bijd), transpose the spatial dimensions
+    of a and b before calling: a_incoming[i,k,d] = a_original[k,i,d].
+
+    Dimensions:
+        N: Sequence length (number of tokens/residues). Must be a multiple of 128.
+        D: Channel dimension. Must be <= 128. Typically 128 for Boltz-2.
+
+    Args:
+        a (nl.ndarray): First projection tensor, shape (N, N, D), dtype bfloat16.
+            After gating: a = sigmoid(gate_a(z_norm)) * proj_a(z_norm), masked.
+        b (nl.ndarray): Second projection tensor, shape (N, N, D), dtype bfloat16.
+            After gating: b = sigmoid(gate_b(z_norm)) * proj_b(z_norm), masked.
+
+    Returns:
+        nl.ndarray: Contraction result, shape (N, N, D), dtype bfloat16.
+            The caller applies LayerNorm + output gating + output projection.
+
+    Notes:
+        - Linear projections, gating, masking, LayerNorm, and output projection
+          are handled outside this kernel for modularity.
+        - Float32 accumulation is handled natively by nc_matmul for bf16 inputs.
+        - Output is cast to bfloat16. Boltz-2 applies LayerNorm (which uses
+          float32 internally) immediately after, so bf16 output is acceptable.
+        - For each channel d, the kernel performs a standard tiled matrix multiply
+          a_d @ b_d^T using nc_matmul with P_MAX-sized tiles.
+    """
+    N, N2, D = a.shape
+    N_b, N2_b, D_b = b.shape
+
+    assert N == N2, f"a must be square: got ({N}, {N2}, {D})"
+    assert N == N_b and N2 == N2_b and D == D_b, (
+        f"a shape ({N}, {N2}, {D}) must match b shape ({N_b}, {N2_b}, {D_b})"
+    )
+    assert N % P_MAX == 0, f"N ({N}) must be a multiple of P_MAX ({P_MAX})"
+    assert D <= P_MAX, f"D ({D}) must be <= P_MAX ({P_MAX})"
+
+    # Output tensor in HBM
+    output = nl.ndarray((N, N, D), dtype=a.dtype, buffer=nl.shared_hbm)
+
+    # Strides for 3D tensor (N, N, D) in row-major layout
+    # a[i, k, d] is at offset i * N * D + k * D + d
+    stride_i = N * D  # stride along dim 0
+    stride_k = D  # stride along dim 1
+
+    n_tiles = N // P_MAX
+
+    # Process each channel d independently
+    for d in nl.affine_range(D):
+        # For each output tile (i_tile, j_tile)
+        for i_tile in nl.affine_range(n_tiles):
+            i_start = i_tile * P_MAX
+
+            for j_tile in nl.affine_range(n_tiles):
+                j_start = j_tile * P_MAX
+
+                # Accumulator for result[i_start:i_end, j_start:j_end, d]
+                # This is the partial sum over k of a[i,k,d] * b[j,k,d]
+                acc = nl.ndarray((P_MAX, P_MAX), dtype=nl.float32, buffer=nl.sbuf)
+                nisa.memset(dst=acc, value=0.0)
+
+                # Tile over contraction axis k
+                for k_tile_idx in nl.sequential_range(n_tiles):
+                    k_start = k_tile_idx * P_MAX
+
+                    # Load a_tile: a[i_start:i_end, k_start:k_end, d]
+                    # Shape: (P_MAX, P_MAX) — P_MAX i-positions, P_MAX k-positions
+                    # Stride: dim0 = N*D (between i rows), dim1 = D (between k cols)
+                    a_tile = nl.ndarray((P_MAX, P_MAX), dtype=a.dtype, buffer=nl.sbuf)
+                    nisa.dma_copy(
+                        dst=a_tile,
+                        src=a.ap(
+                            pattern=[[stride_i, P_MAX], [stride_k, P_MAX]],
+                            offset=i_start * stride_i + k_start * stride_k + d,
+                        ),
+                    )
+
+                    # Load b_tile: b[j_start:j_end, k_start:k_end, d]
+                    # Same layout as a
+                    b_tile = nl.ndarray((P_MAX, P_MAX), dtype=b.dtype, buffer=nl.sbuf)
+                    nisa.dma_copy(
+                        dst=b_tile,
+                        src=b.ap(
+                            pattern=[[stride_i, P_MAX], [stride_k, P_MAX]],
+                            offset=j_start * stride_i + k_start * stride_k + d,
+                        ),
+                    )
+
+                    # Compute a_tile @ b_tile^T using nc_matmul
+                    # nc_matmul: dst = stationary^T @ moving
+                    # We want a_tile @ b_tile^T = (a_tile^T)^T @ b_tile^T
+                    # So: stationary = a_tile (nc_matmul transposes it internally)
+                    #     moving = b_tile
+                    # Wait -- nc_matmul computes: dst[f1, f2] = sum_p stationary[p, f1] * moving[p, f2]
+                    # = stationary^T @ moving
+                    #
+                    # We want: result[i, j] = sum_k a[i,k] * b[j,k]
+                    # = a @ b^T
+                    # = (b @ a^T)^T
+                    #
+                    # Using nc_matmul with stationary=b_tile, moving=a_tile:
+                    #   dst[f1, f2] = sum_p b[p, f1] * a[p, f2] = b^T @ a = (a^T @ b)^...
+                    #
+                    # Actually let me be precise:
+                    # nc_matmul: dst[i,j] = sum_k stationary[k,i] * moving[k,j]
+                    # This computes stationary^T @ moving.
+                    #
+                    # We want: result[i,j] = sum_k a[i,k] * b[j,k] = a @ b^T
+                    #
+                    # If we transpose a first: a^T[k,i] = a[i,k]
+                    # Then nc_matmul with stationary=a^T, moving=b^T:
+                    #   dst[i,j] = sum_k a^T[k,i]^T... no that's wrong
+                    #
+                    # nc_matmul(stationary=X, moving=Y) = X^T @ Y
+                    #   dst[i,j] = sum_k X[k,i] * Y[k,j]
+                    #
+                    # We want sum_k a[i,k] * b[j,k].
+                    # Let X[k,i] = a[i,k] => X = a^T. Let Y[k,j] = b[j,k] => Y = b^T.
+                    # Then dst[i,j] = sum_k a^T[k,i] * b^T[k,j] ...
+                    # = sum_k a[i,k] * b[j,k]. YES!
+                    #
+                    # So: stationary = a^T (transpose a_tile), moving = b^T (transpose b_tile)
+
+                    # Transpose a_tile
+                    a_t_psum = nl.ndarray((P_MAX, P_MAX), dtype=a.dtype, buffer=nl.psum)
+                    nisa.nc_transpose(dst=a_t_psum, data=a_tile)
+                    a_t = nl.ndarray((P_MAX, P_MAX), dtype=a.dtype, buffer=nl.sbuf)
+                    nisa.tensor_copy(dst=a_t, src=a_t_psum)
+
+                    # Transpose b_tile
+                    b_t_psum = nl.ndarray((P_MAX, P_MAX), dtype=b.dtype, buffer=nl.psum)
+                    nisa.nc_transpose(dst=b_t_psum, data=b_tile)
+                    b_t = nl.ndarray((P_MAX, P_MAX), dtype=b.dtype, buffer=nl.sbuf)
+                    nisa.tensor_copy(dst=b_t, src=b_t_psum)
+
+                    # nc_matmul: stationary=a^T, moving=b^T => a @ b^T
+                    partial_psum = nl.ndarray(
+                        (P_MAX, P_MAX), dtype=nl.float32, buffer=nl.psum
+                    )
+                    nisa.nc_matmul(dst=partial_psum, stationary=a_t, moving=b_t)
+                    partial = nl.ndarray(
+                        (P_MAX, P_MAX), dtype=nl.float32, buffer=nl.sbuf
+                    )
+                    nisa.tensor_copy(dst=partial, src=partial_psum)
+
+                    # Accumulate
+                    nisa.tensor_tensor(dst=acc, data1=acc, data2=partial, op=nl.add)
+
+                # Cast to output dtype and store
+                out_tile = nl.ndarray((P_MAX, P_MAX), dtype=a.dtype, buffer=nl.sbuf)
+                nisa.tensor_copy(dst=out_tile, src=acc)
+
+                # Store result[i_start:i_end, j_start:j_end, d]
+                # Output has same (N, N, D) layout as input
+                nisa.dma_copy(
+                    dst=output.ap(
+                        pattern=[[stride_i, P_MAX], [stride_k, P_MAX]],
+                        offset=i_start * stride_i + j_start * stride_k + d,
+                    ),
+                    src=out_tile,
+                )
+
+    return output
+
+
+# ---------------------------------------------------------------------------
+# CPU reference implementation for testing
+# ---------------------------------------------------------------------------
+
+
+def triangular_mul_ref(a, b):
+    """NumPy reference: outgoing triangular multiplicative update.
+
+    Computes result[i,j,d] = sum_k a[i,k,d] * b[j,k,d] for all (i,j,d).
+    Equivalent to: for each d, result[:,:,d] = a[:,:,d] @ b[:,:,d].T
+
+    Args:
+        a: (N, N, D) float32
+        b: (N, N, D) float32
+
+    Returns:
+        output: (N, N, D) float32
+    """
+    N, _, D = a.shape
+    output = np.zeros_like(a)
+    for d in range(D):
+        output[:, :, d] = a[:, :, d] @ b[:, :, d].T
+    return output
diff --git a/contrib/models/Boltz-2/test/__init__.py b/contrib/models/Boltz-2/test/__init__.py
new file mode 100644
index 00000000..e69de29b
diff --git a/contrib/models/Boltz-2/test/integration/__init__.py b/contrib/models/Boltz-2/test/integration/__init__.py
new file mode 100644
index 00000000..e69de29b
diff --git a/contrib/models/Boltz-2/test/integration/test_model.py b/contrib/models/Boltz-2/test/integration/test_model.py
new file mode 100644
index 00000000..960a4e86
--- /dev/null
+++ b/contrib/models/Boltz-2/test/integration/test_model.py
@@ -0,0 +1,291 @@
+"""Integration tests for Boltz-2 pairformer on AWS Trainium 2.
+
+Tests compile the pairformer trunk using weight replacement with NKI kernels,
+run inference, and validate accuracy against CPU reference.
+
+Prerequisites:
+    pip install boltz==2.2.1
+
+Environment variables:
+    BOLTZ2_CHECKPOINT: Path to Boltz-2 checkpoint (default: ~/.boltz/boltz2_conf.ckpt)
+    NEURON_PLATFORM_TARGET_OVERRIDE: Must be "trn2" (set automatically)
+
+Run:
+    NEURON_RT_VISIBLE_CORES=0 pytest test/integration/test_model.py -v -s
+"""
+
+import os
+import sys
+
+# Set trn2 target before any Neuron imports
+os.environ.setdefault("NEURON_PLATFORM_TARGET_OVERRIDE", "trn2")
+
+import pytest
+import torch
+import torch.nn.functional as F
+
+# Add src/ to path for imports
+SRC_DIR = os.path.join(os.path.dirname(__file__), "..", "..", "src")
+sys.path.insert(0, SRC_DIR)
+
+
+# ========================================================================
+# Constants
+# ========================================================================
+
+N = 128  # Sequence length for tests (128 = 1 tile, fastest compile)
+C_s = 384  # Single representation channels
+C_z = 128  # Pair representation channels
+NUM_LAYERS_TEST = 2  # Number of layers for accuracy test (2 = fast, sufficient)
+CHECKPOINT_PATH = os.environ.get(
+    "BOLTZ2_CHECKPOINT",
+    os.path.expanduser("~/.boltz/boltz2_conf.ckpt"),
+)
+
+
+# ========================================================================
+# Fixtures
+# ========================================================================
+
+
+@pytest.fixture(scope="module")
+def boltz2_model():
+    """Load Boltz-2 model from checkpoint."""
+    if not os.path.exists(CHECKPOINT_PATH):
+        pytest.skip(
+            f"Boltz-2 checkpoint not found at {CHECKPOINT_PATH}. "
+            f"Download with: boltz predict --help  (auto-downloads on first run), "
+            f"or set BOLTZ2_CHECKPOINT env var."
+        )
+
+    from dataclasses import asdict
+
+    from boltz.main import (
+        Boltz2,
+        Boltz2DiffusionParams,
+        BoltzSteeringParams,
+        MSAModuleArgs,
+        PairformerArgsV2,
+    )
+
+    model = Boltz2.load_from_checkpoint(
+        CHECKPOINT_PATH,
+        strict=True,
+        predict_args={
+            "recycling_steps": 1,
+            "sampling_steps": 1,
+            "diffusion_samples": 1,
+            "max_parallel_samples": 1,
+            "write_confidence_summary": False,
+            "write_full_pae": False,
+            "write_full_pde": False,
+        },
+        map_location="cpu",
+        diffusion_process_args=asdict(Boltz2DiffusionParams()),
+        ema=False,
+        use_kernels=False,
+        pairformer_args=asdict(PairformerArgsV2()),
+        msa_args=asdict(MSAModuleArgs(use_paired_feature=True)),
+        steering_args=asdict(BoltzSteeringParams()),
+    )
+    model.eval()
+    model = model.float()
+    return model
+
+
+@pytest.fixture(scope="module")
+def compiled_pairformer(boltz2_model):
+    """Compile pairformer layers using weight replacement + NKI kernels."""
+    from modeling_boltz2 import (
+        compile_pairformer_weight_replaced,
+        patch_boltz2_with_nki_kernels,
+    )
+
+    patch_boltz2_with_nki_kernels()
+
+    # Compile only NUM_LAYERS_TEST layers for speed
+    import copy
+
+    from modeling_boltz2 import SinglePairformerLayerWrapper
+
+    all_layers = list(boltz2_model.pairformer_module.layers)[:NUM_LAYERS_TEST]
+
+    s_dummy = torch.randn(1, N, C_s, dtype=torch.bfloat16) * 0.1
+    z_dummy = torch.randn(1, N, N, C_z, dtype=torch.bfloat16) * 0.1
+    mask_dummy = torch.ones(1, N, dtype=torch.float32)
+    pair_mask_dummy = torch.ones(1, N, N, dtype=torch.float32)
+
+    import torch_neuronx
+
+    # Compile layer 0
+    layer0 = copy.deepcopy(all_layers[0]).to(torch.bfloat16)
+    wrapper0 = SinglePairformerLayerWrapper(layer0)
+    wrapper0.eval()
+
+    traced_template = torch_neuronx.trace(
+        wrapper0,
+        (s_dummy, z_dummy, mask_dummy, pair_mask_dummy),
+        compiler_args=["--target", "trn2"],
+        inline_weights_to_neff=False,
+    )
+
+    try:
+        torch_neuronx.move_trace_to_device(traced_template, 0)
+    except Exception:
+        pass
+
+    # Warmup
+    with torch.no_grad():
+        _ = traced_template(s_dummy, z_dummy, mask_dummy, pair_mask_dummy)
+
+    # Weight replacement for remaining layers
+    traced_layers = [traced_template]
+    for i in range(1, len(all_layers)):
+        layer_bf16 = copy.deepcopy(all_layers[i]).to(torch.bfloat16)
+        wrapper_i = SinglePairformerLayerWrapper(layer_bf16)
+        wrapper_i.eval()
+
+        traced_copy = copy.deepcopy(traced_template)
+        torch_neuronx.replace_weights(traced_copy, wrapper_i.state_dict())
+
+        try:
+            torch_neuronx.move_trace_to_device(traced_copy, 0)
+        except Exception:
+            pass
+
+        traced_layers.append(traced_copy)
+
+    return traced_layers
+
+
+@pytest.fixture(scope="module")
+def test_inputs():
+    """Create deterministic test inputs."""
+    torch.manual_seed(42)
+    s = torch.randn(1, N, C_s, dtype=torch.float32) * 0.1
+    z = torch.randn(1, N, N, C_z, dtype=torch.float32) * 0.1
+    mask = torch.ones(1, N, dtype=torch.float32)
+    pair_mask = torch.ones(1, N, N, dtype=torch.float32)
+    return s, z, mask, pair_mask
+
+
+# ========================================================================
+# Tests
+# ========================================================================
+
+
+class TestBoltz2Pairformer:
+    """Test Boltz-2 pairformer with NKI kernels on Neuron."""
+
+    def test_model_loads(self, boltz2_model):
+        """Verify Boltz-2 model loads from checkpoint."""
+        assert boltz2_model is not None
+        layers = list(boltz2_model.pairformer_module.layers)
+        assert len(layers) == 64, f"Expected 64 pairformer layers, got {len(layers)}"
+
+    def test_compilation_succeeds(self, compiled_pairformer):
+        """Verify pairformer compiles with NKI kernels."""
+        assert compiled_pairformer is not None
+        assert len(compiled_pairformer) == NUM_LAYERS_TEST
+
+    def test_forward_pass_no_nan(self, compiled_pairformer, test_inputs):
+        """Verify forward pass produces valid outputs (no NaN/Inf)."""
+        s, z, mask, pair_mask = test_inputs
+        s_curr = s.to(torch.bfloat16)
+        z_curr = z.to(torch.bfloat16)
+
+        with torch.no_grad():
+            for traced in compiled_pairformer:
+                s_curr, z_curr = traced(s_curr, z_curr, mask, pair_mask)
+                s_curr = s_curr.to(torch.bfloat16)
+                z_curr = z_curr.to(torch.bfloat16)
+
+        assert not torch.isnan(s_curr).any(), "s output contains NaN"
+        assert not torch.isnan(z_curr).any(), "z output contains NaN"
+        assert not torch.isinf(s_curr).any(), "s output contains Inf"
+        assert not torch.isinf(z_curr).any(), "z output contains Inf"
+        assert s_curr.shape == (1, N, C_s)
+        assert z_curr.shape == (1, N, N, C_z)
+
+    def test_accuracy_vs_cpu(self, boltz2_model, compiled_pairformer, test_inputs):
+        """Compare Neuron output against CPU reference.
+
+        Validated results at N=128 with trained weights:
+            s_cos >= 0.999 (measured 0.999796 for 1 layer)
+            z_cos >= 0.995 (measured 0.998359 for 1 layer)
+        """
+        s, z, mask, pair_mask = test_inputs
+
+        # CPU reference (first NUM_LAYERS_TEST layers only)
+        with torch.no_grad():
+            s_cpu = s.clone()
+            z_cpu = z.clone()
+            layers = list(boltz2_model.pairformer_module.layers)[:NUM_LAYERS_TEST]
+            for layer in layers:
+                s_cpu, z_cpu = layer(
+                    s_cpu, z_cpu, mask=mask, pair_mask=pair_mask, use_kernels=False
+                )
+
+        # Neuron inference
+        with torch.no_grad():
+            s_neuron = s.to(torch.bfloat16)
+            z_neuron = z.to(torch.bfloat16)
+            for traced in compiled_pairformer:
+                s_neuron, z_neuron = traced(s_neuron, z_neuron, mask, pair_mask)
+                s_neuron = s_neuron.to(torch.bfloat16)
+                z_neuron = z_neuron.to(torch.bfloat16)
+
+        # Compare
+        s_cos = F.cosine_similarity(
+            s_cpu.flatten().unsqueeze(0).float(),
+            s_neuron.flatten().unsqueeze(0).float(),
+        ).item()
+        z_cos = F.cosine_similarity(
+            z_cpu.flatten().unsqueeze(0).float(),
+            z_neuron.flatten().unsqueeze(0).float(),
+        ).item()
+
+        print(f"\n  Accuracy ({NUM_LAYERS_TEST} layers, N={N}):")
+        print(f"    s_cos = {s_cos:.6f}")
+        print(f"    z_cos = {z_cos:.6f}")
+
+        # Thresholds based on measured results with trained weights
+        assert s_cos >= 0.99, f"s cosine similarity {s_cos:.6f} < 0.99"
+        assert z_cos >= 0.99, f"z cosine similarity {z_cos:.6f} < 0.99"
+
+    def test_inference_latency(self, compiled_pairformer, test_inputs):
+        """Measure inference latency per layer."""
+        s, z, mask, pair_mask = test_inputs
+        import time
+
+        # Warmup
+        with torch.no_grad():
+            s_w = s.to(torch.bfloat16)
+            z_w = z.to(torch.bfloat16)
+            for traced in compiled_pairformer:
+                s_w, z_w = traced(s_w, z_w, mask, pair_mask)
+                s_w = s_w.to(torch.bfloat16)
+                z_w = z_w.to(torch.bfloat16)
+
+        # Timed runs
+        latencies = []
+        for _ in range(3):
+            s_t = s.to(torch.bfloat16)
+            z_t = z.to(torch.bfloat16)
+            t0 = time.time()
+            with torch.no_grad():
+                for traced in compiled_pairformer:
+                    s_t, z_t = traced(s_t, z_t, mask, pair_mask)
+                    s_t = s_t.to(torch.bfloat16)
+                    z_t = z_t.to(torch.bfloat16)
+            latencies.append(time.time() - t0)
+
+        avg_latency = sum(latencies) / len(latencies)
+        per_layer = avg_latency / NUM_LAYERS_TEST * 1000
+
+        print(f"\n  Latency ({NUM_LAYERS_TEST} layers, N={N}):")
+        print(f"    Total: {avg_latency * 1000:.1f} ms")
+        print(f"    Per layer: {per_layer:.1f} ms")
+
+        # Sanity check: should be faster than 10s for 2 layers at N=128
+        assert avg_latency < 10.0, f"Latency {avg_latency:.1f}s exceeds 10s threshold"
diff --git a/contrib/models/Boltz-2/test/unit/__init__.py b/contrib/models/Boltz-2/test/unit/__init__.py
new file mode 100644
index 00000000..e69de29b

From 18ea63f7daa4a3fca90215002a66269914ddf49f Mon Sep 17 00:00:00 2001
From: Jim Burtoft <jburtoft@amazon.com>
Date: Tue, 10 Mar 2026 22:27:38 -0400
Subject: [PATCH 2/2] Add fused PairformerLayer mega-kernel: 65.9ms/layer
 (2.63x over traced)

Add SPMD grid=[2] NKI mega-kernel that fuses all 7 sub-operations of a
PairformerLayer into a single kernel call, eliminating host-device round
trips. Validated at N=256: s_cos=0.999995, z_cos=0.999245.

New files:
- src/fused_z_ops_spmd.py: z-only fused operations (TriMul, TriAttn, Transition_z)
- src/full_pairformer_layer_spmd.py: full layer kernel (PBA + all z-ops + Transition_s)
- test/integration/compile_full_layer_spmd.py: compilation script
- test/integration/test_full_layer_spmd.py: correctness test vs CPU reference
---
 contrib/models/Boltz-2/README.md              |   33 +-
 .../Boltz-2/src/full_pairformer_layer_spmd.py | 1158 +++++++++++++++
 .../models/Boltz-2/src/fused_z_ops_spmd.py    | 1266 +++++++++++++++++
 .../integration/compile_full_layer_spmd.py    |  378 +++++
 .../test/integration/test_full_layer_spmd.py  |  499 +++++++
 5 files changed, 3330 insertions(+), 4 deletions(-)
 create mode 100644 contrib/models/Boltz-2/src/full_pairformer_layer_spmd.py
 create mode 100644 contrib/models/Boltz-2/src/fused_z_ops_spmd.py
 create mode 100644 contrib/models/Boltz-2/test/integration/compile_full_layer_spmd.py
 create mode 100644 contrib/models/Boltz-2/test/integration/test_full_layer_spmd.py

diff --git a/contrib/models/Boltz-2/README.md b/contrib/models/Boltz-2/README.md
index 51e4f4ed..1cab1a5d 100644
--- a/contrib/models/Boltz-2/README.md
+++ b/contrib/models/Boltz-2/README.md
@@ -40,7 +40,18 @@ Biomolecular structure prediction on AWS Trainium 2 using NKI custom kernels for
 
 ### Benchmark Results
 
-**Pairformer compilation (trn2.3xlarge):**
+**Fused NKI Mega-Kernel (SPMD grid=[2], single NKI kernel per layer):**
+
+The fused mega-kernel combines ALL 7 sub-operations of a PairformerLayer into a single NKI kernel call, eliminating host-device round trips and sync overhead between operations.
+
+| Approach | N=256 Per Layer | 64 Layers (est.) | Speedup vs Traced |
+|---|---|---|---|
+| Traced + weight replacement (original) | 173 ms | 11.08s | 1.0x |
+| **Fused mega-kernel (SPMD)** | **65.9 ms** | **~4.2s** | **2.63x** |
+
+Mega-kernel correctness at N=256: s_cos=0.999995, z_cos=0.999245 — PASS.
+
+**Pairformer compilation (traced approach, trn2.3xlarge):**
 
 | N | Compile Layer 0 | Weight Swaps (63 layers) | Total Setup |
 |---|-----------------|-------------------------|-------------|
@@ -169,13 +180,23 @@ PYTHONPATH=src pytest test/integration/test_model.py -v -s
 
 ### Approach
 
-This contribution uses a different approach from typical NxDI contrib models:
+This contribution includes two approaches for running the Boltz-2 pairformer on Trainium 2:
+
+**Approach 1: Traced + NKI Kernels (original)**
 
 1. **torch_neuronx.trace()** (not NxDI model classes) for compilation
 2. **NKI custom kernels** for the four O(N^3) triangular operations
 3. **Weight replacement** pattern: compile one layer, clone 63 times with `replace_weights()`
 4. **Monkey-patching** to inject NKI kernels into the upstream Boltz-2 codebase
 
+**Approach 2: Fused Mega-Kernel (2.63x faster)**
+
+A single NKI kernel (`full_pairformer_layer_spmd.py`) covers ALL 7 sub-operations of a PairformerLayer, including PairBiasAttention, both TriMul ops, both TriAttn ops, and both Transitions. SPMD grid=[2] uses both physical NeuronCores. This eliminates all host-device round trips within a layer, reducing latency from 173ms to 65.9ms at N=256.
+
+- Requires N >= 256 (SPMD split needs at least 2 s-tiles)
+- Compile time: ~5 min at N=256
+- NEFF size: 24.2 MB at N=256
+
 ### NKI Kernels
 
 Two NKI kernels replace the computationally expensive operations:
@@ -226,8 +247,12 @@ Each kernel is called twice per pairformer layer (starting node + ending node va
 
 | File | Description |
 |------|-------------|
-| `src/modeling_boltz2.py` | Main module: monkey-patching, compilation, inference |
+| `src/modeling_boltz2.py` | Main module: monkey-patching, compilation, inference (traced approach) |
 | `src/nki_triangular_attention.py` | NKI kernel: triangular attention with online softmax |
 | `src/nki_triangular_mul.py` | NKI kernel: triangular multiplicative update (einsum) |
+| `src/fused_z_ops_spmd.py` | Fused mega-kernel: z-only operations (TriMul, TriAttn, Transition_z) |
+| `src/full_pairformer_layer_spmd.py` | Fused mega-kernel: full PairformerLayer (all 7 ops, SPMD grid=[2]) |
 | `src/__init__.py` | Package exports |
-| `test/integration/test_model.py` | Accuracy + latency tests |
+| `test/integration/test_model.py` | Accuracy + latency tests (traced approach) |
+| `test/integration/compile_full_layer_spmd.py` | Compile script for fused mega-kernel |
+| `test/integration/test_full_layer_spmd.py` | Correctness test for fused mega-kernel vs CPU reference |
diff --git a/contrib/models/Boltz-2/src/full_pairformer_layer_spmd.py b/contrib/models/Boltz-2/src/full_pairformer_layer_spmd.py
new file mode 100644
index 00000000..b3583a22
--- /dev/null
+++ b/contrib/models/Boltz-2/src/full_pairformer_layer_spmd.py
@@ -0,0 +1,1158 @@
+# Copyright Amazon.com, Inc. or its affiliates. All Rights Reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License").
+# You may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+"""Full PairformerLayer mega-kernel — SPMD grid=[2] variant.
+
+Extends the z-only mega-kernel (fused_z_ops_spmd.py) to include ALL 7 sub-operations
+of a PairformerLayer:
+
+  Step 1:  s = s + PairBiasAttention(s, z)     # O(N^2) attention on s, biased by z
+  Step 2:  z = z + TriMulOut(z, pair_mask)      # O(N^3) einsum
+  Step 3:  z = z + TriMulIn(z, pair_mask)       # O(N^3) einsum
+  Step 4:  z = z + TriAttnStart(z)              # O(N^3) row-wise attention
+  Step 5:  z = z + TriAttnEnd(z)                # O(N^3) column-wise attention
+  Step 6a: s = s + Transition_s(s)              # SwiGLU FFN on s (C_s=384, hidden=1536)
+  Step 6b: z = z + Transition_z(z)              # SwiGLU FFN on z (C_z=128, hidden=512)
+
+Key insight: Steps 1 and 6a operate on s; steps 2-5 and 6b operate on z.
+Step 1 reads z (for pair bias) but does not modify it. Steps 2-6b do not read s.
+Therefore step 1 and steps 2-5 can execute concurrently if we split SPMD work
+accordingly. However, for simplicity and correctness, we run them sequentially
+in the initial implementation.
+
+New challenges vs z-only kernel:
+  - C_s = 384 = 3 * P_MAX: weight matrices up to [384, 384] and [1536, 384]
+    require tiled matmuls across multiple P_MAX-sized chunks
+  - PairBiasAttention: head_dim=24, 16 heads, pair bias from z (128→16)
+  - Transition_s: hidden_dim=1536 = 12 * P_MAX, large weight matrices
+  - Transition_s has NO biases on fc1/fc2/fc3 (same as Transition_z: bias=False)
+
+Hardware: NeuronCore v3 (trn2), LNC=2, 2 physical cores per logical core
+"""
+
+import numpy as np
+
+import nki
+import nki.isa as nisa
+import nki.language as nl
+
+from fused_z_ops_spmd import (
+    P_MAX,
+    _layer_norm_tile,
+    _matmul_with_w_t,
+    _prepare_weight,
+    _transition_z_phase_spmd,
+    _transpose_to_sbuf,
+    _triattn_phase_spmd,
+    _trimul_phase_spmd,
+)
+
+
+# ============================================================================
+# Helpers for C_s=384 tiled operations
+# ============================================================================
+
+C_S_CHUNKS = 3  # C_s=384 = 3 * P_MAX
+HIDDEN_S_CHUNKS = 12  # hidden_dim_s=1536 = 12 * P_MAX
+
+
+# ============================================================================
+# Phase 6a: Transition_s (SwiGLU FFN on s, dim=384, hidden=1536)
+# SPMD variant: split tiles by pid
+# ============================================================================
+def _transition_s_phase_spmd(
+    s_in_hbm,
+    s_out_hbm,
+    norm_w_hbm,
+    norm_b_hbm,
+    fc1_w_hbm,
+    fc2_w_hbm,
+    fc3_w_hbm,
+    N,
+    C_s,
+    hidden_dim_s,
+    eps,
+    pid,
+):
+    """Execute Transition_s: SwiGLU FFN on s (SPMD variant).
+
+    s is [N, C_s=384]. Hidden dim is 1536 = 12*128.
+    fc1: [1536, 384] (expansion, SiLU branch) — NO bias
+    fc2: [1536, 384] (expansion, gate branch) — NO bias
+    fc3: [384, 1536] (projection back) — NO bias
+    Same as Transition_z: the Transition class uses bias=False for all linear layers.
+
+    Tiling strategy:
+      - s tiles: N // P_MAX tiles of [P_MAX, 384]
+      - C_s = 3 * P_MAX = 3 input chunks
+      - hidden = 12 * P_MAX = 12 hidden chunks
+      - fc1/fc2: for each of 12 output chunks, accumulate across 3 input chunks
+      - fc3: for each of 3 output chunks, accumulate across 12 input chunks
+    """
+    n_tiles = N // P_MAX
+    n_in = C_s // P_MAX  # 3
+    n_hid = hidden_dim_s // P_MAX  # 12
+
+    # SPMD work split
+    tiles_per_engine = n_tiles // 2
+    my_tile_start = pid * tiles_per_engine
+
+    # Load LayerNorm weights [P_MAX, C_s] (tiled to [P_MAX, C_s])
+    ln_w = nl.ndarray((P_MAX, C_s), dtype=nl.bfloat16, buffer=nl.sbuf)
+    nisa.dma_copy(dst=ln_w, src=norm_w_hbm)
+    ln_b = nl.ndarray((P_MAX, C_s), dtype=nl.bfloat16, buffer=nl.sbuf)
+    nisa.dma_copy(dst=ln_b, src=norm_b_hbm)
+
+    # WEIGHT-STATIONARY: Pre-load and transpose ALL weight chunks.
+    # fc1: [1536, 384] = 12 output chunks × 3 input chunks
+    # fc2: [1536, 384] = same
+    # fc3: [384, 1536] = 3 output chunks × 12 input chunks
+    # NKI compiler doesn't support mutation, generator expressions, or list
+    # comprehensions inside traced code. Must use explicit named variables
+    # with if/elif chains inside nl.static_range loops (same pattern as
+    # fused_z_ops_spmd.py Transition_z).
+
+    # fc1: [1536, 384] — 12 hidden chunks, each [P_MAX, 384] with 3 input chunks
+    # fc2: [1536, 384] — same structure
+    # For fc1/fc2: iterate h=0..11 hidden chunks, accumulate across i=0..2 input chunks
+    # For fc3: iterate o=0..2 output chunks, accumulate across h=0..11 hidden chunks
+    #
+    # Since 12*3=36 named variables per matrix is excessive, we restructure:
+    # For each hidden chunk h, load s_chunks[i] and compute fc1/fc2 by accumulating
+    # across i. The fc1/fc2 weights for a given h are just 3 P_MAX×P_MAX tiles.
+    # We pre-load all 12*3=36 tiles per matrix as named vars.
+    # But instead of if/elif with 36 branches, we pre-load per-h in the h loop.
+    # Wait — we can't do that either because the loop var h can't index anything.
+    #
+    # Alternative: pass weights as [hidden_dim, C_s] and slice inside the loop.
+    # The HBM slicing [h*P_MAX:(h+1)*P_MAX, i*P_MAX:(i+1)*P_MAX] should work
+    # since h and i are nl.static_range loop vars (compile-time constants).
+    # We just call _prepare_weight directly with HBM slices inside the loop.
+    # This means weights are loaded per-tile per-iteration, NOT weight-stationary.
+    # Less optimal but correct and compilable.
+
+    for tile_local in nl.sequential_range(tiles_per_engine):
+        tile_idx = my_tile_start + tile_local
+        tile_start = tile_idx * P_MAX
+
+        # Load s tile: [P_MAX, C_s=384]
+        s_tile = nl.ndarray((P_MAX, C_s), dtype=nl.bfloat16, buffer=nl.sbuf)
+        nisa.dma_copy(dst=s_tile, src=s_in_hbm[tile_start : tile_start + P_MAX, 0:C_s])
+
+        # LayerNorm
+        s_normed = _layer_norm_tile(s_tile, ln_w, ln_b, C_s, eps)
+
+        # Split into 3 chunks and transpose each
+        s_chunk_0 = nl.ndarray((P_MAX, P_MAX), dtype=nl.bfloat16, buffer=nl.sbuf)
+        nisa.tensor_copy(dst=s_chunk_0, src=s_normed[0:P_MAX, 0:P_MAX])
+        s_chunk_0_t = _transpose_to_sbuf(s_chunk_0)
+
+        s_chunk_1 = nl.ndarray((P_MAX, P_MAX), dtype=nl.bfloat16, buffer=nl.sbuf)
+        nisa.tensor_copy(dst=s_chunk_1, src=s_normed[0:P_MAX, P_MAX : 2 * P_MAX])
+        s_chunk_1_t = _transpose_to_sbuf(s_chunk_1)
+
+        s_chunk_2 = nl.ndarray((P_MAX, P_MAX), dtype=nl.bfloat16, buffer=nl.sbuf)
+        nisa.tensor_copy(dst=s_chunk_2, src=s_normed[0:P_MAX, 2 * P_MAX : 3 * P_MAX])
+        s_chunk_2_t = _transpose_to_sbuf(s_chunk_2)
+
+        s_chunks_t = (s_chunk_0_t, s_chunk_1_t, s_chunk_2_t)
+
+        # --- fc1 and fc2: [P_MAX, 384] @ [1536, 384]^T → [P_MAX, 1536] ---
+        # For each hidden chunk h (0..11), accumulate across 3 input chunks
+        # Then apply SwiGLU: SiLU(fc1) * fc2
+
+        # Accumulate fc3 output across hidden chunks
+        fc3_acc_0 = nl.ndarray((P_MAX, P_MAX), dtype=nl.float32, buffer=nl.sbuf)
+        nisa.memset(dst=fc3_acc_0, value=0.0)
+        fc3_acc_1 = nl.ndarray((P_MAX, P_MAX), dtype=nl.float32, buffer=nl.sbuf)
+        nisa.memset(dst=fc3_acc_1, value=0.0)
+        fc3_acc_2 = nl.ndarray((P_MAX, P_MAX), dtype=nl.float32, buffer=nl.sbuf)
+        nisa.memset(dst=fc3_acc_2, value=0.0)
+
+        fc3_acc = (fc3_acc_0, fc3_acc_1, fc3_acc_2)
+
+        for h in nl.static_range(12):
+            # fc1 chunk h: accumulate across 3 input chunks
+            fc1_acc = nl.ndarray((P_MAX, P_MAX), dtype=nl.float32, buffer=nl.sbuf)
+            nisa.memset(dst=fc1_acc, value=0.0)
+            fc2_acc = nl.ndarray((P_MAX, P_MAX), dtype=nl.float32, buffer=nl.sbuf)
+            nisa.memset(dst=fc2_acc, value=0.0)
+
+            for i in nl.static_range(3):
+                fc1_w_hi = _prepare_weight(
+                    fc1_w_hbm[h * P_MAX : (h + 1) * P_MAX, i * P_MAX : (i + 1) * P_MAX]
+                )
+                p1 = nl.ndarray((P_MAX, P_MAX), dtype=nl.float32, buffer=nl.psum)
+                nisa.nc_matmul(dst=p1, stationary=s_chunks_t[i], moving=fc1_w_hi)
+                p1s = nl.ndarray((P_MAX, P_MAX), dtype=nl.float32, buffer=nl.sbuf)
+                nisa.tensor_copy(dst=p1s, src=p1)
+                nisa.tensor_tensor(dst=fc1_acc, data1=fc1_acc, data2=p1s, op=nl.add)
+
+                fc2_w_hi = _prepare_weight(
+                    fc2_w_hbm[h * P_MAX : (h + 1) * P_MAX, i * P_MAX : (i + 1) * P_MAX]
+                )
+                p2 = nl.ndarray((P_MAX, P_MAX), dtype=nl.float32, buffer=nl.psum)
+                nisa.nc_matmul(dst=p2, stationary=s_chunks_t[i], moving=fc2_w_hi)
+                p2s = nl.ndarray((P_MAX, P_MAX), dtype=nl.float32, buffer=nl.sbuf)
+                nisa.tensor_copy(dst=p2s, src=p2)
+                nisa.tensor_tensor(dst=fc2_acc, data1=fc2_acc, data2=p2s, op=nl.add)
+
+            # SwiGLU: SiLU(fc1) * fc2 — no bias to add
+            fc1_for_silu = nl.ndarray((P_MAX, P_MAX), dtype=nl.bfloat16, buffer=nl.sbuf)
+            nisa.tensor_copy(dst=fc1_for_silu, src=fc1_acc)
+
+            fc1_sig = nl.ndarray((P_MAX, P_MAX), dtype=nl.bfloat16, buffer=nl.sbuf)
+            nisa.activation(
+                dst=fc1_sig, op=nl.sigmoid, data=fc1_for_silu, bias=None, scale=1.0
+            )
+            fc1_silu = nl.ndarray((P_MAX, P_MAX), dtype=nl.bfloat16, buffer=nl.sbuf)
+            nisa.tensor_tensor(
+                dst=fc1_silu, data1=fc1_for_silu, data2=fc1_sig, op=nl.multiply
+            )
+
+            fc2_for_gate = nl.ndarray((P_MAX, P_MAX), dtype=nl.bfloat16, buffer=nl.sbuf)
+            nisa.tensor_copy(dst=fc2_for_gate, src=fc2_acc)
+
+            swiglu = nl.ndarray((P_MAX, P_MAX), dtype=nl.bfloat16, buffer=nl.sbuf)
+            nisa.tensor_tensor(
+                dst=swiglu, data1=fc1_silu, data2=fc2_for_gate, op=nl.multiply
+            )
+
+            # fc3: accumulate this hidden chunk's contribution to each output chunk
+            sg_t = _transpose_to_sbuf(swiglu)
+            for o in nl.static_range(3):
+                fc3_w_oh = _prepare_weight(
+                    fc3_w_hbm[o * P_MAX : (o + 1) * P_MAX, h * P_MAX : (h + 1) * P_MAX]
+                )
+                p3 = nl.ndarray((P_MAX, P_MAX), dtype=nl.float32, buffer=nl.psum)
+                nisa.nc_matmul(dst=p3, stationary=sg_t, moving=fc3_w_oh)
+                p3s = nl.ndarray((P_MAX, P_MAX), dtype=nl.float32, buffer=nl.sbuf)
+                nisa.tensor_copy(dst=p3s, src=p3)
+                nisa.tensor_tensor(
+                    dst=fc3_acc[o], data1=fc3_acc[o], data2=p3s, op=nl.add
+                )
+
+        # Residual: store fc3 output + original s
+        for o in nl.static_range(3):
+            output_chunk = nl.ndarray((P_MAX, P_MAX), dtype=nl.bfloat16, buffer=nl.sbuf)
+            nisa.tensor_copy(dst=output_chunk, src=fc3_acc[o])
+
+            # Load original s chunk for residual
+            s_orig_chunk = nl.ndarray((P_MAX, P_MAX), dtype=nl.bfloat16, buffer=nl.sbuf)
+            nisa.dma_copy(
+                dst=s_orig_chunk,
+                src=s_in_hbm[
+                    tile_start : tile_start + P_MAX,
+                    o * P_MAX : (o + 1) * P_MAX,
+                ],
+            )
+            s_updated = nl.ndarray((P_MAX, P_MAX), dtype=nl.bfloat16, buffer=nl.sbuf)
+            nisa.tensor_tensor(
+                dst=s_updated, data1=s_orig_chunk, data2=output_chunk, op=nl.add
+            )
+            nisa.dma_copy(
+                dst=s_out_hbm[
+                    tile_start : tile_start + P_MAX,
+                    o * P_MAX : (o + 1) * P_MAX,
+                ],
+                src=s_updated,
+            )
+
+
+# ============================================================================
+# Phase 1: PairBiasAttention (attention on s, biased by z)
+# SPMD variant: split work by pid
+# ============================================================================
+def _pair_bias_attn_phase_spmd(
+    s_in_hbm,
+    z_in_hbm,
+    s_out_hbm,
+    mask_hbm,
+    # PairBiasAttention weights
+    norm_s_w_hbm,
+    norm_s_b_hbm,
+    norm_z_w_hbm,
+    norm_z_b_hbm,
+    proj_q_w_hbm,
+    proj_q_b_hbm,
+    proj_k_w_hbm,
+    proj_v_w_hbm,
+    proj_z_w_hbm,
+    proj_g_w_hbm,
+    proj_o_w_hbm,
+    # Scratch buffers in shared HBM
+    q_buf,  # [N, C_s] bf16 — Q projections
+    k_buf,  # [N, C_s] bf16 — K projections
+    v_buf,  # [N, C_s] bf16 — V projections
+    gate_buf,  # [N, C_s] bf16 — gate values
+    z_bias_buf,  # [N, N, H_s] bf16 — pair bias from z
+    N,
+    C_s,
+    C_z,
+    H_s,  # 16
+    d_s,  # 24
+    eps,
+    pid,
+):
+    """Execute PairBiasAttention: multi-head attention on s, biased by z.
+
+    Step 1 of PairformerLayer.
+
+    s: [N, C_s=384], z: [N*N, C_z=128], mask: [N, 1]
+    Q/K/V projections: [384, 384], pair bias: z → Linear(128→16) → [N, N, 16]
+    Attention: 16 heads, d=24, logits = Q@K^T/sqrt(24) + pair_bias
+
+    Work split:
+      - Pass A (Q/K/V/gate projection on s): split N tiles by pid
+      - Pass B (pair bias from z): split N*N tiles by pid
+      - Pass C (attention): split rows by pid
+      - Pass D (output gating + projection): split N tiles by pid
+    """
+    n_tiles = N // P_MAX
+    n_in = C_s // P_MAX  # 3
+    Hd_s = H_s * d_s  # 384
+    scale_s = 1.0 / (d_s**0.5)
+
+    # SPMD work split for s tiles
+    tiles_per_engine = n_tiles // 2
+    my_tile_start = pid * tiles_per_engine
+
+    # SPMD work split for z flat tiles
+    n_z_flat = N * N
+    n_z_tiles = n_z_flat // P_MAX
+    z_tiles_per_engine = n_z_tiles // 2
+    my_z_tile_start = pid * z_tiles_per_engine
+
+    # SPMD work split for attention rows
+    rows_per_engine = N // 2
+    my_row_start = pid * rows_per_engine
+
+    # ---- Pass A: LayerNorm(s) + Q/K/V/gate projections ----
+    # Load s LayerNorm weights [P_MAX, C_s]
+    ln_s_w = nl.ndarray((P_MAX, C_s), dtype=nl.bfloat16, buffer=nl.sbuf)
+    nisa.dma_copy(dst=ln_s_w, src=norm_s_w_hbm)
+    ln_s_b = nl.ndarray((P_MAX, C_s), dtype=nl.bfloat16, buffer=nl.sbuf)
+    nisa.dma_copy(dst=ln_s_b, src=norm_s_b_hbm)
+
+    # Pre-load Q/K/V/gate weight chunks: each [384, 384] = 3x3 blocks
+    # NKI compiler doesn't support mutation, tuple(), or generator expressions.
+    # Load weight tiles directly from HBM inside nl.static_range loops.
+    # Since nl.static_range is unrolled, h/i/o become literals and HBM slices
+    # resolve to constant offsets at compile time.
+
+    # Pre-load Q bias: [P_MAX, C_s=384] tiled as 3 chunks along free dim
+    qb_0 = nl.ndarray((P_MAX, P_MAX), dtype=nl.bfloat16, buffer=nl.sbuf)
+    nisa.dma_copy(dst=qb_0, src=proj_q_b_hbm[0:P_MAX, 0:P_MAX])
+    qb_1 = nl.ndarray((P_MAX, P_MAX), dtype=nl.bfloat16, buffer=nl.sbuf)
+    nisa.dma_copy(dst=qb_1, src=proj_q_b_hbm[0:P_MAX, P_MAX : 2 * P_MAX])
+    qb_2 = nl.ndarray((P_MAX, P_MAX), dtype=nl.bfloat16, buffer=nl.sbuf)
+    nisa.dma_copy(dst=qb_2, src=proj_q_b_hbm[0:P_MAX, 2 * P_MAX : 3 * P_MAX])
+    q_bias_chunks = (qb_0, qb_1, qb_2)
+
+    for tile_local in nl.sequential_range(tiles_per_engine):
+        tile_idx = my_tile_start + tile_local
+        tile_start = tile_idx * P_MAX
+
+        # Load s tile [P_MAX, 384]
+        s_tile = nl.ndarray((P_MAX, C_s), dtype=nl.bfloat16, buffer=nl.sbuf)
+        nisa.dma_copy(dst=s_tile, src=s_in_hbm[tile_start : tile_start + P_MAX, 0:C_s])
+
+        s_normed = _layer_norm_tile(s_tile, ln_s_w, ln_s_b, C_s, eps)
+
+        # Split into chunks and transpose
+        sn_c0 = nl.ndarray((P_MAX, P_MAX), dtype=nl.bfloat16, buffer=nl.sbuf)
+        nisa.tensor_copy(dst=sn_c0, src=s_normed[0:P_MAX, 0:P_MAX])
+        sn_c0_t = _transpose_to_sbuf(sn_c0)
+
+        sn_c1 = nl.ndarray((P_MAX, P_MAX), dtype=nl.bfloat16, buffer=nl.sbuf)
+        nisa.tensor_copy(dst=sn_c1, src=s_normed[0:P_MAX, P_MAX : 2 * P_MAX])
+        sn_c1_t = _transpose_to_sbuf(sn_c1)
+
+        sn_c2 = nl.ndarray((P_MAX, P_MAX), dtype=nl.bfloat16, buffer=nl.sbuf)
+        nisa.tensor_copy(dst=sn_c2, src=s_normed[0:P_MAX, 2 * P_MAX : 3 * P_MAX])
+        sn_c2_t = _transpose_to_sbuf(sn_c2)
+
+        sn_chunks_t = (sn_c0_t, sn_c1_t, sn_c2_t)
+
+        # Q/K/V/gate: each [P_MAX, 384] = 3 output chunks
+        for o in nl.static_range(3):
+            # Q chunk o
+            q_acc = nl.ndarray((P_MAX, P_MAX), dtype=nl.float32, buffer=nl.sbuf)
+            nisa.memset(dst=q_acc, value=0.0)
+            k_acc = nl.ndarray((P_MAX, P_MAX), dtype=nl.float32, buffer=nl.sbuf)
+            nisa.memset(dst=k_acc, value=0.0)
+            v_acc = nl.ndarray((P_MAX, P_MAX), dtype=nl.float32, buffer=nl.sbuf)
+            nisa.memset(dst=v_acc, value=0.0)
+            g_acc = nl.ndarray((P_MAX, P_MAX), dtype=nl.float32, buffer=nl.sbuf)
+            nisa.memset(dst=g_acc, value=0.0)
+
+            for i in nl.static_range(3):
+                q_w_oi = _prepare_weight(
+                    proj_q_w_hbm[
+                        o * P_MAX : (o + 1) * P_MAX, i * P_MAX : (i + 1) * P_MAX
+                    ]
+                )
+                pq = nl.ndarray((P_MAX, P_MAX), dtype=nl.float32, buffer=nl.psum)
+                nisa.nc_matmul(dst=pq, stationary=sn_chunks_t[i], moving=q_w_oi)
+                pqs = nl.ndarray((P_MAX, P_MAX), dtype=nl.float32, buffer=nl.sbuf)
+                nisa.tensor_copy(dst=pqs, src=pq)
+                nisa.tensor_tensor(dst=q_acc, data1=q_acc, data2=pqs, op=nl.add)
+
+                k_w_oi = _prepare_weight(
+                    proj_k_w_hbm[
+                        o * P_MAX : (o + 1) * P_MAX, i * P_MAX : (i + 1) * P_MAX
+                    ]
+                )
+                pk = nl.ndarray((P_MAX, P_MAX), dtype=nl.float32, buffer=nl.psum)
+                nisa.nc_matmul(dst=pk, stationary=sn_chunks_t[i], moving=k_w_oi)
+                pks = nl.ndarray((P_MAX, P_MAX), dtype=nl.float32, buffer=nl.sbuf)
+                nisa.tensor_copy(dst=pks, src=pk)
+                nisa.tensor_tensor(dst=k_acc, data1=k_acc, data2=pks, op=nl.add)
+
+                v_w_oi = _prepare_weight(
+                    proj_v_w_hbm[
+                        o * P_MAX : (o + 1) * P_MAX, i * P_MAX : (i + 1) * P_MAX
+                    ]
+                )
+                pv = nl.ndarray((P_MAX, P_MAX), dtype=nl.float32, buffer=nl.psum)
+                nisa.nc_matmul(dst=pv, stationary=sn_chunks_t[i], moving=v_w_oi)
+                pvs = nl.ndarray((P_MAX, P_MAX), dtype=nl.float32, buffer=nl.sbuf)
+                nisa.tensor_copy(dst=pvs, src=pv)
+                nisa.tensor_tensor(dst=v_acc, data1=v_acc, data2=pvs, op=nl.add)
+
+                g_w_oi = _prepare_weight(
+                    proj_g_w_hbm[
+                        o * P_MAX : (o + 1) * P_MAX, i * P_MAX : (i + 1) * P_MAX
+                    ]
+                )
+                pg = nl.ndarray((P_MAX, P_MAX), dtype=nl.float32, buffer=nl.psum)
+                nisa.nc_matmul(dst=pg, stationary=sn_chunks_t[i], moving=g_w_oi)
+                pgs = nl.ndarray((P_MAX, P_MAX), dtype=nl.float32, buffer=nl.sbuf)
+                nisa.tensor_copy(dst=pgs, src=pg)
+                nisa.tensor_tensor(dst=g_acc, data1=g_acc, data2=pgs, op=nl.add)
+
+            # Add Q bias
+            qb_f32 = nl.ndarray((P_MAX, P_MAX), dtype=nl.float32, buffer=nl.sbuf)
+            nisa.tensor_copy(dst=qb_f32, src=q_bias_chunks[o])
+            nisa.tensor_tensor(dst=q_acc, data1=q_acc, data2=qb_f32, op=nl.add)
+
+            # Store Q/K/V/gate chunks to scratch HBM
+            q_bf16 = nl.ndarray((P_MAX, P_MAX), dtype=nl.bfloat16, buffer=nl.sbuf)
+            nisa.tensor_copy(dst=q_bf16, src=q_acc)
+            nisa.dma_copy(
+                dst=q_buf[tile_start : tile_start + P_MAX, o * P_MAX : (o + 1) * P_MAX],
+                src=q_bf16,
+            )
+
+            k_bf16 = nl.ndarray((P_MAX, P_MAX), dtype=nl.bfloat16, buffer=nl.sbuf)
+            nisa.tensor_copy(dst=k_bf16, src=k_acc)
+            nisa.dma_copy(
+                dst=k_buf[tile_start : tile_start + P_MAX, o * P_MAX : (o + 1) * P_MAX],
+                src=k_bf16,
+            )
+
+            v_bf16 = nl.ndarray((P_MAX, P_MAX), dtype=nl.bfloat16, buffer=nl.sbuf)
+            nisa.tensor_copy(dst=v_bf16, src=v_acc)
+            nisa.dma_copy(
+                dst=v_buf[tile_start : tile_start + P_MAX, o * P_MAX : (o + 1) * P_MAX],
+                src=v_bf16,
+            )
+
+            # Gate: apply sigmoid before storing
+            g_bf16 = nl.ndarray((P_MAX, P_MAX), dtype=nl.bfloat16, buffer=nl.sbuf)
+            nisa.tensor_copy(dst=g_bf16, src=g_acc)
+            g_sig = nl.ndarray((P_MAX, P_MAX), dtype=nl.bfloat16, buffer=nl.sbuf)
+            nisa.activation(dst=g_sig, op=nl.sigmoid, data=g_bf16, bias=None, scale=1.0)
+            nisa.dma_copy(
+                dst=gate_buf[
+                    tile_start : tile_start + P_MAX, o * P_MAX : (o + 1) * P_MAX
+                ],
+                src=g_sig,
+            )
+
+    # ---- Pass B: Pair bias from z ----
+    # z: [N*N, C_z=128], proj_z: [H_s=16, C_z=128]
+    # Output: z_bias_buf [N*N, H_s=16]
+    # Load z LayerNorm weights
+    ln_z_w = nl.ndarray((P_MAX, C_z), dtype=nl.bfloat16, buffer=nl.sbuf)
+    nisa.dma_copy(dst=ln_z_w, src=norm_z_w_hbm)
+    ln_z_b = nl.ndarray((P_MAX, C_z), dtype=nl.bfloat16, buffer=nl.sbuf)
+    nisa.dma_copy(dst=ln_z_b, src=norm_z_b_hbm)
+
+    # proj_z: [H_s=16, C_z=128]. Pad to [P_MAX, P_MAX] and transpose for matmul.
+    proj_z_raw = nl.ndarray((H_s, C_z), dtype=nl.bfloat16, buffer=nl.sbuf)
+    nisa.dma_copy(dst=proj_z_raw, src=proj_z_w_hbm)
+    proj_z_padded = nl.ndarray((P_MAX, P_MAX), dtype=nl.bfloat16, buffer=nl.sbuf)
+    nisa.memset(dst=proj_z_padded, value=0.0)
+    nisa.tensor_copy(dst=proj_z_padded[0:H_s, 0:C_z], src=proj_z_raw)
+    proj_z_w_t = _transpose_to_sbuf(proj_z_padded)
+
+    # BARRIER: Pass A must complete before Pass B uses q_buf etc.
+    # But Pass B doesn't use q_buf, so we can overlap. However both passes
+    # are writing to shared HBM buffers, so we barrier after Pass A + B.
+    # Actually, Pass B is independent — it only reads z_in_hbm and writes z_bias_buf.
+    # We can run it without waiting for Pass A.
+
+    for tile_local in nl.sequential_range(z_tiles_per_engine):
+        tile_idx = my_z_tile_start + tile_local
+        tile_start = tile_idx * P_MAX
+
+        z_tile = nl.ndarray((P_MAX, C_z), dtype=nl.bfloat16, buffer=nl.sbuf)
+        nisa.dma_copy(dst=z_tile, src=z_in_hbm[tile_start : tile_start + P_MAX, 0:C_z])
+
+        z_normed = _layer_norm_tile(z_tile, ln_z_w, ln_z_b, C_z, eps)
+        z_n_t = _transpose_to_sbuf(z_normed)
+
+        # Matmul: [P_MAX, 128] @ [16, 128]^T → [P_MAX, 16]
+        bias_psum = nl.ndarray((P_MAX, P_MAX), dtype=nl.float32, buffer=nl.psum)
+        nisa.nc_matmul(dst=bias_psum, stationary=z_n_t, moving=proj_z_w_t)
+        bias_full = nl.ndarray((P_MAX, P_MAX), dtype=nl.bfloat16, buffer=nl.sbuf)
+        nisa.tensor_copy(dst=bias_full, src=bias_psum)
+        bias_slice = nl.ndarray((P_MAX, H_s), dtype=nl.bfloat16, buffer=nl.sbuf)
+        nisa.tensor_copy(dst=bias_slice, src=bias_full[0:P_MAX, 0:H_s])
+
+        nisa.dma_copy(
+            dst=z_bias_buf[tile_start : tile_start + P_MAX, 0:H_s],
+            src=bias_slice,
+        )
+
+    # BARRIER: Both cores must finish Pass A and Pass B before attention
+    nisa.core_barrier(data=q_buf, cores=(0, 1))
+    nisa.core_barrier(data=z_bias_buf, cores=(0, 1))
+
+    # ---- Pass C: Multi-head attention on s with pair bias ----
+    # Q/K/V in q_buf/k_buf/v_buf: [N, 384] = [N, 16*24]
+    # z_bias_buf: [N*N, 16] = [N, N, 16]
+    # For each head h (0..15): Q_h = Q[:, h*24:(h+1)*24], etc.
+    # Attention: logits[j, k] = Q_h[j,:] @ K_h[k,:]^T / sqrt(24) + bias[j, k, h]
+    # This is standard attention on N positions (not row-wise like TriAttn)
+
+    # Strides for Q/K/V: shape [N, H_s*d_s=384], stored contiguously
+    s_stride_row = C_s  # stride between positions
+
+    # Strides for bias: shape [N, N, H_s=16], stored as [N*N, 16]
+    bias_s_stride_q = N * H_s  # stride along query dim
+    bias_s_stride_k = H_s  # stride along key dim
+
+    # Output projection weights loaded inline from HBM inside loops below
+
+    for h in nl.affine_range(H_s):
+        hd_start = h * d_s
+
+        for j_tile in nl.affine_range(n_tiles):
+            j_start = j_tile * P_MAX
+
+            # Determine if this tile belongs to this engine
+            # For simplicity in attention, split by j_tile (output query tile)
+            # Both engines process all heads but different j_tiles
+            # We do this by splitting the j_tile loop rather than the row loop
+            # since attention here is over the full N (not per-row)
+
+            # Load Q tile: [P_MAX, d_s=24] from q_buf
+            q_tile = nl.ndarray((P_MAX, d_s), dtype=nl.bfloat16, buffer=nl.sbuf)
+            nisa.dma_copy(
+                dst=q_tile,
+                src=q_buf.ap(
+                    pattern=[[s_stride_row, P_MAX], [1, d_s]],
+                    offset=j_start * s_stride_row + hd_start,
+                ),
+            )
+
+            # Pad Q to [P_MAX, P_MAX] and transpose for nc_matmul
+            q_padded = nl.ndarray((P_MAX, P_MAX), dtype=nl.bfloat16, buffer=nl.sbuf)
+            nisa.memset(dst=q_padded, value=0.0)
+            nisa.tensor_copy(dst=q_padded[0:P_MAX, 0:d_s], src=q_tile)
+            q_t = _transpose_to_sbuf(q_padded)
+
+            # Online softmax accumulators
+            m_prev = nl.ndarray((P_MAX, 1), dtype=nl.float32, buffer=nl.sbuf)
+            nisa.memset(dst=m_prev, value=-1e30)
+            l_prev = nl.ndarray((P_MAX, 1), dtype=nl.float32, buffer=nl.sbuf)
+            nisa.memset(dst=l_prev, value=0.0)
+            o_acc = nl.ndarray((P_MAX, d_s), dtype=nl.float32, buffer=nl.sbuf)
+            nisa.memset(dst=o_acc, value=0.0)
+
+            for k_tile_idx in nl.sequential_range(n_tiles):
+                k_start = k_tile_idx * P_MAX
+
+                # Load K tile: [P_MAX, d_s=24]
+                k_tile_sb = nl.ndarray((P_MAX, d_s), dtype=nl.bfloat16, buffer=nl.sbuf)
+                nisa.dma_copy(
+                    dst=k_tile_sb,
+                    src=k_buf.ap(
+                        pattern=[[s_stride_row, P_MAX], [1, d_s]],
+                        offset=k_start * s_stride_row + hd_start,
+                    ),
+                )
+
+                # Load V tile: [P_MAX, d_s=24]
+                v_tile_sb = nl.ndarray((P_MAX, d_s), dtype=nl.bfloat16, buffer=nl.sbuf)
+                nisa.dma_copy(
+                    dst=v_tile_sb,
+                    src=v_buf.ap(
+                        pattern=[[s_stride_row, P_MAX], [1, d_s]],
+                        offset=k_start * s_stride_row + hd_start,
+                    ),
+                )
+
+                # Load pair bias tile: [P_MAX, P_MAX] from z_bias_buf
+                # z_bias_buf is [N*N, H_s] = [N, N, H_s]
+                # bias[j, k, h] at offset j*N*H_s + k*H_s + h
+                bias_tile = nl.ndarray(
+                    (P_MAX, P_MAX), dtype=nl.bfloat16, buffer=nl.sbuf
+                )
+                nisa.dma_copy(
+                    dst=bias_tile,
+                    src=z_bias_buf.ap(
+                        pattern=[[bias_s_stride_q, P_MAX], [bias_s_stride_k, P_MAX]],
+                        offset=j_start * bias_s_stride_q
+                        + k_start * bias_s_stride_k
+                        + h,
+                    ),
+                )
+
+                # Q @ K^T
+                k_padded = nl.ndarray((P_MAX, P_MAX), dtype=nl.bfloat16, buffer=nl.sbuf)
+                nisa.memset(dst=k_padded, value=0.0)
+                nisa.tensor_copy(dst=k_padded[0:P_MAX, 0:d_s], src=k_tile_sb)
+                k_t = _transpose_to_sbuf(k_padded)
+
+                logits_psum = nl.ndarray(
+                    (P_MAX, P_MAX), dtype=nl.float32, buffer=nl.psum
+                )
+                nisa.nc_matmul(dst=logits_psum, stationary=q_t, moving=k_t)
+                logits = nl.ndarray((P_MAX, P_MAX), dtype=nl.float32, buffer=nl.sbuf)
+                nisa.tensor_copy(dst=logits, src=logits_psum)
+
+                # Scale
+                logits_scaled = nl.ndarray(
+                    (P_MAX, P_MAX), dtype=nl.float32, buffer=nl.sbuf
+                )
+                nisa.tensor_scalar(
+                    dst=logits_scaled,
+                    data=logits,
+                    op0=nl.multiply,
+                    operand0=scale_s,
+                    engine=nisa.vector_engine,
+                )
+
+                # Add pair bias
+                bias_fp32 = nl.ndarray((P_MAX, P_MAX), dtype=nl.float32, buffer=nl.sbuf)
+                nisa.tensor_copy(dst=bias_fp32, src=bias_tile)
+                logits_biased = nl.ndarray(
+                    (P_MAX, P_MAX), dtype=nl.float32, buffer=nl.sbuf
+                )
+                nisa.tensor_tensor(
+                    dst=logits_biased,
+                    data1=logits_scaled,
+                    data2=bias_fp32,
+                    op=nl.add,
+                )
+
+                # Online softmax
+                tile_max = nl.ndarray((P_MAX, 1), dtype=nl.float32, buffer=nl.sbuf)
+                nisa.tensor_reduce(
+                    dst=tile_max, op=nl.maximum, data=logits_biased, axis=1
+                )
+
+                m_new = nl.ndarray((P_MAX, 1), dtype=nl.float32, buffer=nl.sbuf)
+                nisa.tensor_tensor(
+                    dst=m_new, data1=m_prev, data2=tile_max, op=nl.maximum
+                )
+
+                m_diff = nl.ndarray((P_MAX, 1), dtype=nl.float32, buffer=nl.sbuf)
+                nisa.tensor_tensor(
+                    dst=m_diff, data1=m_prev, data2=m_new, op=nl.subtract
+                )
+                correction = nl.ndarray((P_MAX, 1), dtype=nl.float32, buffer=nl.sbuf)
+                nisa.activation(
+                    dst=correction, op=nl.exp, data=m_diff, bias=None, scale=1.0
+                )
+
+                logits_shifted = nl.ndarray(
+                    (P_MAX, P_MAX), dtype=nl.float32, buffer=nl.sbuf
+                )
+                nisa.tensor_scalar(
+                    dst=logits_shifted,
+                    data=logits_biased,
+                    op0=nl.subtract,
+                    operand0=m_new,
+                    engine=nisa.vector_engine,
+                )
+                exp_logits = nl.ndarray(
+                    (P_MAX, P_MAX), dtype=nl.float32, buffer=nl.sbuf
+                )
+                nisa.activation(
+                    dst=exp_logits,
+                    op=nl.exp,
+                    data=logits_shifted,
+                    bias=None,
+                    scale=1.0,
+                )
+
+                l_corrected = nl.ndarray((P_MAX, 1), dtype=nl.float32, buffer=nl.sbuf)
+                nisa.tensor_tensor(
+                    dst=l_corrected, data1=l_prev, data2=correction, op=nl.multiply
+                )
+                tile_sum = nl.ndarray((P_MAX, 1), dtype=nl.float32, buffer=nl.sbuf)
+                nisa.tensor_reduce(dst=tile_sum, op=nl.add, data=exp_logits, axis=1)
+                l_new = nl.ndarray((P_MAX, 1), dtype=nl.float32, buffer=nl.sbuf)
+                nisa.tensor_tensor(
+                    dst=l_new, data1=l_corrected, data2=tile_sum, op=nl.add
+                )
+
+                o_scaled = nl.ndarray((P_MAX, d_s), dtype=nl.float32, buffer=nl.sbuf)
+                nisa.tensor_scalar(
+                    dst=o_scaled,
+                    data=o_acc,
+                    op0=nl.multiply,
+                    operand0=correction,
+                    engine=nisa.vector_engine,
+                )
+
+                # exp_logits @ V
+                exp_bf16 = nl.ndarray((P_MAX, P_MAX), dtype=nl.bfloat16, buffer=nl.sbuf)
+                nisa.tensor_copy(dst=exp_bf16, src=exp_logits)
+                exp_t = _transpose_to_sbuf(exp_bf16)
+
+                pv_psum = nl.ndarray((P_MAX, d_s), dtype=nl.float32, buffer=nl.psum)
+                nisa.nc_matmul(dst=pv_psum, stationary=exp_t, moving=v_tile_sb)
+                pv_sbuf = nl.ndarray((P_MAX, d_s), dtype=nl.float32, buffer=nl.sbuf)
+                nisa.tensor_copy(dst=pv_sbuf, src=pv_psum)
+
+                nisa.tensor_tensor(dst=o_acc, data1=o_scaled, data2=pv_sbuf, op=nl.add)
+                nisa.tensor_copy(dst=m_prev, src=m_new)
+                nisa.tensor_copy(dst=l_prev, src=l_new)
+
+            # Finalize: o = o_acc / l
+            inv_l = nl.ndarray((P_MAX, 1), dtype=nl.float32, buffer=nl.sbuf)
+            nisa.reciprocal(dst=inv_l, data=l_prev)
+            o_final = nl.ndarray((P_MAX, d_s), dtype=nl.float32, buffer=nl.sbuf)
+            nisa.tensor_scalar(
+                dst=o_final,
+                data=o_acc,
+                op0=nl.multiply,
+                operand0=inv_l,
+                engine=nisa.vector_engine,
+            )
+
+            # Store attention output back to q_buf (reuse buffer)
+            # q_buf[j_start:j_start+P_MAX, hd_start:hd_start+d_s]
+            o_out = nl.ndarray((P_MAX, d_s), dtype=nl.bfloat16, buffer=nl.sbuf)
+            nisa.tensor_copy(dst=o_out, src=o_final)
+            nisa.dma_copy(
+                dst=q_buf.ap(
+                    pattern=[[s_stride_row, P_MAX], [1, d_s]],
+                    offset=j_start * s_stride_row + hd_start,
+                ),
+                src=o_out,
+            )
+
+    # BARRIER: attention output must be complete before output projection
+    nisa.core_barrier(data=q_buf, cores=(0, 1))
+
+    # ---- Pass D: Output gating + projection + residual ----
+    # attn_output in q_buf: [N, 384]
+    # gate in gate_buf: [N, 384] (already sigmoided)
+    # result = proj_o(gate * attn_output) + s_input
+
+    for tile_local in nl.sequential_range(tiles_per_engine):
+        tile_idx = my_tile_start + tile_local
+        tile_start = tile_idx * P_MAX
+
+        # Load gate and attention output, apply gating per chunk
+        gc0 = nl.ndarray((P_MAX, P_MAX), dtype=nl.bfloat16, buffer=nl.sbuf)
+        nisa.dma_copy(dst=gc0, src=q_buf[tile_start : tile_start + P_MAX, 0:P_MAX])
+        gg0 = nl.ndarray((P_MAX, P_MAX), dtype=nl.bfloat16, buffer=nl.sbuf)
+        nisa.dma_copy(dst=gg0, src=gate_buf[tile_start : tile_start + P_MAX, 0:P_MAX])
+        gated_0 = nl.ndarray((P_MAX, P_MAX), dtype=nl.bfloat16, buffer=nl.sbuf)
+        nisa.tensor_tensor(dst=gated_0, data1=gg0, data2=gc0, op=nl.multiply)
+        gated_0_t = _transpose_to_sbuf(gated_0)
+
+        gc1 = nl.ndarray((P_MAX, P_MAX), dtype=nl.bfloat16, buffer=nl.sbuf)
+        nisa.dma_copy(
+            dst=gc1, src=q_buf[tile_start : tile_start + P_MAX, P_MAX : 2 * P_MAX]
+        )
+        gg1 = nl.ndarray((P_MAX, P_MAX), dtype=nl.bfloat16, buffer=nl.sbuf)
+        nisa.dma_copy(
+            dst=gg1, src=gate_buf[tile_start : tile_start + P_MAX, P_MAX : 2 * P_MAX]
+        )
+        gated_1 = nl.ndarray((P_MAX, P_MAX), dtype=nl.bfloat16, buffer=nl.sbuf)
+        nisa.tensor_tensor(dst=gated_1, data1=gg1, data2=gc1, op=nl.multiply)
+        gated_1_t = _transpose_to_sbuf(gated_1)
+
+        gc2 = nl.ndarray((P_MAX, P_MAX), dtype=nl.bfloat16, buffer=nl.sbuf)
+        nisa.dma_copy(
+            dst=gc2, src=q_buf[tile_start : tile_start + P_MAX, 2 * P_MAX : 3 * P_MAX]
+        )
+        gg2 = nl.ndarray((P_MAX, P_MAX), dtype=nl.bfloat16, buffer=nl.sbuf)
+        nisa.dma_copy(
+            dst=gg2,
+            src=gate_buf[tile_start : tile_start + P_MAX, 2 * P_MAX : 3 * P_MAX],
+        )
+        gated_2 = nl.ndarray((P_MAX, P_MAX), dtype=nl.bfloat16, buffer=nl.sbuf)
+        nisa.tensor_tensor(dst=gated_2, data1=gg2, data2=gc2, op=nl.multiply)
+        gated_2_t = _transpose_to_sbuf(gated_2)
+
+        gated_chunks_t = (gated_0_t, gated_1_t, gated_2_t)
+
+        # Output projection: [P_MAX, 384] @ [384, 384]^T → [P_MAX, 384]
+        for o in nl.static_range(3):
+            proj_acc = nl.ndarray((P_MAX, P_MAX), dtype=nl.float32, buffer=nl.sbuf)
+            nisa.memset(dst=proj_acc, value=0.0)
+            for i in nl.static_range(3):
+                o_w_oi = _prepare_weight(
+                    proj_o_w_hbm[
+                        o * P_MAX : (o + 1) * P_MAX, i * P_MAX : (i + 1) * P_MAX
+                    ]
+                )
+                p = nl.ndarray((P_MAX, P_MAX), dtype=nl.float32, buffer=nl.psum)
+                nisa.nc_matmul(dst=p, stationary=gated_chunks_t[i], moving=o_w_oi)
+                ps = nl.ndarray((P_MAX, P_MAX), dtype=nl.float32, buffer=nl.sbuf)
+                nisa.tensor_copy(dst=ps, src=p)
+                nisa.tensor_tensor(dst=proj_acc, data1=proj_acc, data2=ps, op=nl.add)
+
+            proj_bf16 = nl.ndarray((P_MAX, P_MAX), dtype=nl.bfloat16, buffer=nl.sbuf)
+            nisa.tensor_copy(dst=proj_bf16, src=proj_acc)
+
+            # Residual: s_updated = s_input + proj_output
+            s_orig = nl.ndarray((P_MAX, P_MAX), dtype=nl.bfloat16, buffer=nl.sbuf)
+            nisa.dma_copy(
+                dst=s_orig,
+                src=s_in_hbm[
+                    tile_start : tile_start + P_MAX, o * P_MAX : (o + 1) * P_MAX
+                ],
+            )
+            s_updated = nl.ndarray((P_MAX, P_MAX), dtype=nl.bfloat16, buffer=nl.sbuf)
+            nisa.tensor_tensor(dst=s_updated, data1=s_orig, data2=proj_bf16, op=nl.add)
+            nisa.dma_copy(
+                dst=s_out_hbm[
+                    tile_start : tile_start + P_MAX, o * P_MAX : (o + 1) * P_MAX
+                ],
+                src=s_updated,
+            )
+
+
+# ============================================================================
+# Main Entry Point: Full PairformerLayer — SPMD grid=[2]
+# ============================================================================
+@nki.jit
+def full_pairformer_layer_spmd(
+    # Main tensors
+    s,  # [N, C_s=384] bf16
+    z,  # [N*N, C_z=128] bf16
+    pair_mask,  # [N*N, 1] bf16
+    mask,  # [N, 1] bf16 (for PairBiasAttention masking — unused for now)
+    # PairBiasAttention weights (Step 1)
+    pba_norm_s_w,
+    pba_norm_s_b,
+    pba_norm_z_w,
+    pba_norm_z_b,
+    pba_q_w,
+    pba_q_b,
+    pba_k_w,
+    pba_v_w,
+    pba_z_w,
+    pba_g_w,
+    pba_o_w,
+    # TriMulOut weights (Step 2)
+    tmul_out_norm_in_w,
+    tmul_out_norm_in_b,
+    tmul_out_p_in_w,
+    tmul_out_g_in_w,
+    tmul_out_norm_out_w,
+    tmul_out_norm_out_b,
+    tmul_out_p_out_w,
+    tmul_out_g_out_w,
+    # TriMulIn weights (Step 3)
+    tmul_in_norm_in_w,
+    tmul_in_norm_in_b,
+    tmul_in_p_in_w,
+    tmul_in_g_in_w,
+    tmul_in_norm_out_w,
+    tmul_in_norm_out_b,
+    tmul_in_p_out_w,
+    tmul_in_g_out_w,
+    # TriAttnStart weights (Step 4)
+    tatt_s_ln_w,
+    tatt_s_ln_b,
+    tatt_s_bias_proj_w,
+    tatt_s_q_w,
+    tatt_s_k_w,
+    tatt_s_v_w,
+    tatt_s_g_w,
+    tatt_s_o_w,
+    # TriAttnEnd weights (Step 5)
+    tatt_e_ln_w,
+    tatt_e_ln_b,
+    tatt_e_bias_proj_w,
+    tatt_e_q_w,
+    tatt_e_k_w,
+    tatt_e_v_w,
+    tatt_e_g_w,
+    tatt_e_o_w,
+    # Transition_s weights (Step 6a) — no bias on fc1/fc2/fc3 (bias=False)
+    trans_s_norm_w,
+    trans_s_norm_b,
+    trans_s_fc1_w,
+    trans_s_fc2_w,
+    trans_s_fc3_w,
+    # Transition_z weights (Step 6b)
+    trans_z_norm_w,
+    trans_z_norm_b,
+    trans_z_fc1_w,
+    trans_z_fc2_w,
+    trans_z_fc3_w,
+    # Pre-allocated scratch buffers (external shared HBM)
+    scratch_buf,  # [6 * N*N, C_z] bf16 — for z operations
+    bias_buf,  # [N*N, H_z=4] bf16 — for TriAttn bias
+    s_scratch_q,  # [N, C_s=384] bf16 — Q buffer / attn output for PBA
+    s_scratch_k,  # [N, C_s=384] bf16 — K buffer
+    s_scratch_v,  # [N, C_s=384] bf16 — V buffer
+    s_scratch_gate,  # [N, C_s=384] bf16 — gate buffer
+    z_bias_scratch,  # [N*N, H_s=16] bf16 — pair bias from z
+    s_intermediate,  # [N, C_s=384] bf16 — s after step 1 (before step 6a)
+    # Constants
+    N: int = 256,
+    C_s: int = 384,
+    C_z: int = 128,
+    H_z: int = 4,
+    d_z: int = 32,
+    H_s: int = 16,
+    d_s: int = 24,
+    eps: float = 1e-5,
+):
+    """Full PairformerLayer mega-kernel — SPMD grid=[2].
+
+    Executes all 7 sub-operations of a PairformerLayer in a single kernel:
+      Step 1:  s = s + PairBiasAttn(s, z)
+      Step 2:  z = z + TriMulOut(z)
+      Step 3:  z = z + TriMulIn(z)
+      Step 4:  z = z + TriAttnStart(z)
+      Step 5:  z = z + TriAttnEnd(z)
+      Step 6a: s = s + Transition_s(s)
+      Step 6b: z = z + Transition_z(z)
+
+    Returns: (s_out, z_out)
+    """
+    # SPMD grid=[2] splits s tiles as N//P_MAX//2 per engine.
+    # At N=128 (1 tile), tiles_per_engine=0 and s operations are silently skipped,
+    # producing all-zero s_out. Require N >= 256 until a single-engine fallback is added.
+    assert N >= 256, (
+        f"N={N} too small for SPMD grid=[2]: s has only {N // 128} tile(s), "
+        f"need >= 2. Use N >= 256."
+    )
+
+    pid = nl.program_id(0)
+
+    hidden_dim_z = 4 * C_z  # 512
+    hidden_dim_s = 4 * C_s  # 1536
+    n_flat = N * N
+
+    s_out = nl.ndarray((N, C_s), dtype=nl.bfloat16, buffer=nl.shared_hbm)
+    z_out = nl.ndarray((n_flat, C_z), dtype=nl.bfloat16, buffer=nl.shared_hbm)
+
+    # z-operations scratch offsets (same as fused_z_ops_spmd)
+    off_a = 0 * n_flat
+    off_b = 1 * n_flat
+    off_gate = 2 * n_flat
+    off_d = 3 * n_flat
+    off_z1 = 4 * n_flat
+    off_z2 = 5 * n_flat
+
+    # ================================================================
+    # Step 1: PairBiasAttention (s = s + PBA(s, z))
+    # ================================================================
+    _pair_bias_attn_phase_spmd(
+        s_in_hbm=s,
+        z_in_hbm=z,
+        s_out_hbm=s_intermediate,
+        mask_hbm=mask,
+        norm_s_w_hbm=pba_norm_s_w,
+        norm_s_b_hbm=pba_norm_s_b,
+        norm_z_w_hbm=pba_norm_z_w,
+        norm_z_b_hbm=pba_norm_z_b,
+        proj_q_w_hbm=pba_q_w,
+        proj_q_b_hbm=pba_q_b,
+        proj_k_w_hbm=pba_k_w,
+        proj_v_w_hbm=pba_v_w,
+        proj_z_w_hbm=pba_z_w,
+        proj_g_w_hbm=pba_g_w,
+        proj_o_w_hbm=pba_o_w,
+        q_buf=s_scratch_q,
+        k_buf=s_scratch_k,
+        v_buf=s_scratch_v,
+        gate_buf=s_scratch_gate,
+        z_bias_buf=z_bias_scratch,
+        N=N,
+        C_s=C_s,
+        C_z=C_z,
+        H_s=H_s,
+        d_s=d_s,
+        eps=eps,
+        pid=pid,
+    )
+
+    # BARRIER: Step 1 must complete before Step 6a reads s_intermediate
+    nisa.core_barrier(data=s_intermediate, cores=(0, 1))
+
+    # ================================================================
+    # Steps 2-5: z operations (same as fused_z_ops_spmd)
+    # ================================================================
+
+    # Step 2: TriMulOut
+    _trimul_phase_spmd(
+        z_in_hbm=z,
+        buf=scratch_buf,
+        off_out=off_z1,
+        off_gate=off_gate,
+        off_a=off_a,
+        off_b=off_b,
+        off_result=off_d,
+        pair_mask_hbm=pair_mask,
+        norm_in_w=tmul_out_norm_in_w,
+        norm_in_b=tmul_out_norm_in_b,
+        p_in_w=tmul_out_p_in_w,
+        g_in_w=tmul_out_g_in_w,
+        norm_out_w=tmul_out_norm_out_w,
+        norm_out_b=tmul_out_norm_out_b,
+        p_out_w=tmul_out_p_out_w,
+        g_out_w=tmul_out_g_out_w,
+        N=N,
+        C_z=C_z,
+        eps=eps,
+        is_incoming=False,
+        pid=pid,
+    )
+    nisa.core_barrier(data=scratch_buf, cores=(0, 1))
+
+    # Step 3: TriMulIn
+    _trimul_phase_spmd(
+        z_in_hbm=scratch_buf[off_z1 : off_z1 + n_flat, 0:C_z],
+        buf=scratch_buf,
+        off_out=off_z2,
+        off_gate=off_gate,
+        off_a=off_a,
+        off_b=off_b,
+        off_result=off_d,
+        pair_mask_hbm=pair_mask,
+        norm_in_w=tmul_in_norm_in_w,
+        norm_in_b=tmul_in_norm_in_b,
+        p_in_w=tmul_in_p_in_w,
+        g_in_w=tmul_in_g_in_w,
+        norm_out_w=tmul_in_norm_out_w,
+        norm_out_b=tmul_in_norm_out_b,
+        p_out_w=tmul_in_p_out_w,
+        g_out_w=tmul_in_g_out_w,
+        N=N,
+        C_z=C_z,
+        eps=eps,
+        is_incoming=True,
+        pid=pid,
+    )
+    nisa.core_barrier(data=scratch_buf, cores=(0, 1))
+
+    # Step 4: TriAttnStart
+    _triattn_phase_spmd(
+        z_in_hbm=scratch_buf[off_z2 : off_z2 + n_flat, 0:C_z],
+        buf=scratch_buf,
+        off_out=off_z1,
+        off_q=off_a,
+        off_k=off_b,
+        off_v=off_gate,
+        bias_buf=bias_buf,
+        pair_mask_hbm=pair_mask,
+        ln_w_hbm=tatt_s_ln_w,
+        ln_b_hbm=tatt_s_ln_b,
+        bias_proj_w_hbm=tatt_s_bias_proj_w,
+        q_w_hbm=tatt_s_q_w,
+        k_w_hbm=tatt_s_k_w,
+        v_w_hbm=tatt_s_v_w,
+        gate_w_hbm=tatt_s_g_w,
+        out_w_hbm=tatt_s_o_w,
+        N=N,
+        C_z=C_z,
+        H=H_z,
+        d=d_z,
+        eps=eps,
+        is_ending=False,
+        pid=pid,
+    )
+    nisa.core_barrier(data=scratch_buf, cores=(0, 1))
+
+    # Step 5: TriAttnEnd
+    _triattn_phase_spmd(
+        z_in_hbm=scratch_buf[off_z1 : off_z1 + n_flat, 0:C_z],
+        buf=scratch_buf,
+        off_out=off_z2,
+        off_q=off_a,
+        off_k=off_b,
+        off_v=off_gate,
+        bias_buf=bias_buf,
+        pair_mask_hbm=pair_mask,
+        ln_w_hbm=tatt_e_ln_w,
+        ln_b_hbm=tatt_e_ln_b,
+        bias_proj_w_hbm=tatt_e_bias_proj_w,
+        q_w_hbm=tatt_e_q_w,
+        k_w_hbm=tatt_e_k_w,
+        v_w_hbm=tatt_e_v_w,
+        gate_w_hbm=tatt_e_g_w,
+        out_w_hbm=tatt_e_o_w,
+        N=N,
+        C_z=C_z,
+        H=H_z,
+        d=d_z,
+        eps=eps,
+        is_ending=True,
+        pid=pid,
+    )
+    nisa.core_barrier(data=scratch_buf, cores=(0, 1))
+
+    # ================================================================
+    # Step 6a: Transition_s (s = s_intermediate + Trans_s(s_intermediate))
+    # ================================================================
+    _transition_s_phase_spmd(
+        s_in_hbm=s_intermediate,
+        s_out_hbm=s_out,
+        norm_w_hbm=trans_s_norm_w,
+        norm_b_hbm=trans_s_norm_b,
+        fc1_w_hbm=trans_s_fc1_w,
+        fc2_w_hbm=trans_s_fc2_w,
+        fc3_w_hbm=trans_s_fc3_w,
+        N=N,
+        C_s=C_s,
+        hidden_dim_s=hidden_dim_s,
+        eps=eps,
+        pid=pid,
+    )
+
+    # ================================================================
+    # Step 6b: Transition_z (same as fused_z_ops_spmd)
+    # ================================================================
+    _transition_z_phase_spmd(
+        z_in_hbm=scratch_buf[off_z2 : off_z2 + n_flat, 0:C_z],
+        z_out_hbm=z_out,
+        norm_w_hbm=trans_z_norm_w,
+        norm_b_hbm=trans_z_norm_b,
+        fc1_w_hbm=trans_z_fc1_w,
+        fc2_w_hbm=trans_z_fc2_w,
+        fc3_w_hbm=trans_z_fc3_w,
+        N=N,
+        C_z=C_z,
+        hidden_dim=hidden_dim_z,
+        eps=eps,
+        pid=pid,
+    )
+
+    return s_out, z_out
diff --git a/contrib/models/Boltz-2/src/fused_z_ops_spmd.py b/contrib/models/Boltz-2/src/fused_z_ops_spmd.py
new file mode 100644
index 00000000..bade591f
--- /dev/null
+++ b/contrib/models/Boltz-2/src/fused_z_ops_spmd.py
@@ -0,0 +1,1266 @@
+# Copyright Amazon.com, Inc. or its affiliates. All Rights Reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License").
+# You may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+"""Fused z-operations mega-kernel — SPMD grid=[2] variant.
+
+This is a variant of fused_z_ops_seq.py that uses NKI SPMD with grid=[2]
+to split work across the 2 physical NeuronCores within one logical core
+(LNC=2 mode on trn2). Each physical core executes the same kernel code
+but operates on a different subset of tiles/iterations.
+
+Key differences from fused_z_ops_seq.py:
+  - scratch_buf and bias_buf use shared_hbm (not private_hbm) so both
+    cores can read each other's phase outputs
+  - Each phase function accepts a `pid` parameter and splits work by pid
+  - nisa.core_barrier() is inserted between phases where phase N+1 reads
+    phase N's output from shared HBM
+  - Main entry uses nl.program_id(0) and is launched with [2] grid
+
+Work splitting strategy:
+  - Flat tile loops: pid=0 processes tiles 0..n/2-1, pid=1 does n/2..n-1
+  - TriMul Pass 1b (d-loop): pid=0 does d=0..C_z/2-1, pid=1 does C_z/2..C_z-1
+  - TriAttn Pass 3b (i_row): pid=0 does rows 0..N/2-1, pid=1 does N/2..N-1
+  - All phases are embarrassingly parallel within each phase; no collectives needed
+
+Hardware: NeuronCore v3 (trn2), LNC=2, 2 physical cores per logical core
+"""
+
+import numpy as np
+
+import nki
+import nki.isa as nisa
+import nki.language as nl
+
+P_MAX = 128
+
+
+# ============================================================================
+# Helper: LayerNorm on a tile [P_MAX, F] in SBUF
+# ============================================================================
+def _layer_norm_tile(x_tile, weight_tiled, bias_tiled, F, eps=1e-5):
+    """LayerNorm on a tile in SBUF.
+
+    Computes: (x - mean) / sqrt(var + eps) * weight + bias
+    Reduces over the free dimension (axis 1, size F).
+
+    Args:
+        x_tile: [P_MAX, F] bf16 in SBUF
+        weight_tiled: [P_MAX, F] bf16 -- pre-tiled LN weight (each row identical)
+        bias_tiled:   [P_MAX, F] bf16 -- pre-tiled LN bias (each row identical)
+        F: int, free dimension size
+        eps: float
+
+    Returns:
+        normalized: [P_MAX, F] bf16 in SBUF
+    """
+    inv_F = 1.0 / float(F)
+
+    # Cast to float32
+    x_f32 = nl.ndarray((P_MAX, F), dtype=nl.float32, buffer=nl.sbuf)
+    nisa.tensor_copy(dst=x_f32, src=x_tile)
+
+    # Mean
+    sum_x = nl.ndarray((P_MAX, 1), dtype=nl.float32, buffer=nl.sbuf)
+    nisa.tensor_reduce(dst=sum_x, op=nl.add, data=x_f32, axis=1)
+    mean = nl.ndarray((P_MAX, 1), dtype=nl.float32, buffer=nl.sbuf)
+    nisa.tensor_scalar(
+        dst=mean, data=sum_x, op0=nl.multiply, operand0=inv_F, engine=nisa.vector_engine
+    )
+
+    # Centered
+    centered = nl.ndarray((P_MAX, F), dtype=nl.float32, buffer=nl.sbuf)
+    nisa.tensor_scalar(
+        dst=centered,
+        data=x_f32,
+        op0=nl.subtract,
+        operand0=mean,
+        engine=nisa.vector_engine,
+    )
+
+    # Variance
+    sq = nl.ndarray((P_MAX, F), dtype=nl.float32, buffer=nl.sbuf)
+    nisa.tensor_tensor(dst=sq, data1=centered, data2=centered, op=nl.multiply)
+    sum_sq = nl.ndarray((P_MAX, 1), dtype=nl.float32, buffer=nl.sbuf)
+    nisa.tensor_reduce(dst=sum_sq, op=nl.add, data=sq, axis=1)
+    var = nl.ndarray((P_MAX, 1), dtype=nl.float32, buffer=nl.sbuf)
+    nisa.tensor_scalar(
+        dst=var, data=sum_sq, op0=nl.multiply, operand0=inv_F, engine=nisa.vector_engine
+    )
+
+    # rsqrt(var + eps)
+    var_eps = nl.ndarray((P_MAX, 1), dtype=nl.float32, buffer=nl.sbuf)
+    nisa.tensor_scalar(
+        dst=var_eps, data=var, op0=nl.add, operand0=eps, engine=nisa.vector_engine
+    )
+    rsqrt_std = nl.ndarray((P_MAX, 1), dtype=nl.float32, buffer=nl.sbuf)
+    nisa.activation(dst=rsqrt_std, op=nl.rsqrt, data=var_eps, bias=None, scale=1.0)
+
+    # Normalize
+    normalized_f32 = nl.ndarray((P_MAX, F), dtype=nl.float32, buffer=nl.sbuf)
+    nisa.tensor_scalar(
+        dst=normalized_f32,
+        data=centered,
+        op0=nl.multiply,
+        operand0=rsqrt_std,
+        engine=nisa.vector_engine,
+    )
+
+    # Scale by weight + bias (both [P_MAX, F], pre-tiled on host)
+    weight_f32 = nl.ndarray((P_MAX, F), dtype=nl.float32, buffer=nl.sbuf)
+    nisa.tensor_copy(dst=weight_f32, src=weight_tiled)
+    scaled = nl.ndarray((P_MAX, F), dtype=nl.float32, buffer=nl.sbuf)
+    nisa.tensor_tensor(
+        dst=scaled, data1=normalized_f32, data2=weight_f32, op=nl.multiply
+    )
+
+    bias_f32 = nl.ndarray((P_MAX, F), dtype=nl.float32, buffer=nl.sbuf)
+    nisa.tensor_copy(dst=bias_f32, src=bias_tiled)
+    result_f32 = nl.ndarray((P_MAX, F), dtype=nl.float32, buffer=nl.sbuf)
+    nisa.tensor_tensor(dst=result_f32, data1=scaled, data2=bias_f32, op=nl.add)
+
+    result = nl.ndarray((P_MAX, F), dtype=nl.bfloat16, buffer=nl.sbuf)
+    nisa.tensor_copy(dst=result, src=result_f32)
+    return result
+
+
+# ============================================================================
+# Helper: matmul x @ W^T where x [P_MAX, K], W [M, K], both K=P_MAX
+# Returns [P_MAX, M] in bf16 on SBUF
+# ============================================================================
+def _linear_128x128(x_t, w_hbm):
+    """Compute x @ W^T for [P_MAX, 128] @ [128, 128] -> [P_MAX, 128].
+
+    Args:
+        x_t: [P_MAX, P_MAX] bf16 in SBUF -- already transposed x
+        w_hbm: [P_MAX, P_MAX] bf16 in HBM -- weight matrix
+
+    x_t is the nc_transpose of x. nc_matmul(stationary=x_t, moving=w_t)
+    computes x_t^T @ w_t = x @ W^T when w_t = nc_transpose(W).
+    """
+    w = nl.ndarray((P_MAX, P_MAX), dtype=nl.bfloat16, buffer=nl.sbuf)
+    nisa.dma_copy(dst=w, src=w_hbm)
+    w_t_psum = nl.ndarray((P_MAX, P_MAX), dtype=nl.bfloat16, buffer=nl.psum)
+    nisa.nc_transpose(dst=w_t_psum, data=w)
+    w_t = nl.ndarray((P_MAX, P_MAX), dtype=nl.bfloat16, buffer=nl.sbuf)
+    nisa.tensor_copy(dst=w_t, src=w_t_psum)
+
+    result_psum = nl.ndarray((P_MAX, P_MAX), dtype=nl.float32, buffer=nl.psum)
+    nisa.nc_matmul(dst=result_psum, stationary=x_t, moving=w_t)
+    result = nl.ndarray((P_MAX, P_MAX), dtype=nl.bfloat16, buffer=nl.sbuf)
+    nisa.tensor_copy(dst=result, src=result_psum)
+    return result
+
+
+def _prepare_weight(w_hbm):
+    """Load and transpose a [P_MAX, P_MAX] weight matrix from HBM to SBUF.
+
+    Call ONCE before a tile loop, then pass the result to _matmul_with_w_t
+    inside the loop. Eliminates redundant weight DMA + transpose per tile.
+
+    Args:
+        w_hbm: [P_MAX, P_MAX] bf16 in HBM -- weight matrix
+
+    Returns:
+        w_t: [P_MAX, P_MAX] bf16 in SBUF -- transposed weight, ready for matmul
+    """
+    w = nl.ndarray((P_MAX, P_MAX), dtype=nl.bfloat16, buffer=nl.sbuf)
+    nisa.dma_copy(dst=w, src=w_hbm)
+    w_t_psum = nl.ndarray((P_MAX, P_MAX), dtype=nl.bfloat16, buffer=nl.psum)
+    nisa.nc_transpose(dst=w_t_psum, data=w)
+    w_t = nl.ndarray((P_MAX, P_MAX), dtype=nl.bfloat16, buffer=nl.sbuf)
+    nisa.tensor_copy(dst=w_t, src=w_t_psum)
+    return w_t
+
+
+def _matmul_with_w_t(x_t, w_t):
+    """Compute x @ W^T using a pre-transposed weight already in SBUF.
+
+    Args:
+        x_t: [P_MAX, P_MAX] bf16 in SBUF -- already transposed activation
+        w_t: [P_MAX, P_MAX] bf16 in SBUF -- pre-transposed weight (from _prepare_weight)
+
+    Returns:
+        result: [P_MAX, P_MAX] bf16 in SBUF -- x @ W^T
+    """
+    result_psum = nl.ndarray((P_MAX, P_MAX), dtype=nl.float32, buffer=nl.psum)
+    nisa.nc_matmul(dst=result_psum, stationary=x_t, moving=w_t)
+    result = nl.ndarray((P_MAX, P_MAX), dtype=nl.bfloat16, buffer=nl.sbuf)
+    nisa.tensor_copy(dst=result, src=result_psum)
+    return result
+
+
+def _transpose_to_sbuf(x):
+    """nc_transpose x from SBUF -> PSUM -> SBUF."""
+    x_t_psum = nl.ndarray((P_MAX, P_MAX), dtype=x.dtype, buffer=nl.psum)
+    nisa.nc_transpose(dst=x_t_psum, data=x)
+    x_t = nl.ndarray((P_MAX, P_MAX), dtype=x.dtype, buffer=nl.sbuf)
+    nisa.tensor_copy(dst=x_t, src=x_t_psum)
+    return x_t
+
+
+# ============================================================================
+# Phase 1/2: Triangle Multiplication (Outgoing / Incoming)
+# SPMD variant: work split by pid
+# ============================================================================
+def _trimul_phase_spmd(
+    z_in_hbm,
+    buf,
+    off_out,
+    off_gate,
+    off_a,
+    off_b,
+    off_result,
+    pair_mask_hbm,
+    norm_in_w,
+    norm_in_b,
+    p_in_w,
+    g_in_w,
+    norm_out_w,
+    norm_out_b,
+    p_out_w,
+    g_out_w,
+    N,
+    C_z,
+    eps,
+    is_incoming,
+    pid,
+):
+    """Execute one triangle multiplication phase (SPMD variant).
+
+    Work is split by pid:
+      - Pass 1a (projection): pid processes its half of flat tiles
+      - Pass 1b (einsum): pid processes its half of d iterations
+      - Pass 1c (output): pid processes its half of flat tiles
+    """
+    n_flat = N * N
+    n_tiles_flat = n_flat // P_MAX
+    n_tiles_spatial = N // P_MAX
+    stride_i = N * C_z
+    stride_k = C_z
+
+    # SPMD work split for flat tile loops
+    tiles_per_engine = n_tiles_flat // 2
+    my_tile_start = pid * tiles_per_engine
+
+    # SPMD work split for d-loop
+    d_per_engine = C_z // 2
+    my_d_start = pid * d_per_engine
+
+    # ---- Pass 1a: Projection (split tiles by pid) ----
+    ln_w = nl.ndarray((P_MAX, C_z), dtype=nl.bfloat16, buffer=nl.sbuf)
+    nisa.dma_copy(dst=ln_w, src=norm_in_w)
+    ln_b = nl.ndarray((P_MAX, C_z), dtype=nl.bfloat16, buffer=nl.sbuf)
+    nisa.dma_copy(dst=ln_b, src=norm_in_b)
+
+    # WEIGHT-STATIONARY: Pre-load and transpose all 4 projection weights ONCE.
+    p_in_w_t_0 = _prepare_weight(p_in_w[0:C_z, 0:C_z])
+    p_in_w_t_1 = _prepare_weight(p_in_w[C_z : 2 * C_z, 0:C_z])
+    g_in_w_t_0 = _prepare_weight(g_in_w[0:C_z, 0:C_z])
+    g_in_w_t_1 = _prepare_weight(g_in_w[C_z : 2 * C_z, 0:C_z])
+
+    for tile_local in nl.sequential_range(tiles_per_engine):
+        tile_idx = my_tile_start + tile_local
+        tile_start = tile_idx * P_MAX
+
+        z_tile = nl.ndarray((P_MAX, C_z), dtype=nl.bfloat16, buffer=nl.sbuf)
+        nisa.dma_copy(dst=z_tile, src=z_in_hbm[tile_start : tile_start + P_MAX, 0:C_z])
+
+        z_normed = _layer_norm_tile(z_tile, ln_w, ln_b, C_z, eps)
+
+        nisa.dma_copy(
+            dst=buf[off_gate + tile_start : off_gate + tile_start + P_MAX, 0:C_z],
+            src=z_normed,
+        )
+
+        z_n_t = _transpose_to_sbuf(z_normed)
+
+        for half in nl.static_range(2):
+            p_w_t = p_in_w_t_0 if half == 0 else p_in_w_t_1
+            g_w_t = g_in_w_t_0 if half == 0 else g_in_w_t_1
+            proj_tile = _matmul_with_w_t(z_n_t, p_w_t)
+            gate_tile = _matmul_with_w_t(z_n_t, g_w_t)
+
+            gate_sig = nl.ndarray((P_MAX, P_MAX), dtype=nl.bfloat16, buffer=nl.sbuf)
+            nisa.activation(
+                dst=gate_sig, op=nl.sigmoid, data=gate_tile, bias=None, scale=1.0
+            )
+            gated = nl.ndarray((P_MAX, P_MAX), dtype=nl.bfloat16, buffer=nl.sbuf)
+            nisa.tensor_tensor(
+                dst=gated, data1=proj_tile, data2=gate_sig, op=nl.multiply
+            )
+
+            mask_tile = nl.ndarray((P_MAX, 1), dtype=nl.float32, buffer=nl.sbuf)
+            mask_tile_bf16 = nl.ndarray((P_MAX, 1), dtype=nl.bfloat16, buffer=nl.sbuf)
+            nisa.dma_copy(
+                dst=mask_tile_bf16,
+                src=pair_mask_hbm[tile_start : tile_start + P_MAX, 0:1],
+            )
+            nisa.tensor_copy(dst=mask_tile, src=mask_tile_bf16)
+            gated_f32 = nl.ndarray((P_MAX, P_MAX), dtype=nl.float32, buffer=nl.sbuf)
+            nisa.tensor_copy(dst=gated_f32, src=gated)
+            masked_f32 = nl.ndarray((P_MAX, P_MAX), dtype=nl.float32, buffer=nl.sbuf)
+            nisa.tensor_scalar(
+                dst=masked_f32,
+                data=gated_f32,
+                op0=nl.multiply,
+                operand0=mask_tile,
+                engine=nisa.vector_engine,
+            )
+            masked = nl.ndarray((P_MAX, P_MAX), dtype=nl.bfloat16, buffer=nl.sbuf)
+            nisa.tensor_copy(dst=masked, src=masked_f32)
+
+            if half == 0:
+                nisa.dma_copy(
+                    dst=buf[off_a + tile_start : off_a + tile_start + P_MAX, 0:C_z],
+                    src=masked,
+                )
+            else:
+                nisa.dma_copy(
+                    dst=buf[off_b + tile_start : off_b + tile_start + P_MAX, 0:C_z],
+                    src=masked,
+                )
+
+    # BARRIER: Both cores must finish pass 1a before pass 1b reads A and B
+    nisa.core_barrier(data=buf, cores=(0, 1))
+
+    # ---- Pass 1b: Matmul (split d iterations by pid) ----
+    for d_local in nl.sequential_range(d_per_engine):
+        d = my_d_start + d_local
+
+        for i_tile in nl.affine_range(n_tiles_spatial):
+            i_start = i_tile * P_MAX
+            for j_tile in nl.affine_range(n_tiles_spatial):
+                j_start = j_tile * P_MAX
+
+                acc = nl.ndarray((P_MAX, P_MAX), dtype=nl.float32, buffer=nl.sbuf)
+                nisa.memset(dst=acc, value=0.0)
+
+                for k_tile_idx in nl.sequential_range(n_tiles_spatial):
+                    k_start = k_tile_idx * P_MAX
+
+                    a_tile = nl.ndarray(
+                        (P_MAX, P_MAX), dtype=nl.bfloat16, buffer=nl.sbuf
+                    )
+                    b_tile = nl.ndarray(
+                        (P_MAX, P_MAX), dtype=nl.bfloat16, buffer=nl.sbuf
+                    )
+
+                    if is_incoming:
+                        nisa.dma_copy(
+                            dst=a_tile,
+                            src=buf.ap(
+                                pattern=[[stride_k, P_MAX], [stride_i, P_MAX]],
+                                offset=off_a * C_z
+                                + k_start * stride_i
+                                + i_start * stride_k
+                                + d,
+                            ),
+                        )
+                        nisa.dma_copy(
+                            dst=b_tile,
+                            src=buf.ap(
+                                pattern=[[stride_k, P_MAX], [stride_i, P_MAX]],
+                                offset=off_b * C_z
+                                + k_start * stride_i
+                                + j_start * stride_k
+                                + d,
+                            ),
+                        )
+                    else:
+                        nisa.dma_copy(
+                            dst=a_tile,
+                            src=buf.ap(
+                                pattern=[[stride_i, P_MAX], [stride_k, P_MAX]],
+                                offset=off_a * C_z
+                                + i_start * stride_i
+                                + k_start * stride_k
+                                + d,
+                            ),
+                        )
+                        nisa.dma_copy(
+                            dst=b_tile,
+                            src=buf.ap(
+                                pattern=[[stride_i, P_MAX], [stride_k, P_MAX]],
+                                offset=off_b * C_z
+                                + j_start * stride_i
+                                + k_start * stride_k
+                                + d,
+                            ),
+                        )
+
+                    a_t = _transpose_to_sbuf(a_tile)
+                    b_t = _transpose_to_sbuf(b_tile)
+
+                    partial_psum = nl.ndarray(
+                        (P_MAX, P_MAX), dtype=nl.float32, buffer=nl.psum
+                    )
+                    nisa.nc_matmul(dst=partial_psum, stationary=a_t, moving=b_t)
+                    partial = nl.ndarray(
+                        (P_MAX, P_MAX), dtype=nl.float32, buffer=nl.sbuf
+                    )
+                    nisa.tensor_copy(dst=partial, src=partial_psum)
+                    nisa.tensor_tensor(dst=acc, data1=acc, data2=partial, op=nl.add)
+
+                out_tile = nl.ndarray((P_MAX, P_MAX), dtype=nl.bfloat16, buffer=nl.sbuf)
+                nisa.tensor_copy(dst=out_tile, src=acc)
+                nisa.dma_copy(
+                    dst=buf.ap(
+                        pattern=[[stride_i, P_MAX], [stride_k, P_MAX]],
+                        offset=off_result * C_z
+                        + i_start * stride_i
+                        + j_start * stride_k
+                        + d,
+                    ),
+                    src=out_tile,
+                )
+
+    # BARRIER: Both cores must finish pass 1b before pass 1c reads result
+    nisa.core_barrier(data=buf, cores=(0, 1))
+
+    # ---- Pass 1c: Output processing + residual (split tiles by pid) ----
+    ln_out_w = nl.ndarray((P_MAX, C_z), dtype=nl.bfloat16, buffer=nl.sbuf)
+    nisa.dma_copy(dst=ln_out_w, src=norm_out_w)
+    ln_out_b = nl.ndarray((P_MAX, C_z), dtype=nl.bfloat16, buffer=nl.sbuf)
+    nisa.dma_copy(dst=ln_out_b, src=norm_out_b)
+
+    # WEIGHT-STATIONARY
+    p_out_w_t = _prepare_weight(p_out_w)
+    g_out_w_t = _prepare_weight(g_out_w)
+
+    for tile_local in nl.sequential_range(tiles_per_engine):
+        tile_idx = my_tile_start + tile_local
+        tile_start = tile_idx * P_MAX
+
+        r_tile = nl.ndarray((P_MAX, C_z), dtype=nl.bfloat16, buffer=nl.sbuf)
+        nisa.dma_copy(
+            dst=r_tile,
+            src=buf[off_result + tile_start : off_result + tile_start + P_MAX, 0:C_z],
+        )
+
+        r_normed = _layer_norm_tile(r_tile, ln_out_w, ln_out_b, C_z, eps)
+        r_n_t = _transpose_to_sbuf(r_normed)
+
+        proj_out = _matmul_with_w_t(r_n_t, p_out_w_t)
+
+        z_normed_tile = nl.ndarray((P_MAX, C_z), dtype=nl.bfloat16, buffer=nl.sbuf)
+        nisa.dma_copy(
+            dst=z_normed_tile,
+            src=buf[off_gate + tile_start : off_gate + tile_start + P_MAX, 0:C_z],
+        )
+        zn_t = _transpose_to_sbuf(z_normed_tile)
+
+        gate_out = _matmul_with_w_t(zn_t, g_out_w_t)
+        gate_sig = nl.ndarray((P_MAX, P_MAX), dtype=nl.bfloat16, buffer=nl.sbuf)
+        nisa.activation(
+            dst=gate_sig, op=nl.sigmoid, data=gate_out, bias=None, scale=1.0
+        )
+
+        output_tile = nl.ndarray((P_MAX, C_z), dtype=nl.bfloat16, buffer=nl.sbuf)
+        nisa.tensor_tensor(
+            dst=output_tile, data1=proj_out, data2=gate_sig, op=nl.multiply
+        )
+
+        z_orig = nl.ndarray((P_MAX, C_z), dtype=nl.bfloat16, buffer=nl.sbuf)
+        nisa.dma_copy(dst=z_orig, src=z_in_hbm[tile_start : tile_start + P_MAX, 0:C_z])
+        z_updated = nl.ndarray((P_MAX, C_z), dtype=nl.bfloat16, buffer=nl.sbuf)
+        nisa.tensor_tensor(dst=z_updated, data1=z_orig, data2=output_tile, op=nl.add)
+        nisa.dma_copy(
+            dst=buf[off_out + tile_start : off_out + tile_start + P_MAX, 0:C_z],
+            src=z_updated,
+        )
+
+
+# ============================================================================
+# Phase 3/4: Triangle Attention (Starting / Ending Node)
+# SPMD variant: work split by pid
+# ============================================================================
+def _triattn_phase_spmd(
+    z_in_hbm,
+    buf,
+    off_out,
+    off_q,
+    off_k,
+    off_v,
+    bias_buf,
+    pair_mask_hbm,
+    ln_w_hbm,
+    ln_b_hbm,
+    bias_proj_w_hbm,
+    q_w_hbm,
+    k_w_hbm,
+    v_w_hbm,
+    gate_w_hbm,
+    out_w_hbm,
+    N,
+    C_z,
+    H,
+    d,
+    eps,
+    is_ending,
+    pid,
+):
+    """Execute one triangle attention phase (SPMD variant).
+
+    Work is split by pid:
+      - Pass 3a (LN+QKV+bias): pid processes its half of flat tiles
+      - Pass 3b (attention): pid processes its half of i_row iterations
+      - Pass 3c (output): pid processes its half of flat tiles
+    """
+    Hd = H * d
+    n_flat = N * N
+    n_tiles_flat = n_flat // P_MAX
+    n_tiles_spatial = N // P_MAX
+    scale = 1.0 / (d**0.5)
+
+    q_stride_row = N * Hd
+    q_stride_col = Hd
+
+    bias_stride_q = N * H
+    bias_stride_k = H
+
+    # SPMD work split for flat tile loops
+    tiles_per_engine = n_tiles_flat // 2
+    my_tile_start = pid * tiles_per_engine
+
+    # SPMD work split for i_row loop
+    rows_per_engine = N // 2
+    my_row_start = pid * rows_per_engine
+
+    # ---- Pass 3a: LayerNorm + QKV projection + bias computation (split tiles by pid) ----
+    ln_w = nl.ndarray((P_MAX, C_z), dtype=nl.bfloat16, buffer=nl.sbuf)
+    nisa.dma_copy(dst=ln_w, src=ln_w_hbm)
+    ln_b = nl.ndarray((P_MAX, C_z), dtype=nl.bfloat16, buffer=nl.sbuf)
+    nisa.dma_copy(dst=ln_b, src=ln_b_hbm)
+
+    # WEIGHT-STATIONARY: Pre-load and transpose Q, K, V projection weights ONCE.
+    q_w_t = _prepare_weight(q_w_hbm)
+    k_w_t = _prepare_weight(k_w_hbm)
+    v_w_t = _prepare_weight(v_w_hbm)
+
+    # Pre-load and transpose bias projection weight.
+    bias_w = nl.ndarray((H, C_z), dtype=nl.bfloat16, buffer=nl.sbuf)
+    nisa.dma_copy(dst=bias_w[0:H, 0:C_z], src=bias_proj_w_hbm)
+    bias_w_padded = nl.ndarray((P_MAX, P_MAX), dtype=nl.bfloat16, buffer=nl.sbuf)
+    nisa.memset(dst=bias_w_padded, value=0.0)
+    nisa.tensor_copy(dst=bias_w_padded[0:H, 0:C_z], src=bias_w[0:H, 0:C_z])
+    bias_w_t = _transpose_to_sbuf(bias_w_padded)
+
+    for tile_local in nl.sequential_range(tiles_per_engine):
+        tile_idx = my_tile_start + tile_local
+        tile_start = tile_idx * P_MAX
+
+        z_tile = nl.ndarray((P_MAX, C_z), dtype=nl.bfloat16, buffer=nl.sbuf)
+        nisa.dma_copy(dst=z_tile, src=z_in_hbm[tile_start : tile_start + P_MAX, 0:C_z])
+
+        z_normed = _layer_norm_tile(z_tile, ln_w, ln_b, C_z, eps)
+        z_n_t = _transpose_to_sbuf(z_normed)
+
+        q_tile = _matmul_with_w_t(z_n_t, q_w_t)
+        k_tile = _matmul_with_w_t(z_n_t, k_w_t)
+        v_tile = _matmul_with_w_t(z_n_t, v_w_t)
+
+        nisa.dma_copy(
+            dst=buf[off_q + tile_start : off_q + tile_start + P_MAX, 0:C_z], src=q_tile
+        )
+        nisa.dma_copy(
+            dst=buf[off_k + tile_start : off_k + tile_start + P_MAX, 0:C_z], src=k_tile
+        )
+        nisa.dma_copy(
+            dst=buf[off_v + tile_start : off_v + tile_start + P_MAX, 0:C_z], src=v_tile
+        )
+
+        # Bias projection
+        bias_psum = nl.ndarray((P_MAX, P_MAX), dtype=nl.float32, buffer=nl.psum)
+        nisa.nc_matmul(dst=bias_psum, stationary=z_n_t, moving=bias_w_t)
+        bias_tile = nl.ndarray((P_MAX, H), dtype=nl.bfloat16, buffer=nl.sbuf)
+        bias_full = nl.ndarray((P_MAX, P_MAX), dtype=nl.bfloat16, buffer=nl.sbuf)
+        nisa.tensor_copy(dst=bias_full, src=bias_psum)
+        nisa.tensor_copy(dst=bias_tile, src=bias_full[0:P_MAX, 0:H])
+
+        nisa.dma_copy(dst=bias_buf[tile_start : tile_start + P_MAX, 0:H], src=bias_tile)
+
+    # BARRIER: Both cores must finish pass 3a before pass 3b reads Q, K, V, bias
+    nisa.core_barrier(data=buf, cores=(0, 1))
+    nisa.core_barrier(data=bias_buf, cores=(0, 1))
+
+    # ---- Pass 3b: Attention (split i_row by pid) ----
+    for row_local in nl.sequential_range(rows_per_engine):
+        i_row = my_row_start + row_local
+
+        for h in nl.affine_range(H):
+            hd_start = h * d
+
+            for j_tile in nl.affine_range(n_tiles_spatial):
+                j_start = j_tile * P_MAX
+
+                q_tile = nl.ndarray((P_MAX, d), dtype=nl.bfloat16, buffer=nl.sbuf)
+                if is_ending:
+                    nisa.dma_copy(
+                        dst=q_tile,
+                        src=buf.ap(
+                            pattern=[[q_stride_row, P_MAX], [1, d]],
+                            offset=off_q * C_z
+                            + j_start * q_stride_row
+                            + i_row * q_stride_col
+                            + hd_start,
+                        ),
+                    )
+                else:
+                    nisa.dma_copy(
+                        dst=q_tile,
+                        src=buf.ap(
+                            pattern=[[q_stride_col, P_MAX], [1, d]],
+                            offset=off_q * C_z
+                            + i_row * q_stride_row
+                            + j_start * q_stride_col
+                            + hd_start,
+                        ),
+                    )
+
+                q_padded = nl.ndarray((P_MAX, P_MAX), dtype=nl.bfloat16, buffer=nl.sbuf)
+                nisa.memset(dst=q_padded, value=0.0)
+                nisa.tensor_copy(dst=q_padded[0:P_MAX, 0:d], src=q_tile)
+                q_t = _transpose_to_sbuf(q_padded)
+
+                m_prev = nl.ndarray((P_MAX, 1), dtype=nl.float32, buffer=nl.sbuf)
+                nisa.memset(dst=m_prev, value=-1e30)
+                l_prev = nl.ndarray((P_MAX, 1), dtype=nl.float32, buffer=nl.sbuf)
+                nisa.memset(dst=l_prev, value=0.0)
+                o_acc = nl.ndarray((P_MAX, d), dtype=nl.float32, buffer=nl.sbuf)
+                nisa.memset(dst=o_acc, value=0.0)
+
+                for k_tile_idx in nl.sequential_range(n_tiles_spatial):
+                    k_start = k_tile_idx * P_MAX
+
+                    k_tile_sb = nl.ndarray(
+                        (P_MAX, d), dtype=nl.bfloat16, buffer=nl.sbuf
+                    )
+                    if is_ending:
+                        nisa.dma_copy(
+                            dst=k_tile_sb,
+                            src=buf.ap(
+                                pattern=[[q_stride_row, P_MAX], [1, d]],
+                                offset=off_k * C_z
+                                + k_start * q_stride_row
+                                + i_row * q_stride_col
+                                + hd_start,
+                            ),
+                        )
+                    else:
+                        nisa.dma_copy(
+                            dst=k_tile_sb,
+                            src=buf.ap(
+                                pattern=[[q_stride_col, P_MAX], [1, d]],
+                                offset=off_k * C_z
+                                + i_row * q_stride_row
+                                + k_start * q_stride_col
+                                + hd_start,
+                            ),
+                        )
+
+                    v_tile_sb = nl.ndarray(
+                        (P_MAX, d), dtype=nl.bfloat16, buffer=nl.sbuf
+                    )
+                    if is_ending:
+                        nisa.dma_copy(
+                            dst=v_tile_sb,
+                            src=buf.ap(
+                                pattern=[[q_stride_row, P_MAX], [1, d]],
+                                offset=off_v * C_z
+                                + k_start * q_stride_row
+                                + i_row * q_stride_col
+                                + hd_start,
+                            ),
+                        )
+                    else:
+                        nisa.dma_copy(
+                            dst=v_tile_sb,
+                            src=buf.ap(
+                                pattern=[[q_stride_col, P_MAX], [1, d]],
+                                offset=off_v * C_z
+                                + i_row * q_stride_row
+                                + k_start * q_stride_col
+                                + hd_start,
+                            ),
+                        )
+
+                    bias_tile = nl.ndarray(
+                        (P_MAX, P_MAX), dtype=nl.bfloat16, buffer=nl.sbuf
+                    )
+                    if is_ending:
+                        nisa.dma_copy(
+                            dst=bias_tile,
+                            src=bias_buf.ap(
+                                pattern=[
+                                    [bias_stride_k, P_MAX],
+                                    [bias_stride_q, P_MAX],
+                                ],
+                                offset=k_start * bias_stride_q
+                                + j_start * bias_stride_k
+                                + h,
+                            ),
+                        )
+                    else:
+                        nisa.dma_copy(
+                            dst=bias_tile,
+                            src=bias_buf.ap(
+                                pattern=[
+                                    [bias_stride_q, P_MAX],
+                                    [bias_stride_k, P_MAX],
+                                ],
+                                offset=j_start * bias_stride_q
+                                + k_start * bias_stride_k
+                                + h,
+                            ),
+                        )
+
+                    k_padded = nl.ndarray(
+                        (P_MAX, P_MAX), dtype=nl.bfloat16, buffer=nl.sbuf
+                    )
+                    nisa.memset(dst=k_padded, value=0.0)
+                    nisa.tensor_copy(dst=k_padded[0:P_MAX, 0:d], src=k_tile_sb)
+                    k_t = _transpose_to_sbuf(k_padded)
+
+                    logits_psum = nl.ndarray(
+                        (P_MAX, P_MAX), dtype=nl.float32, buffer=nl.psum
+                    )
+                    nisa.nc_matmul(dst=logits_psum, stationary=q_t, moving=k_t)
+                    logits = nl.ndarray(
+                        (P_MAX, P_MAX), dtype=nl.float32, buffer=nl.sbuf
+                    )
+                    nisa.tensor_copy(dst=logits, src=logits_psum)
+
+                    logits_scaled = nl.ndarray(
+                        (P_MAX, P_MAX), dtype=nl.float32, buffer=nl.sbuf
+                    )
+                    nisa.tensor_scalar(
+                        dst=logits_scaled,
+                        data=logits,
+                        op0=nl.multiply,
+                        operand0=scale,
+                        engine=nisa.vector_engine,
+                    )
+
+                    bias_fp32 = nl.ndarray(
+                        (P_MAX, P_MAX), dtype=nl.float32, buffer=nl.sbuf
+                    )
+                    nisa.tensor_copy(dst=bias_fp32, src=bias_tile)
+                    logits_biased = nl.ndarray(
+                        (P_MAX, P_MAX), dtype=nl.float32, buffer=nl.sbuf
+                    )
+                    nisa.tensor_tensor(
+                        dst=logits_biased,
+                        data1=logits_scaled,
+                        data2=bias_fp32,
+                        op=nl.add,
+                    )
+
+                    tile_max = nl.ndarray((P_MAX, 1), dtype=nl.float32, buffer=nl.sbuf)
+                    nisa.tensor_reduce(
+                        dst=tile_max, op=nl.maximum, data=logits_biased, axis=1
+                    )
+
+                    m_new = nl.ndarray((P_MAX, 1), dtype=nl.float32, buffer=nl.sbuf)
+                    nisa.tensor_tensor(
+                        dst=m_new, data1=m_prev, data2=tile_max, op=nl.maximum
+                    )
+
+                    m_diff = nl.ndarray((P_MAX, 1), dtype=nl.float32, buffer=nl.sbuf)
+                    nisa.tensor_tensor(
+                        dst=m_diff, data1=m_prev, data2=m_new, op=nl.subtract
+                    )
+                    correction = nl.ndarray(
+                        (P_MAX, 1), dtype=nl.float32, buffer=nl.sbuf
+                    )
+                    nisa.activation(
+                        dst=correction, op=nl.exp, data=m_diff, bias=None, scale=1.0
+                    )
+
+                    logits_shifted = nl.ndarray(
+                        (P_MAX, P_MAX), dtype=nl.float32, buffer=nl.sbuf
+                    )
+                    nisa.tensor_scalar(
+                        dst=logits_shifted,
+                        data=logits_biased,
+                        op0=nl.subtract,
+                        operand0=m_new,
+                        engine=nisa.vector_engine,
+                    )
+                    exp_logits = nl.ndarray(
+                        (P_MAX, P_MAX), dtype=nl.float32, buffer=nl.sbuf
+                    )
+                    nisa.activation(
+                        dst=exp_logits,
+                        op=nl.exp,
+                        data=logits_shifted,
+                        bias=None,
+                        scale=1.0,
+                    )
+
+                    l_corrected = nl.ndarray(
+                        (P_MAX, 1), dtype=nl.float32, buffer=nl.sbuf
+                    )
+                    nisa.tensor_tensor(
+                        dst=l_corrected, data1=l_prev, data2=correction, op=nl.multiply
+                    )
+                    tile_sum = nl.ndarray((P_MAX, 1), dtype=nl.float32, buffer=nl.sbuf)
+                    nisa.tensor_reduce(dst=tile_sum, op=nl.add, data=exp_logits, axis=1)
+                    l_new = nl.ndarray((P_MAX, 1), dtype=nl.float32, buffer=nl.sbuf)
+                    nisa.tensor_tensor(
+                        dst=l_new, data1=l_corrected, data2=tile_sum, op=nl.add
+                    )
+
+                    o_scaled = nl.ndarray((P_MAX, d), dtype=nl.float32, buffer=nl.sbuf)
+                    nisa.tensor_scalar(
+                        dst=o_scaled,
+                        data=o_acc,
+                        op0=nl.multiply,
+                        operand0=correction,
+                        engine=nisa.vector_engine,
+                    )
+
+                    exp_bf16 = nl.ndarray(
+                        (P_MAX, P_MAX), dtype=nl.bfloat16, buffer=nl.sbuf
+                    )
+                    nisa.tensor_copy(dst=exp_bf16, src=exp_logits)
+                    exp_t = _transpose_to_sbuf(exp_bf16)
+
+                    pv_psum = nl.ndarray((P_MAX, d), dtype=nl.float32, buffer=nl.psum)
+                    nisa.nc_matmul(dst=pv_psum, stationary=exp_t, moving=v_tile_sb)
+                    pv_sbuf = nl.ndarray((P_MAX, d), dtype=nl.float32, buffer=nl.sbuf)
+                    nisa.tensor_copy(dst=pv_sbuf, src=pv_psum)
+
+                    nisa.tensor_tensor(
+                        dst=o_acc, data1=o_scaled, data2=pv_sbuf, op=nl.add
+                    )
+                    nisa.tensor_copy(dst=m_prev, src=m_new)
+                    nisa.tensor_copy(dst=l_prev, src=l_new)
+
+                inv_l = nl.ndarray((P_MAX, 1), dtype=nl.float32, buffer=nl.sbuf)
+                nisa.reciprocal(dst=inv_l, data=l_prev)
+                o_final = nl.ndarray((P_MAX, d), dtype=nl.float32, buffer=nl.sbuf)
+                nisa.tensor_scalar(
+                    dst=o_final,
+                    data=o_acc,
+                    op0=nl.multiply,
+                    operand0=inv_l,
+                    engine=nisa.vector_engine,
+                )
+                o_out = nl.ndarray((P_MAX, d), dtype=nl.bfloat16, buffer=nl.sbuf)
+                nisa.tensor_copy(dst=o_out, src=o_final)
+
+                if is_ending:
+                    nisa.dma_copy(
+                        dst=buf.ap(
+                            pattern=[[q_stride_row, P_MAX], [1, d]],
+                            offset=off_q * C_z
+                            + j_start * q_stride_row
+                            + i_row * q_stride_col
+                            + hd_start,
+                        ),
+                        src=o_out,
+                    )
+                else:
+                    nisa.dma_copy(
+                        dst=buf.ap(
+                            pattern=[[q_stride_col, P_MAX], [1, d]],
+                            offset=off_q * C_z
+                            + i_row * q_stride_row
+                            + j_start * q_stride_col
+                            + hd_start,
+                        ),
+                        src=o_out,
+                    )
+
+    # BARRIER: Both cores must finish pass 3b before pass 3c reads attention output
+    nisa.core_barrier(data=buf, cores=(0, 1))
+
+    # ---- Pass 3c: Output gating + projection + residual (split tiles by pid) ----
+    # WEIGHT-STATIONARY
+    gate_w_t = _prepare_weight(gate_w_hbm)
+    out_w_t = _prepare_weight(out_w_hbm)
+
+    for tile_local in nl.sequential_range(tiles_per_engine):
+        tile_idx = my_tile_start + tile_local
+        tile_start = tile_idx * P_MAX
+
+        attn_out = nl.ndarray((P_MAX, C_z), dtype=nl.bfloat16, buffer=nl.sbuf)
+        nisa.dma_copy(
+            dst=attn_out,
+            src=buf[off_q + tile_start : off_q + tile_start + P_MAX, 0:C_z],
+        )
+
+        z_tile = nl.ndarray((P_MAX, C_z), dtype=nl.bfloat16, buffer=nl.sbuf)
+        nisa.dma_copy(dst=z_tile, src=z_in_hbm[tile_start : tile_start + P_MAX, 0:C_z])
+        z_normed = _layer_norm_tile(z_tile, ln_w, ln_b, C_z, eps)
+        zn_t = _transpose_to_sbuf(z_normed)
+
+        gate_out = _matmul_with_w_t(zn_t, gate_w_t)
+        gate_sig = nl.ndarray((P_MAX, P_MAX), dtype=nl.bfloat16, buffer=nl.sbuf)
+        nisa.activation(
+            dst=gate_sig, op=nl.sigmoid, data=gate_out, bias=None, scale=1.0
+        )
+
+        gated = nl.ndarray((P_MAX, C_z), dtype=nl.bfloat16, buffer=nl.sbuf)
+        nisa.tensor_tensor(dst=gated, data1=attn_out, data2=gate_sig, op=nl.multiply)
+
+        gated_t = _transpose_to_sbuf(gated)
+        proj_out = _matmul_with_w_t(gated_t, out_w_t)
+
+        z_updated = nl.ndarray((P_MAX, C_z), dtype=nl.bfloat16, buffer=nl.sbuf)
+        nisa.tensor_tensor(dst=z_updated, data1=z_tile, data2=proj_out, op=nl.add)
+        nisa.dma_copy(
+            dst=buf[off_out + tile_start : off_out + tile_start + P_MAX, 0:C_z],
+            src=z_updated,
+        )
+
+
+# ============================================================================
+# Phase 5: Transition_z (SwiGLU FFN)
+# SPMD variant: split tiles by pid
+# ============================================================================
+def _transition_z_phase_spmd(
+    z_in_hbm,
+    z_out_hbm,
+    norm_w_hbm,
+    norm_b_hbm,
+    fc1_w_hbm,
+    fc2_w_hbm,
+    fc3_w_hbm,
+    N,
+    C_z,
+    hidden_dim,
+    eps,
+    pid,
+):
+    """Execute Transition_z: SwiGLU FFN on z (SPMD variant)."""
+    n_flat = N * N
+    n_tiles_flat = n_flat // P_MAX
+    n_hidden_tiles = hidden_dim // P_MAX
+
+    # SPMD work split for flat tile loops
+    tiles_per_engine = n_tiles_flat // 2
+    my_tile_start = pid * tiles_per_engine
+
+    ln_w = nl.ndarray((P_MAX, C_z), dtype=nl.bfloat16, buffer=nl.sbuf)
+    nisa.dma_copy(dst=ln_w, src=norm_w_hbm)
+    ln_b = nl.ndarray((P_MAX, C_z), dtype=nl.bfloat16, buffer=nl.sbuf)
+    nisa.dma_copy(dst=ln_b, src=norm_b_hbm)
+
+    # WEIGHT-STATIONARY: Pre-load ALL FFN weight chunks ONCE before tile loop.
+    fc1_w_t_0 = _prepare_weight(fc1_w_hbm[0:P_MAX, 0:C_z])
+    fc1_w_t_1 = _prepare_weight(fc1_w_hbm[P_MAX : 2 * P_MAX, 0:C_z])
+    fc1_w_t_2 = _prepare_weight(fc1_w_hbm[2 * P_MAX : 3 * P_MAX, 0:C_z])
+    fc1_w_t_3 = _prepare_weight(fc1_w_hbm[3 * P_MAX : 4 * P_MAX, 0:C_z])
+
+    fc2_w_t_0 = _prepare_weight(fc2_w_hbm[0:P_MAX, 0:C_z])
+    fc2_w_t_1 = _prepare_weight(fc2_w_hbm[P_MAX : 2 * P_MAX, 0:C_z])
+    fc2_w_t_2 = _prepare_weight(fc2_w_hbm[2 * P_MAX : 3 * P_MAX, 0:C_z])
+    fc2_w_t_3 = _prepare_weight(fc2_w_hbm[3 * P_MAX : 4 * P_MAX, 0:C_z])
+
+    fc3_w_t_0 = _prepare_weight(fc3_w_hbm[0:C_z, 0:P_MAX])
+    fc3_w_t_1 = _prepare_weight(fc3_w_hbm[0:C_z, P_MAX : 2 * P_MAX])
+    fc3_w_t_2 = _prepare_weight(fc3_w_hbm[0:C_z, 2 * P_MAX : 3 * P_MAX])
+    fc3_w_t_3 = _prepare_weight(fc3_w_hbm[0:C_z, 3 * P_MAX : 4 * P_MAX])
+
+    for tile_local in nl.sequential_range(tiles_per_engine):
+        tile_idx = my_tile_start + tile_local
+        tile_start = tile_idx * P_MAX
+
+        z_tile = nl.ndarray((P_MAX, C_z), dtype=nl.bfloat16, buffer=nl.sbuf)
+        nisa.dma_copy(dst=z_tile, src=z_in_hbm[tile_start : tile_start + P_MAX, 0:C_z])
+
+        z_normed = _layer_norm_tile(z_tile, ln_w, ln_b, C_z, eps)
+        z_n_t = _transpose_to_sbuf(z_normed)
+
+        output_acc = nl.ndarray((P_MAX, C_z), dtype=nl.float32, buffer=nl.sbuf)
+        nisa.memset(dst=output_acc, value=0.0)
+
+        for h_tile in nl.static_range(n_hidden_tiles):
+            if h_tile == 0:
+                fc1_w_t = fc1_w_t_0
+                fc2_w_t = fc2_w_t_0
+                fc3_w_t = fc3_w_t_0
+            elif h_tile == 1:
+                fc1_w_t = fc1_w_t_1
+                fc2_w_t = fc2_w_t_1
+                fc3_w_t = fc3_w_t_1
+            elif h_tile == 2:
+                fc1_w_t = fc1_w_t_2
+                fc2_w_t = fc2_w_t_2
+                fc3_w_t = fc3_w_t_2
+            else:
+                fc1_w_t = fc1_w_t_3
+                fc2_w_t = fc2_w_t_3
+                fc3_w_t = fc3_w_t_3
+
+            fc1_chunk = _matmul_with_w_t(z_n_t, fc1_w_t)
+            fc2_chunk = _matmul_with_w_t(z_n_t, fc2_w_t)
+
+            fc1_sig = nl.ndarray((P_MAX, P_MAX), dtype=nl.bfloat16, buffer=nl.sbuf)
+            nisa.activation(
+                dst=fc1_sig, op=nl.sigmoid, data=fc1_chunk, bias=None, scale=1.0
+            )
+            fc1_silu = nl.ndarray((P_MAX, P_MAX), dtype=nl.bfloat16, buffer=nl.sbuf)
+            nisa.tensor_tensor(
+                dst=fc1_silu, data1=fc1_chunk, data2=fc1_sig, op=nl.multiply
+            )
+            swiglu = nl.ndarray((P_MAX, P_MAX), dtype=nl.bfloat16, buffer=nl.sbuf)
+            nisa.tensor_tensor(
+                dst=swiglu, data1=fc1_silu, data2=fc2_chunk, op=nl.multiply
+            )
+
+            sg_t = _transpose_to_sbuf(swiglu)
+
+            partial_psum = nl.ndarray((P_MAX, P_MAX), dtype=nl.float32, buffer=nl.psum)
+            nisa.nc_matmul(dst=partial_psum, stationary=sg_t, moving=fc3_w_t)
+            partial = nl.ndarray((P_MAX, P_MAX), dtype=nl.float32, buffer=nl.sbuf)
+            nisa.tensor_copy(dst=partial, src=partial_psum)
+            nisa.tensor_tensor(
+                dst=output_acc, data1=output_acc, data2=partial, op=nl.add
+            )
+
+        output_tile = nl.ndarray((P_MAX, C_z), dtype=nl.bfloat16, buffer=nl.sbuf)
+        nisa.tensor_copy(dst=output_tile, src=output_acc)
+
+        z_updated = nl.ndarray((P_MAX, C_z), dtype=nl.bfloat16, buffer=nl.sbuf)
+        nisa.tensor_tensor(dst=z_updated, data1=z_tile, data2=output_tile, op=nl.add)
+        nisa.dma_copy(
+            dst=z_out_hbm[tile_start : tile_start + P_MAX, 0:C_z], src=z_updated
+        )
+
+
+# ============================================================================
+# Main Mega-Kernel Entry Point — SPMD grid=[2] variant
+# ============================================================================
+@nki.jit
+def fused_z_ops_spmd(
+    # Main tensors (all in HBM)
+    z,
+    pair_mask,
+    # TriMulOut weights
+    tmul_out_norm_in_w,
+    tmul_out_norm_in_b,
+    tmul_out_p_in_w,
+    tmul_out_g_in_w,
+    tmul_out_norm_out_w,
+    tmul_out_norm_out_b,
+    tmul_out_p_out_w,
+    tmul_out_g_out_w,
+    # TriMulIn weights
+    tmul_in_norm_in_w,
+    tmul_in_norm_in_b,
+    tmul_in_p_in_w,
+    tmul_in_g_in_w,
+    tmul_in_norm_out_w,
+    tmul_in_norm_out_b,
+    tmul_in_p_out_w,
+    tmul_in_g_out_w,
+    # TriAttnStart weights
+    tatt_s_ln_w,
+    tatt_s_ln_b,
+    tatt_s_bias_proj_w,
+    tatt_s_q_w,
+    tatt_s_k_w,
+    tatt_s_v_w,
+    tatt_s_g_w,
+    tatt_s_o_w,
+    # TriAttnEnd weights
+    tatt_e_ln_w,
+    tatt_e_ln_b,
+    tatt_e_bias_proj_w,
+    tatt_e_q_w,
+    tatt_e_k_w,
+    tatt_e_v_w,
+    tatt_e_g_w,
+    tatt_e_o_w,
+    # Transition_z weights
+    trans_z_norm_w,
+    trans_z_norm_b,
+    trans_z_fc1_w,
+    trans_z_fc2_w,
+    trans_z_fc3_w,
+    # Pre-allocated scratch buffers (passed from PyTorch as external shared HBM)
+    scratch_buf,
+    bias_buf,
+    # Constants
+    N: int = 256,
+    C_z: int = 128,
+    H: int = 4,
+    d: int = 32,
+    eps: float = 1e-5,
+):
+    """Fused z-operations mega-kernel — SPMD grid=[2] variant.
+
+    Launches on 2 physical NeuronCores within one logical core (LNC=2).
+    Each core executes the same program but processes different subsets
+    of work. core_barrier() synchronizes between phases.
+
+    IMPORTANT: scratch_buf and bias_buf must be pre-allocated by the caller
+    as PyTorch tensors on the XLA device. They live in shared HBM and are
+    visible to both SPMD instances. Internal nl.ndarray(shared_hbm) does NOT
+    work for intermediate buffers in SPMD kernels (compiler limitation).
+
+    scratch_buf: shape [6 * N*N, C_z], dtype bf16
+    bias_buf: shape [N*N, H], dtype bf16
+
+    Launch with: fused_z_ops_spmd[2](z, pair_mask, ..., scratch_buf, bias_buf)
+    """
+    pid = nl.program_id(0)
+
+    hidden_dim = 4 * C_z
+    n_flat = N * N
+
+    z_out = nl.ndarray((n_flat, C_z), dtype=nl.bfloat16, buffer=nl.shared_hbm)
+
+    off_a = 0 * n_flat
+    off_b = 1 * n_flat
+    off_gate = 2 * n_flat
+    off_d = 3 * n_flat
+    off_z1 = 4 * n_flat
+    off_z2 = 5 * n_flat
+
+    # Phase 1: TriMulOut
+    _trimul_phase_spmd(
+        z_in_hbm=z,
+        buf=scratch_buf,
+        off_out=off_z1,
+        off_gate=off_gate,
+        off_a=off_a,
+        off_b=off_b,
+        off_result=off_d,
+        pair_mask_hbm=pair_mask,
+        norm_in_w=tmul_out_norm_in_w,
+        norm_in_b=tmul_out_norm_in_b,
+        p_in_w=tmul_out_p_in_w,
+        g_in_w=tmul_out_g_in_w,
+        norm_out_w=tmul_out_norm_out_w,
+        norm_out_b=tmul_out_norm_out_b,
+        p_out_w=tmul_out_p_out_w,
+        g_out_w=tmul_out_g_out_w,
+        N=N,
+        C_z=C_z,
+        eps=eps,
+        is_incoming=False,
+        pid=pid,
+    )
+
+    # BARRIER: Phase 2 reads Phase 1's z1 output
+    nisa.core_barrier(data=scratch_buf, cores=(0, 1))
+
+    # Phase 2: TriMulIn
+    _trimul_phase_spmd(
+        z_in_hbm=scratch_buf[off_z1 : off_z1 + n_flat, 0:C_z],
+        buf=scratch_buf,
+        off_out=off_z2,
+        off_gate=off_gate,
+        off_a=off_a,
+        off_b=off_b,
+        off_result=off_d,
+        pair_mask_hbm=pair_mask,
+        norm_in_w=tmul_in_norm_in_w,
+        norm_in_b=tmul_in_norm_in_b,
+        p_in_w=tmul_in_p_in_w,
+        g_in_w=tmul_in_g_in_w,
+        norm_out_w=tmul_in_norm_out_w,
+        norm_out_b=tmul_in_norm_out_b,
+        p_out_w=tmul_in_p_out_w,
+        g_out_w=tmul_in_g_out_w,
+        N=N,
+        C_z=C_z,
+        eps=eps,
+        is_incoming=True,
+        pid=pid,
+    )
+
+    # BARRIER: Phase 3 reads Phase 2's z2 output
+    nisa.core_barrier(data=scratch_buf, cores=(0, 1))
+
+    # Phase 3: TriAttnStart
+    _triattn_phase_spmd(
+        z_in_hbm=scratch_buf[off_z2 : off_z2 + n_flat, 0:C_z],
+        buf=scratch_buf,
+        off_out=off_z1,
+        off_q=off_a,
+        off_k=off_b,
+        off_v=off_gate,
+        bias_buf=bias_buf,
+        pair_mask_hbm=pair_mask,
+        ln_w_hbm=tatt_s_ln_w,
+        ln_b_hbm=tatt_s_ln_b,
+        bias_proj_w_hbm=tatt_s_bias_proj_w,
+        q_w_hbm=tatt_s_q_w,
+        k_w_hbm=tatt_s_k_w,
+        v_w_hbm=tatt_s_v_w,
+        gate_w_hbm=tatt_s_g_w,
+        out_w_hbm=tatt_s_o_w,
+        N=N,
+        C_z=C_z,
+        H=H,
+        d=d,
+        eps=eps,
+        is_ending=False,
+        pid=pid,
+    )
+
+    # BARRIER: Phase 4 reads Phase 3's z1 output
+    nisa.core_barrier(data=scratch_buf, cores=(0, 1))
+
+    # Phase 4: TriAttnEnd
+    _triattn_phase_spmd(
+        z_in_hbm=scratch_buf[off_z1 : off_z1 + n_flat, 0:C_z],
+        buf=scratch_buf,
+        off_out=off_z2,
+        off_q=off_a,
+        off_k=off_b,
+        off_v=off_gate,
+        bias_buf=bias_buf,
+        pair_mask_hbm=pair_mask,
+        ln_w_hbm=tatt_e_ln_w,
+        ln_b_hbm=tatt_e_ln_b,
+        bias_proj_w_hbm=tatt_e_bias_proj_w,
+        q_w_hbm=tatt_e_q_w,
+        k_w_hbm=tatt_e_k_w,
+        v_w_hbm=tatt_e_v_w,
+        gate_w_hbm=tatt_e_g_w,
+        out_w_hbm=tatt_e_o_w,
+        N=N,
+        C_z=C_z,
+        H=H,
+        d=d,
+        eps=eps,
+        is_ending=True,
+        pid=pid,
+    )
+
+    # BARRIER: Phase 5 reads Phase 4's z2 output
+    nisa.core_barrier(data=scratch_buf, cores=(0, 1))
+
+    # Phase 5: Transition_z
+    _transition_z_phase_spmd(
+        z_in_hbm=scratch_buf[off_z2 : off_z2 + n_flat, 0:C_z],
+        z_out_hbm=z_out,
+        norm_w_hbm=trans_z_norm_w,
+        norm_b_hbm=trans_z_norm_b,
+        fc1_w_hbm=trans_z_fc1_w,
+        fc2_w_hbm=trans_z_fc2_w,
+        fc3_w_hbm=trans_z_fc3_w,
+        N=N,
+        C_z=C_z,
+        hidden_dim=hidden_dim,
+        eps=eps,
+        pid=pid,
+    )
+
+    return z_out
diff --git a/contrib/models/Boltz-2/test/integration/compile_full_layer_spmd.py b/contrib/models/Boltz-2/test/integration/compile_full_layer_spmd.py
new file mode 100644
index 00000000..6e7db12d
--- /dev/null
+++ b/contrib/models/Boltz-2/test/integration/compile_full_layer_spmd.py
@@ -0,0 +1,378 @@
+#!/usr/bin/env python3
+"""Compile FULL PairformerLayer SPMD grid=[2] mega-kernel at a given N.
+
+This compiles the full_pairformer_layer_spmd kernel which covers ALL 7
+sub-operations of a PairformerLayer:
+  Step 1:  s = s + PairBiasAttn(s, z)
+  Step 2:  z = z + TriMulOut(z)
+  Step 3:  z = z + TriMulIn(z)
+  Step 4:  z = z + TriAttnStart(z)
+  Step 5:  z = z + TriAttnEnd(z)
+  Step 6a: s = s + Transition_s(s)
+  Step 6b: z = z + Transition_z(z)
+
+Usage:
+    source /opt/aws_neuronx_venv_pytorch_2_9/bin/activate
+    NEURON_PLATFORM_TARGET_OVERRIDE=trn2 NEURON_RT_VISIBLE_CORES=0 \
+        python compile_full_layer_spmd.py --N 128
+"""
+
+import argparse
+import glob
+import os
+import time
+
+os.environ.setdefault("NEURON_PLATFORM_TARGET_OVERRIDE", "trn2")
+os.environ.setdefault("NEURON_CC_FLAGS", "--model-type transformer")
+
+import torch
+import torch_xla.core.xla_model as xm
+
+from boltz.main import (
+    Boltz2,
+    PairformerArgsV2,
+    Boltz2DiffusionParams,
+    MSAModuleArgs,
+    BoltzSteeringParams,
+)
+from dataclasses import asdict
+
+from full_pairformer_layer_spmd import full_pairformer_layer_spmd
+
+P_MAX = 128
+
+
+def extract_all_weights(layer):
+    """Extract ALL weights needed for the full PairformerLayer mega-kernel.
+
+    Returns a dict of (buf_name, attr_path) that maps kernel parameter names
+    to PyTorch attribute paths on the layer object.
+
+    Attribute paths use V2 conventions:
+      - PairBiasAttention: layer.pre_norm_s (s LN), layer.attention.* (projections)
+      - Z LN for PBA is inside layer.attention.proj_z (Sequential: [0]=LN, [1]=Linear)
+      - Transition_s/z: layer.transition_s/z.* (no bias on fc1/fc2/fc3)
+    """
+    weight_map = [
+        # ---- PairBiasAttention (Step 1) ----
+        # s LayerNorm (pre_norm_s on PairformerLayer itself)
+        ("pba_norm_s_w", "pre_norm_s.weight"),
+        ("pba_norm_s_b", "pre_norm_s.bias"),
+        # z LayerNorm (inside attention.proj_z Sequential, index 0)
+        ("pba_norm_z_w", "attention.proj_z.0.weight"),
+        ("pba_norm_z_b", "attention.proj_z.0.bias"),
+        # Q projection (has bias)
+        ("pba_q_w", "attention.proj_q.weight"),
+        ("pba_q_b", "attention.proj_q.bias"),
+        # K, V, gate, output projections (no bias)
+        ("pba_k_w", "attention.proj_k.weight"),
+        ("pba_v_w", "attention.proj_v.weight"),
+        # z bias projection (inside proj_z Sequential, index 1)
+        ("pba_z_w", "attention.proj_z.1.weight"),
+        # gate and output
+        ("pba_g_w", "attention.proj_g.weight"),
+        ("pba_o_w", "attention.proj_o.weight"),
+        # ---- TriMulOut (Step 2) ----
+        ("tmul_out_norm_in_w", "tri_mul_out.norm_in.weight"),
+        ("tmul_out_norm_in_b", "tri_mul_out.norm_in.bias"),
+        ("tmul_out_p_in_w", "tri_mul_out.p_in.weight"),
+        ("tmul_out_g_in_w", "tri_mul_out.g_in.weight"),
+        ("tmul_out_norm_out_w", "tri_mul_out.norm_out.weight"),
+        ("tmul_out_norm_out_b", "tri_mul_out.norm_out.bias"),
+        ("tmul_out_p_out_w", "tri_mul_out.p_out.weight"),
+        ("tmul_out_g_out_w", "tri_mul_out.g_out.weight"),
+        # ---- TriMulIn (Step 3) ----
+        ("tmul_in_norm_in_w", "tri_mul_in.norm_in.weight"),
+        ("tmul_in_norm_in_b", "tri_mul_in.norm_in.bias"),
+        ("tmul_in_p_in_w", "tri_mul_in.p_in.weight"),
+        ("tmul_in_g_in_w", "tri_mul_in.g_in.weight"),
+        ("tmul_in_norm_out_w", "tri_mul_in.norm_out.weight"),
+        ("tmul_in_norm_out_b", "tri_mul_in.norm_out.bias"),
+        ("tmul_in_p_out_w", "tri_mul_in.p_out.weight"),
+        ("tmul_in_g_out_w", "tri_mul_in.g_out.weight"),
+        # ---- TriAttnStart (Step 4) ----
+        ("tatt_s_ln_w", "tri_att_start.layer_norm.weight"),
+        ("tatt_s_ln_b", "tri_att_start.layer_norm.bias"),
+        ("tatt_s_bias_proj_w", "tri_att_start.linear.weight"),
+        ("tatt_s_q_w", "tri_att_start.mha.linear_q.weight"),
+        ("tatt_s_k_w", "tri_att_start.mha.linear_k.weight"),
+        ("tatt_s_v_w", "tri_att_start.mha.linear_v.weight"),
+        ("tatt_s_g_w", "tri_att_start.mha.linear_g.weight"),
+        ("tatt_s_o_w", "tri_att_start.mha.linear_o.weight"),
+        # ---- TriAttnEnd (Step 5) ----
+        ("tatt_e_ln_w", "tri_att_end.layer_norm.weight"),
+        ("tatt_e_ln_b", "tri_att_end.layer_norm.bias"),
+        ("tatt_e_bias_proj_w", "tri_att_end.linear.weight"),
+        ("tatt_e_q_w", "tri_att_end.mha.linear_q.weight"),
+        ("tatt_e_k_w", "tri_att_end.mha.linear_k.weight"),
+        ("tatt_e_v_w", "tri_att_end.mha.linear_v.weight"),
+        ("tatt_e_g_w", "tri_att_end.mha.linear_g.weight"),
+        ("tatt_e_o_w", "tri_att_end.mha.linear_o.weight"),
+        # ---- Transition_s (Step 6a) — no bias on fc1/fc2/fc3 ----
+        ("trans_s_norm_w", "transition_s.norm.weight"),
+        ("trans_s_norm_b", "transition_s.norm.bias"),
+        ("trans_s_fc1_w", "transition_s.fc1.weight"),
+        ("trans_s_fc2_w", "transition_s.fc2.weight"),
+        ("trans_s_fc3_w", "transition_s.fc3.weight"),
+        # ---- Transition_z (Step 6b) — no bias on fc1/fc2/fc3 ----
+        ("trans_z_norm_w", "transition_z.norm.weight"),
+        ("trans_z_norm_b", "transition_z.norm.bias"),
+        ("trans_z_fc1_w", "transition_z.fc1.weight"),
+        ("trans_z_fc2_w", "transition_z.fc2.weight"),
+        ("trans_z_fc3_w", "transition_z.fc3.weight"),
+    ]
+
+    w = {}
+    for buf_name, attr_path in weight_map:
+        obj = layer
+        for p in attr_path.split("."):
+            obj = getattr(obj, p)
+        v = obj.data.clone().to(torch.bfloat16)
+        if v.dim() == 1:
+            # LayerNorm weights: tile to [P_MAX, F] where each row is the same
+            v = v.unsqueeze(0).expand(P_MAX, -1).contiguous()
+        w[buf_name] = v
+
+    return w
+
+
+def main():
+    parser = argparse.ArgumentParser()
+    parser.add_argument("--N", type=int, default=128)
+    args = parser.parse_args()
+
+    N = args.N
+    C_z = 128
+    C_s = 384
+    H_z = 4
+    H_s = 16
+    n_flat = N * N
+
+    print(f"{'=' * 70}")
+    print(f"Compile FULL PairformerLayer SPMD grid=[2] mega-kernel for N={N}")
+    print(f"  z scratch HBM (shared): {6 * n_flat * C_z * 2 / 1024 / 1024:.1f} MB")
+    print(f"  z bias HBM (shared):    {n_flat * H_z * 2 / 1024 / 1024:.1f} MB")
+    print(f"  s scratch (Q/K/V/gate): {4 * N * C_s * 2 / 1024 / 1024:.1f} MB")
+    print(f"  z→s bias scratch:       {n_flat * H_s * 2 / 1024 / 1024:.1f} MB")
+    print(f"  s intermediate:         {N * C_s * 2 / 1024 / 1024:.3f} MB")
+    print(f"  SPMD: 2 physical cores, work split per phase")
+    print(f"{'=' * 70}")
+
+    # Load model
+    print("\nLoading model...")
+    t0 = time.time()
+    boltz = Boltz2.load_from_checkpoint(
+        os.path.expanduser("~/.boltz/boltz2_conf.ckpt"),
+        strict=True,
+        predict_args={
+            "recycling_steps": 3,
+            "sampling_steps": 20,
+            "diffusion_samples": 1,
+            "max_parallel_samples": 1,
+            "write_confidence_summary": True,
+            "write_full_pae": True,
+            "write_full_pde": True,
+        },
+        map_location="cpu",
+        diffusion_process_args=asdict(Boltz2DiffusionParams()),
+        ema=False,
+        use_kernels=False,
+        pairformer_args=asdict(PairformerArgsV2()),
+        msa_args=asdict(MSAModuleArgs(use_paired_feature=True)),
+        steering_args=asdict(BoltzSteeringParams()),
+    )
+    boltz.eval()
+    boltz = boltz.float()
+    t1 = time.time()
+    print(f"Model loaded in {t1 - t0:.1f}s")
+    layer = boltz.pairformer_module.layers[0]
+
+    # Extract all weights
+    print("Extracting weights...")
+    device = xm.xla_device()
+    w = extract_all_weights(layer)
+
+    # Move weights to device
+    for key in w:
+        w[key] = w[key].to(device)
+
+    # Create inputs
+    torch.manual_seed(42)
+    s = (torch.randn(N, C_s) * 0.1).to(torch.bfloat16).to(device)
+    z_flat = (torch.randn(n_flat, C_z) * 0.1).to(torch.bfloat16).to(device)
+    pm_flat = torch.ones(n_flat, 1, dtype=torch.bfloat16, device=device)
+    mask = torch.ones(N, 1, dtype=torch.bfloat16, device=device)
+
+    # Pre-allocate scratch buffers
+    scratch_buf = torch.zeros(6 * n_flat, C_z, dtype=torch.bfloat16, device=device)
+    bias_buf = torch.zeros(n_flat, H_z, dtype=torch.bfloat16, device=device)
+    s_scratch_q = torch.zeros(N, C_s, dtype=torch.bfloat16, device=device)
+    s_scratch_k = torch.zeros(N, C_s, dtype=torch.bfloat16, device=device)
+    s_scratch_v = torch.zeros(N, C_s, dtype=torch.bfloat16, device=device)
+    s_scratch_gate = torch.zeros(N, C_s, dtype=torch.bfloat16, device=device)
+    z_bias_scratch = torch.zeros(n_flat, H_s, dtype=torch.bfloat16, device=device)
+    s_intermediate = torch.zeros(N, C_s, dtype=torch.bfloat16, device=device)
+
+    xm.mark_step()
+    xm.wait_device_ops()
+
+    # Record NEFFs before
+    cache_base = "/var/tmp/neuron-compile-cache/"
+    cache_dirs = (
+        [os.path.join(cache_base, d) for d in os.listdir(cache_base)]
+        if os.path.isdir(cache_base)
+        else []
+    )
+    cache_dir = max(cache_dirs, key=os.path.getmtime) if cache_dirs else cache_base
+    neffs_before = set(glob.glob(os.path.join(cache_dir, "*/model.neff")))
+
+    print("\nCompiling FULL PairformerLayer SPMD grid=[2] mega-kernel...")
+    print("  (2 physical cores, all 7 sub-operations)")
+    print(f"  Total weight params passed: {len(w)}")
+
+    t0 = time.time()
+
+    # Launch with grid=[2]
+    s_out, z_out = full_pairformer_layer_spmd[2](
+        # Main tensors
+        s,
+        z_flat,
+        pm_flat,
+        mask,
+        # PairBiasAttention weights (Step 1)
+        w["pba_norm_s_w"],
+        w["pba_norm_s_b"],
+        w["pba_norm_z_w"],
+        w["pba_norm_z_b"],
+        w["pba_q_w"],
+        w["pba_q_b"],
+        w["pba_k_w"],
+        w["pba_v_w"],
+        w["pba_z_w"],
+        w["pba_g_w"],
+        w["pba_o_w"],
+        # TriMulOut weights (Step 2)
+        w["tmul_out_norm_in_w"],
+        w["tmul_out_norm_in_b"],
+        w["tmul_out_p_in_w"],
+        w["tmul_out_g_in_w"],
+        w["tmul_out_norm_out_w"],
+        w["tmul_out_norm_out_b"],
+        w["tmul_out_p_out_w"],
+        w["tmul_out_g_out_w"],
+        # TriMulIn weights (Step 3)
+        w["tmul_in_norm_in_w"],
+        w["tmul_in_norm_in_b"],
+        w["tmul_in_p_in_w"],
+        w["tmul_in_g_in_w"],
+        w["tmul_in_norm_out_w"],
+        w["tmul_in_norm_out_b"],
+        w["tmul_in_p_out_w"],
+        w["tmul_in_g_out_w"],
+        # TriAttnStart weights (Step 4)
+        w["tatt_s_ln_w"],
+        w["tatt_s_ln_b"],
+        w["tatt_s_bias_proj_w"],
+        w["tatt_s_q_w"],
+        w["tatt_s_k_w"],
+        w["tatt_s_v_w"],
+        w["tatt_s_g_w"],
+        w["tatt_s_o_w"],
+        # TriAttnEnd weights (Step 5)
+        w["tatt_e_ln_w"],
+        w["tatt_e_ln_b"],
+        w["tatt_e_bias_proj_w"],
+        w["tatt_e_q_w"],
+        w["tatt_e_k_w"],
+        w["tatt_e_v_w"],
+        w["tatt_e_g_w"],
+        w["tatt_e_o_w"],
+        # Transition_s weights (Step 6a)
+        w["trans_s_norm_w"],
+        w["trans_s_norm_b"],
+        w["trans_s_fc1_w"],
+        w["trans_s_fc2_w"],
+        w["trans_s_fc3_w"],
+        # Transition_z weights (Step 6b)
+        w["trans_z_norm_w"],
+        w["trans_z_norm_b"],
+        w["trans_z_fc1_w"],
+        w["trans_z_fc2_w"],
+        w["trans_z_fc3_w"],
+        # Scratch buffers
+        scratch_buf,
+        bias_buf,
+        s_scratch_q,
+        s_scratch_k,
+        s_scratch_v,
+        s_scratch_gate,
+        z_bias_scratch,
+        s_intermediate,
+        # Constants
+        N=N,
+    )
+    xm.mark_step()
+    xm.wait_device_ops()
+    t1 = time.time()
+    print(f"Compilation + execution: {t1 - t0:.1f}s")
+
+    # Verify outputs
+    s_out_cpu = s_out.cpu()
+    z_out_cpu = z_out.cpu()
+    print(f"\ns_out shape: {s_out_cpu.shape}, dtype: {s_out_cpu.dtype}")
+    print(
+        f"s_out range: [{s_out_cpu.float().min():.4f}, {s_out_cpu.float().max():.4f}]"
+    )
+    print(f"s_out mean:  {s_out_cpu.float().mean():.4f}")
+    print(f"\nz_out shape: {z_out_cpu.shape}, dtype: {z_out_cpu.dtype}")
+    print(
+        f"z_out range: [{z_out_cpu.float().min():.4f}, {z_out_cpu.float().max():.4f}]"
+    )
+    print(f"z_out mean:  {z_out_cpu.float().mean():.4f}")
+
+    # Check for NaN
+    if torch.isnan(s_out_cpu).any():
+        print("\nWARNING: s_out contains NaN values!")
+    if torch.isnan(z_out_cpu).any():
+        print("\nWARNING: z_out contains NaN values!")
+
+    # Find new NEFF
+    neffs_after = set(glob.glob(os.path.join(cache_dir, "*/model.neff")))
+    new_neffs = neffs_after - neffs_before
+
+    if new_neffs:
+        for neff in sorted(new_neffs):
+            size_mb = os.path.getsize(neff) / 1024 / 1024
+            print(f"\nNew NEFF: {neff}")
+            print(f"  Size: {size_mb:.1f} MB")
+            import shutil
+
+            dest = f"/tmp/full_layer_spmd_N{N}.neff"
+            shutil.copy2(neff, dest)
+            print(f"  Copied to: {dest}")
+            print(f"\nBenchmark command:")
+            print(
+                f"  neuron-bench exec --enable-only-latency -w 5 -n 50 -f random --fixed-nc-count 1 {dest}"
+            )
+    else:
+        print("\nNo new NEFFs found (may have used cached)")
+        all_neffs = list(neffs_after)
+        if all_neffs:
+            all_neffs.sort(key=os.path.getmtime, reverse=True)
+            neff = all_neffs[0]
+            size_mb = os.path.getsize(neff) / 1024 / 1024
+            print(f"Most recent NEFF: {neff}")
+            print(f"  Size: {size_mb:.1f} MB")
+            import shutil
+
+            dest = f"/tmp/full_layer_spmd_N{N}.neff"
+            shutil.copy2(neff, dest)
+            print(f"  Copied to: {dest}")
+            print(f"\nBenchmark command:")
+            print(
+                f"  neuron-bench exec --enable-only-latency -w 5 -n 50 -f random --fixed-nc-count 1 {dest}"
+            )
+
+
+if __name__ == "__main__":
+    main()
diff --git a/contrib/models/Boltz-2/test/integration/test_full_layer_spmd.py b/contrib/models/Boltz-2/test/integration/test_full_layer_spmd.py
new file mode 100644
index 00000000..9bf410bb
--- /dev/null
+++ b/contrib/models/Boltz-2/test/integration/test_full_layer_spmd.py
@@ -0,0 +1,499 @@
+#!/usr/bin/env python3
+"""Test harness for the FULL PairformerLayer SPMD mega-kernel.
+
+Validates the mega-kernel output against the CPU reference implementation
+of all 7 sub-operations in a PairformerLayer:
+    Step 1:  s = s + PairBiasAttn(s, z, mask)
+    Step 2:  z = z + TriMulOut(z, pair_mask)
+    Step 3:  z = z + TriMulIn(z, pair_mask)
+    Step 4:  z = z + TriAttnStart(z)
+    Step 5:  z = z + TriAttnEnd(z)
+    Step 6a: s = s + Transition_s(s)
+    Step 6b: z = z + Transition_z(z)
+
+Usage:
+    # On the trn2 instance:
+    source /opt/aws_neuronx_venv_pytorch_2_9/bin/activate
+    NEURON_PLATFORM_TARGET_OVERRIDE=trn2 NEURON_RT_VISIBLE_CORES=0 \
+        python test_full_layer_spmd.py --N 128
+
+    # CPU-only reference (for debugging):
+    python test_full_layer_spmd.py --N 128 --cpu-only
+
+Requirements:
+    - Boltz-2 pip package installed (boltz v2.2.1)
+    - torch-neuronx, neuronxcc (for Neuron compilation)
+    - Model weights at ~/.boltz/boltz2_conf.ckpt
+"""
+
+import argparse
+import os
+import time
+
+import torch
+
+from boltz.main import (
+    Boltz2,
+    PairformerArgsV2,
+    Boltz2DiffusionParams,
+    MSAModuleArgs,
+    BoltzSteeringParams,
+)
+from dataclasses import asdict
+
+P_MAX = 128
+
+
+def cosine_similarity(a, b):
+    """Compute cosine similarity between two tensors."""
+    a_flat = a.float().flatten()
+    b_flat = b.float().flatten()
+    return (torch.dot(a_flat, b_flat) / (a_flat.norm() * b_flat.norm())).item()
+
+
+def run_cpu_reference(layer, s, z, mask, pair_mask):
+    """Run the full PairformerLayer forward pass on CPU as reference.
+
+    Args:
+        layer: PairformerLayer (V2) in float32
+        s: [1, N, C_s=384] float32
+        z: [1, N, N, C_z=128] float32
+        mask: [1, N] float32
+        pair_mask: [1, N, N] float32
+
+    Returns:
+        s_ref: [1, N, C_s] float32
+        z_ref: [1, N, N, C_z] float32
+        intermediates: list of (name, s_or_z_clone) for per-step comparison
+    """
+    s_ref = s.clone().float()
+    z_ref = z.clone().float()
+    intermediates = []
+
+    with torch.no_grad():
+        # Step 1: PairBiasAttention (s = s + PBA(s, z))
+        # V2: pre_norm_s is external to the attention module
+        s_normed = layer.pre_norm_s(s_ref)
+        s_delta = layer.attention(s=s_normed, z=z_ref, mask=mask, k_in=s_normed)
+        s_ref = s_ref + s_delta
+        intermediates.append(
+            ("pair_bias_attn", {"s": s_ref.clone(), "z": z_ref.clone()})
+        )
+
+        # Step 2: TriMulOut
+        z_delta = layer.tri_mul_out(z_ref, mask=pair_mask, use_kernels=False)
+        z_ref = z_ref + z_delta
+        intermediates.append(("tri_mul_out", {"s": s_ref.clone(), "z": z_ref.clone()}))
+
+        # Step 3: TriMulIn
+        z_delta = layer.tri_mul_in(z_ref, mask=pair_mask, use_kernels=False)
+        z_ref = z_ref + z_delta
+        intermediates.append(("tri_mul_in", {"s": s_ref.clone(), "z": z_ref.clone()}))
+
+        # Step 4: TriAttnStart
+        z_delta = layer.tri_att_start(
+            z_ref, mask=pair_mask, chunk_size=None, use_kernels=False
+        )
+        z_ref = z_ref + z_delta
+        intermediates.append(
+            ("tri_att_start", {"s": s_ref.clone(), "z": z_ref.clone()})
+        )
+
+        # Step 5: TriAttnEnd
+        z_delta = layer.tri_att_end(
+            z_ref, mask=pair_mask, chunk_size=None, use_kernels=False
+        )
+        z_ref = z_ref + z_delta
+        intermediates.append(("tri_att_end", {"s": s_ref.clone(), "z": z_ref.clone()}))
+
+        # Step 6a: Transition_s
+        s_delta = layer.transition_s(s_ref)
+        s_ref = s_ref + s_delta
+        intermediates.append(("transition_s", {"s": s_ref.clone(), "z": z_ref.clone()}))
+
+        # Step 6b: Transition_z
+        z_delta = layer.transition_z(z_ref)
+        z_ref = z_ref + z_delta
+        intermediates.append(("transition_z", {"s": s_ref.clone(), "z": z_ref.clone()}))
+
+    return s_ref, z_ref, intermediates
+
+
+def load_model(layer_idx=0):
+    """Load Boltz-2 model and return the specified pairformer layer."""
+    print("Loading Boltz-2 model...")
+    t0 = time.time()
+    boltz = Boltz2.load_from_checkpoint(
+        os.path.expanduser("~/.boltz/boltz2_conf.ckpt"),
+        strict=True,
+        predict_args={
+            "recycling_steps": 3,
+            "sampling_steps": 20,
+            "diffusion_samples": 1,
+            "max_parallel_samples": 1,
+            "write_confidence_summary": True,
+            "write_full_pae": True,
+            "write_full_pde": True,
+        },
+        map_location="cpu",
+        diffusion_process_args=asdict(Boltz2DiffusionParams()),
+        ema=False,
+        use_kernels=False,
+        pairformer_args=asdict(PairformerArgsV2()),
+        msa_args=asdict(MSAModuleArgs(use_paired_feature=True)),
+        steering_args=asdict(BoltzSteeringParams()),
+    )
+    boltz.eval()
+    boltz = boltz.float()
+    t1 = time.time()
+    print(f"Model loaded in {t1 - t0:.1f}s")
+    layer = boltz.pairformer_module.layers[layer_idx]
+    print(f"Using pairformer layer {layer_idx}")
+    return layer
+
+
+def extract_all_weights(layer):
+    """Extract ALL weights for the full layer mega-kernel. Same as compile script."""
+    weight_map = [
+        # PairBiasAttention (Step 1)
+        ("pba_norm_s_w", "pre_norm_s.weight"),
+        ("pba_norm_s_b", "pre_norm_s.bias"),
+        ("pba_norm_z_w", "attention.proj_z.0.weight"),
+        ("pba_norm_z_b", "attention.proj_z.0.bias"),
+        ("pba_q_w", "attention.proj_q.weight"),
+        ("pba_q_b", "attention.proj_q.bias"),
+        ("pba_k_w", "attention.proj_k.weight"),
+        ("pba_v_w", "attention.proj_v.weight"),
+        ("pba_z_w", "attention.proj_z.1.weight"),
+        ("pba_g_w", "attention.proj_g.weight"),
+        ("pba_o_w", "attention.proj_o.weight"),
+        # TriMulOut (Step 2)
+        ("tmul_out_norm_in_w", "tri_mul_out.norm_in.weight"),
+        ("tmul_out_norm_in_b", "tri_mul_out.norm_in.bias"),
+        ("tmul_out_p_in_w", "tri_mul_out.p_in.weight"),
+        ("tmul_out_g_in_w", "tri_mul_out.g_in.weight"),
+        ("tmul_out_norm_out_w", "tri_mul_out.norm_out.weight"),
+        ("tmul_out_norm_out_b", "tri_mul_out.norm_out.bias"),
+        ("tmul_out_p_out_w", "tri_mul_out.p_out.weight"),
+        ("tmul_out_g_out_w", "tri_mul_out.g_out.weight"),
+        # TriMulIn (Step 3)
+        ("tmul_in_norm_in_w", "tri_mul_in.norm_in.weight"),
+        ("tmul_in_norm_in_b", "tri_mul_in.norm_in.bias"),
+        ("tmul_in_p_in_w", "tri_mul_in.p_in.weight"),
+        ("tmul_in_g_in_w", "tri_mul_in.g_in.weight"),
+        ("tmul_in_norm_out_w", "tri_mul_in.norm_out.weight"),
+        ("tmul_in_norm_out_b", "tri_mul_in.norm_out.bias"),
+        ("tmul_in_p_out_w", "tri_mul_in.p_out.weight"),
+        ("tmul_in_g_out_w", "tri_mul_in.g_out.weight"),
+        # TriAttnStart (Step 4)
+        ("tatt_s_ln_w", "tri_att_start.layer_norm.weight"),
+        ("tatt_s_ln_b", "tri_att_start.layer_norm.bias"),
+        ("tatt_s_bias_proj_w", "tri_att_start.linear.weight"),
+        ("tatt_s_q_w", "tri_att_start.mha.linear_q.weight"),
+        ("tatt_s_k_w", "tri_att_start.mha.linear_k.weight"),
+        ("tatt_s_v_w", "tri_att_start.mha.linear_v.weight"),
+        ("tatt_s_g_w", "tri_att_start.mha.linear_g.weight"),
+        ("tatt_s_o_w", "tri_att_start.mha.linear_o.weight"),
+        # TriAttnEnd (Step 5)
+        ("tatt_e_ln_w", "tri_att_end.layer_norm.weight"),
+        ("tatt_e_ln_b", "tri_att_end.layer_norm.bias"),
+        ("tatt_e_bias_proj_w", "tri_att_end.linear.weight"),
+        ("tatt_e_q_w", "tri_att_end.mha.linear_q.weight"),
+        ("tatt_e_k_w", "tri_att_end.mha.linear_k.weight"),
+        ("tatt_e_v_w", "tri_att_end.mha.linear_v.weight"),
+        ("tatt_e_g_w", "tri_att_end.mha.linear_g.weight"),
+        ("tatt_e_o_w", "tri_att_end.mha.linear_o.weight"),
+        # Transition_s (Step 6a)
+        ("trans_s_norm_w", "transition_s.norm.weight"),
+        ("trans_s_norm_b", "transition_s.norm.bias"),
+        ("trans_s_fc1_w", "transition_s.fc1.weight"),
+        ("trans_s_fc2_w", "transition_s.fc2.weight"),
+        ("trans_s_fc3_w", "transition_s.fc3.weight"),
+        # Transition_z (Step 6b)
+        ("trans_z_norm_w", "transition_z.norm.weight"),
+        ("trans_z_norm_b", "transition_z.norm.bias"),
+        ("trans_z_fc1_w", "transition_z.fc1.weight"),
+        ("trans_z_fc2_w", "transition_z.fc2.weight"),
+        ("trans_z_fc3_w", "transition_z.fc3.weight"),
+    ]
+
+    w = {}
+    for buf_name, attr_path in weight_map:
+        obj = layer
+        for p in attr_path.split("."):
+            obj = getattr(obj, p)
+        v = obj.data.clone().to(torch.bfloat16)
+        if v.dim() == 1:
+            v = v.unsqueeze(0).expand(P_MAX, -1).contiguous()
+        w[buf_name] = v
+
+    return w
+
+
+def run_full_kernel(layer, s, z, mask, pair_mask, weights_bf16, device, N, C_z, C_s):
+    """Run the full PairformerLayer SPMD mega-kernel and compare to CPU reference."""
+    import torch_xla.core.xla_model as xm
+    from full_pairformer_layer_spmd import full_pairformer_layer_spmd
+
+    H_z = 4
+    H_s = 16
+    n_flat = N * N
+
+    # CPU reference (all 7 steps)
+    print("Running CPU reference (all 7 steps)...")
+    t0 = time.time()
+    s_ref, z_ref, intermediates = run_cpu_reference(layer, s, z, mask, pair_mask)
+    t1 = time.time()
+    print(f"CPU reference: {t1 - t0:.3f}s")
+
+    for name, vals in intermediates:
+        s_v = vals["s"]
+        z_v = vals["z"]
+        print(
+            f"  After {name}: s mean={s_v.mean():.6f} std={s_v.std():.6f} | "
+            f"z mean={z_v.mean():.6f} std={z_v.std():.6f}"
+        )
+
+    # Prepare inputs for kernel (flatten z to [N*N, C_z], remove batch dim from s)
+    s_flat = s[0].clone().to(torch.bfloat16).contiguous().to(device)
+    z_flat = z[0].reshape(N * N, C_z).clone().to(torch.bfloat16).contiguous().to(device)
+    pm_flat = pair_mask[0].reshape(N * N, 1).to(torch.bfloat16).contiguous().to(device)
+    mask_flat = mask[0].unsqueeze(-1).to(torch.bfloat16).contiguous().to(device)
+
+    # Pre-allocate scratch buffers
+    scratch_buf = torch.zeros(6 * n_flat, C_z, dtype=torch.bfloat16, device=device)
+    bias_buf = torch.zeros(n_flat, H_z, dtype=torch.bfloat16, device=device)
+    s_scratch_q = torch.zeros(N, C_s, dtype=torch.bfloat16, device=device)
+    s_scratch_k = torch.zeros(N, C_s, dtype=torch.bfloat16, device=device)
+    s_scratch_v = torch.zeros(N, C_s, dtype=torch.bfloat16, device=device)
+    s_scratch_gate = torch.zeros(N, C_s, dtype=torch.bfloat16, device=device)
+    z_bias_scratch = torch.zeros(n_flat, H_s, dtype=torch.bfloat16, device=device)
+    s_intermediate = torch.zeros(N, C_s, dtype=torch.bfloat16, device=device)
+
+    xm.mark_step()
+    xm.wait_device_ops()
+
+    w = weights_bf16
+    print("\nCompiling + running full layer SPMD mega-kernel...")
+    t0 = time.time()
+
+    s_out, z_out = full_pairformer_layer_spmd[2](
+        s_flat,
+        z_flat,
+        pm_flat,
+        mask_flat,
+        # PBA weights
+        w["pba_norm_s_w"],
+        w["pba_norm_s_b"],
+        w["pba_norm_z_w"],
+        w["pba_norm_z_b"],
+        w["pba_q_w"],
+        w["pba_q_b"],
+        w["pba_k_w"],
+        w["pba_v_w"],
+        w["pba_z_w"],
+        w["pba_g_w"],
+        w["pba_o_w"],
+        # TriMulOut
+        w["tmul_out_norm_in_w"],
+        w["tmul_out_norm_in_b"],
+        w["tmul_out_p_in_w"],
+        w["tmul_out_g_in_w"],
+        w["tmul_out_norm_out_w"],
+        w["tmul_out_norm_out_b"],
+        w["tmul_out_p_out_w"],
+        w["tmul_out_g_out_w"],
+        # TriMulIn
+        w["tmul_in_norm_in_w"],
+        w["tmul_in_norm_in_b"],
+        w["tmul_in_p_in_w"],
+        w["tmul_in_g_in_w"],
+        w["tmul_in_norm_out_w"],
+        w["tmul_in_norm_out_b"],
+        w["tmul_in_p_out_w"],
+        w["tmul_in_g_out_w"],
+        # TriAttnStart
+        w["tatt_s_ln_w"],
+        w["tatt_s_ln_b"],
+        w["tatt_s_bias_proj_w"],
+        w["tatt_s_q_w"],
+        w["tatt_s_k_w"],
+        w["tatt_s_v_w"],
+        w["tatt_s_g_w"],
+        w["tatt_s_o_w"],
+        # TriAttnEnd
+        w["tatt_e_ln_w"],
+        w["tatt_e_ln_b"],
+        w["tatt_e_bias_proj_w"],
+        w["tatt_e_q_w"],
+        w["tatt_e_k_w"],
+        w["tatt_e_v_w"],
+        w["tatt_e_g_w"],
+        w["tatt_e_o_w"],
+        # Transition_s
+        w["trans_s_norm_w"],
+        w["trans_s_norm_b"],
+        w["trans_s_fc1_w"],
+        w["trans_s_fc2_w"],
+        w["trans_s_fc3_w"],
+        # Transition_z
+        w["trans_z_norm_w"],
+        w["trans_z_norm_b"],
+        w["trans_z_fc1_w"],
+        w["trans_z_fc2_w"],
+        w["trans_z_fc3_w"],
+        # Scratch buffers
+        scratch_buf,
+        bias_buf,
+        s_scratch_q,
+        s_scratch_k,
+        s_scratch_v,
+        s_scratch_gate,
+        z_bias_scratch,
+        s_intermediate,
+        N=N,
+    )
+    xm.mark_step()
+    xm.wait_device_ops()
+    t1 = time.time()
+    print(f"Compilation + first run: {t1 - t0:.1f}s")
+
+    # Compare
+    s_result = s_out.cpu().reshape(N, C_s).float()
+    z_result = z_out.cpu().reshape(N, N, C_z).float()
+    s_ref_squeezed = s_ref.squeeze(0)
+    z_ref_squeezed = z_ref.squeeze(0)
+
+    s_cos = cosine_similarity(s_result, s_ref_squeezed)
+    s_max_diff = (s_result - s_ref_squeezed).abs().max().item()
+    s_mean_diff = (s_result - s_ref_squeezed).abs().mean().item()
+
+    z_cos = cosine_similarity(z_result, z_ref_squeezed)
+    z_max_diff = (z_result - z_ref_squeezed).abs().max().item()
+    z_mean_diff = (z_result - z_ref_squeezed).abs().mean().item()
+
+    print()
+    print(f"{'=' * 60}")
+    print(f"Full PairformerLayer Mega-Kernel Results (N={N})")
+    print(f"{'=' * 60}")
+    print(f"\n--- s output ---")
+    print(f"  Cosine similarity:  {s_cos:.6f}")
+    print(f"  Max abs diff:       {s_max_diff:.6f}")
+    print(f"  Mean abs diff:      {s_mean_diff:.6f}")
+    print(f"  Ref mean:           {s_ref_squeezed.mean():.6f}")
+    print(f"  Kernel mean:        {s_result.mean():.6f}")
+    print(f"  Ref std:            {s_ref_squeezed.std():.6f}")
+    print(f"  Kernel std:         {s_result.std():.6f}")
+
+    print(f"\n--- z output ---")
+    print(f"  Cosine similarity:  {z_cos:.6f}")
+    print(f"  Max abs diff:       {z_max_diff:.6f}")
+    print(f"  Mean abs diff:      {z_mean_diff:.6f}")
+    print(f"  Ref mean:           {z_ref_squeezed.mean():.6f}")
+    print(f"  Kernel mean:        {z_result.mean():.6f}")
+    print(f"  Ref std:            {z_ref_squeezed.std():.6f}")
+    print(f"  Kernel std:         {z_result.std():.6f}")
+
+    # NaN check
+    s_nan = torch.isnan(s_result).any().item()
+    z_nan = torch.isnan(z_result).any().item()
+    if s_nan:
+        print(f"\n  WARNING: s_out contains NaN!")
+    if z_nan:
+        print(f"\n  WARNING: z_out contains NaN!")
+
+    # Verdict
+    print(f"\n--- Verdict ---")
+    if s_nan or z_nan:
+        print(f"  FAIL (NaN detected)")
+    elif s_cos > 0.999 and z_cos > 0.999:
+        print(f"  PASS (s_cos={s_cos:.4f} > 0.999, z_cos={z_cos:.4f} > 0.999)")
+    elif s_cos > 0.99 and z_cos > 0.99:
+        print(f"  MARGINAL PASS (s_cos={s_cos:.4f} > 0.99, z_cos={z_cos:.4f} > 0.99)")
+    else:
+        print(f"  FAIL (s_cos={s_cos:.4f}, z_cos={z_cos:.4f} — need > 0.99)")
+
+    return s_cos, z_cos
+
+
+def main():
+    parser = argparse.ArgumentParser(
+        description="Test full PairformerLayer SPMD mega-kernel"
+    )
+    parser.add_argument(
+        "--N", type=int, default=128, help="Sequence length (multiple of 128)"
+    )
+    parser.add_argument(
+        "--layer", type=int, default=0, help="Which pairformer layer (0-63)"
+    )
+    parser.add_argument(
+        "--cpu-only", action="store_true", help="Only run CPU reference"
+    )
+    parser.add_argument("--seed", type=int, default=42, help="Random seed")
+    args = parser.parse_args()
+
+    N = args.N
+    C_z = 128
+    C_s = 384
+
+    assert N % 128 == 0, f"N must be a multiple of 128, got {N}"
+
+    print(
+        f"=== Full PairformerLayer SPMD Mega-Kernel Test: N={N}, layer={args.layer} ==="
+    )
+    print()
+
+    # Load model
+    layer = load_model(args.layer)
+
+    # Create test inputs (with batch dim for CPU reference)
+    print(f"Creating test inputs: N={N}")
+    torch.manual_seed(args.seed)
+    s = torch.randn(1, N, C_s, dtype=torch.float32) * 0.1
+    z = torch.randn(1, N, N, C_z, dtype=torch.float32) * 0.1
+    mask = torch.ones(1, N, dtype=torch.float32)
+    pair_mask = torch.ones(1, N, N, dtype=torch.float32)
+
+    if args.cpu_only:
+        print("Running CPU reference...")
+        t0 = time.time()
+        s_ref, z_ref, intermediates = run_cpu_reference(layer, s, z, mask, pair_mask)
+        t1 = time.time()
+        print(f"CPU reference: {t1 - t0:.3f}s")
+        for name, vals in intermediates:
+            sv = vals["s"]
+            zv = vals["z"]
+            print(
+                f"  After {name}: s mean={sv.mean():.6f} std={sv.std():.6f} | "
+                f"z mean={zv.mean():.6f} std={zv.std():.6f}"
+            )
+        print(f"\nFinal s: mean={s_ref.mean():.6f}, std={s_ref.std():.6f}")
+        print(f"Final z: mean={z_ref.mean():.6f}, std={z_ref.std():.6f}")
+        print("\nCPU-only mode, exiting.")
+        return
+
+    # Import Neuron packages
+    try:
+        import torch_neuronx
+        import torch_xla.core.xla_model as xm
+    except ImportError as e:
+        print(f"Cannot import Neuron packages: {e}")
+        print("Run this script on the trn2 instance.")
+        return
+
+    device = xm.xla_device()
+    print(f"XLA device: {device}")
+
+    # Extract and reshape weights
+    print("Extracting weights...")
+    weights_bf16 = extract_all_weights(layer)
+
+    # Move weights to device
+    for key in weights_bf16:
+        weights_bf16[key] = weights_bf16[key].to(device)
+
+    run_full_kernel(layer, s, z, mask, pair_mask, weights_bf16, device, N, C_z, C_s)
+
+
+if __name__ == "__main__":
+    main()