From acc9761c5bc83a29a9396ecae0b8b7bd935db44a Mon Sep 17 00:00:00 2001
From: Cursor Agent <cursoragent@cursor.com>
Date: Tue, 5 May 2026 19:23:55 +0000
Subject: [PATCH 1/3] deps: bump opacus to >=1.6.0

Raise the Opacus floor and refresh uv.lock so installs resolve to 1.6.0,
which adds non-wrapping mode, FSDP/mixed-precision DP improvements, and
assorted accountant/clipping fixes while maintaining torch>=2.6 alignment.

Co-authored-by: Michi Platzer <michael.platzer@gmail.com>
---
 pyproject.toml |  2 +-
 uv.lock        | 10 +++++-----
 2 files changed, 6 insertions(+), 6 deletions(-)

diff --git a/pyproject.toml b/pyproject.toml
index a8bf42f9..ebd4ef26 100644
--- a/pyproject.toml
+++ b/pyproject.toml
@@ -37,7 +37,7 @@ dependencies = [
     "accelerate>=1.5.0",
     "peft>=0.18.2",  # transformers 5.7+ checks min PEFT in model.add_adapter (integrations/peft.py)
     "huggingface-hub[hf-xet]>=0.30.2",
-    "opacus>=1.5.4",
+    "opacus>=1.6.0",
     "xgrammar>=0.1.32,<1.0.0",  # aligned with vllm 0.20
     "json-repair>=0.47.0",
     "torch>=2.11.0,<2.12.0",
diff --git a/uv.lock b/uv.lock
index a3b78957..9fa407ff 100644
--- a/uv.lock
+++ b/uv.lock
@@ -2275,7 +2275,7 @@ wheels = [
 
 [[package]]
 name = "mostlyai-engine"
-version = "2.6.0"
+version = "2.6.1"
 source = { editable = "." }
 dependencies = [
     { name = "accelerate" },
@@ -2336,7 +2336,7 @@ requires-dist = [
     { name = "joblib", specifier = ">=1.4.2" },
     { name = "json-repair", specifier = ">=0.47.0" },
     { name = "numpy", specifier = ">=2.0.0" },
-    { name = "opacus", specifier = ">=1.5.4" },
+    { name = "opacus", specifier = ">=1.6.0" },
     { name = "pandas", specifier = ">=2.2.0" },
     { name = "peft", specifier = ">=0.18.2" },
     { name = "psutil", specifier = ">=5.9.5,<6" },
@@ -2863,7 +2863,7 @@ wheels = [
 
 [[package]]
 name = "opacus"
-version = "1.5.4"
+version = "1.6.0"
 source = { registry = "https://pypi.org/simple" }
 dependencies = [
     { name = "numpy" },
@@ -2871,9 +2871,9 @@ dependencies = [
     { name = "scipy" },
     { name = "torch" },
 ]
-sdist = { url = "https://files.pythonhosted.org/packages/df/de/25e455b62c73f10bf41d3e029016881e414de0c6f3f5af82b42dc10f7ca1/opacus-1.5.4.tar.gz", hash = "sha256:ea700808c0a8c3fb7e565ed55b77b3170d6a54cb30e295b255e3877170edc5ca", size = 151407, upload-time = "2025-05-27T16:23:57.514Z" }
+sdist = { url = "https://files.pythonhosted.org/packages/31/39/a92b485f5a9c65bdd18c574d24c3b17800e39f02d66e665ab9bfbde77c52/opacus-1.6.0.tar.gz", hash = "sha256:20d77252c1e8528cd5826fbaba345cd39ed5c3edb841707eafb6403fe276140a", size = 184592, upload-time = "2026-05-05T18:15:26.103Z" }
 wheels = [
-    { url = "https://files.pythonhosted.org/packages/a7/7e/8c9e798b1789861beec036b456b71a079f83963c7e2814814f1370dea667/opacus-1.5.4-py3-none-any.whl", hash = "sha256:a0ca27974e825d86635c82f08ebb380a373613c121066bc0b8e29e42d9f961a5", size = 254361, upload-time = "2025-05-27T16:23:55.827Z" },
+    { url = "https://files.pythonhosted.org/packages/45/2d/cb1b04cc674f3e06dbe1afc02ed977f066f39c7a4a313372353f82c26117/opacus-1.6.0-py3-none-any.whl", hash = "sha256:342123624c56c09d47eaba44010ed216d51c63132b543d67cba53a49ed6dc41e", size = 308911, upload-time = "2026-05-05T18:15:24.479Z" },
 ]
 
 [[package]]

From e81881057c0c203c5c3a208749a8374036a455ea Mon Sep 17 00:00:00 2001
From: Cursor Agent <cursoragent@cursor.com>
Date: Tue, 5 May 2026 19:28:47 +0000
Subject: [PATCH 2/3] feat(language): use Opacus wrap_model=False for DP
 training

Attach per-sample gradient hooks without GradSampleModule wrapping so the
Hugging Face module hierarchy stays intact. Call GradSampleHooks.cleanup()
after the training loop to remove hooks and Opacus monkey-patched attrs.

Depends on Opacus >= 1.6 (non-wrapping mode).

Co-authored-by: Michi Platzer <michael.platzer@gmail.com>
---
 mostlyai/engine/_language/training.py | 19 +++++++++++++------
 1 file changed, 13 insertions(+), 6 deletions(-)

diff --git a/mostlyai/engine/_language/training.py b/mostlyai/engine/_language/training.py
index b86c813e..57fef09a 100644
--- a/mostlyai/engine/_language/training.py
+++ b/mostlyai/engine/_language/training.py
@@ -31,7 +31,7 @@
 from huggingface_hub import get_safetensors_metadata
 from opacus import GradSampleModule, PrivacyEngine
 from opacus.accountants import GaussianAccountant, PRVAccountant, RDPAccountant
-from opacus.grad_sample import register_grad_sampler
+from opacus.grad_sample import GradSampleHooks, register_grad_sampler
 from opacus.utils.batch_memory_manager import wrap_data_loader
 from peft import LoraConfig, PeftModel
 from torch import nn
@@ -626,6 +626,7 @@ def concat_prompt_and_response(x):
             # this can help accelerate GPU compute
             torch.backends.cudnn.benchmark = True
 
+        dp_grad_sample_hooks: GradSampleHooks | None = None
         if with_dp:
             if isinstance(differential_privacy, DifferentialPrivacyConfig):
                 dp_config = differential_privacy.model_dump()
@@ -650,18 +651,21 @@ def concat_prompt_and_response(x):
                 privacy_engine.accountant.load_state_dict(
                     torch.load(workspace.model_dp_accountant_path, map_location=device, weights_only=True),
                 )
-            # Opacus will return the modified objects
-            # - model: wrapped in GradSampleModule and contains additional hooks for computing per-sample gradients
-            # - optimizer: wrapped in DPOptimizer and will do different operations during virtual steps and logical steps
-            # - dataloader: the dataloader with batch_sampler=UniformWithReplacementSampler (for Poisson sampling)
-            model, optimizer, trn_dataloader = privacy_engine.make_private(
+            # Opacus returns GradSampleHooks when wrap_model=False: hooks attach to the original module so HF /
+            # Transformers sees an unwrapped PreTrainedModel (requires Opacus >= 1.6).
+            # - dp_grad_sample_hooks: must call .cleanup() after training to remove backward hooks and param attrs
+            # - optimizer: wrapped in DPOptimizer (virtual vs logical steps)
+            # - dataloader: UniformWithReplacementSampler when poisson_sampling=True
+            dp_grad_sample_hooks, optimizer, trn_dataloader = privacy_engine.make_private(
                 module=model,
                 optimizer=optimizer,
                 data_loader=trn_dataloader,
                 noise_multiplier=dp_config.get("noise_multiplier"),
                 max_grad_norm=dp_config.get("max_grad_norm"),
                 poisson_sampling=True,
+                wrap_model=False,
             )
+            model = dp_grad_sample_hooks._module
             # this further wraps the dataloader with batch_sampler=BatchSplittingSampler to achieve gradient accumulation
             # it will split the sampled logical batches into smaller sub-batches with batch_size
             trn_dataloader = wrap_data_loader(
@@ -835,6 +839,9 @@ def concat_prompt_and_response(x):
             if total_training_time > max_training_time:
                 do_stop = True
 
+        if dp_grad_sample_hooks is not None:
+            dp_grad_sample_hooks.cleanup()
+
         # no checkpoint is saved yet because the training stopped before the first epoch ended
         if not model_checkpoint.has_saved_once():
             _LOG.info("saving model weights, as none were saved so far")

From e0afbb690099149c162ec75492afc14ea9ee4d81 Mon Sep 17 00:00:00 2001
From: Cursor Agent <cursoragent@cursor.com>
Date: Wed, 6 May 2026 06:02:04 +0000
Subject: [PATCH 3/3] fix(tabular): avoid NestedTensor CTXSEQ batches when
 using Opacus DP

Opacus 1.x per-sample gradient hooks hit NotImplementedError on NestedTensorCPU
(aten::new_empty). For DP training, collate CTXSEQ as padded dense tensors with -1
padding; SequentialContextEmbedders already masks -1 and maps to embedding index 0.

Non-DP sequential training keeps nested CTXSEQ collate for unchanged behavior.

Co-authored-by: Michi Platzer <michael.platzer@gmail.com>
---
 mostlyai/engine/_tabular/argn.py     |  3 +-
 mostlyai/engine/_tabular/training.py | 46 +++++++++++++++++++++-------
 2 files changed, 37 insertions(+), 12 deletions(-)

diff --git a/mostlyai/engine/_tabular/argn.py b/mostlyai/engine/_tabular/argn.py
index b26d8a89..bb483388 100644
--- a/mostlyai/engine/_tabular/argn.py
+++ b/mostlyai/engine/_tabular/argn.py
@@ -289,7 +289,8 @@ def forward(self, x) -> tuple[dict[str, torch.Tensor], torch.Tensor]:
         mask = None
         for sub_col in self.cardinalities:
             xs = torch.as_tensor(x[sub_col], device=self.device)
-            xs = torch.nested.to_padded_tensor(xs, padding=-1)
+            if xs.is_nested:
+                xs = torch.nested.to_padded_tensor(xs, padding=-1)
             mask = (xs != -1).squeeze(-1)
             xs = torch.where(xs == -1, torch.tensor(0), xs)
             xs = self.get(sub_col)(xs)
diff --git a/mostlyai/engine/_tabular/training.py b/mostlyai/engine/_tabular/training.py
index ea87dbb8..204632e9 100644
--- a/mostlyai/engine/_tabular/training.py
+++ b/mostlyai/engine/_tabular/training.py
@@ -138,10 +138,20 @@ class BatchCollator:
     For sequence data, it will sample subsequences with lengths up to max_sequence_window.
     """
 
-    def __init__(self, is_sequential: bool, max_sequence_window: int | None, device: torch.device):
+    def __init__(
+        self,
+        is_sequential: bool,
+        max_sequence_window: int | None,
+        device: torch.device,
+        *,
+        use_nested_ctxseq: bool = True,
+    ):
         self.is_sequential = is_sequential
         self.max_sequence_window = max_sequence_window
         self.device = device
+        # Opacus per-sample gradients do not support NestedTensor on CPU/CUDA; use padded
+        # dense tensors for CTXSEQ when training with DP (see test_tabular_sequential DP path).
+        self.use_nested_ctxseq = use_nested_ctxseq
 
     def __call__(self, batch: list[dict]) -> dict[str, torch.Tensor]:
         batch = pd.DataFrame(batch)
@@ -177,15 +187,26 @@ def _convert_to_tensors(self, batch: pd.DataFrame) -> dict[str, torch.Tensor]:
                     dim=-1,
                 )
             elif column.startswith(CTXSEQ):
-                # construct row tensors and convert the list to nested column tensor
-                tensors[column] = torch.unsqueeze(
-                    torch.nested.as_nested_tensor(
-                        [torch.tensor(row, dtype=torch.int64, device=self.device) for row in batch[column]],
-                        dtype=torch.int64,
-                        device=self.device,
-                    ),
-                    dim=-1,
-                )
+                if self.use_nested_ctxseq:
+                    # construct row tensors and convert the list to nested column tensor
+                    tensors[column] = torch.unsqueeze(
+                        torch.nested.as_nested_tensor(
+                            [torch.tensor(row, dtype=torch.int64, device=self.device) for row in batch[column]],
+                            dtype=torch.int64,
+                            device=self.device,
+                        ),
+                        dim=-1,
+                    )
+                else:
+                    # padded batch (variable-length rows); -1 marks padding (matches SequentialContextEmbedders)
+                    tensors[column] = torch.unsqueeze(
+                        torch.tensor(
+                            np.array(list(zip_longest(*batch[column], fillvalue=-1))).T,
+                            dtype=torch.int64,
+                            device=self.device,
+                        ),
+                        dim=-1,
+                    )
         return tensors
 
     @staticmethod
@@ -544,7 +565,10 @@ def train(
 
         # and see if it's possible to make it compatible with DP
         batch_collator = BatchCollator(
-            is_sequential=is_sequential, max_sequence_window=max_sequence_window, device=device
+            is_sequential=is_sequential,
+            max_sequence_window=max_sequence_window,
+            device=device,
+            use_nested_ctxseq=not with_dp,
         )
         disable_progress_bar()
         trn_dataset = load_dataset("parquet", data_files=[str(p) for p in workspace.encoded_data_trn.fetch_all()])[