From acc9761c5bc83a29a9396ecae0b8b7bd935db44a Mon Sep 17 00:00:00 2001
From: Cursor Agent <cursoragent@cursor.com>
Date: Tue, 5 May 2026 19:23:55 +0000
Subject: [PATCH 1/2] deps: bump opacus to >=1.6.0

Raise the Opacus floor and refresh uv.lock so installs resolve to 1.6.0,
which adds non-wrapping mode, FSDP/mixed-precision DP improvements, and
assorted accountant/clipping fixes while maintaining torch>=2.6 alignment.

Co-authored-by: Michi Platzer <michael.platzer@gmail.com>
---
 pyproject.toml |  2 +-
 uv.lock        | 10 +++++-----
 2 files changed, 6 insertions(+), 6 deletions(-)

diff --git a/pyproject.toml b/pyproject.toml
index a8bf42f9..ebd4ef26 100644
--- a/pyproject.toml
+++ b/pyproject.toml
@@ -37,7 +37,7 @@ dependencies = [
     "accelerate>=1.5.0",
     "peft>=0.18.2",  # transformers 5.7+ checks min PEFT in model.add_adapter (integrations/peft.py)
     "huggingface-hub[hf-xet]>=0.30.2",
-    "opacus>=1.5.4",
+    "opacus>=1.6.0",
     "xgrammar>=0.1.32,<1.0.0",  # aligned with vllm 0.20
     "json-repair>=0.47.0",
     "torch>=2.11.0,<2.12.0",
diff --git a/uv.lock b/uv.lock
index a3b78957..9fa407ff 100644
--- a/uv.lock
+++ b/uv.lock
@@ -2275,7 +2275,7 @@ wheels = [
 
 [[package]]
 name = "mostlyai-engine"
-version = "2.6.0"
+version = "2.6.1"
 source = { editable = "." }
 dependencies = [
     { name = "accelerate" },
@@ -2336,7 +2336,7 @@ requires-dist = [
     { name = "joblib", specifier = ">=1.4.2" },
     { name = "json-repair", specifier = ">=0.47.0" },
     { name = "numpy", specifier = ">=2.0.0" },
-    { name = "opacus", specifier = ">=1.5.4" },
+    { name = "opacus", specifier = ">=1.6.0" },
     { name = "pandas", specifier = ">=2.2.0" },
     { name = "peft", specifier = ">=0.18.2" },
     { name = "psutil", specifier = ">=5.9.5,<6" },
@@ -2863,7 +2863,7 @@ wheels = [
 
 [[package]]
 name = "opacus"
-version = "1.5.4"
+version = "1.6.0"
 source = { registry = "https://pypi.org/simple" }
 dependencies = [
     { name = "numpy" },
@@ -2871,9 +2871,9 @@ dependencies = [
     { name = "scipy" },
     { name = "torch" },
 ]
-sdist = { url = "https://files.pythonhosted.org/packages/df/de/25e455b62c73f10bf41d3e029016881e414de0c6f3f5af82b42dc10f7ca1/opacus-1.5.4.tar.gz", hash = "sha256:ea700808c0a8c3fb7e565ed55b77b3170d6a54cb30e295b255e3877170edc5ca", size = 151407, upload-time = "2025-05-27T16:23:57.514Z" }
+sdist = { url = "https://files.pythonhosted.org/packages/31/39/a92b485f5a9c65bdd18c574d24c3b17800e39f02d66e665ab9bfbde77c52/opacus-1.6.0.tar.gz", hash = "sha256:20d77252c1e8528cd5826fbaba345cd39ed5c3edb841707eafb6403fe276140a", size = 184592, upload-time = "2026-05-05T18:15:26.103Z" }
 wheels = [
-    { url = "https://files.pythonhosted.org/packages/a7/7e/8c9e798b1789861beec036b456b71a079f83963c7e2814814f1370dea667/opacus-1.5.4-py3-none-any.whl", hash = "sha256:a0ca27974e825d86635c82f08ebb380a373613c121066bc0b8e29e42d9f961a5", size = 254361, upload-time = "2025-05-27T16:23:55.827Z" },
+    { url = "https://files.pythonhosted.org/packages/45/2d/cb1b04cc674f3e06dbe1afc02ed977f066f39c7a4a313372353f82c26117/opacus-1.6.0-py3-none-any.whl", hash = "sha256:342123624c56c09d47eaba44010ed216d51c63132b543d67cba53a49ed6dc41e", size = 308911, upload-time = "2026-05-05T18:15:24.479Z" },
 ]
 
 [[package]]

From e0afbb690099149c162ec75492afc14ea9ee4d81 Mon Sep 17 00:00:00 2001
From: Cursor Agent <cursoragent@cursor.com>
Date: Wed, 6 May 2026 06:02:04 +0000
Subject: [PATCH 2/2] fix(tabular): avoid NestedTensor CTXSEQ batches when
 using Opacus DP

Opacus 1.x per-sample gradient hooks hit NotImplementedError on NestedTensorCPU
(aten::new_empty). For DP training, collate CTXSEQ as padded dense tensors with -1
padding; SequentialContextEmbedders already masks -1 and maps to embedding index 0.

Non-DP sequential training keeps nested CTXSEQ collate for unchanged behavior.

Co-authored-by: Michi Platzer <michael.platzer@gmail.com>
---
 mostlyai/engine/_tabular/argn.py     |  3 +-
 mostlyai/engine/_tabular/training.py | 46 +++++++++++++++++++++-------
 2 files changed, 37 insertions(+), 12 deletions(-)

diff --git a/mostlyai/engine/_tabular/argn.py b/mostlyai/engine/_tabular/argn.py
index b26d8a89..bb483388 100644
--- a/mostlyai/engine/_tabular/argn.py
+++ b/mostlyai/engine/_tabular/argn.py
@@ -289,7 +289,8 @@ def forward(self, x) -> tuple[dict[str, torch.Tensor], torch.Tensor]:
         mask = None
         for sub_col in self.cardinalities:
             xs = torch.as_tensor(x[sub_col], device=self.device)
-            xs = torch.nested.to_padded_tensor(xs, padding=-1)
+            if xs.is_nested:
+                xs = torch.nested.to_padded_tensor(xs, padding=-1)
             mask = (xs != -1).squeeze(-1)
             xs = torch.where(xs == -1, torch.tensor(0), xs)
             xs = self.get(sub_col)(xs)
diff --git a/mostlyai/engine/_tabular/training.py b/mostlyai/engine/_tabular/training.py
index ea87dbb8..204632e9 100644
--- a/mostlyai/engine/_tabular/training.py
+++ b/mostlyai/engine/_tabular/training.py
@@ -138,10 +138,20 @@ class BatchCollator:
     For sequence data, it will sample subsequences with lengths up to max_sequence_window.
     """
 
-    def __init__(self, is_sequential: bool, max_sequence_window: int | None, device: torch.device):
+    def __init__(
+        self,
+        is_sequential: bool,
+        max_sequence_window: int | None,
+        device: torch.device,
+        *,
+        use_nested_ctxseq: bool = True,
+    ):
         self.is_sequential = is_sequential
         self.max_sequence_window = max_sequence_window
         self.device = device
+        # Opacus per-sample gradients do not support NestedTensor on CPU/CUDA; use padded
+        # dense tensors for CTXSEQ when training with DP (see test_tabular_sequential DP path).
+        self.use_nested_ctxseq = use_nested_ctxseq
 
     def __call__(self, batch: list[dict]) -> dict[str, torch.Tensor]:
         batch = pd.DataFrame(batch)
@@ -177,15 +187,26 @@ def _convert_to_tensors(self, batch: pd.DataFrame) -> dict[str, torch.Tensor]:
                     dim=-1,
                 )
             elif column.startswith(CTXSEQ):
-                # construct row tensors and convert the list to nested column tensor
-                tensors[column] = torch.unsqueeze(
-                    torch.nested.as_nested_tensor(
-                        [torch.tensor(row, dtype=torch.int64, device=self.device) for row in batch[column]],
-                        dtype=torch.int64,
-                        device=self.device,
-                    ),
-                    dim=-1,
-                )
+                if self.use_nested_ctxseq:
+                    # construct row tensors and convert the list to nested column tensor
+                    tensors[column] = torch.unsqueeze(
+                        torch.nested.as_nested_tensor(
+                            [torch.tensor(row, dtype=torch.int64, device=self.device) for row in batch[column]],
+                            dtype=torch.int64,
+                            device=self.device,
+                        ),
+                        dim=-1,
+                    )
+                else:
+                    # padded batch (variable-length rows); -1 marks padding (matches SequentialContextEmbedders)
+                    tensors[column] = torch.unsqueeze(
+                        torch.tensor(
+                            np.array(list(zip_longest(*batch[column], fillvalue=-1))).T,
+                            dtype=torch.int64,
+                            device=self.device,
+                        ),
+                        dim=-1,
+                    )
         return tensors
 
     @staticmethod
@@ -544,7 +565,10 @@ def train(
 
         # and see if it's possible to make it compatible with DP
         batch_collator = BatchCollator(
-            is_sequential=is_sequential, max_sequence_window=max_sequence_window, device=device
+            is_sequential=is_sequential,
+            max_sequence_window=max_sequence_window,
+            device=device,
+            use_nested_ctxseq=not with_dp,
         )
         disable_progress_bar()
         trn_dataset = load_dataset("parquet", data_files=[str(p) for p in workspace.encoded_data_trn.fetch_all()])[