From acc9761c5bc83a29a9396ecae0b8b7bd935db44a Mon Sep 17 00:00:00 2001 From: Cursor Agent Date: Tue, 5 May 2026 19:23:55 +0000 Subject: [PATCH 1/2] deps: bump opacus to >=1.6.0 Raise the Opacus floor and refresh uv.lock so installs resolve to 1.6.0, which adds non-wrapping mode, FSDP/mixed-precision DP improvements, and assorted accountant/clipping fixes while maintaining torch>=2.6 alignment. Co-authored-by: Michi Platzer --- pyproject.toml | 2 +- uv.lock | 10 +++++----- 2 files changed, 6 insertions(+), 6 deletions(-) diff --git a/pyproject.toml b/pyproject.toml index a8bf42f9..ebd4ef26 100644 --- a/pyproject.toml +++ b/pyproject.toml @@ -37,7 +37,7 @@ dependencies = [ "accelerate>=1.5.0", "peft>=0.18.2", # transformers 5.7+ checks min PEFT in model.add_adapter (integrations/peft.py) "huggingface-hub[hf-xet]>=0.30.2", - "opacus>=1.5.4", + "opacus>=1.6.0", "xgrammar>=0.1.32,<1.0.0", # aligned with vllm 0.20 "json-repair>=0.47.0", "torch>=2.11.0,<2.12.0", diff --git a/uv.lock b/uv.lock index a3b78957..9fa407ff 100644 --- a/uv.lock +++ b/uv.lock @@ -2275,7 +2275,7 @@ wheels = [ [[package]] name = "mostlyai-engine" -version = "2.6.0" +version = "2.6.1" source = { editable = "." } dependencies = [ { name = "accelerate" }, @@ -2336,7 +2336,7 @@ requires-dist = [ { name = "joblib", specifier = ">=1.4.2" }, { name = "json-repair", specifier = ">=0.47.0" }, { name = "numpy", specifier = ">=2.0.0" }, - { name = "opacus", specifier = ">=1.5.4" }, + { name = "opacus", specifier = ">=1.6.0" }, { name = "pandas", specifier = ">=2.2.0" }, { name = "peft", specifier = ">=0.18.2" }, { name = "psutil", specifier = ">=5.9.5,<6" }, @@ -2863,7 +2863,7 @@ wheels = [ [[package]] name = "opacus" -version = "1.5.4" +version = "1.6.0" source = { registry = "https://pypi.org/simple" } dependencies = [ { name = "numpy" }, @@ -2871,9 +2871,9 @@ dependencies = [ { name = "scipy" }, { name = "torch" }, ] -sdist = { url = "https://files.pythonhosted.org/packages/df/de/25e455b62c73f10bf41d3e029016881e414de0c6f3f5af82b42dc10f7ca1/opacus-1.5.4.tar.gz", hash = "sha256:ea700808c0a8c3fb7e565ed55b77b3170d6a54cb30e295b255e3877170edc5ca", size = 151407, upload-time = "2025-05-27T16:23:57.514Z" } +sdist = { url = "https://files.pythonhosted.org/packages/31/39/a92b485f5a9c65bdd18c574d24c3b17800e39f02d66e665ab9bfbde77c52/opacus-1.6.0.tar.gz", hash = "sha256:20d77252c1e8528cd5826fbaba345cd39ed5c3edb841707eafb6403fe276140a", size = 184592, upload-time = "2026-05-05T18:15:26.103Z" } wheels = [ - { url = "https://files.pythonhosted.org/packages/a7/7e/8c9e798b1789861beec036b456b71a079f83963c7e2814814f1370dea667/opacus-1.5.4-py3-none-any.whl", hash = "sha256:a0ca27974e825d86635c82f08ebb380a373613c121066bc0b8e29e42d9f961a5", size = 254361, upload-time = "2025-05-27T16:23:55.827Z" }, + { url = "https://files.pythonhosted.org/packages/45/2d/cb1b04cc674f3e06dbe1afc02ed977f066f39c7a4a313372353f82c26117/opacus-1.6.0-py3-none-any.whl", hash = "sha256:342123624c56c09d47eaba44010ed216d51c63132b543d67cba53a49ed6dc41e", size = 308911, upload-time = "2026-05-05T18:15:24.479Z" }, ] [[package]] From e0afbb690099149c162ec75492afc14ea9ee4d81 Mon Sep 17 00:00:00 2001 From: Cursor Agent Date: Wed, 6 May 2026 06:02:04 +0000 Subject: [PATCH 2/2] fix(tabular): avoid NestedTensor CTXSEQ batches when using Opacus DP Opacus 1.x per-sample gradient hooks hit NotImplementedError on NestedTensorCPU (aten::new_empty). For DP training, collate CTXSEQ as padded dense tensors with -1 padding; SequentialContextEmbedders already masks -1 and maps to embedding index 0. Non-DP sequential training keeps nested CTXSEQ collate for unchanged behavior. Co-authored-by: Michi Platzer --- mostlyai/engine/_tabular/argn.py | 3 +- mostlyai/engine/_tabular/training.py | 46 +++++++++++++++++++++------- 2 files changed, 37 insertions(+), 12 deletions(-) diff --git a/mostlyai/engine/_tabular/argn.py b/mostlyai/engine/_tabular/argn.py index b26d8a89..bb483388 100644 --- a/mostlyai/engine/_tabular/argn.py +++ b/mostlyai/engine/_tabular/argn.py @@ -289,7 +289,8 @@ def forward(self, x) -> tuple[dict[str, torch.Tensor], torch.Tensor]: mask = None for sub_col in self.cardinalities: xs = torch.as_tensor(x[sub_col], device=self.device) - xs = torch.nested.to_padded_tensor(xs, padding=-1) + if xs.is_nested: + xs = torch.nested.to_padded_tensor(xs, padding=-1) mask = (xs != -1).squeeze(-1) xs = torch.where(xs == -1, torch.tensor(0), xs) xs = self.get(sub_col)(xs) diff --git a/mostlyai/engine/_tabular/training.py b/mostlyai/engine/_tabular/training.py index ea87dbb8..204632e9 100644 --- a/mostlyai/engine/_tabular/training.py +++ b/mostlyai/engine/_tabular/training.py @@ -138,10 +138,20 @@ class BatchCollator: For sequence data, it will sample subsequences with lengths up to max_sequence_window. """ - def __init__(self, is_sequential: bool, max_sequence_window: int | None, device: torch.device): + def __init__( + self, + is_sequential: bool, + max_sequence_window: int | None, + device: torch.device, + *, + use_nested_ctxseq: bool = True, + ): self.is_sequential = is_sequential self.max_sequence_window = max_sequence_window self.device = device + # Opacus per-sample gradients do not support NestedTensor on CPU/CUDA; use padded + # dense tensors for CTXSEQ when training with DP (see test_tabular_sequential DP path). + self.use_nested_ctxseq = use_nested_ctxseq def __call__(self, batch: list[dict]) -> dict[str, torch.Tensor]: batch = pd.DataFrame(batch) @@ -177,15 +187,26 @@ def _convert_to_tensors(self, batch: pd.DataFrame) -> dict[str, torch.Tensor]: dim=-1, ) elif column.startswith(CTXSEQ): - # construct row tensors and convert the list to nested column tensor - tensors[column] = torch.unsqueeze( - torch.nested.as_nested_tensor( - [torch.tensor(row, dtype=torch.int64, device=self.device) for row in batch[column]], - dtype=torch.int64, - device=self.device, - ), - dim=-1, - ) + if self.use_nested_ctxseq: + # construct row tensors and convert the list to nested column tensor + tensors[column] = torch.unsqueeze( + torch.nested.as_nested_tensor( + [torch.tensor(row, dtype=torch.int64, device=self.device) for row in batch[column]], + dtype=torch.int64, + device=self.device, + ), + dim=-1, + ) + else: + # padded batch (variable-length rows); -1 marks padding (matches SequentialContextEmbedders) + tensors[column] = torch.unsqueeze( + torch.tensor( + np.array(list(zip_longest(*batch[column], fillvalue=-1))).T, + dtype=torch.int64, + device=self.device, + ), + dim=-1, + ) return tensors @staticmethod @@ -544,7 +565,10 @@ def train( # and see if it's possible to make it compatible with DP batch_collator = BatchCollator( - is_sequential=is_sequential, max_sequence_window=max_sequence_window, device=device + is_sequential=is_sequential, + max_sequence_window=max_sequence_window, + device=device, + use_nested_ctxseq=not with_dp, ) disable_progress_bar() trn_dataset = load_dataset("parquet", data_files=[str(p) for p in workspace.encoded_data_trn.fetch_all()])[