From acc9761c5bc83a29a9396ecae0b8b7bd935db44a Mon Sep 17 00:00:00 2001 From: Cursor Agent Date: Tue, 5 May 2026 19:23:55 +0000 Subject: [PATCH 1/3] deps: bump opacus to >=1.6.0 Raise the Opacus floor and refresh uv.lock so installs resolve to 1.6.0, which adds non-wrapping mode, FSDP/mixed-precision DP improvements, and assorted accountant/clipping fixes while maintaining torch>=2.6 alignment. Co-authored-by: Michi Platzer --- pyproject.toml | 2 +- uv.lock | 10 +++++----- 2 files changed, 6 insertions(+), 6 deletions(-) diff --git a/pyproject.toml b/pyproject.toml index a8bf42f9..ebd4ef26 100644 --- a/pyproject.toml +++ b/pyproject.toml @@ -37,7 +37,7 @@ dependencies = [ "accelerate>=1.5.0", "peft>=0.18.2", # transformers 5.7+ checks min PEFT in model.add_adapter (integrations/peft.py) "huggingface-hub[hf-xet]>=0.30.2", - "opacus>=1.5.4", + "opacus>=1.6.0", "xgrammar>=0.1.32,<1.0.0", # aligned with vllm 0.20 "json-repair>=0.47.0", "torch>=2.11.0,<2.12.0", diff --git a/uv.lock b/uv.lock index a3b78957..9fa407ff 100644 --- a/uv.lock +++ b/uv.lock @@ -2275,7 +2275,7 @@ wheels = [ [[package]] name = "mostlyai-engine" -version = "2.6.0" +version = "2.6.1" source = { editable = "." } dependencies = [ { name = "accelerate" }, @@ -2336,7 +2336,7 @@ requires-dist = [ { name = "joblib", specifier = ">=1.4.2" }, { name = "json-repair", specifier = ">=0.47.0" }, { name = "numpy", specifier = ">=2.0.0" }, - { name = "opacus", specifier = ">=1.5.4" }, + { name = "opacus", specifier = ">=1.6.0" }, { name = "pandas", specifier = ">=2.2.0" }, { name = "peft", specifier = ">=0.18.2" }, { name = "psutil", specifier = ">=5.9.5,<6" }, @@ -2863,7 +2863,7 @@ wheels = [ [[package]] name = "opacus" -version = "1.5.4" +version = "1.6.0" source = { registry = "https://pypi.org/simple" } dependencies = [ { name = "numpy" }, @@ -2871,9 +2871,9 @@ dependencies = [ { name = "scipy" }, { name = "torch" }, ] -sdist = { url = "https://files.pythonhosted.org/packages/df/de/25e455b62c73f10bf41d3e029016881e414de0c6f3f5af82b42dc10f7ca1/opacus-1.5.4.tar.gz", hash = "sha256:ea700808c0a8c3fb7e565ed55b77b3170d6a54cb30e295b255e3877170edc5ca", size = 151407, upload-time = "2025-05-27T16:23:57.514Z" } +sdist = { url = "https://files.pythonhosted.org/packages/31/39/a92b485f5a9c65bdd18c574d24c3b17800e39f02d66e665ab9bfbde77c52/opacus-1.6.0.tar.gz", hash = "sha256:20d77252c1e8528cd5826fbaba345cd39ed5c3edb841707eafb6403fe276140a", size = 184592, upload-time = "2026-05-05T18:15:26.103Z" } wheels = [ - { url = "https://files.pythonhosted.org/packages/a7/7e/8c9e798b1789861beec036b456b71a079f83963c7e2814814f1370dea667/opacus-1.5.4-py3-none-any.whl", hash = "sha256:a0ca27974e825d86635c82f08ebb380a373613c121066bc0b8e29e42d9f961a5", size = 254361, upload-time = "2025-05-27T16:23:55.827Z" }, + { url = "https://files.pythonhosted.org/packages/45/2d/cb1b04cc674f3e06dbe1afc02ed977f066f39c7a4a313372353f82c26117/opacus-1.6.0-py3-none-any.whl", hash = "sha256:342123624c56c09d47eaba44010ed216d51c63132b543d67cba53a49ed6dc41e", size = 308911, upload-time = "2026-05-05T18:15:24.479Z" }, ] [[package]] From e81881057c0c203c5c3a208749a8374036a455ea Mon Sep 17 00:00:00 2001 From: Cursor Agent Date: Tue, 5 May 2026 19:28:47 +0000 Subject: [PATCH 2/3] feat(language): use Opacus wrap_model=False for DP training Attach per-sample gradient hooks without GradSampleModule wrapping so the Hugging Face module hierarchy stays intact. Call GradSampleHooks.cleanup() after the training loop to remove hooks and Opacus monkey-patched attrs. Depends on Opacus >= 1.6 (non-wrapping mode). Co-authored-by: Michi Platzer --- mostlyai/engine/_language/training.py | 19 +++++++++++++------ 1 file changed, 13 insertions(+), 6 deletions(-) diff --git a/mostlyai/engine/_language/training.py b/mostlyai/engine/_language/training.py index b86c813e..57fef09a 100644 --- a/mostlyai/engine/_language/training.py +++ b/mostlyai/engine/_language/training.py @@ -31,7 +31,7 @@ from huggingface_hub import get_safetensors_metadata from opacus import GradSampleModule, PrivacyEngine from opacus.accountants import GaussianAccountant, PRVAccountant, RDPAccountant -from opacus.grad_sample import register_grad_sampler +from opacus.grad_sample import GradSampleHooks, register_grad_sampler from opacus.utils.batch_memory_manager import wrap_data_loader from peft import LoraConfig, PeftModel from torch import nn @@ -626,6 +626,7 @@ def concat_prompt_and_response(x): # this can help accelerate GPU compute torch.backends.cudnn.benchmark = True + dp_grad_sample_hooks: GradSampleHooks | None = None if with_dp: if isinstance(differential_privacy, DifferentialPrivacyConfig): dp_config = differential_privacy.model_dump() @@ -650,18 +651,21 @@ def concat_prompt_and_response(x): privacy_engine.accountant.load_state_dict( torch.load(workspace.model_dp_accountant_path, map_location=device, weights_only=True), ) - # Opacus will return the modified objects - # - model: wrapped in GradSampleModule and contains additional hooks for computing per-sample gradients - # - optimizer: wrapped in DPOptimizer and will do different operations during virtual steps and logical steps - # - dataloader: the dataloader with batch_sampler=UniformWithReplacementSampler (for Poisson sampling) - model, optimizer, trn_dataloader = privacy_engine.make_private( + # Opacus returns GradSampleHooks when wrap_model=False: hooks attach to the original module so HF / + # Transformers sees an unwrapped PreTrainedModel (requires Opacus >= 1.6). + # - dp_grad_sample_hooks: must call .cleanup() after training to remove backward hooks and param attrs + # - optimizer: wrapped in DPOptimizer (virtual vs logical steps) + # - dataloader: UniformWithReplacementSampler when poisson_sampling=True + dp_grad_sample_hooks, optimizer, trn_dataloader = privacy_engine.make_private( module=model, optimizer=optimizer, data_loader=trn_dataloader, noise_multiplier=dp_config.get("noise_multiplier"), max_grad_norm=dp_config.get("max_grad_norm"), poisson_sampling=True, + wrap_model=False, ) + model = dp_grad_sample_hooks._module # this further wraps the dataloader with batch_sampler=BatchSplittingSampler to achieve gradient accumulation # it will split the sampled logical batches into smaller sub-batches with batch_size trn_dataloader = wrap_data_loader( @@ -835,6 +839,9 @@ def concat_prompt_and_response(x): if total_training_time > max_training_time: do_stop = True + if dp_grad_sample_hooks is not None: + dp_grad_sample_hooks.cleanup() + # no checkpoint is saved yet because the training stopped before the first epoch ended if not model_checkpoint.has_saved_once(): _LOG.info("saving model weights, as none were saved so far") From e0afbb690099149c162ec75492afc14ea9ee4d81 Mon Sep 17 00:00:00 2001 From: Cursor Agent Date: Wed, 6 May 2026 06:02:04 +0000 Subject: [PATCH 3/3] fix(tabular): avoid NestedTensor CTXSEQ batches when using Opacus DP Opacus 1.x per-sample gradient hooks hit NotImplementedError on NestedTensorCPU (aten::new_empty). For DP training, collate CTXSEQ as padded dense tensors with -1 padding; SequentialContextEmbedders already masks -1 and maps to embedding index 0. Non-DP sequential training keeps nested CTXSEQ collate for unchanged behavior. Co-authored-by: Michi Platzer --- mostlyai/engine/_tabular/argn.py | 3 +- mostlyai/engine/_tabular/training.py | 46 +++++++++++++++++++++------- 2 files changed, 37 insertions(+), 12 deletions(-) diff --git a/mostlyai/engine/_tabular/argn.py b/mostlyai/engine/_tabular/argn.py index b26d8a89..bb483388 100644 --- a/mostlyai/engine/_tabular/argn.py +++ b/mostlyai/engine/_tabular/argn.py @@ -289,7 +289,8 @@ def forward(self, x) -> tuple[dict[str, torch.Tensor], torch.Tensor]: mask = None for sub_col in self.cardinalities: xs = torch.as_tensor(x[sub_col], device=self.device) - xs = torch.nested.to_padded_tensor(xs, padding=-1) + if xs.is_nested: + xs = torch.nested.to_padded_tensor(xs, padding=-1) mask = (xs != -1).squeeze(-1) xs = torch.where(xs == -1, torch.tensor(0), xs) xs = self.get(sub_col)(xs) diff --git a/mostlyai/engine/_tabular/training.py b/mostlyai/engine/_tabular/training.py index ea87dbb8..204632e9 100644 --- a/mostlyai/engine/_tabular/training.py +++ b/mostlyai/engine/_tabular/training.py @@ -138,10 +138,20 @@ class BatchCollator: For sequence data, it will sample subsequences with lengths up to max_sequence_window. """ - def __init__(self, is_sequential: bool, max_sequence_window: int | None, device: torch.device): + def __init__( + self, + is_sequential: bool, + max_sequence_window: int | None, + device: torch.device, + *, + use_nested_ctxseq: bool = True, + ): self.is_sequential = is_sequential self.max_sequence_window = max_sequence_window self.device = device + # Opacus per-sample gradients do not support NestedTensor on CPU/CUDA; use padded + # dense tensors for CTXSEQ when training with DP (see test_tabular_sequential DP path). + self.use_nested_ctxseq = use_nested_ctxseq def __call__(self, batch: list[dict]) -> dict[str, torch.Tensor]: batch = pd.DataFrame(batch) @@ -177,15 +187,26 @@ def _convert_to_tensors(self, batch: pd.DataFrame) -> dict[str, torch.Tensor]: dim=-1, ) elif column.startswith(CTXSEQ): - # construct row tensors and convert the list to nested column tensor - tensors[column] = torch.unsqueeze( - torch.nested.as_nested_tensor( - [torch.tensor(row, dtype=torch.int64, device=self.device) for row in batch[column]], - dtype=torch.int64, - device=self.device, - ), - dim=-1, - ) + if self.use_nested_ctxseq: + # construct row tensors and convert the list to nested column tensor + tensors[column] = torch.unsqueeze( + torch.nested.as_nested_tensor( + [torch.tensor(row, dtype=torch.int64, device=self.device) for row in batch[column]], + dtype=torch.int64, + device=self.device, + ), + dim=-1, + ) + else: + # padded batch (variable-length rows); -1 marks padding (matches SequentialContextEmbedders) + tensors[column] = torch.unsqueeze( + torch.tensor( + np.array(list(zip_longest(*batch[column], fillvalue=-1))).T, + dtype=torch.int64, + device=self.device, + ), + dim=-1, + ) return tensors @staticmethod @@ -544,7 +565,10 @@ def train( # and see if it's possible to make it compatible with DP batch_collator = BatchCollator( - is_sequential=is_sequential, max_sequence_window=max_sequence_window, device=device + is_sequential=is_sequential, + max_sequence_window=max_sequence_window, + device=device, + use_nested_ctxseq=not with_dp, ) disable_progress_bar() trn_dataset = load_dataset("parquet", data_files=[str(p) for p in workspace.encoded_data_trn.fetch_all()])[