From 6ab53db634a74bf6c7addbc1da48d6d5a78ea974 Mon Sep 17 00:00:00 2001 From: Chenhan Yu Date: Thu, 5 Mar 2026 11:14:25 -0800 Subject: [PATCH 1/4] fix: https://github.com/NVIDIA/Model-Optimizer/issues/981 Signed-off-by: Chenhan Yu --- modelopt/torch/opt/plugins/mcore_dist_checkpointing.py | 6 ++++++ 1 file changed, 6 insertions(+) diff --git a/modelopt/torch/opt/plugins/mcore_dist_checkpointing.py b/modelopt/torch/opt/plugins/mcore_dist_checkpointing.py index 3e5b359468..8a2f38a68d 100644 --- a/modelopt/torch/opt/plugins/mcore_dist_checkpointing.py +++ b/modelopt/torch/opt/plugins/mcore_dist_checkpointing.py @@ -149,6 +149,12 @@ def _parse_transformer_config(transformer_config: dict) -> dict: config[k] = v else: config[k] = str(v) + # Handle https://github.com/NVIDIA/Model-Optimizer/issues/981 where + # hierarchical_context_parallel_sizes: [8, 2] will raise a TypeError. + try: + config[k] = str(v) + except (AttributeError, TypeError): + config[k] = repr(type(v)) return config From 014c1a007f01729c2269ce6e4383fcb0dcfe1abc Mon Sep 17 00:00:00 2001 From: Keval Morabia <28916987+kevalmorabia97@users.noreply.github.com> Date: Fri, 6 Mar 2026 00:52:45 +0530 Subject: [PATCH 2/4] Update modelopt/torch/opt/plugins/mcore_dist_checkpointing.py Co-authored-by: coderabbitai[bot] <136622811+coderabbitai[bot]@users.noreply.github.com> Signed-off-by: Keval Morabia <28916987+kevalmorabia97@users.noreply.github.com> --- modelopt/torch/opt/plugins/mcore_dist_checkpointing.py | 1 - 1 file changed, 1 deletion(-) diff --git a/modelopt/torch/opt/plugins/mcore_dist_checkpointing.py b/modelopt/torch/opt/plugins/mcore_dist_checkpointing.py index 8a2f38a68d..10c04b43d8 100644 --- a/modelopt/torch/opt/plugins/mcore_dist_checkpointing.py +++ b/modelopt/torch/opt/plugins/mcore_dist_checkpointing.py @@ -148,7 +148,6 @@ def _parse_transformer_config(transformer_config: dict) -> dict: if isinstance(v, (bool, int, str)): config[k] = v else: - config[k] = str(v) # Handle https://github.com/NVIDIA/Model-Optimizer/issues/981 where # hierarchical_context_parallel_sizes: [8, 2] will raise a TypeError. try: From 6e567e60cb5b385d5ee94faab03ef8a7abb19706 Mon Sep 17 00:00:00 2001 From: Chenhan Yu Date: Thu, 5 Mar 2026 14:05:25 -0800 Subject: [PATCH 3/4] chg: taking feedback to omit the field Signed-off-by: Chenhan Yu --- modelopt/torch/opt/plugins/mcore_dist_checkpointing.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/modelopt/torch/opt/plugins/mcore_dist_checkpointing.py b/modelopt/torch/opt/plugins/mcore_dist_checkpointing.py index 10c04b43d8..a17b55f773 100644 --- a/modelopt/torch/opt/plugins/mcore_dist_checkpointing.py +++ b/modelopt/torch/opt/plugins/mcore_dist_checkpointing.py @@ -153,7 +153,7 @@ def _parse_transformer_config(transformer_config: dict) -> dict: try: config[k] = str(v) except (AttributeError, TypeError): - config[k] = repr(type(v)) + print("Warning: TransformerConfig.{} does not have _repr_ implemented.") return config From 68f6c204dd39de9acfb24a1256a7e559e398c083 Mon Sep 17 00:00:00 2001 From: Asha Anoosheh Date: Tue, 17 Mar 2026 17:11:43 -0700 Subject: [PATCH 4/4] Remove extra config parsing/saving altogether Signed-off-by: Asha Anoosheh --- .../opt/plugins/mcore_dist_checkpointing.py | 44 ------------------- 1 file changed, 44 deletions(-) diff --git a/modelopt/torch/opt/plugins/mcore_dist_checkpointing.py b/modelopt/torch/opt/plugins/mcore_dist_checkpointing.py index a17b55f773..16ace511fe 100644 --- a/modelopt/torch/opt/plugins/mcore_dist_checkpointing.py +++ b/modelopt/torch/opt/plugins/mcore_dist_checkpointing.py @@ -22,7 +22,6 @@ from typing import Any import torch -import yaml from megatron.core import dist_checkpointing, mpu from megatron.core.dist_checkpointing.serialization import get_default_load_sharded_strategy from megatron.core.dist_checkpointing.strategies.common import COMMON_STATE_FNAME @@ -36,21 +35,6 @@ SUPPORTED_WRAPPERS[Float16Module] = "module" -DROP_SUBSTRINGS = [ - "fp4", - "fp8", - "tp_", - "parallel", - "cuda_graph", - "init_", - "cpu", - "recompute", - "inference", - "pipeline", - "comm", - "batch", -] - def remove_per_module_state( modelopt_state: dict[str, Any], @@ -138,34 +122,6 @@ def save_sharded_modelopt_state( sharded_strategy: configures sharded tensors saving behavior and backend prefix: the prefix to add to the modelopt_state keys ("model." for NeMo) """ - - def _parse_transformer_config(transformer_config: dict) -> dict: - config = {} - - for k, v in transformer_config.items(): - if any(substring in k for substring in DROP_SUBSTRINGS): - continue - if isinstance(v, (bool, int, str)): - config[k] = v - else: - # Handle https://github.com/NVIDIA/Model-Optimizer/issues/981 where - # hierarchical_context_parallel_sizes: [8, 2] will raise a TypeError. - try: - config[k] = str(v) - except (AttributeError, TypeError): - print("Warning: TransformerConfig.{} does not have _repr_ implemented.") - - return config - - # Save own version of run config, if not already saved by the framework. - if dist.is_master() and not os.path.exists(f"{checkpoint_name}/run_config.yaml"): - run_config_name = f"{checkpoint_name}/modelopt_run_config.yaml" - # We avoid deepcopy since some attributes in Megatron-Bridge config cannot be deepcopied. - config_dict = _parse_transformer_config(model[0].config.__dict__) - config_dict["nvidia_modelopt_version"] = modelopt.__version__ - with open(run_config_name, "w") as f: - yaml.dump(config_dict, f, default_flow_style=False) - if not mto.ModeloptStateManager.is_converted(model[0]): return if len(model) > 1: