From 3b9c8d45c0ffdecb1f11f395e45c789cad06e2e2 Mon Sep 17 00:00:00 2001 From: HAOCHENYE <21724054@zju.edu.cn> Date: Fri, 6 Feb 2026 08:37:51 +0000 Subject: [PATCH 1/2] tmp ghstack-source-id: 3b810e7a789538d8f0820f181329f6fd4c5f8344 Pull-Request: https://github.com/InternLM/xtuner/pull/1484 --- pyproject.toml | 1 + 1 file changed, 1 insertion(+) diff --git a/pyproject.toml b/pyproject.toml index 6fa81c006..16f31e2c4 100644 --- a/pyproject.toml +++ b/pyproject.toml @@ -130,3 +130,4 @@ skip-magic-trailing-comma = false # Like Black, automatically detect the appropriate line ending. line-ending = "auto" +111 From 5241ddf56c24dc19e0ee3d69e2b5a878585e6009 Mon Sep 17 00:00:00 2001 From: HAOCHENYE <21724054@zju.edu.cn> Date: Fri, 27 Feb 2026 08:23:23 +0000 Subject: [PATCH 2/2] [Enhance] Make `ModelOutputs` pydantic `BaseModel` --- .github/workflows/claude.yml | 5 ++++- xtuner/v1/loss/base_loss_ctx.py | 5 +++++ xtuner/v1/model/base.py | 22 ++++++++++++++++++---- xtuner/v1/model/dense/dense.py | 2 +- xtuner/v1/model/moe/moe.py | 16 ++++++++++------ 5 files changed, 38 insertions(+), 12 deletions(-) diff --git a/.github/workflows/claude.yml b/.github/workflows/claude.yml index 0e3b473fd..aaae73ec0 100644 --- a/.github/workflows/claude.yml +++ b/.github/workflows/claude.yml @@ -9,6 +9,9 @@ on: types: [opened, assigned] pull_request_review: types: [submitted] + pull_request_target: + types: [opened, synchronize] + branches: [main] jobs: claude: @@ -38,7 +41,7 @@ jobs: # Prompt A workaround for claude code action bug of `Fork` PR prompt: | REPO: ${{ github.repository }} - PR NUMBER: ${{ github.event.pull_request.number }} + PR NUMBER: ${{ github.event.pull_request.number || github.event.issue.number}} Please review this pull request. diff --git a/xtuner/v1/loss/base_loss_ctx.py b/xtuner/v1/loss/base_loss_ctx.py index 1b42bebd0..5cae06d86 100644 --- a/xtuner/v1/loss/base_loss_ctx.py +++ b/xtuner/v1/loss/base_loss_ctx.py @@ -12,6 +12,7 @@ from typing_extensions import Self from xtuner.v1.loss.utils import sp_split +from xtuner.v1.model.utils.misc import ModelForwardExtraLogInfo from .chunk_loss import ChunkLoss @@ -195,6 +196,10 @@ def forward( else: loss, (logits, extra_info) = self.chunk_mode(hidden_states, head_weight, head_bias, self.loss_kwargs) + # TODO: yanhuida, should be removed + if not isinstance(extra_info, ModelForwardExtraLogInfo): + extra_info = ModelForwardExtraLogInfo(extra_info) + extra_info["local_base_loss"] = loss.detach().clone() # Step 2.c in the loss calculation: reduce the loss over all ranks using all_reduce with autograd support diff --git a/xtuner/v1/model/base.py b/xtuner/v1/model/base.py index 9f1e0f995..293909019 100644 --- a/xtuner/v1/model/base.py +++ b/xtuner/v1/model/base.py @@ -194,11 +194,25 @@ def layers_type(self) -> list[Literal["full_attention", "sliding_attention"]]: ] -class ModelOutputs(TypedDict): - hidden_states: NotRequired[list[torch.Tensor]] - logits: NotRequired[torch.Tensor] +class ModelOutputs(PydanticBaseModel): + model_config = ConfigDict(arbitrary_types_allowed=True) + hidden_states: list[torch.Tensor] | None = None + logits: torch.Tensor | None = None loss: torch.Tensor - extra_info: ModelForwardExtraLogInfo + extra_info: ModelForwardExtraLogInfo | None = None + + def free(self): + self.hidden_states = None + self.logits = None + self.extra_info = None + + # TODO: Only for avoid BC. Should be removed later. + def __getitem__(self, key): + return getattr(self, key) + + # TODO: Only for avoid BC. Should be removed later. + def __contains__(self, key): + return key in self.model_fields_set def _is_float8_available(): diff --git a/xtuner/v1/model/dense/dense.py b/xtuner/v1/model/dense/dense.py index cde398b38..db2a4ccb4 100644 --- a/xtuner/v1/model/dense/dense.py +++ b/xtuner/v1/model/dense/dense.py @@ -107,7 +107,7 @@ def forward( output["loss"] = loss output["logits"] = logits output["extra_info"] = extra_info - return ModelOutputs(**output) # type: ignore[typeddict-item] + return ModelOutputs(**output) def build_embeddings(self, config: TransformerConfig): return nn.Embedding(config.vocab_size, config.hidden_size, config.pad_token_id) diff --git a/xtuner/v1/model/moe/moe.py b/xtuner/v1/model/moe/moe.py index 962129e1d..276cf23d0 100644 --- a/xtuner/v1/model/moe/moe.py +++ b/xtuner/v1/model/moe/moe.py @@ -79,10 +79,14 @@ class MoEModelOutputs(ModelOutputs): - router_logits: NotRequired[dict[str, torch.Tensor]] - balancing_loss: NotRequired[torch.Tensor] - z_loss: NotRequired[torch.Tensor] - tokens_per_expert_global: NotRequired[torch.Tensor] + router_logits: dict[str, torch.Tensor] | None = None + balancing_loss: torch.Tensor | None = None + z_loss: torch.Tensor | None = None + tokens_per_expert_global: torch.Tensor + + def free(self): + super().free() + self.router_logits = None class BalancingLossConfig(PydanticBaseModel): @@ -482,7 +486,7 @@ def _micro_batch_forward( output["router_logits"] = router_logits_dict - return MoEModelOutputs(**output, logits=logits) # type: ignore[typeddict-item] + return MoEModelOutputs(**output, logits=logits) def _forward( self, @@ -583,7 +587,7 @@ def _forward( else: output["router_logits"] = None - return MoEModelOutputs(**output) # type: ignore[typeddict-item] + return MoEModelOutputs(**output) def build_embeddings(self, config: MoEConfig): return nn.Embedding(config.vocab_size, config.hidden_size, config.pad_token_id)