From f39570b271356d2bcc71134e2b4f454e1fa72299 Mon Sep 17 00:00:00 2001 From: fengl Date: Mon, 12 Jan 2026 17:11:51 +0800 Subject: [PATCH 1/3] [grpo] support gigpo with gym --- swift/trainers/arguments.py | 13 ++-- swift/trainers/rlhf_trainer/grpo_trainer.py | 68 ++++++++++++++++++++- 2 files changed, 75 insertions(+), 6 deletions(-) diff --git a/swift/trainers/arguments.py b/swift/trainers/arguments.py index ec894fbbb5..f6fed6b434 100644 --- a/swift/trainers/arguments.py +++ b/swift/trainers/arguments.py @@ -517,13 +517,14 @@ class GRPOArgumentsMixin(RolloutTrainerArgumentsMixin): tau_neg (float): The temperature parameter for negative dominance in the SAPO algorithm, controlling the sharpness of the soft gating function. Typically, `tau_neg` is set > `tau_pos` to impose stronger constraints on negative dominance. The default value is 1.05. - advantage_estimator (Literal['grpo', 'rloo', 'reinforce_plus_plus']): The advantage estimation + step_advantage_w (float): The weight for the step-level advantage (A^S) in the GiGPO algorithm. Defaults to 1.0. + advantage_estimator (Literal['grpo', 'rloo', 'reinforce_plus_plus', 'gigpo']): The advantage estimation function to use. 'grpo' calculates the relative advantage within a group. Options are 'grpo', 'rloo', 'reinforce_plus_plus'. Defaults to 'grpo'. kl_in_reward (Optional[bool]): Controls how the KL divergence regularization term is handled. If `False`, it's an independent term in the loss function. If `True`, KL is directly incorporated into the - reward (subtracted from it). The default is tied to `advantage_estimator`: `False` for 'grpo', `True` for - 'rloo' and 'reinforce_plus_plus'. + reward (subtracted from it). The default is tied to `advantage_estimator`: `False` for 'grpo' and 'gigpo', + `True` for 'rloo' and 'reinforce_plus_plus'. generation_batch_size (Optional[int]): The batch size for sampling completions. It should be a multiple of `num_processes * per_device_train_batch_size`. Defaults to `per_device_batch_size * gradient_accumulation_steps * num_processes`. @@ -596,10 +597,12 @@ class GRPOArgumentsMixin(RolloutTrainerArgumentsMixin): tau_pos: float = 1.0 tau_neg: float = 1.05 - # RLOO, REINFORCE++ - advantage_estimator: Literal['grpo', 'rloo', 'reinforce_plus_plus'] = 'grpo' + # RLOO, REINFORCE++, GiGPO + advantage_estimator: Literal['grpo', 'rloo', 'reinforce_plus_plus', 'gigpo'] = 'grpo' # If false, add KL into loss, otherwise add into reward kl_in_reward: Optional[bool] = None # rloo/reinforce_plus_plus: true, grpo: false (default) + # GiGPO, https://arxiv.org/abs/2405.06708 + step_advantage_w = 1.0 generation_batch_size: Optional[int] = None steps_per_generation: Optional[int] = None diff --git a/swift/trainers/rlhf_trainer/grpo_trainer.py b/swift/trainers/rlhf_trainer/grpo_trainer.py index eea759c3ee..063abe95ec 100644 --- a/swift/trainers/rlhf_trainer/grpo_trainer.py +++ b/swift/trainers/rlhf_trainer/grpo_trainer.py @@ -21,6 +21,7 @@ import inspect import os import time +import numpy as np from collections import defaultdict, deque from contextlib import contextmanager, nullcontext from copy import copy, deepcopy @@ -425,6 +426,52 @@ def normalize_advantages(advantages: torch.Tensor, rewards_std: torch.Tensor) -> return advantages / (rewards_std + 1e-4) return advantages + def _compute_step_advantages(inputs, trajectory_advantages): + # Extract step-level reward information from inputs + # Store (prompt_id, step) -> [rewards] mapping + step_rewards_dict = {} + for idx, input_data in enumerate(inputs): + prompt_id = input_data['prompt_id'] + rollout_info = input_data['rollout_infos'] + + # Collect all step rewards for current trajectory + for traj_info in rollout_info.get('trajectory_info', []): + step = traj_info.get('step', 0) + reward = traj_info.get('reward', 0.0) + + # Group rewards by prompt_id and step + key = (prompt_id, step) + if key not in step_rewards_dict: + step_rewards_dict[key] = [] + step_rewards_dict[key].append(reward) + # Calculate step-level advantage and aggregate + aggregated_step_advantages = torch.zeros_like(trajectory_advantages) + for idx, input_data in enumerate(inputs): + prompt_id = input_data['prompt_id'] + rollout_info = input_data['rollout_infos'] + + # Calculate aggregated step-level advantage for current trajectory + step_advantages = [] + for traj_info in rollout_info.get('trajectory_info', []): + step = traj_info.get('step', 0) + reward = traj_info.get('reward', 0.0) + + # Get all rewards for same prompt and step + key = (prompt_id, step) + all_rewards = step_rewards_dict.get(key, [reward]) + + # Calculate step advantage (compared to group average) + mean_reward = np.mean(all_rewards) + step_advantage = reward - mean_reward + step_advantages.append(step_advantage) + + # Aggregate step-level advantage for current trajectory (use mean of valid steps) + if step_advantages: + aggregated_step_advantages[idx] = np.mean(step_advantages) + else: + aggregated_step_advantages[idx] = 0.0 + return aggregated_step_advantages + def log_rewards_metrics(rewards: torch.Tensor, rewards_per_func_for_metrics: torch.Tensor): """Log reward statistics for monitoring. Only log once per unique request_id.""" # rewards: [prompt_batch_size, num_generations] @@ -506,6 +553,12 @@ def log_rewards_all(rewards_per_func: torch.Tensor): advantages = rewards * K / (K - 1) - group_rewards_mean * K / (K - 1) else: advantages = rewards - group_rewards_mean + elif self.advantage_estimator == 'gigpo' and self.use_gym_env: + # Get trajectory-level advantage (original GRPO advantage) + trajectory_advantages = rewards - group_rewards_mean + aggregated_step_advantages = _compute_step_advantages(inputs, trajectory_advantages) + # Weighted sum of trajectory-level advantage and aggregated step-level advantage + advantages = trajectory_advantages + self.step_advantage_w * aggregated_step_advantages else: # 'grpo' or 'reinforce_plus_plus' # Both use group mean as baseline advantages = rewards - group_rewards_mean @@ -654,6 +707,13 @@ def log_rewards_all(rewards_per_func: torch.Tensor): indices_in_unique = torch.tensor([rid_to_idx[r] for r in request_ids], device=device) advantages = request_advantages[indices_in_unique] + if self.advantage_estimator == 'gigpo' and self.use_gym_env: + # Get trajectory-level advantage (original GRPO advantage) + trajectory_advantages = advantages + aggregated_step_advantages = _compute_step_advantages(inputs, trajectory_advantages) + # Weighted sum of trajectory-level advantage and aggregated step-level advantage + advantages = trajectory_advantages + self.step_advantage_w * aggregated_step_advantages + # Step 5. Log metrics for unique request_ids log_rewards_metrics(rewards=unique_rewards, rewards_per_func_for_metrics=rewards_per_func[unique_indices]) @@ -2154,6 +2214,9 @@ def _prepare_algorithm_params(self): self.advantage_estimator = args.advantage_estimator self.kl_in_reward = args.kl_in_reward + # GiGPO, https://arxiv.org/abs/2405.06708 + self.step_advantage_w = args.step_advantage_w + # Rollout Importance Sampling Correction self.rollout_importance_sampling_mode = args.rollout_importance_sampling_mode self.rollout_importance_sampling_threshold = args.rollout_importance_sampling_threshold @@ -2227,7 +2290,10 @@ def _prepare_rewards(self, reward_funcs, reward_model=None, reward_templates=Non f'functions ({len(reward_funcs)})') self.reward_weights = torch.tensor(args.reward_weights, dtype=torch.float32).to(device) else: - self.reward_weights = torch.ones(len(self.reward_func_names), dtype=torch.float32).to(device) + if self.use_gym_env: + self.reward_weights = torch.ones(1, dtype=torch.float32).to(device) + else: + self.reward_weights = torch.ones(len(self.reward_func_names), dtype=torch.float32).to(device) # after init trainer for i, reward_func in enumerate(self.reward_funcs): From 166d8ef9799afb4dc61c4ecd1883946a545426cc Mon Sep 17 00:00:00 2001 From: fengl Date: Tue, 27 Jan 2026 15:04:01 +0800 Subject: [PATCH 2/3] [grpo] support gigpo with gym - Optimization and Documentation Supplement --- .../Instruction/Command-line-parameters.md | 3 +- .../GRPO/AdvancedResearch/GIGPO.md | 95 +++++++++++++++++++ .../Instruction/Command-line-parameters.md | 3 +- .../GRPO/AdvancedResearch/GIGPO.md | 95 +++++++++++++++++++ swift/rlhf_trainers/args_mixin.py | 8 +- swift/rlhf_trainers/grpo_trainer.py | 35 ++++--- 6 files changed, 216 insertions(+), 23 deletions(-) create mode 100644 docs/source/Instruction/GRPO/AdvancedResearch/GIGPO.md create mode 100644 docs/source_en/Instruction/GRPO/AdvancedResearch/GIGPO.md diff --git a/docs/source/Instruction/Command-line-parameters.md b/docs/source/Instruction/Command-line-parameters.md index 12a4097be0..a2e1ed0950 100644 --- a/docs/source/Instruction/Command-line-parameters.md +++ b/docs/source/Instruction/Command-line-parameters.md @@ -623,8 +623,9 @@ reward模型参数将在PPO、GRPO中使用。 - overlong_filter:跳过超长截断的样本,不参与loss计算,默认为False。 - delta: [INTELLECT-2 tech report](https://huggingface.co/papers/2505.07291)中双侧 GRPO 上界裁剪值。若设置,建议大于 1 + epsilon。默认为None。 - importance_sampling_level: 控制重要性采样比计算,可选项为 `token` 和 `sequence`,`token` 模式下保留原始的每个 token 的对数概率比,`sequence` 模式下则会对序列中所有有效 token 的对数概率比进行平均。[GSPO论文](https://arxiv.org/abs/2507.18071)中使用sequence级别计算来稳定训练,默认为`token`。 -- advantage_estimator: 优势计算函数,默认为 `grpo`,即计算组内相对优势,可选项为 `grpo`、[`rloo`](./GRPO/AdvancedResearch/RLOO.md)、[`reinforce_plus_plus`](./GRPO/AdvancedResearch/REINFORCEPP.md)。 +- advantage_estimator: 优势计算函数,默认为 `grpo`,即计算组内相对优势,可选项为 `grpo`、[`rloo`](./GRPO/AdvancedResearch/RLOO.md)、[`reinforce_plus_plus`](./GRPO/AdvancedResearch/REINFORCEPP.md)、[`gigpo`](./GRPO/AdvancedResearch/GIGPO.md)。 - kl_in_reward: 控制 KL 散度正则项的处理位置;`false`表示作为损失函数的独立正则项,`true`表示将 KL 直接并入奖励(从奖励中扣除)。默认情况与advantage_estimator绑定,`grpo`下默认为`false`,`rloo` 和 `reinforce_plus_plus` 下默认为 `true`。 +- gigpo_step_advantage_weight: GiGPO 算法中步骤级优势(A^S)的权重。默认值为 1.0。 - scale_rewards:指定奖励的缩放策略。可选值包括 `group`(按组内标准差缩放)、`batch`(按整个批次的标准差缩放)、`none`(不进行缩放)、`gdpo`(对每个奖励函数分别进行组内归一化后加权聚合,参考 [GDPO 论文](https://arxiv.org/abs/2601.05242))。在 ms-swift < 3.10 版本中,该参数为布尔类型,`true` 对应 `group`,`false` 对应 `none`。默认值与 `advantage_estimator` 绑定:`grpo` 对应 `group`,`rloo` 对应 `none`,`reinforce_plus_plus` 对应 `batch`。 - 注意:`gdpo` 模式不支持 `kl_in_reward=True`,若同时设置会自动将 `kl_in_reward` 设为 `False`。 - GDPO 适用于多奖励优化场景:当使用多个奖励函数时,GDPO 会对每个奖励函数分别在组内进行标准化(减均值、除标准差),然后使用 `reward_weights` 进行加权求和,最后再进行批次级别的标准化。这种方式可以更好地保留各个奖励的相对差异,避免不同奖励组合坍塌成相同的 advantage 值。 diff --git a/docs/source/Instruction/GRPO/AdvancedResearch/GIGPO.md b/docs/source/Instruction/GRPO/AdvancedResearch/GIGPO.md new file mode 100644 index 0000000000..65e1105ca4 --- /dev/null +++ b/docs/source/Instruction/GRPO/AdvancedResearch/GIGPO.md @@ -0,0 +1,95 @@ +# Group-in-Group Policy Optimization (GIGPO) + +**版本依赖**:ms-swift>=3.10 + +[Group-in-Group Policy Optimization (GIGPO)](https://arxiv.org/abs/2505.10978) 是一种改进的策略优化算法,基于分组对比的思想,通过两级分组结构(轨迹级别和步骤级别)提供更细粒度的优势估计。 + +## 算法原理 + +GIGPO 基于 GRPO(Group Relative Policy Optimization)算法扩展而来,两者都采用组内对比的方式来估计优势函数,但 GIGPO 引入了更细粒度的步骤级别优势估计,以解决长期序列决策中的信用分配问题。 + +### 核心创新:两级分组优势估计 + +GIGPO 的核心创新在于同时使用轨迹级别和步骤级别的相对优势来指导策略优化: + +#### 1. 轨迹级别相对优势 + +轨迹级别相对优势捕获了整个决策过程中智能体的整体表现: + +$$ +A^E(\tau_i) = \frac{R(\tau_i) - \text{mean}(\{R(\tau_j)\})}{F_{\text{norm}}(\{R(\tau_j)\})} +$$ + +其中: +- $\tau_i$ 是第 $i$ 个轨迹 +- $R(\tau_i) = \sum_t r_t^{(i)}$ 是轨迹的总回报 +- $\text{mean}(\{R(\tau_j)\})$ 是组内所有轨迹的平均回报 +- $F_{\text{norm}}$ 是归一化因子(可以是标准差或固定值1) + +#### 2. 步骤级别相对优势 + +GIGPO 的关键创新在于**锚点状态分组**机制: +- 识别并分组不同轨迹中重复出现的环境状态,称为**锚点状态** +- 在每个锚点状态组内计算相对优势,提供细粒度的信用分配 + +步骤级别相对优势的计算过程: + +1. **识别锚点状态**:收集所有轨迹中出现的唯一环境状态 $\mathcal{U} = \{\tilde{s}_1, \tilde{s}_2, \ldots, \tilde{s}_U\}$ +2. **构建步骤级分组**: + $$G^S(\tilde{s}) = \{(a_t^{(i)}, r_t^{(i)}) \mid s_t^{(i)} = \tilde{s}, 1 \leq i \leq N, 1 \leq t \leq T\}$$ +3. **计算折扣回报**: + $$R_t^{(i)} = \sum_{k=t}^T \gamma^{k-t} r_k^{(i)}$$ +4. **计算步骤相对优势**: + $$A^S(a_t^{(i)}) = \frac{R_t^{(i)} - \text{mean}(\{R_t^{(j)} \mid (a_t^{(j)}, R_t^{(j)}) \in G^S(\tilde{s})\})}{F_{\text{norm}}(\{R_t^{(j)} \mid (a_t^{(j)}, R_t^{(j)}) \in G^S(\tilde{s})\})}$$ + +#### 3. 组合优势信号 + +GIGPO 将轨迹级别和步骤级别的优势信号加权组合,形成最终的优势估计: + +$$A(a_t^{(i)}) = A^E(\tau_i) + \omega \cdot A^S(a_t^{(i)})$$ + +其中 $\omega$ 是平衡两种优势信号的权重系数(对应参数 `gigpo_step_advantage_weight`)。 + +### 与 GRPO 的主要区别 + +| 对比维度 | GRPO | GIGPO | +|---------|------|-------| +| **优势估计粒度** | 仅轨迹级别 | 轨迹级别 + 步骤级别 | +| **信用分配** | 粗粒度(整个轨迹) | 细粒度(每个动作步骤) | +| **环境状态利用** | 不利用 | 利用锚点状态分组 | +| **适用场景** | 通用序列生成 | 复杂长期决策任务 | +| **额外参数** | 无 | `gigpo_step_advantage_weight` | + +## 参数设置 + +我们可以基于 `GRPOTrainer`,通过设置以下参数实现 GIGPO 训练: +```bash +# 基本 GIGPO 配置 +--advantage_estimator gigpo # 使用 GIGPO 的两级优势函数计算 +--use_gym_env true # 启用 Gym 环境支持(GIGPO 必需) +--gigpo_step_advantage_weight 1.0 # 步骤级优势的权重系数 +``` + +### 重要参数说明 + +- **`--advantage_estimator`**:选择优势函数估计方法 + - `grpo`(默认):仅使用轨迹级别优势 + - `rloo`:使用留一法构造基线 + - `gigpo`:同时使用轨迹级别和步骤级别优势 + +- **`--use_gym_env`**:是否启用 Gym 环境支持 + - `true`:启用(GIGPO 必需,因为需要环境状态信息) + - `false`:禁用 + +- **`--gigpo_step_advantage_weight`**:步骤级优势的权重系数 $\omega$ + - 控制步骤级优势在组合优势中的贡献 + - 取值范围:[0, +∞) + - 默认值:1.0 + +- **`--num_generations`**:每个 prompt 生成的样本数量 + - 增加样本数量可以提高优势估计的稳定性 + +- **`--beta`**:KL 散度正则化系数 + - 控制策略偏离参考策略的程度 + +其他参数与 [GRPO参数](../../Command-line-parameters.md#grpo参数) 一致 \ No newline at end of file diff --git a/docs/source_en/Instruction/Command-line-parameters.md b/docs/source_en/Instruction/Command-line-parameters.md index b8e1b16a88..a3c046a828 100644 --- a/docs/source_en/Instruction/Command-line-parameters.md +++ b/docs/source_en/Instruction/Command-line-parameters.md @@ -639,8 +639,9 @@ The meanings of the following parameters can be referenced [here](https://huggin The hyperparameters for the reward function can be found in the [Built-in Reward Functions section](#built-in-reward-functions). - delta: Delta value for the upper clipping bound in two-sided GRPO. Recommended to be > 1 + epsilon. This method was introduced in the [INTELLECT-2 tech report](https://huggingface.co/papers/2505.07291). - importance_sampling_level: Controls how the importance sampling ratio is computed. Options are `token` and `sequence`. In `token` mode, the raw per-token log-probability ratios are used. In `sequence` mode, the log-probability ratios of all valid tokens in the sequence are averaged to produce a single ratio per sequence. The [GSPO paper](https://arxiv.org/abs/2507.18071) uses sequence-level importance sampling to stabilize training. The default is `token`. -- advantage_estimator: Advantage estimator. Default is `grpo` (group-relative advantage). Options: `grpo`, [`rloo`](./GRPO/AdvancedResearch/RLOO.md), [`reinforce_plus_plus`](./GRPO/AdvancedResearch/REINFORCEPP.md). +- advantage_estimator: Advantage estimator. Default is `grpo` (group-relative advantage). Options: `grpo`, [`rloo`](./GRPO/AdvancedResearch/RLOO.md), [`reinforce_plus_plus`](./GRPO/AdvancedResearch/REINFORCEPP.md)、[`gigpo`](./GRPO/AdvancedResearch/GIGPO.md). - kl_in_reward: Controls where the KL regularization is applied. `false`: KL is a separate loss term. `true`: KL is subtracted from the reward. The default is bound to `advantage_estimator`: `false` for `grpo`, and `true` for `rloo` and `reinforce_plus_plus`. +- gigpo_step_advantage_weight: The weight for the step-level advantage (A^S) in the GiGPO algorithm. The default value is 1.0. - scale_rewards: Specifies the reward scaling strategy. Options: `group` (scale by intra-group std), `batch` (scale by batch-wide std), `none` (no scaling), `gdpo` (normalize each reward function separately within groups before weighted aggregation, see [GDPO paper](https://arxiv.org/abs/2601.05242)). In ms-swift < 3.10, this was a boolean where `true` corresponds to `group` and `false` to `none`. The default is bound to `advantage_estimator`: `group` for `grpo`, `none` for `rloo`, and `batch` for `reinforce_plus_plus`. - Note: `gdpo` mode does not support `kl_in_reward=True`. If both are set, `kl_in_reward` will be automatically set to `False`. - GDPO is designed for multi-reward optimization: When using multiple reward functions, GDPO normalizes each reward function separately within groups (subtract mean, divide by std), then performs weighted aggregation using `reward_weights`, and finally applies batch-level normalization. This approach better preserves the relative differences between rewards and prevents different reward combinations from collapsing into identical advantage values. diff --git a/docs/source_en/Instruction/GRPO/AdvancedResearch/GIGPO.md b/docs/source_en/Instruction/GRPO/AdvancedResearch/GIGPO.md new file mode 100644 index 0000000000..3c15d79114 --- /dev/null +++ b/docs/source_en/Instruction/GRPO/AdvancedResearch/GIGPO.md @@ -0,0 +1,95 @@ +# Group-in-Group Policy Optimization (GIGPO) + +**Version Dependency**: ms-swift>=3.10 + +[Group-in-Group Policy Optimization (GIGPO)](https://arxiv.org/abs/2505.10978) is an improved policy optimization algorithm based on the idea of group comparison, providing more fine-grained advantage estimation through a two-level grouping structure (trajectory level and step level). + +## Algorithm Principles + +GIGPO is extended from GRPO (Group Relative Policy Optimization). Both algorithms use in-group comparison to estimate advantage functions, but GIGPO introduces finer-grained step-level advantage estimation to solve the credit assignment problem in long-term sequential decision-making. + +### Core Innovation: Two-Level Group Advantage Estimation + +GIGPO's core innovation lies in simultaneously using trajectory-level and step-level relative advantages to guide policy optimization: + +#### 1. Trajectory-Level Relative Advantage + +Trajectory-level relative advantage captures the overall performance of the agent in the entire decision-making process: + +$$ +A^E(\tau_i) = \frac{R(\tau_i) - \text{mean}(\{R(\tau_j)\})}{F_{\text{norm}}(\{R(\tau_j)\})} +$$ + +Where: +- $\tau_i$ is the i-th trajectory +- $R(\tau_i) = \sum_t r_t^{(i)}$ is the total return of the trajectory +- $\text{mean}(\{R(\tau_j)\})$ is the average return of all trajectories in the group +- $F_{\text{norm}}$ is the normalization factor (can be standard deviation or fixed value 1) + +#### 2. Step-Level Relative Advantage + +The key innovation of GIGPO is the **anchor state grouping** mechanism: +- Identify and group repeated environmental states across different trajectories, called **anchor states** +- Calculate relative advantages within each anchor state group to provide fine-grained credit assignment + +The calculation process of step-level relative advantage: + +1. **Identify anchor states**: Collect all unique environmental states from all trajectories $\mathcal{U} = \{\tilde{s}_1, \tilde{s}_2, \ldots, \tilde{s}_U\}$ +2. **Construct step-level groups**: + $$G^S(\tilde{s}) = \{(a_t^{(i)}, r_t^{(i)}) \mid s_t^{(i)} = \tilde{s}, 1 \leq i \leq N, 1 \leq t \leq T\}$$ +3. **Calculate discounted returns**: + $$R_t^{(i)} = \sum_{k=t}^T \gamma^{k-t} r_k^{(i)}$$ +4. **Calculate step relative advantages**: + $$A^S(a_t^{(i)}) = \frac{R_t^{(i)} - \text{mean}(\{R_t^{(j)} \mid (a_t^{(j)}, R_t^{(j)}) \in G^S(\tilde{s})\})}{F_{\text{norm}}(\{R_t^{(j)} \mid (a_t^{(j)}, R_t^{(j)}) \in G^S(\tilde{s})\})}$$ + +#### 3. Combined Advantage Signal + +GIGPO weightedly combines trajectory-level and step-level advantage signals to form the final advantage estimation: + +$$A(a_t^{(i)}) = A^E(\tau_i) + \omega \cdot A^S(a_t^{(i)})$$ + +Where $\omega$ is the weight coefficient that balances the two advantage signals (corresponding to the parameter `gigpo_step_advantage_weight`). + +### Main Differences from GRPO + +| Comparison Dimension | GRPO | GIGPO | +|---------------------|------|-------| +| **Advantage Estimation Granularity** | Trajectory level only | Trajectory level + Step level | +| **Credit Assignment** | Coarse-grained (entire trajectory) | Fine-grained (each action step) | +| **Environmental State Utilization** | Not utilized | Utilizes anchor state grouping | +| **Applicable Scenarios** | General sequence generation | Complex long-term decision tasks | +| **Additional Parameters** | None | `gigpo_step_advantage_weight` | + +## Parameter Settings + +We can implement GIGPO training based on `GRPOTrainer` by setting the following parameters: +```bash +# Basic GIGPO configuration +--advantage_estimator gigpo # Use GIGPO's two-level advantage function calculation +--use_gym_env true # Enable Gym environment support (required for GIGPO) +--gigpo_step_advantage_weight 1.0 # Weight coefficient for step-level advantage +``` + +### Important Parameter Descriptions + +- **`--advantage_estimator`**:Selects the advantage function estimation method + - `grpo` (default): Uses only trajectory-level advantage + - `rloo`: Uses leave-one-out method to construct baseline + - `gigpo`: Uses both trajectory-level and step-level advantages + +- **`--use_gym_env`**:Whether to enable Gym environment support + - `true`: Enabled (required for GIGPO, as it needs environmental state information) + - `false`: Disabled + +- **`--gigpo_step_advantage_weight`**:Weight coefficient $\omega$ for step-level advantage + - Controls the contribution of step-level advantage in the combined advantage + - Range: [0, +∞) + - Default value: 1.0 + +- **`--num_generations`**:Number of samples generated per prompt + - Increasing the number of samples can improve the stability of advantage estimation + +- **`--beta`**:KL divergence regularization coefficient + - Controls the degree of policy deviation from the reference policy + +Other parameters are the same as [GRPO parameters](../../Command-line-parameters.md#grpo-parameters) \ No newline at end of file diff --git a/swift/rlhf_trainers/args_mixin.py b/swift/rlhf_trainers/args_mixin.py index b3dbaea39c..b28bc0b10b 100644 --- a/swift/rlhf_trainers/args_mixin.py +++ b/swift/rlhf_trainers/args_mixin.py @@ -273,6 +273,8 @@ class GRPOArgumentsMixin(RolloutTrainerArgumentsMixin): `False`, it's an independent term in the loss function. If `True`, KL is directly incorporated into the reward (subtracted from it). The default is tied to `advantage_estimator`: `False` for 'grpo', `True` for 'rloo' and 'reinforce_plus_plus'. + gigpo_step_advantage_weight (float): The weight for the step-level advantage (A^S) in the GiGPO algorithm. + The default value is 1.0. generation_batch_size (Optional[int]): The batch size for sampling completions. It should be a multiple of `num_processes * per_device_train_batch_size`. Defaults to `per_device_batch_size * gradient_accumulation_steps * num_processes`. @@ -356,10 +358,12 @@ class GRPOArgumentsMixin(RolloutTrainerArgumentsMixin): tau_pos: float = 1.0 tau_neg: float = 1.05 - # RLOO, REINFORCE++ - advantage_estimator: Literal['grpo', 'rloo', 'reinforce_plus_plus'] = 'grpo' + # RLOO, REINFORCE++, GiGPO + advantage_estimator: Literal['grpo', 'rloo', 'reinforce_plus_plus', 'gigpo'] = 'grpo' # If false, add KL into loss, otherwise add into reward kl_in_reward: Optional[bool] = None # rloo/reinforce_plus_plus: true, grpo: false (default) + # GiGPO, https://arxiv.org/abs/2505.10978 + gigpo_step_advantage_weight = 1.0 generation_batch_size: Optional[int] = None steps_per_generation: Optional[int] = None diff --git a/swift/rlhf_trainers/grpo_trainer.py b/swift/rlhf_trainers/grpo_trainer.py index da978088d6..432a74077b 100644 --- a/swift/rlhf_trainers/grpo_trainer.py +++ b/swift/rlhf_trainers/grpo_trainer.py @@ -430,39 +430,35 @@ def normalize_advantages(advantages: torch.Tensor, rewards_std: torch.Tensor) -> def _compute_step_advantages(inputs, trajectory_advantages): # Extract step-level reward information from inputs # Store (prompt_id, step) -> [rewards] mapping - step_rewards_dict = {} - for idx, input_data in enumerate(inputs): + step_rewards_dict = defaultdict(list) + for input_data in inputs: prompt_id = input_data['prompt_id'] rollout_info = input_data['rollout_infos'] - - # Collect all step rewards for current trajectory for traj_info in rollout_info.get('trajectory_info', []): step = traj_info.get('step', 0) reward = traj_info.get('reward', 0.0) + step_rewards_dict[(prompt_id, step)].append(reward) + + # Pre-calculate mean rewards for each step + step_mean_rewards = {key: np.mean(rewards) for key, rewards in step_rewards_dict.items()} - # Group rewards by prompt_id and step - key = (prompt_id, step) - if key not in step_rewards_dict: - step_rewards_dict[key] = [] - step_rewards_dict[key].append(reward) # Calculate step-level advantage and aggregate aggregated_step_advantages = torch.zeros_like(trajectory_advantages) for idx, input_data in enumerate(inputs): prompt_id = input_data['prompt_id'] rollout_info = input_data['rollout_infos'] - # Calculate aggregated step-level advantage for current trajectory step_advantages = [] for traj_info in rollout_info.get('trajectory_info', []): step = traj_info.get('step', 0) reward = traj_info.get('reward', 0.0) - # Get all rewards for same prompt and step + # Get pre-calculated mean reward for the same prompt and step key = (prompt_id, step) - all_rewards = step_rewards_dict.get(key, [reward]) + # The key should always exist, but we use .get for safety. + mean_reward = step_mean_rewards.get(key, reward) - # Calculate step advantage (compared to group average) - mean_reward = np.mean(all_rewards) + # Calculate step advantage step_advantage = reward - mean_reward step_advantages.append(step_advantage) @@ -554,12 +550,13 @@ def log_rewards_all(rewards_per_func: torch.Tensor): advantages = rewards * K / (K - 1) - group_rewards_mean * K / (K - 1) else: advantages = rewards - group_rewards_mean - elif self.advantage_estimator == 'gigpo' and self.use_gym_env: + elif self.advantage_estimator == 'gigpo': + assert self.use_gym_env # Get trajectory-level advantage (original GRPO advantage) trajectory_advantages = rewards - group_rewards_mean aggregated_step_advantages = _compute_step_advantages(inputs, trajectory_advantages) # Weighted sum of trajectory-level advantage and aggregated step-level advantage - advantages = trajectory_advantages + self.step_advantage_w * aggregated_step_advantages + advantages = trajectory_advantages + self.gigpo_step_advantage_weight * aggregated_step_advantages else: # 'grpo' or 'reinforce_plus_plus' # Both use group mean as baseline advantages = rewards - group_rewards_mean @@ -729,7 +726,7 @@ def log_rewards_all(rewards_per_func: torch.Tensor): trajectory_advantages = advantages aggregated_step_advantages = _compute_step_advantages(inputs, trajectory_advantages) # Weighted sum of trajectory-level advantage and aggregated step-level advantage - advantages = trajectory_advantages + self.step_advantage_w * aggregated_step_advantages + advantages = trajectory_advantages + self.gigpo_step_advantage_weight * aggregated_step_advantages # Step 5. Log metrics for unique request_ids log_rewards_metrics(rewards=unique_rewards, rewards_per_func_for_metrics=rewards_per_func[unique_indices]) @@ -2227,8 +2224,8 @@ def _prepare_algorithm_params(self): logger.warning('GDPO mode does not support kl_in_reward=True. Setting kl_in_reward=False.') self.kl_in_reward = False - # GiGPO, https://arxiv.org/abs/2405.06708 - self.step_advantage_w = args.step_advantage_w + # GiGPO, https://arxiv.org/abs/2505.10978 + self.gigpo_step_advantage_weight = args.gigpo_step_advantage_weight # Rollout Importance Sampling Correction self.rollout_importance_sampling_mode = args.rollout_importance_sampling_mode From 23e2c137be87f45c50cca44c93f17e483e9623ec Mon Sep 17 00:00:00 2001 From: fengl Date: Wed, 28 Jan 2026 17:29:41 +0800 Subject: [PATCH 3/3] [grpo] support gigpo with gym - fix lint --- docs/source/Instruction/GRPO/AdvancedResearch/GIGPO.md | 2 +- docs/source_en/Instruction/GRPO/AdvancedResearch/GIGPO.md | 2 +- 2 files changed, 2 insertions(+), 2 deletions(-) diff --git a/docs/source/Instruction/GRPO/AdvancedResearch/GIGPO.md b/docs/source/Instruction/GRPO/AdvancedResearch/GIGPO.md index 65e1105ca4..539f3f7682 100644 --- a/docs/source/Instruction/GRPO/AdvancedResearch/GIGPO.md +++ b/docs/source/Instruction/GRPO/AdvancedResearch/GIGPO.md @@ -92,4 +92,4 @@ $$A(a_t^{(i)}) = A^E(\tau_i) + \omega \cdot A^S(a_t^{(i)})$$ - **`--beta`**:KL 散度正则化系数 - 控制策略偏离参考策略的程度 -其他参数与 [GRPO参数](../../Command-line-parameters.md#grpo参数) 一致 \ No newline at end of file +其他参数与 [GRPO参数](../../Command-line-parameters.md#grpo参数) 一致 diff --git a/docs/source_en/Instruction/GRPO/AdvancedResearch/GIGPO.md b/docs/source_en/Instruction/GRPO/AdvancedResearch/GIGPO.md index 3c15d79114..d2863bb89d 100644 --- a/docs/source_en/Instruction/GRPO/AdvancedResearch/GIGPO.md +++ b/docs/source_en/Instruction/GRPO/AdvancedResearch/GIGPO.md @@ -92,4 +92,4 @@ We can implement GIGPO training based on `GRPOTrainer` by setting the following - **`--beta`**:KL divergence regularization coefficient - Controls the degree of policy deviation from the reference policy -Other parameters are the same as [GRPO parameters](../../Command-line-parameters.md#grpo-parameters) \ No newline at end of file +Other parameters are the same as [GRPO parameters](../../Command-line-parameters.md#grpo-parameters)