-
Notifications
You must be signed in to change notification settings - Fork 390
Implement multi-component reward schema and REPL rubric metadata #625
New issue
Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.
By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.
Already on GitHub? Sign in to your account
base: main
Are you sure you want to change the base?
Changes from all commits
File filter
Filter by extension
Conversations
Jump to
Diff view
Diff view
There are no files selected for viewing
| Original file line number | Diff line number | Diff line change |
|---|---|---|
|
|
@@ -21,6 +21,12 @@ | |
| from typing import Any, Callable | ||
|
|
||
| from openenv.core.rubrics.base import Rubric | ||
| from openenv.core.rubrics.components import ( | ||
| RewardComponent, | ||
| RewardComponentType, | ||
| aggregate_weighted_sum, | ||
| serialize_reward_components, | ||
| ) | ||
|
|
||
|
|
||
| class ExactMatchRubric(Rubric): | ||
|
|
@@ -173,16 +179,61 @@ def set_expected(self, expected: str | None) -> None: | |
| if hasattr(self.outcome, "set_expected"): | ||
| self.outcome.set_expected(expected) | ||
|
|
||
| def forward(self, action: Any, observation: Any) -> float: | ||
| def evaluate_components(self, action: Any, observation: Any) -> list[RewardComponent]: | ||
| """Compute structured reward components for the current step.""" | ||
| done = getattr(observation, "done", False) | ||
| if done: | ||
| final = getattr(observation, "metadata", {}).get("final_answer") | ||
| if final is not None: | ||
| return self.outcome(action, observation) | ||
| # Done but no final answer (max iterations exhausted) | ||
| return self.failure_reward | ||
| # Non-terminal step: process reward only | ||
| return self.process(action, observation) | ||
| return [ | ||
| RewardComponent( | ||
| name="outcome_match", | ||
| type=RewardComponentType.BINARY, | ||
| value=float(self.outcome(action, observation)), | ||
| weight=1.0, | ||
| terminal_only=True, | ||
| ) | ||
| ] | ||
| return [ | ||
| RewardComponent( | ||
| name="max_iterations_failure", | ||
| type=RewardComponentType.PENALTY, | ||
| value=float(self.failure_reward), | ||
| weight=1.0, | ||
| terminal_only=True, | ||
| ) | ||
| ] | ||
|
|
||
| return [ | ||
| RewardComponent( | ||
| name="code_execution_quality", | ||
| type=RewardComponentType.SHAPING, | ||
| value=float(self.process(action, observation)), | ||
| weight=1.0, | ||
| terminal_only=False, | ||
| ) | ||
| ] | ||
|
|
||
| def _attach_reward_metadata( | ||
| self, | ||
| observation: Any, | ||
| components: list[RewardComponent], | ||
| total: float, | ||
| ) -> None: | ||
| """Attach component diagnostics to observation metadata if available.""" | ||
| if not hasattr(observation, "metadata"): | ||
| return | ||
| metadata = getattr(observation, "metadata", None) or {} | ||
| metadata["reward_components"] = serialize_reward_components(components) | ||
| metadata["reward_total"] = total | ||
| metadata["reward_aggregation"] = "weighted_sum_v1" | ||
| observation.metadata = metadata | ||
|
Comment on lines
+217
to
+230
Contributor
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more.
Context Used: .claude/docs/INVARIANTS.md (source) Prompt To Fix With AIThis is a comment left during a code review.
Path: envs/repl_env/rubrics.py
Line: 217-230
Comment:
**ALIGNMENT FLAG: Rubric directly mutates observation**
`_attach_reward_metadata` writes `reward_components`, `reward_total`, and `reward_aggregation` directly onto `observation.metadata` as a side-effect of reward computation. PATTERNS.md shows observations being constructed and returned by the environment (`step`), not mutated downstream. A rubric whose `forward` now carries an observation-mutation side-effect creates a responsibility leak: any caller of `REPLRubric.forward` that did not expect its observation to be modified (e.g., a Transform in the pipeline, a test double, or a future rubric composition) will silently receive a mutated object.
- **Principle at stake**: Separation of reward computation from observation construction (PATTERNS.md "Reward Computation" section; INVARIANTS.md "Rewards in environment")
- **The concern**: Rubric `forward` is documented to return a `float`; attaching metadata side-effects to the observation couples reward diagnostics to observation mutation in a non-obvious way and breaks composability
- **Suggested reviewer**: `@darktex`
**Context Used:** .claude/docs/INVARIANTS.md ([source](https://app.greptile.com/review/custom-context?memory=dbd1ab5e-bd4d-4701-9de0-9817404155a9))
How can I resolve this? If you propose a fix, please make it concise. |
||
|
|
||
| def forward(self, action: Any, observation: Any) -> float: | ||
| components = self.evaluate_components(action, observation) | ||
| total = aggregate_weighted_sum(components) | ||
| self._attach_reward_metadata(observation, components, total) | ||
| return total | ||
|
|
||
| def reset(self) -> None: | ||
| self.outcome.reset() | ||
|
|
||
| Original file line number | Diff line number | Diff line change | ||||||||||||||
|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|
| @@ -0,0 +1,65 @@ | ||||||||||||||||
| # Copyright (c) Meta Platforms, Inc. and affiliates. | ||||||||||||||||
| # All rights reserved. | ||||||||||||||||
| # | ||||||||||||||||
| # This source code is licensed under the BSD-style license found in the | ||||||||||||||||
| # LICENSE file in the root directory of this source tree. | ||||||||||||||||
|
|
||||||||||||||||
| """Reward component schema and helpers. | ||||||||||||||||
|
|
||||||||||||||||
| This module provides a standard representation for decomposed reward signals | ||||||||||||||||
| while preserving a scalar optimization target. | ||||||||||||||||
| """ | ||||||||||||||||
|
|
||||||||||||||||
| from enum import Enum | ||||||||||||||||
| from typing import Any, Dict, List | ||||||||||||||||
|
Comment on lines
+13
to
+14
Contributor
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more.
The file imports
Suggested change
Prompt To Fix With AIThis is a comment left during a code review.
Path: src/openenv/core/rubrics/components.py
Line: 13-14
Comment:
**Inconsistent `typing` imports alongside new-style union syntax**
The file imports `Dict` and `List` from `typing` (Python 3.8 compat style), but simultaneously uses `float | None` PEP-604 syntax (Python 3.10+). These two typing styles target incompatible Python version floors. Align to one style — either migrate all annotations to the modern built-in generics with `from __future__ import annotations`, or keep everything `typing`-module style (`Dict`, `List`, `Optional[float]`).
```suggestion
from typing import Any, Dict, List, Optional
```
How can I resolve this? If you propose a fix, please make it concise. |
||||||||||||||||
|
|
||||||||||||||||
| from pydantic import BaseModel, ConfigDict, Field | ||||||||||||||||
|
|
||||||||||||||||
|
|
||||||||||||||||
| class RewardComponentType(str, Enum): | ||||||||||||||||
| """Common reward component styles for training diagnostics.""" | ||||||||||||||||
|
|
||||||||||||||||
| BINARY = "binary" | ||||||||||||||||
| SPARSE = "sparse" | ||||||||||||||||
| DENSE = "dense" | ||||||||||||||||
| SHAPING = "shaping" | ||||||||||||||||
| PENALTY = "penalty" | ||||||||||||||||
|
|
||||||||||||||||
|
|
||||||||||||||||
| class RewardComponent(BaseModel): | ||||||||||||||||
| """Structured reward component emitted by a rubric/environment.""" | ||||||||||||||||
|
|
||||||||||||||||
| model_config = ConfigDict(extra="forbid") | ||||||||||||||||
|
|
||||||||||||||||
| name: str = Field(..., description="Stable component identifier") | ||||||||||||||||
| type: RewardComponentType = Field(..., description="Reward component style") | ||||||||||||||||
| value: float = Field(..., description="Raw component value before weighting") | ||||||||||||||||
| weight: float = Field(default=1.0, description="Aggregation weight") | ||||||||||||||||
| weighted_value: float | None = Field( | ||||||||||||||||
| default=None, | ||||||||||||||||
| description="Optional explicit weighted value (defaults to value * weight)", | ||||||||||||||||
|
Comment on lines
+38
to
+40
Contributor
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more.
Suggested change
Prompt To Fix With AIThis is a comment left during a code review.
Path: src/openenv/core/rubrics/components.py
Line: 38-40
Comment:
**Python <3.10 runtime incompatibility**
`float | None` (PEP 604 union syntax) requires Python 3.10+ for *runtime* evaluation. Without `from __future__ import annotations`, this annotation is evaluated eagerly at class definition time, raising `TypeError: unsupported operand type(s) for |: 'type' and 'NoneType'` on Python 3.8/3.9. The rest of the codebase (e.g. `models.py`) consistently uses `Optional[T]`, signaling Python < 3.10 is supported. Either add `from __future__ import annotations` at the top of this file, or replace the union with `Optional[float]`.
```suggestion
weighted_value: Optional[float] = Field(
default=None,
description="Optional explicit weighted value (defaults to value * weight)",
)
```
How can I resolve this? If you propose a fix, please make it concise. |
||||||||||||||||
| ) | ||||||||||||||||
| terminal_only: bool = Field( | ||||||||||||||||
| default=False, | ||||||||||||||||
| description="Whether this component is meaningful only at terminal steps", | ||||||||||||||||
| ) | ||||||||||||||||
| metadata: Dict[str, Any] = Field( | ||||||||||||||||
| default_factory=dict, | ||||||||||||||||
| description="Additional component-specific diagnostics", | ||||||||||||||||
| ) | ||||||||||||||||
|
|
||||||||||||||||
| def effective_weighted_value(self) -> float: | ||||||||||||||||
| """Return weighted component value, respecting explicit overrides.""" | ||||||||||||||||
| if self.weighted_value is not None: | ||||||||||||||||
| return self.weighted_value | ||||||||||||||||
| return self.value * self.weight | ||||||||||||||||
|
|
||||||||||||||||
|
|
||||||||||||||||
| def aggregate_weighted_sum(components: List[RewardComponent]) -> float: | ||||||||||||||||
| """Aggregate reward components with weighted-sum semantics.""" | ||||||||||||||||
| return float(sum(component.effective_weighted_value() for component in components)) | ||||||||||||||||
|
|
||||||||||||||||
|
|
||||||||||||||||
| def serialize_reward_components(components: List[RewardComponent]) -> List[Dict[str, Any]]: | ||||||||||||||||
| """Serialize reward components for observation metadata.""" | ||||||||||||||||
| return [component.model_dump() for component in components] | ||||||||||||||||
| Original file line number | Diff line number | Diff line change |
|---|---|---|
| @@ -0,0 +1,68 @@ | ||
| # Copyright (c) Meta Platforms, Inc. and affiliates. | ||
| # All rights reserved. | ||
| # | ||
| # This source code is licensed under the BSD-style license found in the | ||
| # LICENSE file in the root directory of this source tree. | ||
|
|
||
| """Tests for reward component schema and helpers.""" | ||
|
|
||
| from openenv.core.rubrics.components import ( | ||
| RewardComponent, | ||
| RewardComponentType, | ||
| aggregate_weighted_sum, | ||
| serialize_reward_components, | ||
| ) | ||
|
|
||
|
|
||
| class TestRewardComponent: | ||
| def test_weighted_value_defaults_to_value_times_weight(self): | ||
| component = RewardComponent( | ||
| name="progress", | ||
| type=RewardComponentType.DENSE, | ||
| value=0.6, | ||
| weight=0.5, | ||
| ) | ||
| assert component.effective_weighted_value() == 0.3 | ||
|
|
||
| def test_explicit_weighted_value_takes_precedence(self): | ||
| component = RewardComponent( | ||
| name="safety_penalty", | ||
| type=RewardComponentType.PENALTY, | ||
| value=-1.0, | ||
| weight=0.2, | ||
| weighted_value=-0.5, | ||
| ) | ||
| assert component.effective_weighted_value() == -0.5 | ||
|
|
||
|
|
||
| class TestRewardComponentHelpers: | ||
| def test_aggregate_weighted_sum(self): | ||
| components = [ | ||
| RewardComponent( | ||
| name="success", | ||
| type=RewardComponentType.BINARY, | ||
| value=1.0, | ||
| weight=0.7, | ||
| ), | ||
| RewardComponent( | ||
| name="format", | ||
| type=RewardComponentType.SPARSE, | ||
| value=1.0, | ||
| weight=0.3, | ||
| ), | ||
| ] | ||
| assert aggregate_weighted_sum(components) == 1.0 | ||
|
|
||
| def test_serialize_reward_components(self): | ||
| components = [ | ||
| RewardComponent( | ||
| name="step_quality", | ||
| type=RewardComponentType.SHAPING, | ||
| value=-0.05, | ||
| ) | ||
| ] | ||
| payload = serialize_reward_components(components) | ||
| assert len(payload) == 1 | ||
| assert payload[0]["name"] == "step_quality" | ||
| assert payload[0]["type"] == "shaping" | ||
| assert payload[0]["value"] == -0.05 |
There was a problem hiding this comment.
Choose a reason for hiding this comment
The reason will be displayed to describe this comment to others. Learn more.
getattr(observation, "metadata", None) or {}evaluates to a new empty dict whenobservation.metadatais{}(empty dicts are falsy). Any keys already present would be discarded. Prefer an explicitNonecheck to make the intent clear.Prompt To Fix With AI