Skip to content
Open
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
63 changes: 57 additions & 6 deletions envs/repl_env/rubrics.py
Original file line number Diff line number Diff line change
Expand Up @@ -21,6 +21,12 @@
from typing import Any, Callable

from openenv.core.rubrics.base import Rubric
from openenv.core.rubrics.components import (
RewardComponent,
RewardComponentType,
aggregate_weighted_sum,
serialize_reward_components,
)


class ExactMatchRubric(Rubric):
Expand Down Expand Up @@ -173,16 +179,61 @@ def set_expected(self, expected: str | None) -> None:
if hasattr(self.outcome, "set_expected"):
self.outcome.set_expected(expected)

def forward(self, action: Any, observation: Any) -> float:
def evaluate_components(self, action: Any, observation: Any) -> list[RewardComponent]:
"""Compute structured reward components for the current step."""
done = getattr(observation, "done", False)
if done:
final = getattr(observation, "metadata", {}).get("final_answer")
if final is not None:
return self.outcome(action, observation)
# Done but no final answer (max iterations exhausted)
return self.failure_reward
# Non-terminal step: process reward only
return self.process(action, observation)
return [
RewardComponent(
name="outcome_match",
type=RewardComponentType.BINARY,
value=float(self.outcome(action, observation)),
weight=1.0,
terminal_only=True,
)
]
return [
RewardComponent(
name="max_iterations_failure",
type=RewardComponentType.PENALTY,
value=float(self.failure_reward),
weight=1.0,
terminal_only=True,
)
]

return [
RewardComponent(
name="code_execution_quality",
type=RewardComponentType.SHAPING,
value=float(self.process(action, observation)),
weight=1.0,
terminal_only=False,
)
]

def _attach_reward_metadata(
self,
observation: Any,
components: list[RewardComponent],
total: float,
) -> None:
"""Attach component diagnostics to observation metadata if available."""
if not hasattr(observation, "metadata"):
return
metadata = getattr(observation, "metadata", None) or {}
Comment on lines +224 to +226

Copy link
Copy Markdown
Contributor

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

P2 Silent metadata clobber on empty-dict metadata

getattr(observation, "metadata", None) or {} evaluates to a new empty dict when observation.metadata is {} (empty dicts are falsy). Any keys already present would be discarded. Prefer an explicit None check to make the intent clear.

Suggested change
if not hasattr(observation, "metadata"):
return
metadata = getattr(observation, "metadata", None) or {}
metadata = getattr(observation, "metadata", None)
if metadata is None:
metadata = {}
Prompt To Fix With AI
This is a comment left during a code review.
Path: envs/repl_env/rubrics.py
Line: 224-226

Comment:
**Silent metadata clobber on empty-dict metadata**

`getattr(observation, "metadata", None) or {}` evaluates to a *new* empty dict when `observation.metadata` is `{}` (empty dicts are falsy). Any keys already present would be discarded. Prefer an explicit `None` check to make the intent clear.

```suggestion
        metadata = getattr(observation, "metadata", None)
        if metadata is None:
            metadata = {}
```

How can I resolve this? If you propose a fix, please make it concise.

metadata["reward_components"] = serialize_reward_components(components)
metadata["reward_total"] = total
metadata["reward_aggregation"] = "weighted_sum_v1"
observation.metadata = metadata
Comment on lines +217 to +230

Copy link
Copy Markdown
Contributor

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

P1 ALIGNMENT FLAG: Rubric directly mutates observation

_attach_reward_metadata writes reward_components, reward_total, and reward_aggregation directly onto observation.metadata as a side-effect of reward computation. PATTERNS.md shows observations being constructed and returned by the environment (step), not mutated downstream. A rubric whose forward now carries an observation-mutation side-effect creates a responsibility leak: any caller of REPLRubric.forward that did not expect its observation to be modified (e.g., a Transform in the pipeline, a test double, or a future rubric composition) will silently receive a mutated object.

  • Principle at stake: Separation of reward computation from observation construction (PATTERNS.md "Reward Computation" section; INVARIANTS.md "Rewards in environment")
  • The concern: Rubric forward is documented to return a float; attaching metadata side-effects to the observation couples reward diagnostics to observation mutation in a non-obvious way and breaks composability
  • Suggested reviewer: @darktex

Context Used: .claude/docs/INVARIANTS.md (source)

Prompt To Fix With AI
This is a comment left during a code review.
Path: envs/repl_env/rubrics.py
Line: 217-230

Comment:
**ALIGNMENT FLAG: Rubric directly mutates observation**

`_attach_reward_metadata` writes `reward_components`, `reward_total`, and `reward_aggregation` directly onto `observation.metadata` as a side-effect of reward computation. PATTERNS.md shows observations being constructed and returned by the environment (`step`), not mutated downstream. A rubric whose `forward` now carries an observation-mutation side-effect creates a responsibility leak: any caller of `REPLRubric.forward` that did not expect its observation to be modified (e.g., a Transform in the pipeline, a test double, or a future rubric composition) will silently receive a mutated object.

- **Principle at stake**: Separation of reward computation from observation construction (PATTERNS.md "Reward Computation" section; INVARIANTS.md "Rewards in environment")
- **The concern**: Rubric `forward` is documented to return a `float`; attaching metadata side-effects to the observation couples reward diagnostics to observation mutation in a non-obvious way and breaks composability
- **Suggested reviewer**: `@darktex`

**Context Used:** .claude/docs/INVARIANTS.md ([source](https://app.greptile.com/review/custom-context?memory=dbd1ab5e-bd4d-4701-9de0-9817404155a9))

How can I resolve this? If you propose a fix, please make it concise.


def forward(self, action: Any, observation: Any) -> float:
components = self.evaluate_components(action, observation)
total = aggregate_weighted_sum(components)
self._attach_reward_metadata(observation, components, total)
return total

def reset(self) -> None:
self.outcome.reset()
Expand Down
11 changes: 11 additions & 0 deletions src/openenv/core/rubrics/__init__.py
Original file line number Diff line number Diff line change
Expand Up @@ -10,6 +10,12 @@
"""

from openenv.core.rubrics.base import Rubric
from openenv.core.rubrics.components import (
RewardComponent,
RewardComponentType,
aggregate_weighted_sum,
serialize_reward_components,
)
from openenv.core.rubrics.containers import (
Gate,
RubricDict,
Expand All @@ -26,6 +32,11 @@
__all__ = [
# Base
"Rubric",
# Components
"RewardComponent",
"RewardComponentType",
"aggregate_weighted_sum",
"serialize_reward_components",
# Containers
"Sequential",
"Gate",
Expand Down
65 changes: 65 additions & 0 deletions src/openenv/core/rubrics/components.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,65 @@
# Copyright (c) Meta Platforms, Inc. and affiliates.
# All rights reserved.
#
# This source code is licensed under the BSD-style license found in the
# LICENSE file in the root directory of this source tree.

"""Reward component schema and helpers.

This module provides a standard representation for decomposed reward signals
while preserving a scalar optimization target.
"""

from enum import Enum
from typing import Any, Dict, List
Comment on lines +13 to +14

Copy link
Copy Markdown
Contributor

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

P2 Inconsistent typing imports alongside new-style union syntax

The file imports Dict and List from typing (Python 3.8 compat style), but simultaneously uses float | None PEP-604 syntax (Python 3.10+). These two typing styles target incompatible Python version floors. Align to one style — either migrate all annotations to the modern built-in generics with from __future__ import annotations, or keep everything typing-module style (Dict, List, Optional[float]).

Suggested change
from enum import Enum
from typing import Any, Dict, List
from typing import Any, Dict, List, Optional
Prompt To Fix With AI
This is a comment left during a code review.
Path: src/openenv/core/rubrics/components.py
Line: 13-14

Comment:
**Inconsistent `typing` imports alongside new-style union syntax**

The file imports `Dict` and `List` from `typing` (Python 3.8 compat style), but simultaneously uses `float | None` PEP-604 syntax (Python 3.10+). These two typing styles target incompatible Python version floors. Align to one style — either migrate all annotations to the modern built-in generics with `from __future__ import annotations`, or keep everything `typing`-module style (`Dict`, `List`, `Optional[float]`).

```suggestion
from typing import Any, Dict, List, Optional
```

How can I resolve this? If you propose a fix, please make it concise.


from pydantic import BaseModel, ConfigDict, Field


class RewardComponentType(str, Enum):
"""Common reward component styles for training diagnostics."""

BINARY = "binary"
SPARSE = "sparse"
DENSE = "dense"
SHAPING = "shaping"
PENALTY = "penalty"


class RewardComponent(BaseModel):
"""Structured reward component emitted by a rubric/environment."""

model_config = ConfigDict(extra="forbid")

name: str = Field(..., description="Stable component identifier")
type: RewardComponentType = Field(..., description="Reward component style")
value: float = Field(..., description="Raw component value before weighting")
weight: float = Field(default=1.0, description="Aggregation weight")
weighted_value: float | None = Field(
default=None,
description="Optional explicit weighted value (defaults to value * weight)",
Comment on lines +38 to +40

Copy link
Copy Markdown
Contributor

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

P1 Python <3.10 runtime incompatibility

float | None (PEP 604 union syntax) requires Python 3.10+ for runtime evaluation. Without from __future__ import annotations, this annotation is evaluated eagerly at class definition time, raising TypeError: unsupported operand type(s) for |: 'type' and 'NoneType' on Python 3.8/3.9. The rest of the codebase (e.g. models.py) consistently uses Optional[T], signaling Python < 3.10 is supported. Either add from __future__ import annotations at the top of this file, or replace the union with Optional[float].

Suggested change
weighted_value: float | None = Field(
default=None,
description="Optional explicit weighted value (defaults to value * weight)",
weighted_value: Optional[float] = Field(
default=None,
description="Optional explicit weighted value (defaults to value * weight)",
)
Prompt To Fix With AI
This is a comment left during a code review.
Path: src/openenv/core/rubrics/components.py
Line: 38-40

Comment:
**Python <3.10 runtime incompatibility**

`float | None` (PEP 604 union syntax) requires Python 3.10+ for *runtime* evaluation. Without `from __future__ import annotations`, this annotation is evaluated eagerly at class definition time, raising `TypeError: unsupported operand type(s) for |: 'type' and 'NoneType'` on Python 3.8/3.9. The rest of the codebase (e.g. `models.py`) consistently uses `Optional[T]`, signaling Python < 3.10 is supported. Either add `from __future__ import annotations` at the top of this file, or replace the union with `Optional[float]`.

```suggestion
    weighted_value: Optional[float] = Field(
        default=None,
        description="Optional explicit weighted value (defaults to value * weight)",
    )
```

How can I resolve this? If you propose a fix, please make it concise.

)
terminal_only: bool = Field(
default=False,
description="Whether this component is meaningful only at terminal steps",
)
metadata: Dict[str, Any] = Field(
default_factory=dict,
description="Additional component-specific diagnostics",
)

def effective_weighted_value(self) -> float:
"""Return weighted component value, respecting explicit overrides."""
if self.weighted_value is not None:
return self.weighted_value
return self.value * self.weight


def aggregate_weighted_sum(components: List[RewardComponent]) -> float:
"""Aggregate reward components with weighted-sum semantics."""
return float(sum(component.effective_weighted_value() for component in components))


def serialize_reward_components(components: List[RewardComponent]) -> List[Dict[str, Any]]:
"""Serialize reward components for observation metadata."""
return [component.model_dump() for component in components]
68 changes: 68 additions & 0 deletions tests/core/test_rubrics/test_reward_components.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,68 @@
# Copyright (c) Meta Platforms, Inc. and affiliates.
# All rights reserved.
#
# This source code is licensed under the BSD-style license found in the
# LICENSE file in the root directory of this source tree.

"""Tests for reward component schema and helpers."""

from openenv.core.rubrics.components import (
RewardComponent,
RewardComponentType,
aggregate_weighted_sum,
serialize_reward_components,
)


class TestRewardComponent:
def test_weighted_value_defaults_to_value_times_weight(self):
component = RewardComponent(
name="progress",
type=RewardComponentType.DENSE,
value=0.6,
weight=0.5,
)
assert component.effective_weighted_value() == 0.3

def test_explicit_weighted_value_takes_precedence(self):
component = RewardComponent(
name="safety_penalty",
type=RewardComponentType.PENALTY,
value=-1.0,
weight=0.2,
weighted_value=-0.5,
)
assert component.effective_weighted_value() == -0.5


class TestRewardComponentHelpers:
def test_aggregate_weighted_sum(self):
components = [
RewardComponent(
name="success",
type=RewardComponentType.BINARY,
value=1.0,
weight=0.7,
),
RewardComponent(
name="format",
type=RewardComponentType.SPARSE,
value=1.0,
weight=0.3,
),
]
assert aggregate_weighted_sum(components) == 1.0

def test_serialize_reward_components(self):
components = [
RewardComponent(
name="step_quality",
type=RewardComponentType.SHAPING,
value=-0.05,
)
]
payload = serialize_reward_components(components)
assert len(payload) == 1
assert payload[0]["name"] == "step_quality"
assert payload[0]["type"] == "shaping"
assert payload[0]["value"] == -0.05
14 changes: 14 additions & 0 deletions tests/envs/test_repl_env.py
Original file line number Diff line number Diff line change
Expand Up @@ -289,6 +289,13 @@ def test_rubric_reward_on_success(self):
obs = env.step(REPLAction(code="print('FINAL(done)')"))
assert obs.done
assert obs.reward == 1.0
assert obs.metadata["reward_total"] == 1.0
assert obs.metadata["reward_aggregation"] == "weighted_sum_v1"
assert len(obs.metadata["reward_components"]) == 1
component = obs.metadata["reward_components"][0]
assert component["name"] == "outcome_match"
assert component["type"] == "binary"
assert component["terminal_only"] is True

def test_rubric_reward_on_wrong_answer(self):
"""Test rubric reward when final answer does not match expected."""
Expand All @@ -299,13 +306,20 @@ def test_rubric_reward_on_wrong_answer(self):
obs = env.step(REPLAction(code="print('FINAL(wrong)')"))
assert obs.done
assert obs.reward == 0.0
assert obs.metadata["reward_total"] == 0.0
assert obs.metadata["reward_components"][0]["name"] == "outcome_match"
assert obs.metadata["reward_components"][0]["type"] == "binary"

def test_rubric_reward_on_error(self):
"""Test rubric process reward on code error."""
env = REPLEnvironment()
env.reset()
obs = env.step(REPLAction(code="raise ValueError()"))
assert obs.reward == -0.05 # default CodeExecutionRubric error_penalty
assert obs.metadata["reward_total"] == -0.05
assert obs.metadata["reward_components"][0]["name"] == "code_execution_quality"
assert obs.metadata["reward_components"][0]["type"] == "shaping"
assert obs.metadata["reward_components"][0]["terminal_only"] is False

def test_close(self):
"""Test close cleans up resources."""
Expand Down