Skip to content
Open
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
3 changes: 2 additions & 1 deletion docs/.vitepress/config.mts
Original file line number Diff line number Diff line change
Expand Up @@ -37,7 +37,8 @@ export default defineConfig({
text: "Guides",
items: [
{ text: "Getting Started", link: "/guides/getting-started" },
{ text: "Launching a Server", link: "/guides/launching" }
{ text: "Launching a Server", link: "/guides/launching" },
{ text: "Computing Log-Probabilities", link: "/guides/compute-log-probs" }
]
},
{
Expand Down
68 changes: 68 additions & 0 deletions docs/guides/compute-log-probs.md
Original file line number Diff line number Diff line change
@@ -0,0 +1,68 @@
# Computing Log-Probabilities (RL Scoring)

`Engine.compute_log_probs` scores `prompt + completion` token sequences under the
engine's current weights and returns one log-probability per completion token. It
is the core scoring primitive for online-RL trainers (PPO, GRPO, and any
KL-penalised objective) — for example to form importance-sampling ratios against
the policy that generated the rollouts.

## Usage

```python
from tokenspeed.runtime.entrypoints.engine import Engine

# Scoring runs a pure-extend (prefill-only) forward. On backends that cannot
# serve a mixed prefill+decode batch eagerly (e.g. the default `mha` backend),
# launch the engine for scoring with a backend + scheduler config that keeps the
# request on a pure-extend path:
engine = Engine(
model="<model-path>",
attention_backend="flashinfer",
enforce_eager=True,
disable_overlap_schedule=True,
)

out = engine.compute_log_probs(
sequences=[
{"prompt_token_ids": [1, 2, 3, 4], "completion_token_ids": [5, 6, 7]},
{"prompt_token_ids": [10, 11], "completion_token_ids": [12]},
],
temperature=1.0,
)

# out["log_probs"][i][j] == log P(completion_token_ids[i][j] | context)
# out["tokens"][i] == completion_token_ids[i]
out["log_probs"] # e.g. [[-0.12, -0.47, -0.31], [-2.03]]
out["tokens"] # [[5, 6, 7], [12]]
```

`log_probs[i][j]` is the log-probability of the realised completion token `j` in
sequence `i`, conditioned on everything before it (prompt + earlier completion
tokens). Only completion positions are scored; the prompt is context.

## How it works

It reuses the normal generation path: internally each sequence is sent through a
forward-only `generate` call (`max_new_tokens=0`, `return_logprob=True`,
`logprob_start_len=len(prompt)`), and the per-token input logprobs are read back
from `meta_info["input_token_logprobs"]`. Logits are gathered across tensor-parallel
ranks before `log_softmax`, exactly as on the sampling path. No engine pause is
required; scoring requests can be interleaved with normal generation.

Long sequences are handled across chunked prefill: when a `prompt + completion`
is split into multiple prefill chunks, the input-logprob window is collected from
every chunk it overlaps (not just the first), so the full set of completion
logprobs is returned regardless of `chunked_prefill_size`.

## Limits (current)

- **Temperature:** `temperature=1.0` only (raw `log_softmax`). Other values raise
`NotImplementedError`. Sampling-temperature scaling (for off-policy importance
sampling) is a planned follow-up.
- **Speculative decoding:** unavailable — `compute_log_probs` raises if the engine
was launched with a speculative algorithm (the generation path disables logprobs
in that mode).
- **Prompt/completion:** both must be non-empty (the first completion token needs
prior context to be scored).
- **Surface:** exposed as the `Engine` Python method. A native HTTP / SMG endpoint
is deferred until there is a consumer for it.
142 changes: 142 additions & 0 deletions python/tokenspeed/runtime/engine/compute_log_probs.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,142 @@
# Copyright (c) 2026 LightSeek Foundation
#
# Permission is hereby granted, free of charge, to any person obtaining a copy
# of this software and associated documentation files (the "Software"), to deal
# in the Software without restriction, including without limitation the rights
# to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
# copies of the Software, and to permit persons to whom the Software is
# furnished to do so, subject to the following conditions:
#
# The above copyright notice and this permission notice shall be included in
# all copies or substantial portions of the Software.
#
# THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
# IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
# FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
# AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
# LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
# OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
# SOFTWARE.

"""Pure, GPU-free helpers for the compute_log_probs API (RL-plan Milestone 2).

The engine scores ``prompt + completion`` sequences by reusing the normal
generation path: a forward-only ``generate`` call with ``return_logprob=True``
and ``logprob_start_len=len(prompt)`` makes ``meta_info['input_token_logprobs']``
carry exactly the per-completion-token logprobs. These helpers build that call
and parse its result; ``Engine.compute_log_probs`` wires them to ``self.generate``.
"""

from __future__ import annotations

from typing import Any, Callable

DEFAULT_TEMPERATURE = 1.0
# Set to 1 if the GPU spike shows max_new_tokens=0 is unsupported; the single
# generated token lands in output_token_logprobs, never input_token_logprobs.
SCORE_MAX_NEW_TOKENS = 0


class InvalidSequenceError(ValueError):
"""Raised when a sequence cannot be scored (empty prompt or completion)."""


def validate_sequence(
prompt_token_ids: list[int], completion_token_ids: list[int]
) -> None:
if not prompt_token_ids:
raise InvalidSequenceError(
"prompt_token_ids must be non-empty: the first completion token needs "
"prior context to be scored."
)
if not completion_token_ids:
raise InvalidSequenceError(
"completion_token_ids must be non-empty: nothing to score."
)


def build_score_kwargs(
prompt_token_ids: list[int],
completion_token_ids: list[int],
temperature: float = DEFAULT_TEMPERATURE,
) -> dict[str, Any]:
"""Build the kwargs for an internal forward-only ``Engine.generate`` call."""
validate_sequence(prompt_token_ids, completion_token_ids)
# Note: compute_log_probs_core separately gates on temperature != 1.0 for v1;
# the two checks serve different audiences (standalone helper vs. v1 core path),
# so the divergence is intentional, not accidental.
if temperature <= 0:
raise ValueError(f"temperature must be > 0, got {temperature}")
return {
"input_ids": list(prompt_token_ids) + list(completion_token_ids),
"sampling_params": {
"max_new_tokens": SCORE_MAX_NEW_TOKENS,
"temperature": temperature,
},
"return_logprob": True,
# The logprob of completion token c_j is read from the logits at the
# *preceding* position, so scoring starts one token before the
# completion: logprob_start_len = len(prompt) - 1. The engine returns
# one entry per position from there to the end — the M completion
# logprobs followed by one trailing sampled-position entry (target token
# -1) that extract_completion_logprobs drops. (Verified on B200.)
"logprob_start_len": len(prompt_token_ids) - 1,
}


def extract_completion_logprobs(
meta_info: dict[str, Any], num_completion: int
) -> tuple[list[float], list[int]]:
"""Split ``meta_info['input_token_logprobs']`` into (log_probs, tokens).

Each entry is a ``(logprob, token_id, text_or_None)`` tuple. The engine
returns the M completion logprobs (aligned to ``logprob_start_len =
len(prompt) - 1``) followed by one trailing sampled-position entry, so we
keep the first ``num_completion``. Fewer than that means the logprob window
was wrong (or input logprobs were not produced), so we fail loudly rather
than return a silently-misaligned array.
"""
entries = meta_info.get("input_token_logprobs")
if not entries or len(entries) < num_completion:
got = 0 if entries is None else len(entries)
raise ValueError(
f"expected at least {num_completion} completion logprobs, got {got}; "
"check logprob_start_len alignment / input-logprob support."
)
entries = entries[:num_completion]
log_probs = [float(e[0]) for e in entries]
tokens = [int(e[1]) for e in entries]
return log_probs, tokens


def compute_log_probs_core(
sequences: list[dict[str, list[int]]],
generate_fn: Callable[..., dict[str, Any]],
temperature: float = DEFAULT_TEMPERATURE,
) -> dict[str, list[list[float]]]:
"""Score each sequence by calling ``generate_fn`` and parsing the result.

``generate_fn`` must have the signature of ``Engine.generate`` and return a
single result dict (non-streaming) carrying ``meta_info``. v1 supports only
``temperature == 1.0`` (raw log_softmax), matching the engine's default
``temp_scaled_logprobs=False`` path; other values raise ``NotImplementedError``.
"""
if temperature != DEFAULT_TEMPERATURE:
raise NotImplementedError(
"compute_log_probs v1 supports temperature=1.0 (raw log_softmax) only; "
f"got {temperature}. Sampling-temperature scaling is a follow-up."
)

log_probs_out: list[list[float]] = []
tokens_out: list[list[int]] = []
for seq in sequences:
prompt_ids = seq["prompt_token_ids"]
completion_ids = seq["completion_token_ids"]
kwargs = build_score_kwargs(prompt_ids, completion_ids, temperature)
result = generate_fn(**kwargs)
log_probs, tokens = extract_completion_logprobs(
result["meta_info"], len(completion_ids)
)
log_probs_out.append(log_probs)
tokens_out.append(tokens)
return {"log_probs": log_probs_out, "tokens": tokens_out}
70 changes: 68 additions & 2 deletions python/tokenspeed/runtime/engine/generation_output_processor.py
Original file line number Diff line number Diff line change
Expand Up @@ -107,6 +107,12 @@ def __init__(
self.output_token_logprobs_idx: list[int] | None = (
[] if return_logprob else None
)
# Input/prompt-token logprobs (populated once at prefill when requested).
self.input_token_logprobs_val: list[float] | None = (
[] if return_logprob else None
)
self.input_token_logprobs_idx: list[int] | None = [] if return_logprob else None
self.input_token_logprobs_sent: bool = False

# --- Streaming bookkeeping (internal) ---
self._surr_offset: int | None = None
Expand Down Expand Up @@ -521,6 +527,20 @@ def post_process_forward_op(
if model_execution_results.output_logprobs is not None
else None
)
# Input/prompt-token logprobs for this forward (pure-extend scoring
# batches only): a flat (values, token_ids) pair over extend requests.
_input_logprobs_pair = model_execution_results.input_token_logprobs
input_logprobs_val = (
_input_logprobs_pair[0].tolist()
if _input_logprobs_pair is not None
else None
)
input_logprobs_idx = (
_input_logprobs_pair[1].tolist()
if _input_logprobs_pair is not None and _input_logprobs_pair[1] is not None
else None
)
ilp_pt = 0
pt = 0
for i, rid in enumerate(forward_op.request_ids):
output_length = model_execution_results.output_lengths[i].item()
Expand All @@ -538,12 +558,41 @@ def post_process_forward_op(
else:
pt += output_length

# Slice this request's input/prompt logprobs and advance the flat
# pointer BEFORE any `continue`, so alignment holds across requests.
req_input_lp_val = None
req_input_lp_idx = None
if input_logprobs_val is not None and i < num_extends:
sl = int(forward_op.extend_logprob_start_lens[i])
if sl >= 0:
plen = int(forward_op.input_lengths[i]) - sl
if plen > 0:
req_input_lp_val = input_logprobs_val[ilp_pt : ilp_pt + plen]
if input_logprobs_idx is not None:
req_input_lp_idx = input_logprobs_idx[
ilp_pt : ilp_pt + plen
]
ilp_pt += plen

if rid not in self.rid_to_state:
# means it's delayed token, do not process
continue

request_state: RequestState = self.rid_to_state[rid]

# Accumulate input/prompt logprobs BEFORE the chunked-prefill guard
# below: when a scored sequence spans multiple prefill chunks, each
# non-final chunk contributes part of the requested window. Skipping
# this on chunk boundaries would drop those tokens and leave
# compute_log_probs with fewer logprobs than completion tokens.
if (
req_input_lp_val is not None
and request_state.input_token_logprobs_val is not None
):
Comment on lines +588 to +591
Copy link
Copy Markdown

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

P1 Badge Preserve input logprobs across prefill chunks

When a scoring request is chunked, non-final prefill forwards can already carry req_input_lp_val, but this new accumulation runs only after the existing prefill_finished guard. Those chunk results are therefore discarded instead of being stored on RequestState, so long prompt+completion sequences return only the final chunk's logprobs (or none) and compute_log_probs raises/incompletely scores. Accumulate the input logprobs before skipping chunked-prefill output.

Useful? React with 👍 / 👎.

request_state.input_token_logprobs_val.extend(req_input_lp_val)
if req_input_lp_idx is not None:
request_state.input_token_logprobs_idx.extend(req_input_lp_idx)

# Do not output chunking result
if not request_state.prefill_finished:
continue
Expand Down Expand Up @@ -706,6 +755,8 @@ def stream_output(
output_extra_infos: list[dict] = []
output_token_logprobs_val: list[list[float]] = []
output_token_logprobs_idx: list[list[int]] = []
input_token_logprobs_val: list[list[float]] = []
input_token_logprobs_idx: list[list[int]] = []

for i, rs in enumerate(output_states):
# For finished requests, always output (unless already output)
Expand Down Expand Up @@ -785,6 +836,21 @@ def stream_output(
output_token_logprobs_val.append([])
output_token_logprobs_idx.append([])

# Input/prompt logprobs are produced once at prefill; ship them on
# the first output for this request, then mark sent so multi-token
# generations don't resend them every stream step.
if (
rs.return_logprob
and rs.input_token_logprobs_val
and not rs.input_token_logprobs_sent
):
input_token_logprobs_val.append(list(rs.input_token_logprobs_val))
input_token_logprobs_idx.append(list(rs.input_token_logprobs_idx))
rs.input_token_logprobs_sent = True
else:
input_token_logprobs_val.append([])
input_token_logprobs_idx.append([])

# Don't send empty batch to detokenizer
if len(rids_to_send) == 0:
return
Expand All @@ -804,8 +870,8 @@ def stream_output(
completion_tokens=completion_tokens,
cached_tokens=cached_tokens,
spec_verify_ct=spec_verify_ct,
input_token_logprobs_val=[],
input_token_logprobs_idx=[],
input_token_logprobs_val=input_token_logprobs_val,
input_token_logprobs_idx=input_token_logprobs_idx,
output_token_logprobs_val=output_token_logprobs_val,
output_token_logprobs_idx=output_token_logprobs_idx,
input_top_logprobs_val=[],
Expand Down
9 changes: 9 additions & 0 deletions python/tokenspeed/runtime/engine/request_handler.py
Original file line number Diff line number Diff line change
Expand Up @@ -192,9 +192,18 @@ def handle_generate_request(
if recv_req.bootstrap_port is None:
recv_req.bootstrap_port = self.server_args.disaggregation_bootstrap_port

# Input/prompt-token logprobs are requested only when return_logprob is
# set AND a non-negative logprob_start_len is given; otherwise -1 tells
# the scheduler to skip them (output-only or no logprobs).
logprob_start_len = -1
if recv_req.return_logprob and recv_req.logprob_start_len is not None:
if recv_req.logprob_start_len >= 0:
logprob_start_len = recv_req.logprob_start_len

req_spec = make_spec(
rid=recv_req.rid,
tokens=recv_req.input_ids,
logprob_start_len=logprob_start_len,
)
req_state = RequestState.from_recv_req(
recv_req,
Expand Down
4 changes: 3 additions & 1 deletion python/tokenspeed/runtime/engine/scheduler_utils.py
Original file line number Diff line number Diff line change
Expand Up @@ -44,10 +44,12 @@
_TRUTHY_ENV_VALUES = {"1", "true", "yes", "on"}


def make_spec(rid: str, tokens: list[int]) -> RequestSpec:
def make_spec(rid: str, tokens: list[int], logprob_start_len: int = -1) -> RequestSpec:
spec = RequestSpec()
spec.request_id = rid
spec.tokens = tokens
# -1 means input/prompt-token logprobs are not requested.
spec.logprob_start_len = logprob_start_len
return spec


Expand Down
Loading
Loading