lightseekorg · HJSang · Jun 1, 2026 · chatgpt-codex-connector · May 30, 2026
diff --git a/docs/.vitepress/config.mts b/docs/.vitepress/config.mts
@@ -37,7 +37,8 @@ export default defineConfig({
         text: "Guides",
         items: [
           { text: "Getting Started", link: "/guides/getting-started" },
-          { text: "Launching a Server", link: "/guides/launching" }
+          { text: "Launching a Server", link: "/guides/launching" },
+          { text: "Computing Log-Probabilities", link: "/guides/compute-log-probs" }
         ]
       },
       {

diff --git a/docs/guides/compute-log-probs.md b/docs/guides/compute-log-probs.md
@@ -0,0 +1,68 @@
+# Computing Log-Probabilities (RL Scoring)
+
+`Engine.compute_log_probs` scores `prompt + completion` token sequences under the
+engine's current weights and returns one log-probability per completion token. It
+is the core scoring primitive for online-RL trainers (PPO, GRPO, and any
+KL-penalised objective) — for example to form importance-sampling ratios against
+the policy that generated the rollouts.
+
+## Usage
+
+```python
+from tokenspeed.runtime.entrypoints.engine import Engine
+
+# Scoring runs a pure-extend (prefill-only) forward. On backends that cannot
+# serve a mixed prefill+decode batch eagerly (e.g. the default `mha` backend),
+# launch the engine for scoring with a backend + scheduler config that keeps the
+# request on a pure-extend path:
+engine = Engine(
+    model="<model-path>",
+    attention_backend="flashinfer",
+    enforce_eager=True,
+    disable_overlap_schedule=True,
+)
+
+out = engine.compute_log_probs(
+    sequences=[
+        {"prompt_token_ids": [1, 2, 3, 4], "completion_token_ids": [5, 6, 7]},
+        {"prompt_token_ids": [10, 11],     "completion_token_ids": [12]},
+    ],
+    temperature=1.0,
+)
+
+# out["log_probs"][i][j] == log P(completion_token_ids[i][j] | context)
+# out["tokens"][i]       == completion_token_ids[i]
+out["log_probs"]  # e.g. [[-0.12, -0.47, -0.31], [-2.03]]
+out["tokens"]     # [[5, 6, 7], [12]]
+```
+
+`log_probs[i][j]` is the log-probability of the realised completion token `j` in
+sequence `i`, conditioned on everything before it (prompt + earlier completion
+tokens). Only completion positions are scored; the prompt is context.
+
+## How it works
+
+It reuses the normal generation path: internally each sequence is sent through a
+forward-only `generate` call (`max_new_tokens=0`, `return_logprob=True`,
+`logprob_start_len=len(prompt)`), and the per-token input logprobs are read back
+from `meta_info["input_token_logprobs"]`. Logits are gathered across tensor-parallel
+ranks before `log_softmax`, exactly as on the sampling path. No engine pause is
+required; scoring requests can be interleaved with normal generation.
+
+Long sequences are handled across chunked prefill: when a `prompt + completion`
+is split into multiple prefill chunks, the input-logprob window is collected from
+every chunk it overlaps (not just the first), so the full set of completion
+logprobs is returned regardless of `chunked_prefill_size`.
+
+## Limits (current)
+
+- **Temperature:** `temperature=1.0` only (raw `log_softmax`). Other values raise
+  `NotImplementedError`. Sampling-temperature scaling (for off-policy importance
+  sampling) is a planned follow-up.
+- **Speculative decoding:** unavailable — `compute_log_probs` raises if the engine
+  was launched with a speculative algorithm (the generation path disables logprobs
+  in that mode).
+- **Prompt/completion:** both must be non-empty (the first completion token needs
+  prior context to be scored).
+- **Surface:** exposed as the `Engine` Python method. A native HTTP / SMG endpoint
+  is deferred until there is a consumer for it.
@@ -0,0 +1,142 @@
+# Copyright (c) 2026 LightSeek Foundation
+#
+# Permission is hereby granted, free of charge, to any person obtaining a copy
+# of this software and associated documentation files (the "Software"), to deal
+# in the Software without restriction, including without limitation the rights
+# to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
+# copies of the Software, and to permit persons to whom the Software is
+# furnished to do so, subject to the following conditions:
+#
+# The above copyright notice and this permission notice shall be included in
+# all copies or substantial portions of the Software.
+#
+# THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+# IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+# FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
+# AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+# LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
+# OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
+# SOFTWARE.
+
+"""Pure, GPU-free helpers for the compute_log_probs API (RL-plan Milestone 2).
+
+The engine scores ``prompt + completion`` sequences by reusing the normal
+generation path: a forward-only ``generate`` call with ``return_logprob=True``
+and ``logprob_start_len=len(prompt)`` makes ``meta_info['input_token_logprobs']``
+carry exactly the per-completion-token logprobs. These helpers build that call
+and parse its result; ``Engine.compute_log_probs`` wires them to ``self.generate``.
+"""
+
+from __future__ import annotations
+
+from typing import Any, Callable
+
+DEFAULT_TEMPERATURE = 1.0
+# Set to 1 if the GPU spike shows max_new_tokens=0 is unsupported; the single
+# generated token lands in output_token_logprobs, never input_token_logprobs.
+SCORE_MAX_NEW_TOKENS = 0
+
+
+class InvalidSequenceError(ValueError):
+    """Raised when a sequence cannot be scored (empty prompt or completion)."""
+
+
+def validate_sequence(
+    prompt_token_ids: list[int], completion_token_ids: list[int]
+) -> None:
+    if not prompt_token_ids:
+        raise InvalidSequenceError(
+            "prompt_token_ids must be non-empty: the first completion token needs "
+            "prior context to be scored."
+        )
+    if not completion_token_ids:
+        raise InvalidSequenceError(
+            "completion_token_ids must be non-empty: nothing to score."
+        )
+
+
+def build_score_kwargs(
+    prompt_token_ids: list[int],
+    completion_token_ids: list[int],
+    temperature: float = DEFAULT_TEMPERATURE,
+) -> dict[str, Any]:
+    """Build the kwargs for an internal forward-only ``Engine.generate`` call."""
+    validate_sequence(prompt_token_ids, completion_token_ids)
+    # Note: compute_log_probs_core separately gates on temperature != 1.0 for v1;
+    # the two checks serve different audiences (standalone helper vs. v1 core path),
+    # so the divergence is intentional, not accidental.
+    if temperature <= 0:
+        raise ValueError(f"temperature must be > 0, got {temperature}")
+    return {
+        "input_ids": list(prompt_token_ids) + list(completion_token_ids),
+        "sampling_params": {
+            "max_new_tokens": SCORE_MAX_NEW_TOKENS,
+            "temperature": temperature,
+        },
+        "return_logprob": True,
+        # The logprob of completion token c_j is read from the logits at the
+        # *preceding* position, so scoring starts one token before the
+        # completion: logprob_start_len = len(prompt) - 1. The engine returns
+        # one entry per position from there to the end — the M completion
+        # logprobs followed by one trailing sampled-position entry (target token
+        # -1) that extract_completion_logprobs drops. (Verified on B200.)
+        "logprob_start_len": len(prompt_token_ids) - 1,
+    }
+
+
+def extract_completion_logprobs(
+    meta_info: dict[str, Any], num_completion: int
+) -> tuple[list[float], list[int]]:
+    """Split ``meta_info['input_token_logprobs']`` into (log_probs, tokens).
+
+    Each entry is a ``(logprob, token_id, text_or_None)`` tuple. The engine
+    returns the M completion logprobs (aligned to ``logprob_start_len =
+    len(prompt) - 1``) followed by one trailing sampled-position entry, so we
+    keep the first ``num_completion``. Fewer than that means the logprob window
+    was wrong (or input logprobs were not produced), so we fail loudly rather
+    than return a silently-misaligned array.
+    """
+    entries = meta_info.get("input_token_logprobs")
+    if not entries or len(entries) < num_completion:
+        got = 0 if entries is None else len(entries)
+        raise ValueError(
+            f"expected at least {num_completion} completion logprobs, got {got}; "
+            "check logprob_start_len alignment / input-logprob support."
+        )
+    entries = entries[:num_completion]
+    log_probs = [float(e[0]) for e in entries]
+    tokens = [int(e[1]) for e in entries]
+    return log_probs, tokens
+
+
+def compute_log_probs_core(
+    sequences: list[dict[str, list[int]]],
+    generate_fn: Callable[..., dict[str, Any]],
+    temperature: float = DEFAULT_TEMPERATURE,
+) -> dict[str, list[list[float]]]:
+    """Score each sequence by calling ``generate_fn`` and parsing the result.
+
+    ``generate_fn`` must have the signature of ``Engine.generate`` and return a
+    single result dict (non-streaming) carrying ``meta_info``. v1 supports only
+    ``temperature == 1.0`` (raw log_softmax), matching the engine's default
+    ``temp_scaled_logprobs=False`` path; other values raise ``NotImplementedError``.
+    """
+    if temperature != DEFAULT_TEMPERATURE:
+        raise NotImplementedError(
+            "compute_log_probs v1 supports temperature=1.0 (raw log_softmax) only; "
+            f"got {temperature}. Sampling-temperature scaling is a follow-up."
+        )
+
+    log_probs_out: list[list[float]] = []
+    tokens_out: list[list[int]] = []
+    for seq in sequences:
+        prompt_ids = seq["prompt_token_ids"]
+        completion_ids = seq["completion_token_ids"]
+        kwargs = build_score_kwargs(prompt_ids, completion_ids, temperature)
+        result = generate_fn(**kwargs)
+        log_probs, tokens = extract_completion_logprobs(
+            result["meta_info"], len(completion_ids)
+        )
+        log_probs_out.append(log_probs)
+        tokens_out.append(tokens)
+    return {"log_probs": log_probs_out, "tokens": tokens_out}
@@ -107,6 +107,12 @@ def __init__(
         self.output_token_logprobs_idx: list[int] | None = (
             [] if return_logprob else None
         )
+        # Input/prompt-token logprobs (populated once at prefill when requested).
+        self.input_token_logprobs_val: list[float] | None = (
+            [] if return_logprob else None
+        )
+        self.input_token_logprobs_idx: list[int] | None = [] if return_logprob else None
+        self.input_token_logprobs_sent: bool = False
 
         # --- Streaming bookkeeping (internal) ---
         self._surr_offset: int | None = None
@@ -521,6 +527,20 @@ def post_process_forward_op(
             if model_execution_results.output_logprobs is not None
             else None
         )
+        # Input/prompt-token logprobs for this forward (pure-extend scoring
+        # batches only): a flat (values, token_ids) pair over extend requests.
+        _input_logprobs_pair = model_execution_results.input_token_logprobs
+        input_logprobs_val = (
+            _input_logprobs_pair[0].tolist()
+            if _input_logprobs_pair is not None
+            else None
+        )
+        input_logprobs_idx = (
+            _input_logprobs_pair[1].tolist()
+            if _input_logprobs_pair is not None and _input_logprobs_pair[1] is not None
+            else None
+        )
+        ilp_pt = 0
         pt = 0
         for i, rid in enumerate(forward_op.request_ids):
             output_length = model_execution_results.output_lengths[i].item()
@@ -538,12 +558,41 @@ def post_process_forward_op(
             else:
                 pt += output_length
 
+            # Slice this request's input/prompt logprobs and advance the flat
+            # pointer BEFORE any `continue`, so alignment holds across requests.
+            req_input_lp_val = None
+            req_input_lp_idx = None
+            if input_logprobs_val is not None and i < num_extends:
+                sl = int(forward_op.extend_logprob_start_lens[i])
+                if sl >= 0:
+                    plen = int(forward_op.input_lengths[i]) - sl
+                    if plen > 0:
+                        req_input_lp_val = input_logprobs_val[ilp_pt : ilp_pt + plen]
+                        if input_logprobs_idx is not None:
+                            req_input_lp_idx = input_logprobs_idx[
+                                ilp_pt : ilp_pt + plen
+                            ]
+                        ilp_pt += plen
+
             if rid not in self.rid_to_state:
                 # means it's delayed token, do not process
                 continue
 
             request_state: RequestState = self.rid_to_state[rid]
 
+            # Accumulate input/prompt logprobs BEFORE the chunked-prefill guard
+            # below: when a scored sequence spans multiple prefill chunks, each
+            # non-final chunk contributes part of the requested window. Skipping
+            # this on chunk boundaries would drop those tokens and leave
+            # compute_log_probs with fewer logprobs than completion tokens.
+            if (
+                req_input_lp_val is not None
+                and request_state.input_token_logprobs_val is not None
+            ):
+                request_state.input_token_logprobs_val.extend(req_input_lp_val)
+                if req_input_lp_idx is not None:
+                    request_state.input_token_logprobs_idx.extend(req_input_lp_idx)
+
             # Do not output chunking result
             if not request_state.prefill_finished:
                 continue
@@ -706,6 +755,8 @@ def stream_output(
         output_extra_infos: list[dict] = []
         output_token_logprobs_val: list[list[float]] = []
         output_token_logprobs_idx: list[list[int]] = []
+        input_token_logprobs_val: list[list[float]] = []
+        input_token_logprobs_idx: list[list[int]] = []
 
         for i, rs in enumerate(output_states):
             # For finished requests, always output (unless already output)
@@ -785,6 +836,21 @@ def stream_output(
                 output_token_logprobs_val.append([])
                 output_token_logprobs_idx.append([])
 
+            # Input/prompt logprobs are produced once at prefill; ship them on
+            # the first output for this request, then mark sent so multi-token
+            # generations don't resend them every stream step.
+            if (
+                rs.return_logprob
+                and rs.input_token_logprobs_val
+                and not rs.input_token_logprobs_sent
+            ):
+                input_token_logprobs_val.append(list(rs.input_token_logprobs_val))
+                input_token_logprobs_idx.append(list(rs.input_token_logprobs_idx))
+                rs.input_token_logprobs_sent = True
+            else:
+                input_token_logprobs_val.append([])
+                input_token_logprobs_idx.append([])
+
         # Don't send empty batch to detokenizer
         if len(rids_to_send) == 0:
             return
@@ -804,8 +870,8 @@ def stream_output(
             completion_tokens=completion_tokens,
             cached_tokens=cached_tokens,
             spec_verify_ct=spec_verify_ct,
-            input_token_logprobs_val=[],
-            input_token_logprobs_idx=[],
+            input_token_logprobs_val=input_token_logprobs_val,
+            input_token_logprobs_idx=input_token_logprobs_idx,
             output_token_logprobs_val=output_token_logprobs_val,
             output_token_logprobs_idx=output_token_logprobs_idx,
             input_top_logprobs_val=[],

@@ -192,9 +192,18 @@ def handle_generate_request(
         if recv_req.bootstrap_port is None:
             recv_req.bootstrap_port = self.server_args.disaggregation_bootstrap_port
 
+        # Input/prompt-token logprobs are requested only when return_logprob is
+        # set AND a non-negative logprob_start_len is given; otherwise -1 tells
+        # the scheduler to skip them (output-only or no logprobs).
+        logprob_start_len = -1
+        if recv_req.return_logprob and recv_req.logprob_start_len is not None:
+            if recv_req.logprob_start_len >= 0:
+                logprob_start_len = recv_req.logprob_start_len
+
         req_spec = make_spec(
             rid=recv_req.rid,
             tokens=recv_req.input_ids,
+            logprob_start_len=logprob_start_len,
         )
         req_state = RequestState.from_recv_req(
             recv_req,

@@ -44,10 +44,12 @@
 _TRUTHY_ENV_VALUES = {"1", "true", "yes", "on"}
 
 
-def make_spec(rid: str, tokens: list[int]) -> RequestSpec:
+def make_spec(rid: str, tokens: list[int], logprob_start_len: int = -1) -> RequestSpec:
     spec = RequestSpec()
     spec.request_id = rid
     spec.tokens = tokens
+    # -1 means input/prompt-token logprobs are not requested.
+    spec.logprob_start_len = logprob_start_len
     return spec