From 06d904b9175e6492801f1b3a335b1f607befa317 Mon Sep 17 00:00:00 2001 From: developer-agent Date: Tue, 16 Jun 2026 07:47:50 -0400 Subject: [PATCH] fix: strip content-length from upstream headers in non-streaming response (OQP-1) JSONResponse re-serialises the upstream JSON body using compact separators, producing a body that differs from Ollama's trailing-newline-padded bytes. Starlette only auto-computes content-length when the header is absent, so passing upstream headers caused the stale (off-by-one) value to win. Add _STRIP_RESPONSE_HEADERS constant and filter content-length and transfer-encoding before building the JSONResponse. Add regression test that asserts the returned content-length matches the re-serialised body, not the upstream value. Co-Authored-By: Claude Sonnet 4.6 --- src/ollama_queue_proxy/proxy.py | 12 +++++- tests/test_proxy.py | 74 +++++++++++++++++++++++++++++++++ 2 files changed, 85 insertions(+), 1 deletion(-) diff --git a/src/ollama_queue_proxy/proxy.py b/src/ollama_queue_proxy/proxy.py index 7ad9c15..2f53a5b 100644 --- a/src/ollama_queue_proxy/proxy.py +++ b/src/ollama_queue_proxy/proxy.py @@ -34,6 +34,12 @@ "transfer-encoding", } +# Headers to strip from upstream before building a non-streaming JSONResponse. +# JSONResponse re-serialises the body (dropping any trailing newline Ollama appends), +# so it must compute content-length itself — passing the upstream value causes an +# off-by-one. transfer-encoding is hop-by-hop and invalid on a buffered response. +_STRIP_RESPONSE_HEADERS = {"content-length", "transfer-encoding"} + def extract_model(body: bytes) -> str | None: """Extract the 'model' field from a JSON request body.""" @@ -244,10 +250,14 @@ async def stream_gen(r=resp): ) else: ct = resp.headers.get("content-type", "") + passthrough_headers = { + k: v for k, v in resp.headers.items() + if k.lower() not in _STRIP_RESPONSE_HEADERS + } return JSONResponse( status_code=resp.status_code, content=resp.json() if ct.startswith("application/json") else None, - headers={**dict(resp.headers), **response_headers}, + headers={**passthrough_headers, **response_headers}, ) except (httpx.ConnectError, httpx.TimeoutException, httpx.HTTPStatusError) as e: diff --git a/tests/test_proxy.py b/tests/test_proxy.py index 4017033..0b1b0d5 100644 --- a/tests/test_proxy.py +++ b/tests/test_proxy.py @@ -3,6 +3,10 @@ from __future__ import annotations import json +from unittest.mock import AsyncMock, MagicMock, patch + +import httpx +import pytest from ollama_queue_proxy.proxy import extract_model, _MODEL_MANAGEMENT_PATHS @@ -39,3 +43,73 @@ def test_generate_not_in_management_paths(): def test_chat_not_in_management_paths(): assert "/api/chat" not in _MODEL_MANAGEMENT_PATHS + + +# --------------------------------------------------------------------------- +# OQP-1: Content-Length off-by-one on non-streaming chat completions +# --------------------------------------------------------------------------- + +@pytest.mark.asyncio +async def test_non_streaming_response_content_length_correct(): + """ + dispatch_request must NOT forward the upstream content-length to JSONResponse. + Ollama appends a trailing newline to non-streaming JSON bodies, so the upstream + content-length is 1 byte longer than the re-serialised response body. + The returned response must carry the correct (re-serialised) content-length. + """ + from fastapi import Request + from ollama_queue_proxy.proxy import dispatch_request + from ollama_queue_proxy.hosts import HostManager, OllamaHost + from tests.conftest import make_config + + payload = {"message": {"role": "assistant", "content": "hi"}, "done": True} + # Ollama appends \n — body is 1 byte longer than the JSON-only serialisation + upstream_body = json.dumps(payload).encode() + b"\n" + upstream_content_length = str(len(upstream_body)) # e.g. "52" + + mock_resp = MagicMock(spec=httpx.Response) + mock_resp.status_code = 200 + mock_resp.headers = httpx.Headers({ + "content-type": "application/json", + "content-length": upstream_content_length, + }) + mock_resp.json.return_value = payload + + mock_client = AsyncMock(spec=httpx.AsyncClient) + mock_client.request = AsyncMock(return_value=mock_resp) + + cfg = make_config() + host = OllamaHost(url="http://ollama-test:11434", name="test") + host.healthy = True + hm = HostManager.__new__(HostManager) + hm.hosts = [host] + + scope = { + "type": "http", + "method": "POST", + "path": "/api/chat", + "query_string": b"", + "headers": [(b"content-type", b"application/json")], + } + request = Request(scope) + request.state.request_id = "test-req" + + response = await dispatch_request( + request=request, + body=json.dumps({"model": "llama3", "messages": []}).encode(), + client_id=None, + config=cfg, + host_manager=hm, + client=mock_client, + ) + + # JSONResponse uses compact separators — match that serialisation to get the correct length + expected_body = json.dumps(payload, ensure_ascii=False, allow_nan=False, + indent=None, separators=(",", ":")).encode("utf-8") + assert response.headers["content-length"] == str(len(expected_body)), ( + f"content-length should be {len(expected_body)} (re-serialised body), " + f"not {upstream_content_length} (upstream body with trailing newline)" + ) + assert response.headers["content-length"] != upstream_content_length, ( + "upstream content-length (with trailing newline) must not bleed into response" + )