From 33b9143b6a484fe4d9bfa3dd46460857759bb270 Mon Sep 17 00:00:00 2001
From: Chris Busillo <info@shinycomputers.com>
Date: Wed, 13 May 2026 17:24:00 -0400
Subject: [PATCH] Address image harness review findings

---
 tools/code-exec-harness/harness.py            |  55 ++++++---
 .../scenarios/image-history-replay.json       | 107 +++++++++---------
 2 files changed, 96 insertions(+), 66 deletions(-)

diff --git a/tools/code-exec-harness/harness.py b/tools/code-exec-harness/harness.py
index e9e0cef3a5ef..df83df24d7c3 100644
--- a/tools/code-exec-harness/harness.py
+++ b/tools/code-exec-harness/harness.py
@@ -42,6 +42,7 @@ def __init__(self, fixture: dict[str, Any], artifacts: Path) -> None:
         self.artifacts = artifacts
         self.requests: list[dict[str, Any]] = []
         self._responses = list(fixture.get("responses", []))
+        self._next_response_index = 0
         self._lock = threading.Lock()
         self._httpd: socketserver.ThreadingTCPServer | None = None
         self._thread: threading.Thread | None = None
@@ -75,9 +76,11 @@ def __exit__(self, _exc_type: object, _exc: object, _tb: object) -> None:
             self._thread.join(timeout=5)
         self.write_artifacts()
 
-    def _next_response(self) -> dict[str, Any]:
+    def _record_request_and_next_response(self, request: dict[str, Any]) -> dict[str, Any]:
         with self._lock:
-            index = len(self.requests)
+            index = self._next_response_index
+            self._next_response_index += 1
+            self.requests.append(request)
             if index >= len(self._responses):
                 return {"status": 500, "body": {"error": f"missing fake response for request {index + 1}"}}
             return self._responses[index]
@@ -90,9 +93,9 @@ def _handle_post(self, handler: http.server.BaseHTTPRequestHandler) -> None:
             body = json.loads(raw_body) if raw_body else None
         except json.JSONDecodeError:
             body = {"_invalid_json": raw_body}
-        response = self._next_response()
-        with self._lock:
-            self.requests.append({"method": "POST", "path": parsed.path, "body": body})
+        response = self._record_request_and_next_response(
+            {"method": "POST", "path": parsed.path, "body": body}
+        )
 
         if not parsed.path.endswith("/responses"):
             self._send_json(handler, 404, {"error": f"unexpected path {parsed.path}"})
@@ -739,6 +742,16 @@ def request_at(summary: dict[str, Any], index: int) -> dict[str, Any]:
     return request
 
 
+def request_assertion_target(request: dict[str, Any], assertion: dict[str, Any]) -> Any:
+    body = request.get("body")
+    scope = str(assertion.get("scope", "body"))
+    if scope == "body":
+        return body
+    if scope == "input":
+        return body.get("input") if isinstance(body, dict) else None
+    raise HarnessError(f"unsupported responses assertion scope: {scope}")
+
+
 def assert_expectations(summary: dict[str, Any], scenario: dict[str, Any]) -> list[str]:
     failures: list[str] = []
     expect = scenario.get("expect", {})
@@ -769,19 +782,28 @@ def assert_expectations(summary: dict[str, Any], scenario: dict[str, Any]) -> li
         except HarnessError as exc:
             failures.append(str(exc))
             continue
-        body = request.get("body")
+        try:
+            target = request_assertion_target(request, assertion)
+        except HarnessError as exc:
+            failures.append(str(exc))
+            continue
         if "image_payload_bytes" in assertion:
-            actual = data_image_payload_bytes(body)
+            actual = data_image_payload_bytes(target)
             expected = int(assertion["image_payload_bytes"])
             if actual != expected:
                 failures.append(
                     f"responses request {assertion.get('request', 0)} image payload bytes expected {expected}, got {actual}"
                 )
-        if "contains" in assertion and not contains_text(body, str(assertion["contains"])):
+        if "contains" in assertion and not contains_text(target, str(assertion["contains"])):
             failures.append(
                 f"responses request {assertion.get('request', 0)} did not contain {assertion['contains']!r}"
             )
-        if "not_contains" in assertion and contains_text(body, str(assertion["not_contains"])):
+        for needle in assertion.get("contains_all", []):
+            if not contains_text(target, str(needle)):
+                failures.append(
+                    f"responses request {assertion.get('request', 0)} did not contain {needle!r}"
+                )
+        if "not_contains" in assertion and contains_text(target, str(assertion["not_contains"])):
             failures.append(
                 f"responses request {assertion.get('request', 0)} unexpectedly contained {assertion['not_contains']!r}"
             )
@@ -830,7 +852,9 @@ def run_scenario(path: Path, args: argparse.Namespace) -> int:
         else None
     )
 
-    def run_with_env(fake_server: FakeResponsesServer | None) -> tuple[int, list[dict[str, Any]], list[str]]:
+    def run_with_env(
+        fake_server: FakeResponsesServer | None,
+    ) -> tuple[int, list[dict[str, Any]], list[list[str]]]:
         run_env = env.copy()
         if fake_server is not None:
             run_env["OPENAI_BASE_URL"] = fake_server.base_url
@@ -840,13 +864,13 @@ def run_with_env(fake_server: FakeResponsesServer | None) -> tuple[int, list[dic
         turn_prompts = scenario.get("turns")
         if isinstance(turn_prompts, list) and turn_prompts:
             all_events: list[dict[str, Any]] = []
-            commands: list[str] = []
+            commands: list[list[str]] = []
             session_id: str | None = None
             last_returncode = 0
             for index, turn in enumerate(turn_prompts, start=1):
                 prompt = str(turn.get("prompt", "") if isinstance(turn, dict) else turn)
                 command = build_command_for_prompt(scenario, args, paths, prompt, session_id)
-                commands.append(" ".join(command))
+                commands.append(command)
                 returncode, events = run_exec_capture(command, scenario, paths, run_env, f"turn-{index}")
                 all_events.extend(events)
                 last_returncode = returncode
@@ -860,7 +884,7 @@ def run_with_env(fake_server: FakeResponsesServer | None) -> tuple[int, list[dic
 
         command = build_command(scenario, args, paths)
         returncode, events = run_exec_capture(command, scenario, paths, run_env, "turn-1")
-        return returncode, events, [" ".join(command)]
+        return returncode, events, [command]
 
     put_json(paths.artifacts / "manifest.json", {
         "scenario": str(path),
@@ -879,9 +903,10 @@ def run_with_env(fake_server: FakeResponsesServer | None) -> tuple[int, list[dic
             returncode, events, commands = run_with_env(fake_server)
             responses_requests = list(fake_server.requests)
 
-    summary = summarize(events, paths, returncode, commands)
+    summary_command: list[str] = commands[-1] if commands else []
+    summary = summarize(events, paths, returncode, summary_command)
     summary["commands"] = summary.get("commands", [])
-    summary["scenario_commands"] = commands
+    summary["scenario_commands"] = [" ".join(command) for command in commands]
     summary["responses_requests"] = responses_requests
     failures = assert_expectations(summary, scenario)
     summary["expectation_failures"] = failures
diff --git a/tools/code-exec-harness/scenarios/image-history-replay.json b/tools/code-exec-harness/scenarios/image-history-replay.json
index 2a5be6124f99..d6fc49e9bb7b 100644
--- a/tools/code-exec-harness/scenarios/image-history-replay.json
+++ b/tools/code-exec-harness/scenarios/image-history-replay.json
@@ -1,56 +1,61 @@
 {
-  "name": "image-history-replay",
-  "model": "gpt-5.1-codex",
-  "files": {
-    "README.md": "# Image replay harness fixture\n"
-  },
-  "turns": [
-    {
-      "prompt": "Generate an image fixture. Do not run tools."
+    "name": "image-history-replay",
+    "model": "gpt-5.1-codex",
+    "files": {
+        "README.md": "# Image replay harness fixture\n"
     },
-    {
-      "prompt": "Continue without needing the previous image bytes."
-    }
-  ],
-  "responses_api": {
-    "responses": [
-      {
-        "response_id": "resp-image",
-        "events": [
-          {
-            "item": {
-              "type": "image_generation_call",
-              "id": "ig_1",
-              "status": "completed",
-              "result": {
-                "$repeat": "A",
-                "count": 65536
-              }
+    "turns": [
+        {
+            "prompt": "Generate an image fixture. Do not run tools."
+        },
+        {
+            "prompt": "Continue without needing the previous image bytes."
+        }
+    ],
+    "responses_api": {
+        "responses": [
+            {
+                "response_id": "resp-image",
+                "events": [
+                    {
+                        "item": {
+                            "type": "image_generation_call",
+                            "id": "ig_1",
+                            "status": "completed",
+                            "result": {
+                                "$repeat": "A",
+                                "count": 65536
+                            }
+                        }
+                    }
+                ]
+            },
+            {
+                "response_id": "resp-next"
             }
-          }
         ]
-      },
-      {
-        "response_id": "resp-next"
-      }
-    ]
-  },
-  "expect": {
-    "returncode": 0,
-    "responses_request_count": 2,
-    "responses": [
-      {
-        "request": 0,
-        "image_payload_bytes": 0
-      },
-      {
-        "request": 1,
-        "image_payload_bytes": 0,
-        "contains": "image generation result omitted",
-        "not_contains": "data:image/"
-      }
-    ]
-  },
-  "max_seconds": 30,
-  "timeout_seconds": 90
+    },
+    "expect": {
+        "returncode": 0,
+        "responses_request_count": 2,
+        "responses": [
+            {
+                "request": 0,
+                "scope": "input",
+                "image_payload_bytes": 0
+            },
+            {
+                "request": 1,
+                "scope": "input",
+                "image_payload_bytes": 0,
+                "contains": "image generation result omitted",
+                "contains_all": [
+                    "65536 bytes"
+                ],
+                "not_contains": "data:image/"
+            }
+        ]
+    },
+    "max_seconds": 30,
+    "timeout_seconds": 90
 }