Skip to content
Open
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension


Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
3 changes: 2 additions & 1 deletion pkg/templates/python/openai-computer-use/.env.example
Original file line number Diff line number Diff line change
@@ -1,2 +1,3 @@
# Copy this file to .env and fill in your API key
# Copy this file to .env and fill in your API keys
OPENAI_API_KEY=your_openai_api_key_here
KERNEL_API_KEY=your_kernel_api_key_here
23 changes: 20 additions & 3 deletions pkg/templates/python/openai-computer-use/README.md
Original file line number Diff line number Diff line change
@@ -1,7 +1,24 @@
# Kernel Python Sample App - OpenAI Computer Use

This is a Kernel application that demonstrates using the Computer Use Agent (CUA) from OpenAI.
This is a Kernel application that demonstrates using the Computer Use Agent (CUA) from OpenAI with Kernel's native browser control API.

It generally follows the [OpenAI CUA Sample App Reference](https://github.com/openai/openai-cua-sample-app) and uses Playwright via Kernel for browser automation.
It uses Kernel's computer control endpoints (screenshot, click, type, scroll, batch, etc.) instead of Playwright, and includes a `batch_computer_actions` tool that executes multiple actions in a single API call for lower latency.

See the [docs](https://www.kernel.sh/docs/quickstart) for more information.
## Local testing

You can test against a remote Kernel browser without deploying:

```bash
cp .env.example .env
# Fill in OPENAI_API_KEY and KERNEL_API_KEY in .env
uv run test_local.py
```

## Deploy to Kernel

```bash
kernel deploy main.py --env-file .env
kernel invoke python-openai-cua cua-task -p '{"task":"go to https://news.ycombinator.com and list top 5 articles"}'
```

See the [docs](https://www.kernel.sh/docs/quickstart) for more information.
122 changes: 105 additions & 17 deletions pkg/templates/python/openai-computer-use/agent/agent.py
Original file line number Diff line number Diff line change
@@ -1,32 +1,82 @@
from computers import Computer
import json
from typing import Callable
from computers.kernel_computer import KernelComputer
from utils import (
create_response,
show_image,
pp,
sanitize_message,
check_blocklisted_url,
)
import json
from typing import Callable

BATCH_FUNC_NAME = "batch_computer_actions"

class Agent:
"""
A sample agent class that can be used to interact with a computer.
BATCH_INSTRUCTIONS = """You have two ways to perform actions:
1. The standard computer tool — use for single actions when you need screenshot feedback after each step.
2. batch_computer_actions — use to execute multiple actions at once when you can predict the outcome.

ALWAYS prefer batch_computer_actions when performing predictable sequences like:
- Clicking a text field, typing text, and pressing Enter
- Typing a URL and pressing Enter
- Any sequence where you don't need to see intermediate results"""

BATCH_TOOL = {
"type": "function",
"name": BATCH_FUNC_NAME,
"description": (
"Execute multiple computer actions in sequence without waiting for "
"screenshots between them. Use this when you can predict the outcome of a "
"sequence of actions without needing intermediate visual feedback. After all "
"actions execute, a single screenshot is taken and returned.\n\n"
"PREFER this over individual computer actions when:\n"
"- Typing text followed by pressing Enter\n"
"- Clicking a field and then typing into it\n"
"- Any sequence where intermediate screenshots are not needed"
),
"parameters": {
"type": "object",
"properties": {
"actions": {
"type": "array",
"description": "Ordered list of actions to execute",
"items": {
"type": "object",
"properties": {
"type": {
"type": "string",
"enum": ["click", "double_click", "type", "keypress", "scroll", "move", "drag", "wait"],
},
"x": {"type": "number"},
"y": {"type": "number"},
"text": {"type": "string"},
"keys": {"type": "array", "items": {"type": "string"}},
"button": {"type": "string"},
"scroll_x": {"type": "number"},
"scroll_y": {"type": "number"},
},
"required": ["type"],
},
},
},
"required": ["actions"],
},
"strict": False,
}

(See simple_cua_loop.py for a simple example without an agent.)
"""

class Agent:
"""An agent that uses OpenAI CUA with Kernel's native computer control API."""

def __init__(
self,
model="computer-use-preview",
computer: Computer = None,
computer: KernelComputer = None,
tools: list[dict] = [],
acknowledge_safety_check_callback: Callable = lambda message: False,
):
self.model = model
self.computer = computer
self.tools = tools
self.tools = list(tools)
self.print_steps = True
self.debug = False
self.show_images = False
Expand All @@ -41,6 +91,7 @@ def __init__(
"display_height": dimensions[1],
"environment": computer.get_environment(),
},
BATCH_TOOL,
{
"type": "function",
"name": "back",
Expand Down Expand Up @@ -75,6 +126,28 @@ def debug_print(self, *args):
if self.debug:
pp(*args)

def _execute_computer_action(self, action_type, action_args):
if action_type == "click":
self.computer.click(**action_args)
elif action_type == "double_click":
self.computer.double_click(**action_args)
elif action_type == "type":
self.computer.type(**action_args)
elif action_type == "keypress":
self.computer.keypress(**action_args)
elif action_type == "scroll":
self.computer.scroll(**action_args)
elif action_type == "move":
self.computer.move(**action_args)
elif action_type == "drag":
self.computer.drag(**action_args)
elif action_type == "wait":
self.computer.wait(**action_args)
elif action_type == "screenshot":
pass
else:
print(f"Warning: unknown action type: {action_type}")

def handle_item(self, item):
"""Handle each item; may cause a computer action + screenshot."""
if item["type"] == "message":
Expand All @@ -86,14 +159,17 @@ def handle_item(self, item):
if self.print_steps:
print(f"{name}({args})")

if hasattr(self.computer, name): # if function exists on computer, call it
if name == BATCH_FUNC_NAME:
return self._handle_batch_call(item["call_id"], args)

if hasattr(self.computer, name):
method = getattr(self.computer, name)
method(**args)
return [
{
"type": "function_call_output",
"call_id": item["call_id"],
"output": "success", # hard-coded output for demo
"output": "success",
}
]

Expand All @@ -104,14 +180,12 @@ def handle_item(self, item):
if self.print_steps:
print(f"{action_type}({action_args})")

method = getattr(self.computer, action_type)
method(**action_args)
self._execute_computer_action(action_type, action_args)

screenshot_base64 = self.computer.screenshot()
if self.show_images:
show_image(screenshot_base64)

# if user doesn't ack all safety checks exit with error
pending_checks = item.get("pending_safety_checks", [])
for check in pending_checks:
message = check["message"]
Expand All @@ -130,7 +204,6 @@ def handle_item(self, item):
},
}

# additional URL safety checks for browser environments
if self.computer.get_environment() == "browser":
current_url = self.computer.get_current_url()
check_blocklisted_url(current_url)
Expand All @@ -139,6 +212,21 @@ def handle_item(self, item):
return [call_output]
return []

def _handle_batch_call(self, call_id, args):
actions = args.get("actions", [])
self.computer.batch_actions(actions)
screenshot_base64 = self.computer.screenshot()
return [
{
"type": "function_call_output",
"call_id": call_id,
"output": json.dumps([
{"type": "text", "text": "Actions executed successfully."},
{"type": "image_url", "image_url": f"data:image/png;base64,{screenshot_base64}"},
]),
}
]

def run_full_turn(
self, input_items, print_steps=True, debug=False, show_images=False
):
Expand All @@ -147,7 +235,6 @@ def run_full_turn(
self.show_images = show_images
new_items = []

# keep looping until we get a final response
while new_items[-1].get("role") != "assistant" if new_items else True:
self.debug_print([sanitize_message(msg) for msg in input_items + new_items])

Expand All @@ -156,6 +243,7 @@ def run_full_turn(
input=input_items + new_items,
tools=self.tools,
truncation="auto",
instructions=BATCH_INSTRUCTIONS,
)
self.debug_print(response)

Expand Down
Original file line number Diff line number Diff line change
@@ -1,11 +1,7 @@
from . import default
from . import contrib
from .kernel_computer import KernelComputer
from .computer import Computer
from .config import computers_config

__all__ = [
"default",
"contrib",
"KernelComputer",
"Computer",
"computers_config",
]
14 changes: 11 additions & 3 deletions pkg/templates/python/openai-computer-use/computers/computer.py
Original file line number Diff line number Diff line change
@@ -1,8 +1,8 @@
from typing import Protocol, List, Literal, Dict
from typing import Protocol, List, Literal, Dict, Any


class Computer(Protocol):
"""Defines the 'shape' (methods/properties) our loop expects."""
"""Defines the shape (methods/properties) the agent loop expects."""

def get_environment(self) -> Literal["windows", "mac", "linux", "browser"]: ...

Expand All @@ -26,4 +26,12 @@ def keypress(self, keys: List[str]) -> None: ...

def drag(self, path: List[Dict[str, int]]) -> None: ...

def get_current_url() -> str: ...
def batch_actions(self, actions: List[Dict[str, Any]]) -> None: ...

def goto(self, url: str) -> None: ...

def back(self) -> None: ...

def forward(self) -> None: ...

def get_current_url(self) -> str: ...
6 changes: 2 additions & 4 deletions pkg/templates/python/openai-computer-use/computers/config.py
Original file line number Diff line number Diff line change
@@ -1,7 +1,5 @@
from .default import *
from .contrib import *
from .kernel_computer import KernelComputer

computers_config = {
"local-playwright": LocalPlaywrightBrowser,
Copy link

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Updated config.py is now dead code

Low Severity

config.py was updated in this PR to reference KernelComputer, but computers/__init__.py no longer imports or exports computers_config. No other file references it either, making this entire file dead code.

Fix in Cursor Fix in Web

"kernel": KernelPlaywrightBrowser,
"kernel": KernelComputer,
}

This file was deleted.

This file was deleted.

Loading