Skip to content
Open
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
219 changes: 191 additions & 28 deletions pdm.lock

Large diffs are not rendered by default.

9 changes: 9 additions & 0 deletions src/askui/tools/android/agent_os.py
Original file line number Diff line number Diff line change
Expand Up @@ -3,6 +3,8 @@

from PIL import Image

from askui.tools.android.uiautomator_hierarchy import UIElementCollection

ANDROID_KEY = Literal[ # pylint: disable=C0103
"HOME",
"BACK",
Expand Down Expand Up @@ -493,3 +495,10 @@ def pull(self, remote_path: str, local_path: str) -> None:
Pulls a file from the device.
"""
raise NotImplementedError

@abstractmethod
def get_ui_elements(self) -> UIElementCollection:
"""
Gets the UI elements.
"""
raise NotImplementedError
34 changes: 27 additions & 7 deletions src/askui/tools/android/agent_os_facade.py
Original file line number Diff line number Diff line change
Expand Up @@ -4,6 +4,7 @@

from askui.models.shared.tool_tags import ToolTags
from askui.tools.android.agent_os import ANDROID_KEY, AndroidAgentOs, AndroidDisplay
from askui.tools.android.uiautomator_hierarchy import UIElementCollection
from askui.utils.image_utils import scale_coordinates, scale_image_to_fit


Expand Down Expand Up @@ -36,33 +37,38 @@ def screenshot(self) -> Image.Image:
self._target_resolution,
)

def _scale_coordinates_back(self, x: int, y: int) -> Tuple[int, int]:
def _scale_coordinates(
self,
x: int,
y: int,
from_agent: bool = True,
) -> Tuple[int, int]:
if self._real_screen_resolution is None:
self._real_screen_resolution = self._agent_os.screenshot().size

return scale_coordinates(
(x, y),
self._real_screen_resolution,
self._target_resolution,
inverse=True,
inverse=from_agent,
)

def tap(self, x: int, y: int) -> None:
x, y = self._scale_coordinates_back(x, y)
x, y = self._scale_coordinates(x, y)
self._agent_os.tap(x, y)

def swipe(
self, x1: int, y1: int, x2: int, y2: int, duration_in_ms: int = 1000
) -> None:
x1, y1 = self._scale_coordinates_back(x1, y1)
x2, y2 = self._scale_coordinates_back(x2, y2)
x1, y1 = self._scale_coordinates(x1, y1)
x2, y2 = self._scale_coordinates(x2, y2)
self._agent_os.swipe(x1, y1, x2, y2, duration_in_ms)

def drag_and_drop(
self, x1: int, y1: int, x2: int, y2: int, duration_in_ms: int = 1000
) -> None:
x1, y1 = self._scale_coordinates_back(x1, y1)
x2, y2 = self._scale_coordinates_back(x2, y2)
x1, y1 = self._scale_coordinates(x1, y1)
x2, y2 = self._scale_coordinates(x2, y2)
self._agent_os.drag_and_drop(x1, y1, x2, y2, duration_in_ms)

def type(self, text: str) -> None:
Expand Down Expand Up @@ -121,3 +127,17 @@ def push(self, local_path: str, remote_path: str) -> None:

def pull(self, remote_path: str, local_path: str) -> None:
self._agent_os.pull(remote_path, local_path)

def get_ui_elements(self) -> UIElementCollection:
ui_elemet_collection = self._agent_os.get_ui_elements()
for element in ui_elemet_collection:
if element.center is None:
continue
element.set_center(
self._scale_coordinates(
x=element.center[0],
y=element.center[1],
from_agent=False,
)
)
return ui_elemet_collection
32 changes: 32 additions & 0 deletions src/askui/tools/android/ppadb_agent_os.py
Original file line number Diff line number Diff line change
Expand Up @@ -18,6 +18,7 @@
UnknownAndroidDisplay,
)
from askui.tools.android.android_agent_os_error import AndroidAgentOsError
from askui.tools.android.uiautomator_hierarchy import UIElementCollection
from askui.utils.annotated_image import AnnotatedImage


Expand All @@ -34,6 +35,7 @@ class PpadbAgentOs(AndroidAgentOs):
"""

_REPORTER_ROLE_NAME: str = "AndroidAgentOS"
_UIAUTOMATOR_DUMP_PATH: str = "/data/local/tmp/askui_window_dump.xml"

def __init__(
self, reporter: Reporter = NULL_REPORTER, device_identifier: str | int = 0
Expand Down Expand Up @@ -482,3 +484,33 @@ def pull(self, remote_path: str, local_path: str) -> None:
self._REPORTER_ROLE_NAME,
f"pull(remote_path='{remote_path}', local_path='{local_path}')",
)

def get_ui_elements(self) -> UIElementCollection:
"""
Return UI elements from a `uiautomator dump` of the current screen.

Returns:
UIElementCollection: Parsed hierarchy from the dump, or empty if the dump
has no usable content.

Raises:
AndroidAgentOsError: When the dump command does not report success (often
while animations are visible on screen).

Notes:
`uiautomator dump` is unreliable while the screen shows animation
(transitions, loaders, pulsing highlights, etc.). Retry after motion has
stopped and the UI has settled.
"""
self._check_if_device_is_selected()
assert self._device is not None
dump_cmd = f"uiautomator dump {self._UIAUTOMATOR_DUMP_PATH}"
dump_response = self.shell(dump_cmd)
if "dumped" not in dump_response.lower():
msg = f"Failed to dump UI hierarchy: {dump_response}"
raise AndroidAgentOsError(msg)

raw = self.shell(f"cat {self._UIAUTOMATOR_DUMP_PATH}")
if not raw or not raw.strip():
return UIElementCollection([])
return UIElementCollection.build_from_xml_dump(raw)
158 changes: 158 additions & 0 deletions src/askui/tools/android/uiautomator_hierarchy.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,158 @@
"""
Parse UIAutomator hierarchy dump XML from Android (normalized shell output).
"""

from __future__ import annotations

import re
import xml.etree.ElementTree as ET
from dataclasses import dataclass
from typing import TYPE_CHECKING

if TYPE_CHECKING:
from collections.abc import Iterator, Mapping

# Match & that is not start of a valid XML entity
_RE_INVALID_AMP = re.compile(r"&(?!(?:amp|lt|gt|apos|quot|#\d+|#x[0-9a-fA-F]+);)") # noqa: E501
_RE_BOUNDS = re.compile(r"\[(\d+),(\d+)\]\[(\d+),(\d+)\]")

_XML_START_MARKERS = ("<?xml", "<hierarchy")


@dataclass
class UIElement:
"""Parsed UI element from UIAutomator dump."""

text: str
resource_id: str
content_desc: str
class_name: str
bounds: str
clickable: bool
enabled: bool
package: str
_center: tuple[int, int] | None = None

@property
def center(self) -> tuple[int, int] | None:
"""Return (x, y) center of bounds, or None if bounds invalid."""
if self._center is not None:
return self._center
m = _RE_BOUNDS.match(self.bounds)
if not m:
return None
x1, y1, x2, y2 = (int(g) for g in m.groups())
self._center = ((x1 + x2) // 2, (y1 + y2) // 2)
return self._center

def __str__(self) -> str:
"""Short description for list output."""
parts: list[str] = [f"clickable={self.clickable}"]
if self.center:
parts.append(f"center=(x={self.center[0]}, y={self.center[1]})")
if self.text:
parts.append(f'text="{self.text}"')
if self.resource_id:
parts.append(f'resource-id="{self.resource_id}"')
if self.content_desc:
parts.append(f'content-desc="{self.content_desc}"')
if self.class_name:
parts.append(f"class={self.class_name.split('.')[-1]}")
return " | ".join(parts)

def set_center(self, center: tuple[int, int]) -> None:
"""Set the center of the element."""
self._center = center

@classmethod
def from_xml_attrib(cls, attrib: Mapping[str, str]) -> UIElement | None:
"""Build from XML node attributes, or None if there are no bounds."""
bounds = attrib.get("bounds", "").strip()
if not bounds:
return None
return cls(
text=attrib.get("text", ""),
resource_id=attrib.get("resource-id", ""),
content_desc=attrib.get("content-desc", ""),
class_name=attrib.get("class", ""),
bounds=bounds,
clickable=attrib.get("clickable", "false") == "true",
enabled=attrib.get("enabled", "true") == "true",
package=attrib.get("package", ""),
)

@staticmethod
def from_json(json_content: Mapping[str, str]) -> UIElement:
"""Build a UIElement from a string-keyed mapping (e.g. JSON object)."""
return UIElement(
text=json_content.get("text", ""),
resource_id=json_content.get("resource-id", ""),
content_desc=json_content.get("content-desc", ""),
class_name=json_content.get("class", ""),
bounds=json_content.get("bounds", ""),
clickable=json_content.get("clickable", "false") == "true",
enabled=json_content.get("enabled", "true") == "true",
package=json_content.get("package", ""),
)


class UIElementCollection:
"""Collection of UI elements."""

def __init__(self, elements: list[UIElement]) -> None:
self._elements = list(elements)

def get_all(self) -> list[UIElement]:
"""Return a copy of all elements."""
return list(self._elements)

def __iter__(self) -> Iterator[UIElement]:
return iter(self._elements)

def __len__(self) -> int:
return len(self._elements)

def __str__(self) -> str:
"""String representation of the collection."""
return "\n".join(str(element) for element in self._elements)

@staticmethod
def _normalize_dump_string(raw: str) -> str:
"""
Normalize raw shell output to valid XML before parsing.

Handles encoding, ADB/shell cruft, control chars, and unescaped & in attributes.
"""
raw = raw.strip().lstrip("\ufeff")
start_indices = [raw.find(marker) for marker in _XML_START_MARKERS]
valid = [i for i in start_indices if i >= 0]
if valid:
raw = raw[min(valid) :]
end_tag = "</hierarchy>"
j = raw.rfind(end_tag)
if j >= 0:
raw = raw[: j + len(end_tag)]
raw = "".join(c for c in raw if c in "\n\t" or ord(c) >= 32)
return _RE_INVALID_AMP.sub("&amp;", raw)

@staticmethod
def build_from_xml_dump(xml_content: str) -> UIElementCollection:
"""Build a UIElementCollection from a UIAutomator dump XML string."""
elements: list[UIElement] = []
xml_content = UIElementCollection._normalize_dump_string(xml_content)
if not xml_content:
return UIElementCollection(elements)
try:
root = ET.fromstring(xml_content)
except ET.ParseError:
return UIElementCollection(elements)

def collect(node: ET.Element) -> None:
elem = UIElement.from_xml_attrib(node.attrib)
if elem is not None:
elements.append(elem)
for child in node:
collect(child)

collect(root)
return UIElementCollection(elements)
4 changes: 4 additions & 0 deletions src/askui/tools/store/android/__init__.py
Original file line number Diff line number Diff line change
Expand Up @@ -4,8 +4,12 @@
AndroidAgent.
"""

from askui.tools.store.android.get_uiautomator_hierarchy_tool import (
AndroidGetUIAutomatorHierarchyTool,
)
from askui.tools.store.android.save_screenshot_tool import AndroidSaveScreenshotTool

__all__ = [
"AndroidSaveScreenshotTool",
"AndroidGetUIAutomatorHierarchyTool",
]
74 changes: 74 additions & 0 deletions src/askui/tools/store/android/get_uiautomator_hierarchy_tool.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,74 @@
from askui.models.shared import AndroidBaseTool, ToolTags
from askui.tools.android.agent_os_facade import AndroidAgentOsFacade


class AndroidGetUIAutomatorHierarchyTool(AndroidBaseTool):
"""
Returns a flattened, text-friendly snapshot of the Android accessibility hierarchy
for the connected device (via UIAutomator window dump).

Each line describes one on-screen view: `clickable`, tap `center` computed from
bounds, and when non-empty: quoted `text`, `resource-id`, `content-desc`, and a
short view `class` name (last segment of the fully qualified class). Views without
parseable bounds are omitted.

Prefer this over screenshots when capture fails, is unavailable, or you want
explicit structure (ids, descriptions, centers) instead of visual inference.
Prefer using returned centers and labels over blind coordinate guesses.

Lines use ` | ` between fields, for example:
`clickable=True | center=(x=120, y=340) | text="OK" | class=Button`.

Args:
agent_os (AndroidAgentOsFacade | None, optional): The Android agent OS facade.
If omitted, the agent supplies the connected device implementation at
runtime.

Examples:
```python
from askui import AndroidAgent
from askui.tools.store.android import AndroidGetUIAutomatorHierarchyTool

with AndroidAgent() as agent:
agent.act(
"List tappable elements on the screen using the accessibility tree",
tools=[AndroidGetUIAutomatorHierarchyTool()],
)
```

```python
from askui import AndroidAgent
from askui.tools.store.android import AndroidGetUIAutomatorHierarchyTool

with AndroidAgent(act_tools=[AndroidGetUIAutomatorHierarchyTool()]) as agent:
agent.act("What buttons and links are visible on this screen?")
```
"""

def __init__(self, agent_os: AndroidAgentOsFacade | None = None) -> None:
super().__init__(
name="get_uiautomator_hierarchy_tool",
description=(
"UIAutomator accessibility snapshot for the current Android screen"
" (window dump). Returns one text line per view: clickable, tap center"
" from bounds (`center=(x=..., y=...)`), and when set: text,"
" resource-id,"
" content-desc, short view class—fields joined by ` | `. Skips views"
" without valid bounds. Use instead of screenshots when capture is"
" unreliable or you need ids, descriptions, and tap centers for"
" structured reasoning; avoid guessing raw coordinates."
),
required_tags=[ToolTags.SCALED_AGENT_OS.value],
agent_os=agent_os,
)

def __call__(self) -> str:
"""
Build one string of the accessibility hierarchy for the model.

Returns:
str: Prefix `UIAutomator hierarchy was retrieved:` followed by newline-
separated element lines (see class docstring for field format).
"""
hierarchy = self.agent_os.get_ui_elements()
return f"UIAutomator hierarchy was retrieved: {str(hierarchy)}"
4 changes: 1 addition & 3 deletions src/askui/utils/image_utils.py
Original file line number Diff line number Diff line change
Expand Up @@ -306,9 +306,7 @@ def scale_coordinates(
offset = _calc_center_offset(scaling_results.size, target_size)
result = _scale_coordinates(coordinates, offset, scaling_results.factor, inverse)
if check_coordinates_in_bounds:
_check_coordinates_in_bounds(
result, original_size if inverse else scaling_results.size
)
_check_coordinates_in_bounds(result, original_size if inverse else target_size)
return result


Expand Down
Loading