Skip to content
Open
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
219 changes: 191 additions & 28 deletions pdm.lock

Large diffs are not rendered by default.

9 changes: 9 additions & 0 deletions src/askui/tools/android/agent_os.py
Original file line number Diff line number Diff line change
Expand Up @@ -3,6 +3,8 @@

from PIL import Image

from askui.tools.android.uiautomator_hierarchy import UIElementCollection

ANDROID_KEY = Literal[ # pylint: disable=C0103
"HOME",
"BACK",
Expand Down Expand Up @@ -493,3 +495,10 @@ def pull(self, remote_path: str, local_path: str) -> None:
Pulls a file from the device.
"""
raise NotImplementedError

@abstractmethod
def get_ui_elements(self) -> UIElementCollection:
"""
Gets the UI elements.
"""
raise NotImplementedError
34 changes: 27 additions & 7 deletions src/askui/tools/android/agent_os_facade.py
Original file line number Diff line number Diff line change
Expand Up @@ -4,6 +4,7 @@

from askui.models.shared.tool_tags import ToolTags
from askui.tools.android.agent_os import ANDROID_KEY, AndroidAgentOs, AndroidDisplay
from askui.tools.android.uiautomator_hierarchy import UIElementCollection
from askui.utils.image_utils import scale_coordinates, scale_image_to_fit


Expand Down Expand Up @@ -36,33 +37,38 @@ def screenshot(self) -> Image.Image:
self._target_resolution,
)

def _scale_coordinates_back(self, x: int, y: int) -> Tuple[int, int]:
def _scale_coordinates(
self,
x: int,
y: int,
from_agent: bool = True,
Copy link
Copy Markdown
Collaborator

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

What does the from_agent mean?

Copy link
Copy Markdown
Contributor Author

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

“from_agent” indicates whether the coordinates were provided by the agent.

) -> Tuple[int, int]:
if self._real_screen_resolution is None:
self._real_screen_resolution = self._agent_os.screenshot().size

return scale_coordinates(
(x, y),
self._real_screen_resolution,
self._target_resolution,
inverse=True,
inverse=from_agent,
)

def tap(self, x: int, y: int) -> None:
x, y = self._scale_coordinates_back(x, y)
x, y = self._scale_coordinates(x, y)
self._agent_os.tap(x, y)

def swipe(
self, x1: int, y1: int, x2: int, y2: int, duration_in_ms: int = 1000
) -> None:
x1, y1 = self._scale_coordinates_back(x1, y1)
x2, y2 = self._scale_coordinates_back(x2, y2)
x1, y1 = self._scale_coordinates(x1, y1)
x2, y2 = self._scale_coordinates(x2, y2)
self._agent_os.swipe(x1, y1, x2, y2, duration_in_ms)

def drag_and_drop(
self, x1: int, y1: int, x2: int, y2: int, duration_in_ms: int = 1000
) -> None:
x1, y1 = self._scale_coordinates_back(x1, y1)
x2, y2 = self._scale_coordinates_back(x2, y2)
x1, y1 = self._scale_coordinates(x1, y1)
x2, y2 = self._scale_coordinates(x2, y2)
self._agent_os.drag_and_drop(x1, y1, x2, y2, duration_in_ms)

def type(self, text: str) -> None:
Expand Down Expand Up @@ -121,3 +127,17 @@ def push(self, local_path: str, remote_path: str) -> None:

def pull(self, remote_path: str, local_path: str) -> None:
self._agent_os.pull(remote_path, local_path)

def get_ui_elements(self) -> UIElementCollection:
ui_elemet_collection = self._agent_os.get_ui_elements()
for element in ui_elemet_collection:
if element.center is None:
continue
element.set_center(
self._scale_coordinates(
x=element.center[0],
y=element.center[1],
from_agent=False,
)
)
return ui_elemet_collection
103 changes: 60 additions & 43 deletions src/askui/tools/android/ppadb_agent_os.py
Original file line number Diff line number Diff line change
Expand Up @@ -18,6 +18,7 @@
UnknownAndroidDisplay,
)
from askui.tools.android.android_agent_os_error import AndroidAgentOsError
from askui.tools.android.uiautomator_hierarchy import UIElementCollection
from askui.utils.annotated_image import AnnotatedImage


Expand All @@ -34,6 +35,7 @@ class PpadbAgentOs(AndroidAgentOs):
"""

_REPORTER_ROLE_NAME: str = "AndroidAgentOS"
_UIAUTOMATOR_DUMP_PATH: str = "/data/local/tmp/askui_window_dump.xml"

def __init__(
self, reporter: Reporter = NULL_REPORTER, device_identifier: str | int = 0
Expand Down Expand Up @@ -72,8 +74,8 @@ def connect(self) -> None:
self.set_device_by_serial_number(self._device_identifier)
else:
self.set_device_by_index(self._device_identifier)
assert self._device is not None
self._device.wait_boot_complete()
device: AndroidDevice = self._get_selected_device()
device.wait_boot_complete()

def disconnect(self) -> None:
self._client = None
Expand All @@ -92,10 +94,9 @@ def _set_display(self, display: AndroidDisplay) -> None:
)

def get_connected_displays(self) -> list[AndroidDisplay]:
self._check_if_device_is_selected()
assert self._device is not None
device: AndroidDevice = self._get_selected_device()
displays: list[AndroidDisplay] = []
output: str = self._device.shell(
output: str = device.shell(
"dumpsys SurfaceFlinger --display-id",
)

Expand Down Expand Up @@ -202,11 +203,10 @@ def set_device_by_serial_number(self, device_sn: str) -> None:
raise AndroidAgentOsError(msg)

def _screenshot_without_reporting(self) -> Image.Image:
self._check_if_device_is_selected()
device: AndroidDevice = self._get_selected_device()
self._check_if_display_is_selected()
assert self._device is not None
assert self._selected_display is not None
connection_to_device = self._device.create_connection()
connection_to_device = device.create_connection()
unique_display_id_flag = self._selected_display.get_display_unique_id_flag()
connection_to_device.send(
f"shell:/system/bin/screencap -p {unique_display_id_flag}"
Expand All @@ -222,28 +222,26 @@ def screenshot(self) -> Image.Image:
return screenshot

def shell(self, command: str) -> str:
self._check_if_device_is_selected()
device: AndroidDevice = self._get_selected_device()
self._check_if_display_is_selected()
assert self._device is not None
response: str = self._device.shell(command)
response: str = device.shell(command)
self._reporter.add_message(
self._REPORTER_ROLE_NAME,
f"shell(command='{command}') -> '{response}'",
)
return response

def tap(self, x: int, y: int) -> None:
self._check_if_device_is_selected()
device: AndroidDevice = self._get_selected_device()
self._check_if_display_is_selected()
assert self._device is not None
assert self._selected_display is not None
display_flag = self._selected_display.get_display_id_flag()
self._reporter.add_message(
self._REPORTER_ROLE_NAME,
f"tap(x={x}, y={y})",
AnnotatedImage(self._screenshot_without_reporting, [(x, y)]),
)
self._device.shell(f"input {display_flag} tap {x} {y}")
device.shell(f"input {display_flag} tap {x} {y}")
self._mouse_position = (x, y)
self._reporter.add_message(
self._REPORTER_ROLE_NAME,
Expand All @@ -259,9 +257,8 @@ def swipe(
y2: int,
duration_in_ms: int = 1000,
) -> None:
self._check_if_device_is_selected()
device: AndroidDevice = self._get_selected_device()
self._check_if_display_is_selected()
assert self._device is not None
assert self._selected_display is not None
display_flag = self._selected_display.get_display_id_flag()
self._reporter.add_message(
Expand All @@ -272,9 +269,7 @@ def swipe(
),
AnnotatedImage(self._screenshot_without_reporting, [(x1, y1)]),
)
self._device.shell(
f"input {display_flag} swipe {x1} {y1} {x2} {y2} {duration_in_ms}"
)
device.shell(f"input {display_flag} swipe {x1} {y1} {x2} {y2} {duration_in_ms}")
self._mouse_position = (x2, y2)
self._reporter.add_message(
self._REPORTER_ROLE_NAME,
Expand All @@ -290,9 +285,8 @@ def drag_and_drop(
y2: int,
duration_in_ms: int = 1000,
) -> None:
self._check_if_device_is_selected()
device: AndroidDevice = self._get_selected_device()
self._check_if_display_is_selected()
assert self._device is not None
assert self._selected_display is not None
display_flag = self._selected_display.get_display_id_flag()
self._reporter.add_message(
Expand All @@ -301,7 +295,7 @@ def drag_and_drop(
AnnotatedImage(self._screenshot_without_reporting, [(x1, y1)]),
)

self._device.shell(
device.shell(
f"input {display_flag} draganddrop {x1} {y1} {x2} {y2} {duration_in_ms}"
)
self._mouse_position = (x2, y2)
Expand All @@ -319,9 +313,8 @@ def type(self, text: str) -> None:
+ "or special characters which are not supported by the device"
)
raise AndroidAgentOsError(error_msg_nonprintable)
self._check_if_device_is_selected()
device: AndroidDevice = self._get_selected_device()
self._check_if_display_is_selected()
assert self._device is not None
assert self._selected_display is not None
display_flag = self._selected_display.get_display_id_flag()
escaped_text = shlex.quote(text)
Expand All @@ -331,7 +324,7 @@ def type(self, text: str) -> None:
f"Typing text: '{text}'",
AnnotatedImage(self._screenshot_without_reporting),
)
self._device.shell(f"input {display_flag} text {shell_safe_text}")
device.shell(f"input {display_flag} text {shell_safe_text}")

self._reporter.add_message(
self._REPORTER_ROLE_NAME,
Expand All @@ -343,17 +336,16 @@ def key_tap(self, key: ANDROID_KEY) -> None:
if key not in get_args(ANDROID_KEY):
error_msg_invalid_key: str = f"Invalid key: {key}"
raise AndroidAgentOsError(error_msg_invalid_key)
self._check_if_device_is_selected()
device: AndroidDevice = self._get_selected_device()
self._check_if_display_is_selected()
assert self._device is not None
assert self._selected_display is not None
display_flag = self._selected_display.get_display_id_flag()
self._reporter.add_message(
self._REPORTER_ROLE_NAME,
f"Tapping key: '{key}'",
AnnotatedImage(self._screenshot_without_reporting),
)
self._device.shell(f"input {display_flag} keyevent {key}")
device.shell(f"input {display_flag} keyevent {key}")
self._reporter.add_message(
self._REPORTER_ROLE_NAME,
f"After tapping key: '{key}'",
Expand All @@ -372,17 +364,16 @@ def key_combination(
raise AndroidAgentOsError(error_msg_too_few)

keys_string = " ".join(keys)
self._check_if_device_is_selected()
device: AndroidDevice = self._get_selected_device()
self._check_if_display_is_selected()
assert self._device is not None
assert self._selected_display is not None
display_flag = self._selected_display.get_display_id_flag()
self._reporter.add_message(
self._REPORTER_ROLE_NAME,
f"Performing key combination: '{keys_string}'",
AnnotatedImage(self._screenshot_without_reporting),
)
self._device.shell(
device.shell(
f"input {display_flag} keycombination -t {duration_in_ms} {keys_string}"
)
self._reporter.add_message(
Expand All @@ -391,7 +382,7 @@ def key_combination(
AnnotatedImage(self._screenshot_without_reporting),
)

def _check_if_device_is_selected(self) -> None:
def _get_selected_device(self) -> AndroidDevice:
devices: list[AndroidDevice] = self._get_connected_devices()

if not self._device:
Expand All @@ -400,7 +391,7 @@ def _check_if_device_is_selected(self) -> None:

for device in devices:
if device.serial == self._device.serial:
return
return self._device
msg = f"Device {self._device.serial} not found in connected devices"
raise AndroidAgentOsError(msg)

Expand Down Expand Up @@ -441,30 +432,28 @@ def get_selected_device_infos(self) -> tuple[str, AndroidDisplay]:
"""
Get the selected device infos.
"""
self._check_if_device_is_selected()
device: AndroidDevice = self._get_selected_device()
self._check_if_display_is_selected()
assert self._device is not None
assert self._selected_display is not None
self._reporter.add_message(
self._REPORTER_ROLE_NAME,
(
"get_selected_device_infos() -> "
f"Selected device serial number: '{self._device.serial}' "
f"Selected device serial number: '{device.serial}' "
f"and selected display: '{self._selected_display}'"
),
)
return (self._device.serial, self._selected_display)
return (device.serial, self._selected_display)

def push(self, local_path: str, remote_path: str) -> None:
"""
Push a file to the device.
"""
self._check_if_device_is_selected()
assert self._device is not None
device: AndroidDevice = self._get_selected_device()
if not Path.exists(Path(local_path)):
msg = f"Local path {local_path} does not exist"
raise FileNotFoundError(msg)
self._device.push(local_path, remote_path)
device.push(local_path, remote_path)
self._reporter.add_message(
self._REPORTER_ROLE_NAME,
f"push(local_path='{local_path}', remote_path='{remote_path}')",
Expand All @@ -474,11 +463,39 @@ def pull(self, remote_path: str, local_path: str) -> None:
"""
Pull a file from the device.
"""
self._check_if_device_is_selected()
assert self._device is not None
device: AndroidDevice = self._get_selected_device()
Path.mkdir(Path.absolute(Path(local_path).parent), exist_ok=True)
self._device.pull(remote_path, local_path)
device.pull(remote_path, local_path)
self._reporter.add_message(
self._REPORTER_ROLE_NAME,
f"pull(remote_path='{remote_path}', local_path='{local_path}')",
)

def get_ui_elements(self) -> UIElementCollection:
"""
Return UI elements from a `uiautomator dump` of the current screen.

Returns:
UIElementCollection: Parsed hierarchy from the dump, or empty if the dump
has no usable content.

Raises:
AndroidAgentOsError: When the dump command does not report success (often
while animations are visible on screen).

Notes:
`uiautomator dump` is unreliable while the screen shows animation
(transitions, loaders, pulsing highlights, etc.). Retry after motion has
stopped and the UI has settled.
"""
self._get_selected_device()
dump_cmd = f"uiautomator dump {self._UIAUTOMATOR_DUMP_PATH}"
dump_response = self.shell(dump_cmd)
if "dumped" not in dump_response.lower():
Copy link
Copy Markdown
Collaborator

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

What does the "dumped" mean?

Copy link
Copy Markdown
Contributor Author

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

“dumped” is included in the response of a successful UI dump.

msg = f"Failed to dump UI hierarchy: {dump_response}"
raise AndroidAgentOsError(msg)
Copy link
Copy Markdown
Collaborator

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Do we have to terminate the Agent Loop or is this error recoverably from the Agent??

Copy link
Copy Markdown
Contributor Author

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

I don’t think so, since I assume the agent can auto-recover and use different methods, such as taking a screenshot or using the shell.


raw = self.shell(f"cat {self._UIAUTOMATOR_DUMP_PATH}")
if not raw or not raw.strip():
return UIElementCollection([])
return UIElementCollection.build_from_xml_dump(raw)
Loading
Loading