diff --git a/browsergym/core/src/browsergym/core/action/highlevel.py b/browsergym/core/src/browsergym/core/action/highlevel.py index 66fae37a0..18b5b56a0 100644 --- a/browsergym/core/src/browsergym/core/action/highlevel.py +++ b/browsergym/core/src/browsergym/core/action/highlevel.py @@ -5,10 +5,10 @@ from . import utils from .base import AbstractActionSet -from .functions import ( # check,; uncheck, +from .functions import ( clear, click, - dblclick, + dblclick, # check,; uncheck, drag_and_drop, fill, focus, @@ -246,6 +246,12 @@ goto, # GOTO, SEARCH send_msg_to_user, # TERMINATE ], + # example action set + "example": [ + scroll, + keyboard_type, + mouse_click, + ], } diff --git a/browsergym/example_benchmark/README.md b/browsergym/example_benchmark/README.md new file mode 100644 index 000000000..78e7576d7 --- /dev/null +++ b/browsergym/example_benchmark/README.md @@ -0,0 +1,11 @@ +# Example benchmark for BrowserGym + +This package provides `browsergym.example`, which is an example benchmark for BrowserGym. + +## Setup + +1. Install the package +```sh +pip install -e browsergym/example_benchmark +``` +2. Do any additional setup required by the benchmark. \ No newline at end of file diff --git a/browsergym/example_benchmark/pyproject.toml b/browsergym/example_benchmark/pyproject.toml new file mode 100644 index 000000000..ad150a427 --- /dev/null +++ b/browsergym/example_benchmark/pyproject.toml @@ -0,0 +1,34 @@ +[build-system] +requires = ["hatchling", "hatch-requirements-txt"] +build-backend = "hatchling.build" + +[project] +name = "browsergym-example" +description = "Example benchmark for BrowserGym" +authors = [ + {name = "AUTHOR NAME"}, +] +readme = "README.md" +requires-python = ">3.10" +license = {text = "Apache-2.0"} +classifiers = [ + "Development Status :: 3 - Alpha", + "Programming Language :: Python :: 3", + "Operating System :: OS Independent", + "Intended Audience :: Science/Research", + "Topic :: Scientific/Engineering :: Artificial Intelligence", + "License :: OSI Approved :: Apache Software License", +] +dynamic = ["dependencies", "version"] + +[project.urls] +homepage = "https://github.com/ServiceNow/BrowserGym" + +[tool.hatch.version] +path = "../core/src/browsergym/core/__init__.py" + +[tool.hatch.metadata.hooks.requirements_txt] +files = ["requirements.txt"] + +[tool.hatch.build.targets.wheel] +packages = ["src/browsergym"] diff --git a/browsergym/example_benchmark/requirements.txt b/browsergym/example_benchmark/requirements.txt new file mode 100644 index 000000000..692b4d126 --- /dev/null +++ b/browsergym/example_benchmark/requirements.txt @@ -0,0 +1,4 @@ +browsergym-core==0.13.3 +# any other dependencies for your benchmark +# e.g. torch==1.13.0 +# or your own benchmark package as a backend \ No newline at end of file diff --git a/browsergym/example_benchmark/src/browsergym/example/__init__.py b/browsergym/example_benchmark/src/browsergym/example/__init__.py new file mode 100644 index 000000000..73ccfee8c --- /dev/null +++ b/browsergym/example_benchmark/src/browsergym/example/__init__.py @@ -0,0 +1,29 @@ +import os + +from browsergym.core.registration import register_task + +from . import example_task + +ALL_TASKS = [ + example_task.WikipediaTask, + example_task.AmazonTask, +] + + +# register the benchmark tasks +for task in ALL_TASKS: + register_task( + task.get_task_id(), + task, + task_kwargs={ + "seed": 0, + "start_url": os.getenv("START_URL", "https://google.com"), + }, + ) + +# register_tasks will expose the tasks through gym +# tasks will be importable with: +# import gym +# import browsergym.example_benchmark +# env = gym.make("browsergym/example.wikipedia") +# env = gym.make("browsergym/example.amazon") diff --git a/browsergym/example_benchmark/src/browsergym/example/example_task.py b/browsergym/example_benchmark/src/browsergym/example/example_task.py new file mode 100644 index 000000000..41498af98 --- /dev/null +++ b/browsergym/example_benchmark/src/browsergym/example/example_task.py @@ -0,0 +1,95 @@ +import os +from typing import Optional, Tuple + +import playwright.sync_api + +from browsergym.core.task import AbstractBrowserTask + + +class ExampleTask(AbstractBrowserTask): + """ + Example task for browsergym + + """ + + @classmethod + def get_task_id(cls): + return f"example.{cls.subdomain}" + + def __init__( + self, + seed: int, + start_url: str = "https://google.com", + ) -> None: + super().__init__(seed) + self.start_url = start_url + + def setup(self, page: playwright.sync_api.Page) -> tuple[str, dict]: + """ + Perform any operation to allow the task to start. Return the goal as a string and any information that's relevant. + """ + page.goto(self.start_url) + + # can also login or perform any other operation + # to prepare the task + + goal = f"Go to the {self.goal_url} and send a message to the chat." + info = {"goal_url": self.goal_url} + return goal, info + + def validate(self, page, chat_messages): + """Check if the task was completed successfully and return a reward. + + i.e. reward, done, message, info + + Args: + page: the active playwright page. + chat_messages: the chat messages. + Returns: + reward: float, the reward obtained since last call to validate(). + done: boolean flag, indicates if the task has finished or not (be it success or fail). + message: string, a new user message for the chat. + info: dictionnary, custom information from the task. + """ + # wait for the page to load if needed + page.wait_for_load_state("networkidle") + # check if the page is the goal page + if page.url == self.goal_url: + # check if the chat messages contain the goal + for message in chat_messages: + if self.goal in message: + return 1.0, True, "Task completed successfully", {} + return 0.0, False, "Task not completed", {} + + def cheat(self, page: playwright.sync_api.Page, chat_messages: list[str]) -> None: + """ + Solve the task using a pre-defined solution (optional). + """ + raise NotImplementedError("Cheat method not implemented for this task.") + + def teardown(self) -> None: + """ + Tear down the task and clean up any resource / data created by the task (optional). + """ + # typically shut down the browser... + pass + + +class WikipediaTask(ExampleTask): + """ + Wikipedia task for browsergym + + """ + + subdomain = "wikipedia" + goal_url = os.environ.get("WIKIPEDIA_URL", "https://en.wikipedia.org/wiki/Main_Page") + + +class AmazonTask(ExampleTask): + """ + Amazon task for browsergym + + """ + + subdomain = "amazon" + goal_url = os.environ.get("AMAZON_URL", "https://www.amazon.com/") diff --git a/browsergym/experiments/src/browsergym/experiments/benchmark/configs.py b/browsergym/experiments/src/browsergym/experiments/benchmark/configs.py index 0a1ff0a12..f2c78876a 100644 --- a/browsergym/experiments/src/browsergym/experiments/benchmark/configs.py +++ b/browsergym/experiments/src/browsergym/experiments/benchmark/configs.py @@ -86,6 +86,13 @@ retry_with_force=True, demo_mode="off", ), + "example": HighLevelActionSetArgs( + subsets=["example"], + multiaction=False, + strict=False, + retry_with_force=True, + demo_mode="off", + ), } # all benchmarks are callables designed for lazy loading, i.e. `bench = DEFAULT_BENCHMARKS["miniwob_all"]()` @@ -260,4 +267,18 @@ ), task_metadata=task_metadata("weblinx"), ), + "example": lambda: Benchmark( + name="example", + high_level_action_set_args=DEFAULT_HIGHLEVEL_ACTION_SET_ARGS["example"], + is_multi_tab=False, + supports_parallel_seeds=True, + backends=["example"], + env_args_list=make_env_args_list_from_repeat_tasks( + task_list=task_list_from_metadata(metadata=task_metadata("example")), + max_steps=10, + n_repeats=1, # 1 seed per task in the example benchmark + seeds_rng=np.random.RandomState(42), + ), + task_metadata=task_metadata("example"), + ), } diff --git a/browsergym/experiments/src/browsergym/experiments/benchmark/metadata/example.csv b/browsergym/experiments/src/browsergym/experiments/benchmark/metadata/example.csv new file mode 100644 index 000000000..432bb318b --- /dev/null +++ b/browsergym/experiments/src/browsergym/experiments/benchmark/metadata/example.csv @@ -0,0 +1,3 @@ +task_name,any_criterion +example.wikipedia,True +example.amazon,False \ No newline at end of file diff --git a/browsergym/experiments/src/browsergym/experiments/loop.py b/browsergym/experiments/src/browsergym/experiments/loop.py index 0ebb9e94c..e8bf43a50 100644 --- a/browsergym/experiments/src/browsergym/experiments/loop.py +++ b/browsergym/experiments/src/browsergym/experiments/loop.py @@ -942,6 +942,8 @@ def _get_env_name(task_name: str): import browsergym.assistantbench elif task_name.startswith("weblinx"): import weblinx_browsergym + + ## Do not forget to import your benchmark here.. return f"browsergym/{task_name}"