From f5c1ce1641c1d79d6b4b29979dcc8058079bf339 Mon Sep 17 00:00:00 2001 From: ThibaultLSDC Date: Tue, 29 Apr 2025 11:44:54 -0400 Subject: [PATCH 1/2] adding example benchmark --- .../src/browsergym/core/action/highlevel.py | 10 +- browsergym/example_benchmark/README.md | 11 +++ browsergym/example_benchmark/pyproject.toml | 34 +++++++ browsergym/example_benchmark/requirements.txt | 4 + .../src/browsergym/example/__init__.py | 29 ++++++ .../src/browsergym/example/example_task.py | 95 +++++++++++++++++++ .../experiments/benchmark/configs.py | 26 ++++- .../benchmark/metadata/example.csv | 3 + 8 files changed, 206 insertions(+), 6 deletions(-) create mode 100644 browsergym/example_benchmark/README.md create mode 100644 browsergym/example_benchmark/pyproject.toml create mode 100644 browsergym/example_benchmark/requirements.txt create mode 100644 browsergym/example_benchmark/src/browsergym/example/__init__.py create mode 100644 browsergym/example_benchmark/src/browsergym/example/example_task.py create mode 100644 browsergym/experiments/src/browsergym/experiments/benchmark/metadata/example.csv diff --git a/browsergym/core/src/browsergym/core/action/highlevel.py b/browsergym/core/src/browsergym/core/action/highlevel.py index da2c539cf..21639f0e0 100644 --- a/browsergym/core/src/browsergym/core/action/highlevel.py +++ b/browsergym/core/src/browsergym/core/action/highlevel.py @@ -5,10 +5,10 @@ from . import utils from .base import AbstractActionSet -from .functions import ( # check,; uncheck, +from .functions import ( clear, click, - dblclick, + dblclick, # check,; uncheck, drag_and_drop, fill, focus, @@ -245,6 +245,12 @@ goto, # GOTO, SEARCH send_msg_to_user, # TERMINATE ], + # example action set + "example": [ + scroll, + keyboard_type, + mouse_click, + ], } diff --git a/browsergym/example_benchmark/README.md b/browsergym/example_benchmark/README.md new file mode 100644 index 000000000..78e7576d7 --- /dev/null +++ b/browsergym/example_benchmark/README.md @@ -0,0 +1,11 @@ +# Example benchmark for BrowserGym + +This package provides `browsergym.example`, which is an example benchmark for BrowserGym. + +## Setup + +1. Install the package +```sh +pip install -e browsergym/example_benchmark +``` +2. Do any additional setup required by the benchmark. \ No newline at end of file diff --git a/browsergym/example_benchmark/pyproject.toml b/browsergym/example_benchmark/pyproject.toml new file mode 100644 index 000000000..ad150a427 --- /dev/null +++ b/browsergym/example_benchmark/pyproject.toml @@ -0,0 +1,34 @@ +[build-system] +requires = ["hatchling", "hatch-requirements-txt"] +build-backend = "hatchling.build" + +[project] +name = "browsergym-example" +description = "Example benchmark for BrowserGym" +authors = [ + {name = "AUTHOR NAME"}, +] +readme = "README.md" +requires-python = ">3.10" +license = {text = "Apache-2.0"} +classifiers = [ + "Development Status :: 3 - Alpha", + "Programming Language :: Python :: 3", + "Operating System :: OS Independent", + "Intended Audience :: Science/Research", + "Topic :: Scientific/Engineering :: Artificial Intelligence", + "License :: OSI Approved :: Apache Software License", +] +dynamic = ["dependencies", "version"] + +[project.urls] +homepage = "https://github.com/ServiceNow/BrowserGym" + +[tool.hatch.version] +path = "../core/src/browsergym/core/__init__.py" + +[tool.hatch.metadata.hooks.requirements_txt] +files = ["requirements.txt"] + +[tool.hatch.build.targets.wheel] +packages = ["src/browsergym"] diff --git a/browsergym/example_benchmark/requirements.txt b/browsergym/example_benchmark/requirements.txt new file mode 100644 index 000000000..692b4d126 --- /dev/null +++ b/browsergym/example_benchmark/requirements.txt @@ -0,0 +1,4 @@ +browsergym-core==0.13.3 +# any other dependencies for your benchmark +# e.g. torch==1.13.0 +# or your own benchmark package as a backend \ No newline at end of file diff --git a/browsergym/example_benchmark/src/browsergym/example/__init__.py b/browsergym/example_benchmark/src/browsergym/example/__init__.py new file mode 100644 index 000000000..73ccfee8c --- /dev/null +++ b/browsergym/example_benchmark/src/browsergym/example/__init__.py @@ -0,0 +1,29 @@ +import os + +from browsergym.core.registration import register_task + +from . import example_task + +ALL_TASKS = [ + example_task.WikipediaTask, + example_task.AmazonTask, +] + + +# register the benchmark tasks +for task in ALL_TASKS: + register_task( + task.get_task_id(), + task, + task_kwargs={ + "seed": 0, + "start_url": os.getenv("START_URL", "https://google.com"), + }, + ) + +# register_tasks will expose the tasks through gym +# tasks will be importable with: +# import gym +# import browsergym.example_benchmark +# env = gym.make("browsergym/example.wikipedia") +# env = gym.make("browsergym/example.amazon") diff --git a/browsergym/example_benchmark/src/browsergym/example/example_task.py b/browsergym/example_benchmark/src/browsergym/example/example_task.py new file mode 100644 index 000000000..41498af98 --- /dev/null +++ b/browsergym/example_benchmark/src/browsergym/example/example_task.py @@ -0,0 +1,95 @@ +import os +from typing import Optional, Tuple + +import playwright.sync_api + +from browsergym.core.task import AbstractBrowserTask + + +class ExampleTask(AbstractBrowserTask): + """ + Example task for browsergym + + """ + + @classmethod + def get_task_id(cls): + return f"example.{cls.subdomain}" + + def __init__( + self, + seed: int, + start_url: str = "https://google.com", + ) -> None: + super().__init__(seed) + self.start_url = start_url + + def setup(self, page: playwright.sync_api.Page) -> tuple[str, dict]: + """ + Perform any operation to allow the task to start. Return the goal as a string and any information that's relevant. + """ + page.goto(self.start_url) + + # can also login or perform any other operation + # to prepare the task + + goal = f"Go to the {self.goal_url} and send a message to the chat." + info = {"goal_url": self.goal_url} + return goal, info + + def validate(self, page, chat_messages): + """Check if the task was completed successfully and return a reward. + + i.e. reward, done, message, info + + Args: + page: the active playwright page. + chat_messages: the chat messages. + Returns: + reward: float, the reward obtained since last call to validate(). + done: boolean flag, indicates if the task has finished or not (be it success or fail). + message: string, a new user message for the chat. + info: dictionnary, custom information from the task. + """ + # wait for the page to load if needed + page.wait_for_load_state("networkidle") + # check if the page is the goal page + if page.url == self.goal_url: + # check if the chat messages contain the goal + for message in chat_messages: + if self.goal in message: + return 1.0, True, "Task completed successfully", {} + return 0.0, False, "Task not completed", {} + + def cheat(self, page: playwright.sync_api.Page, chat_messages: list[str]) -> None: + """ + Solve the task using a pre-defined solution (optional). + """ + raise NotImplementedError("Cheat method not implemented for this task.") + + def teardown(self) -> None: + """ + Tear down the task and clean up any resource / data created by the task (optional). + """ + # typically shut down the browser... + pass + + +class WikipediaTask(ExampleTask): + """ + Wikipedia task for browsergym + + """ + + subdomain = "wikipedia" + goal_url = os.environ.get("WIKIPEDIA_URL", "https://en.wikipedia.org/wiki/Main_Page") + + +class AmazonTask(ExampleTask): + """ + Amazon task for browsergym + + """ + + subdomain = "amazon" + goal_url = os.environ.get("AMAZON_URL", "https://www.amazon.com/") diff --git a/browsergym/experiments/src/browsergym/experiments/benchmark/configs.py b/browsergym/experiments/src/browsergym/experiments/benchmark/configs.py index c9c5994e0..35a4bb5e8 100644 --- a/browsergym/experiments/src/browsergym/experiments/benchmark/configs.py +++ b/browsergym/experiments/src/browsergym/experiments/benchmark/configs.py @@ -1,9 +1,6 @@ import numpy as np -from browsergym.experiments.benchmark.metadata.utils import ( - task_list_from_metadata, - task_metadata, -) +from browsergym.experiments.benchmark.metadata.utils import task_list_from_metadata, task_metadata from browsergym.experiments.benchmark.utils import ( make_env_args_list_from_fixed_seeds, make_env_args_list_from_repeat_tasks, @@ -87,6 +84,13 @@ retry_with_force=True, demo_mode="off", ), + "example": HighLevelActionSetArgs( + subsets=["example"], + multiaction=False, + strict=False, + retry_with_force=True, + demo_mode="off", + ), } # all benchmarks are callables designed for lazy loading, i.e. `bench = DEFAULT_BENCHMARKS["miniwob_all"]()` @@ -261,4 +265,18 @@ ), task_metadata=task_metadata("weblinx"), ), + "example": lambda: Benchmark( + name="example", + high_level_action_set_args=DEFAULT_HIGHLEVEL_ACTION_SET_ARGS["example"], + is_multi_tab=False, + supports_parallel_seeds=True, + backends=["example"], + env_args_list=make_env_args_list_from_repeat_tasks( + task_list=task_list_from_metadata(metadata=task_metadata("example")), + max_steps=10, + n_repeats=1, # 1 seed per task in the example benchmark + seeds_rng=np.random.RandomState(42), + ), + task_metadata=task_metadata("example"), + ), } diff --git a/browsergym/experiments/src/browsergym/experiments/benchmark/metadata/example.csv b/browsergym/experiments/src/browsergym/experiments/benchmark/metadata/example.csv new file mode 100644 index 000000000..432bb318b --- /dev/null +++ b/browsergym/experiments/src/browsergym/experiments/benchmark/metadata/example.csv @@ -0,0 +1,3 @@ +task_name,any_criterion +example.wikipedia,True +example.amazon,False \ No newline at end of file From 1bff5c459a1df005b0dede9c76fa0d0af6681a91 Mon Sep 17 00:00:00 2001 From: Aman Jaiswal <66757799+amanjaiswal73892@users.noreply.github.com> Date: Mon, 21 Jul 2025 13:59:48 -0400 Subject: [PATCH 2/2] Add reminder to import benchmark in _get_env_name function --- browsergym/experiments/src/browsergym/experiments/loop.py | 2 ++ 1 file changed, 2 insertions(+) diff --git a/browsergym/experiments/src/browsergym/experiments/loop.py b/browsergym/experiments/src/browsergym/experiments/loop.py index 22c5d924d..d956e1bb4 100644 --- a/browsergym/experiments/src/browsergym/experiments/loop.py +++ b/browsergym/experiments/src/browsergym/experiments/loop.py @@ -935,6 +935,8 @@ def _get_env_name(task_name: str): import browsergym.assistantbench elif task_name.startswith("weblinx"): import weblinx_browsergym + + ## Do not forget to import your benchmark here.. return f"browsergym/{task_name}"