diff --git a/prompts/agent.system.tool.olostep.md b/prompts/agent.system.tool.olostep.md new file mode 100644 index 000000000..73a83850f --- /dev/null +++ b/prompts/agent.system.tool.olostep.md @@ -0,0 +1,55 @@ +### olostep: +web scraping and search tool via Olostep API +scrape: extract markdown/html/text from any url +search: AI-powered web search with answers +google: structured Google search results +map: discover all urls on a website + +**Example usages**: +~~~json +{ + "thoughts": ["I need webpage content"], + "headline": "Scraping webpage", + "tool_name": "olostep", + "tool_method": "scrape", + "tool_args": { + "url": "https://example.com/article" + } +} +~~~ + +~~~json +{ + "thoughts": ["I need to search for information"], + "headline": "Searching web", + "tool_name": "olostep", + "tool_method": "search", + "tool_args": { + "query": "What is the capital of France?" + } +} +~~~ + +~~~json +{ + "thoughts": ["I need Google search results"], + "headline": "Google search", + "tool_name": "olostep", + "tool_method": "google", + "tool_args": { + "query": "best python libraries 2024" + } +} +~~~ + +~~~json +{ + "thoughts": ["I need to find all pages on this site"], + "headline": "Mapping website", + "tool_name": "olostep", + "tool_method": "map", + "tool_args": { + "url": "https://docs.example.com" + } +} +~~~ diff --git a/python/helpers/olostep_api.py b/python/helpers/olostep_api.py new file mode 100644 index 000000000..8dd621de5 --- /dev/null +++ b/python/helpers/olostep_api.py @@ -0,0 +1,195 @@ +""" +Olostep API Helper Module for Agent Zero + +This module provides functions to interact with the Olostep API for: +- Web scraping (get markdown, HTML, text, or structured JSON from any URL) +- Web search and answers (get AI-powered answers with sources) +- Website mapping (get all URLs on a website) + +API Documentation: https://docs.olostep.com +""" + +import aiohttp +from typing import Optional, Any +import models + + +OLOSTEP_API_BASE = "https://api.olostep.com" + + +def get_api_key() -> str: + """ + Get the Olostep API key using Agent Zero's standard API key mechanism. + The key can be configured through: + - The Web UI: Settings > API Keys > Olostep + - Environment variable: OLOSTEP_API_KEY or API_KEY_OLOSTEP + """ + return models.get_api_key("olostep") + + +def _get_headers() -> dict: + """Get the headers for Olostep API requests.""" + api_key = get_api_key() + if not api_key or api_key == "None": + raise ValueError( + "Olostep API key not found. Please configure it in Settings > API Keys > Olostep." + ) + return { + "Authorization": f"Bearer {api_key}", + "Content-Type": "application/json", + } + +async def scrape_url( + url: str, + formats: list[str] = ["markdown"], + wait_before_scraping: int = 0, + remove_css_selectors: str = "default", + parser_id: Optional[str] = None, + country: Optional[str] = None, +) -> dict[str, Any]: + """ + Scrape a URL and return its content in specified formats. + + Args: + url: The URL to scrape + formats: Output formats - "markdown", "html", "text", "json", "screenshot" + wait_before_scraping: Milliseconds to wait before scraping (for JS-heavy sites) + remove_css_selectors: "default", "none", or specific selectors to remove + parser_id: Optional parser ID for structured JSON extraction (e.g., "@olostep/google-search") + country: Optional country code for geo-specific scraping (e.g., "US", "GB") + + Returns: + Dictionary containing the scraped content with keys like: + - markdown_content, html_content, text_content, json_content + - Various hosted URLs for the content + - links_on_page, page_metadata + """ + endpoint = f"{OLOSTEP_API_BASE}/v1/scrapes" + + payload = { + "url_to_scrape": url, + "formats": formats, + } + + if wait_before_scraping > 0: + payload["wait_before_scraping"] = wait_before_scraping + + if remove_css_selectors: + payload["remove_css_selectors"] = remove_css_selectors + + if parser_id: + payload["parser"] = {"id": parser_id} + + if country: + payload["country"] = country + + async with aiohttp.ClientSession() as session: + async with session.post(endpoint, json=payload, headers=_get_headers()) as response: + if response.status != 200: + error_text = await response.text() + raise Exception(f"Olostep API error ({response.status}): {error_text}") + return await response.json() + + +async def get_answer( + task: str, + json_format: Optional[dict] = None, +) -> dict[str, Any]: + """ + Search the web and get an AI-powered answer with sources. + + Args: + task: The question or task to answer (e.g., "What is the latest book by J.K. Rowling?") + json_format: Optional JSON schema to structure the response + + Returns: + Dictionary containing: + - result.json_content: The answer in JSON format + - result.json_hosted_url: Hosted URL for the answer + - sources: List of sources used + """ + endpoint = f"{OLOSTEP_API_BASE}/v1/answers" + + payload = {"task": task} + + if json_format: + payload["json_format"] = json_format + + async with aiohttp.ClientSession() as session: + async with session.post(endpoint, json=payload, headers=_get_headers()) as response: + if response.status != 200: + error_text = await response.text() + raise Exception(f"Olostep API error ({response.status}): {error_text}") + return await response.json() + + +async def map_website( + url: str, + include_urls: Optional[list[str]] = None, + exclude_urls: Optional[list[str]] = None, + top_n: Optional[int] = None, +) -> dict[str, Any]: + """ + Get all URLs on a website. + + Args: + url: The website URL to map + include_urls: Glob patterns for URLs to include (e.g., ["/blog/**"]) + exclude_urls: Glob patterns for URLs to exclude + top_n: Limit the number of URLs returned + + Returns: + Dictionary containing: + - urls: List of discovered URLs + - cursor: Pagination cursor if more results available + """ + endpoint = f"{OLOSTEP_API_BASE}/v1/maps" + + payload = {"url": url} + + if include_urls: + payload["include_urls"] = include_urls + + if exclude_urls: + payload["exclude_urls"] = exclude_urls + + if top_n: + payload["top_n"] = top_n + + async with aiohttp.ClientSession() as session: + async with session.post(endpoint, json=payload, headers=_get_headers()) as response: + if response.status != 200: + error_text = await response.text() + raise Exception(f"Olostep API error ({response.status}): {error_text}") + return await response.json() + + +async def google_search( + query: str, + country: str = "us", + language: str = "en", + num_results: int = 10, +) -> dict[str, Any]: + """ + Perform a Google search and get structured results. + + Args: + query: The search query + country: Country code (e.g., "us", "uk", "de") + language: Language code (e.g., "en", "es", "de") + num_results: Number of results to return + + Returns: + Dictionary containing structured search results with: + - organic_results: List of search results with title, url, description + - ai_overview: AI-generated overview if available + - related_searches, people_also_ask, etc. + """ + # Build Google search URL + search_url = f"https://www.google.com/search?q={query}&gl={country}&hl={language}&num={num_results}" + + return await scrape_url( + url=search_url, + formats=["json"], + parser_id="@olostep/google-search", + ) diff --git a/python/helpers/settings.py b/python/helpers/settings.py index 9e71b7956..a750b828f 100644 --- a/python/helpers/settings.py +++ b/python/helpers/settings.py @@ -592,6 +592,11 @@ def convert_out(settings: Settings) -> SettingsOutput: _get_api_key_field(settings, pid_lower, provider["label"]) ) + # Add Olostep API key for web scraping and search + api_keys_fields.append( + _get_api_key_field(settings, "olostep", "Olostep (Web Scraping & Search)") + ) + api_keys_section: SettingsSection = { "id": "api_keys", "title": "API Keys", diff --git a/python/tools/olostep.py b/python/tools/olostep.py new file mode 100644 index 000000000..840f8ed25 --- /dev/null +++ b/python/tools/olostep.py @@ -0,0 +1,281 @@ +""" +Olostep Tool for Agent Zero + +This tool provides web scraping, search, and data extraction capabilities +through the Olostep API. It allows agents to: +- Scrape any URL and get clean markdown, HTML, or structured data +- Search the web and get AI-powered answers with sources +- Map websites to discover all URLs +- Perform Google searches with structured results +""" + +from python.helpers.tool import Tool, Response +from python.helpers.print_style import PrintStyle +from python.helpers import olostep_api +from python.helpers.errors import handle_error + + +class Olostep(Tool): + """ + Olostep integration tool for Agent Zero. + + Supported methods: + - scrape: Extract content from any URL (markdown, HTML, text, JSON) + - search: Search the web and get AI-powered answers + - google: Perform Google searches with structured results + - map: Discover all URLs on a website + """ + + async def execute(self, **kwargs) -> Response: + method = self.method or "scrape" + + try: + if method == "scrape": + return await self._scrape() + elif method == "search" or method == "answer": + return await self._search() + elif method == "google": + return await self._google_search() + elif method == "map": + return await self._map_website() + else: + return Response( + message=f"Unknown method '{method}'. Available methods: scrape, search, google, map", + break_loop=False, + ) + except Exception as e: + handle_error(e) + return Response( + message=f"Olostep error: {str(e)}", + break_loop=False, + ) + + async def _scrape(self) -> Response: + """Scrape a URL and return its content.""" + url = self.args.get("url", "") + if not url: + return Response( + message="Error: 'url' argument is required for scraping", + break_loop=False, + ) + + # Parse formats - default to markdown + formats_arg = self.args.get("formats", "markdown") + if isinstance(formats_arg, str): + formats = [f.strip() for f in formats_arg.split(",")] + else: + formats = formats_arg + + # Optional parameters + wait_ms = int(self.args.get("wait", 0)) + parser_id = self.args.get("parser", None) + country = self.args.get("country", None) + + self.set_progress(f"Scraping {url}...") + + result = await olostep_api.scrape_url( + url=url, + formats=formats, + wait_before_scraping=wait_ms, + parser_id=parser_id, + country=country, + ) + + # Extract the relevant content from the result + output_parts = [] + result_data = result.get("result", {}) + + if result_data.get("markdown_content"): + output_parts.append(f"## Markdown Content\n\n{result_data['markdown_content']}") + + if result_data.get("text_content"): + output_parts.append(f"## Text Content\n\n{result_data['text_content']}") + + if result_data.get("json_content"): + import json + json_str = json.dumps(result_data['json_content'], indent=2) if isinstance(result_data['json_content'], (dict, list)) else str(result_data['json_content']) + output_parts.append(f"## JSON Content\n\n```json\n{json_str}\n```") + + if result_data.get("html_content") and "markdown" not in formats: + # Only include HTML if markdown wasn't requested (to avoid duplication) + output_parts.append(f"## HTML Content\n\n{result_data['html_content'][:5000]}...") + + # Include metadata + metadata = result_data.get("page_metadata", {}) + if metadata: + output_parts.append(f"\n## Page Metadata\n- Title: {metadata.get('title', 'N/A')}\n- Status: {metadata.get('status_code', 'N/A')}") + + # Include links if available + links = result_data.get("links_on_page", []) + if links and len(links) > 0: + links_preview = links[:20] # Limit to first 20 links + links_str = "\n".join([f"- {link}" for link in links_preview]) + if len(links) > 20: + links_str += f"\n... and {len(links) - 20} more links" + output_parts.append(f"\n## Links on Page\n{links_str}") + + output = "\n\n".join(output_parts) if output_parts else "No content extracted" + + return Response(message=output, break_loop=False) + + async def _search(self) -> Response: + """Search the web and get an AI-powered answer.""" + query = self.args.get("query", "") or self.args.get("task", "") + if not query: + return Response( + message="Error: 'query' argument is required for search", + break_loop=False, + ) + + # Optional JSON format for structured responses + json_format = self.args.get("json_format", None) + if json_format and isinstance(json_format, str): + import json + try: + json_format = json.loads(json_format) + except: + json_format = None + + self.set_progress(f"Searching: {query}...") + + result = await olostep_api.get_answer( + task=query, + json_format=json_format, + ) + + # Format the response + output_parts = [] + result_data = result.get("result", {}) + + if result_data.get("json_content"): + import json + content = result_data['json_content'] + if isinstance(content, str): + try: + content = json.loads(content) + except: + pass + if isinstance(content, (dict, list)): + output_parts.append(f"## Answer\n\n```json\n{json.dumps(content, indent=2)}\n```") + else: + output_parts.append(f"## Answer\n\n{content}") + + # Include sources if available + sources = result.get("sources", []) + if sources: + sources_str = "\n".join([f"- {s}" for s in sources[:10]]) + output_parts.append(f"\n## Sources\n{sources_str}") + + output = "\n\n".join(output_parts) if output_parts else "No answer found" + + return Response(message=output, break_loop=False) + + async def _google_search(self) -> Response: + """Perform a Google search with structured results.""" + query = self.args.get("query", "") + if not query: + return Response( + message="Error: 'query' argument is required for Google search", + break_loop=False, + ) + + country = self.args.get("country", "us") + language = self.args.get("language", "en") + num_results = int(self.args.get("num_results", 10)) + + self.set_progress(f"Google search: {query}...") + + result = await olostep_api.google_search( + query=query, + country=country, + language=language, + num_results=num_results, + ) + + # Format the response + result_data = result.get("result", {}) + json_content = result_data.get("json_content", {}) + + if isinstance(json_content, str): + import json + try: + json_content = json.loads(json_content) + except: + return Response(message=f"Search results:\n{json_content}", break_loop=False) + + output_parts = [] + + # AI Overview if available + ai_overview = json_content.get("ai_overview", "") + if ai_overview: + output_parts.append(f"## AI Overview\n{ai_overview}") + + # Organic results + organic_results = json_content.get("organic_results", []) + if organic_results: + results_str = "" + for i, r in enumerate(organic_results[:10], 1): + title = r.get("title", "No title") + url = r.get("url", "") + description = r.get("description", "") + results_str += f"{i}. **{title}**\n {url}\n {description}\n\n" + output_parts.append(f"## Search Results\n{results_str}") + + # Related searches + related = json_content.get("related_searches", []) + if related: + related_str = ", ".join(related[:5]) + output_parts.append(f"## Related Searches\n{related_str}") + + output = "\n\n".join(output_parts) if output_parts else "No results found" + + return Response(message=output, break_loop=False) + + async def _map_website(self) -> Response: + """Map a website to discover all URLs.""" + url = self.args.get("url", "") + if not url: + return Response( + message="Error: 'url' argument is required for mapping", + break_loop=False, + ) + + include_urls = self.args.get("include_urls", None) + exclude_urls = self.args.get("exclude_urls", None) + top_n = self.args.get("top_n", None) + + if include_urls and isinstance(include_urls, str): + include_urls = [p.strip() for p in include_urls.split(",")] + + if exclude_urls and isinstance(exclude_urls, str): + exclude_urls = [p.strip() for p in exclude_urls.split(",")] + + if top_n: + top_n = int(top_n) + + self.set_progress(f"Mapping website: {url}...") + + result = await olostep_api.map_website( + url=url, + include_urls=include_urls, + exclude_urls=exclude_urls, + top_n=top_n, + ) + + urls = result.get("urls", []) + cursor = result.get("cursor", None) + + output_parts = [f"## Website Map: {url}\n"] + output_parts.append(f"Found {len(urls)} URLs:\n") + + # Show first 50 URLs + for u in urls[:50]: + output_parts.append(f"- {u}") + + if len(urls) > 50: + output_parts.append(f"\n... and {len(urls) - 50} more URLs") + + if cursor: + output_parts.append(f"\n(More results available, use cursor: {cursor})") + + return Response(message="\n".join(output_parts), break_loop=False) diff --git a/requirements.txt b/requirements.txt index 07be99756..57abe50db 100644 --- a/requirements.txt +++ b/requirements.txt @@ -46,4 +46,5 @@ imapclient>=3.0.1 html2text>=2024.2.26 beautifulsoup4>=4.12.3 exchangelib>=5.4.3 -pywinpty==3.0.2; sys_platform == "win32" \ No newline at end of file +pywinpty==3.0.2; sys_platform == "win32" +aiohttp>=3.9.0 \ No newline at end of file