diff --git a/workflows/pyproject.toml b/workflows/pyproject.toml index 9637d124..1590a7c5 100644 --- a/workflows/pyproject.toml +++ b/workflows/pyproject.toml @@ -1,6 +1,6 @@ [project] name = "workflow-use" -version = "0.2.7" +version = "0.2.8" authors = [{ name = "Gregor Zunic" }] description = "Create, edit, run deterministic workflows" readme = "README.md" @@ -13,7 +13,7 @@ classifiers = [ dependencies = [ "aiofiles>=24.1.0", - "browser-use>=0.9.0", + "browser-use>=0.9.4", "fastapi>=0.115.12", "fastmcp>=2.3.4", "typer>=0.15.3", diff --git a/workflows/tests/test_element_finder.py b/workflows/tests/test_element_finder.py index 04254178..63d944c7 100644 --- a/workflows/tests/test_element_finder.py +++ b/workflows/tests/test_element_finder.py @@ -5,7 +5,8 @@ and returns element indices (not Playwright element handles). """ -from unittest.mock import Mock, AsyncMock +from unittest.mock import AsyncMock, Mock + from workflow_use.workflow.element_finder import ElementFinder diff --git a/workflows/tests/test_go_back_fix.py b/workflows/tests/test_go_back_fix.py index d2306a0d..e130d87f 100644 --- a/workflows/tests/test_go_back_fix.py +++ b/workflows/tests/test_go_back_fix.py @@ -3,10 +3,11 @@ """ import asyncio -import yaml from pathlib import Path -from browser_use import Browser + +import yaml from browser_use.llm import ChatAnthropic + from workflow_use.workflow.service import Workflow @@ -48,7 +49,7 @@ async def test_go_back_fix(): # Run the workflow result = await workflow.run({}) - print(f'\nโœ… Workflow completed!') + print('\nโœ… Workflow completed!') print(f'๐Ÿ“Š Result: {result}') # Clean up diff --git a/workflows/tests/test_selector_generator_fix.py b/workflows/tests/test_selector_generator_fix.py index 0d9e7022..018abbff 100644 --- a/workflows/tests/test_selector_generator_fix.py +++ b/workflows/tests/test_selector_generator_fix.py @@ -2,9 +2,10 @@ Test that CapturingController now has selector_generator attribute. """ +from browser_use.llm import ChatBrowserUse + from workflow_use.healing.selector_generator import SelectorGenerator from workflow_use.healing.service import HealingService -from browser_use.llm import ChatBrowserUse def test_selector_generator_initialization(): diff --git a/workflows/test_wait_times.py b/workflows/tests/test_wait_times.py similarity index 99% rename from workflows/test_wait_times.py rename to workflows/tests/test_wait_times.py index 682725ef..423ba707 100644 --- a/workflows/test_wait_times.py +++ b/workflows/tests/test_wait_times.py @@ -31,8 +31,6 @@ from pathlib import Path from unittest.mock import AsyncMock, Mock, patch -import yaml - from workflow_use.schema.views import WorkflowDefinitionSchema @@ -244,6 +242,7 @@ async def test_actual_timing(): print('๐Ÿงช Testing actual execution timing...\n') from browser_use.agent.views import ActionResult + from workflow_use.workflow.service import Workflow # Create a workflow with known wait times @@ -339,8 +338,8 @@ async def mock_sleep(duration): mock_llm = Mock() # Import and patch - from workflow_use.workflow.service import Workflow from workflow_use.workflow.semantic_executor import SemanticWorkflowExecutor + from workflow_use.workflow.service import Workflow with patch('asyncio.sleep', mock_sleep): with patch.object( diff --git a/workflows/tests/test_workflow_execution.py b/workflows/tests/test_workflow_execution.py index cea4eb10..472c6456 100644 --- a/workflows/tests/test_workflow_execution.py +++ b/workflows/tests/test_workflow_execution.py @@ -4,8 +4,7 @@ Tests the fixes for go_back/go_forward (empty action models) and deterministic execution. """ -from unittest.mock import Mock, AsyncMock, patch -from workflow_use.schema.views import WorkflowDefinitionSchema, NavigationStep, ClickStep +from workflow_use.schema.views import NavigationStep class TestWorkflowExecution: diff --git a/workflows/uv.lock b/workflows/uv.lock index 19de0746..4db61b5f 100644 --- a/workflows/uv.lock +++ b/workflows/uv.lock @@ -194,7 +194,7 @@ wheels = [ [[package]] name = "browser-use" -version = "0.9.1" +version = "0.9.5" source = { registry = "https://pypi.org/simple" } dependencies = [ { name = "aiohttp" }, @@ -203,6 +203,8 @@ dependencies = [ { name = "authlib" }, { name = "bubus" }, { name = "cdp-use" }, + { name = "click" }, + { name = "cloudpickle" }, { name = "google-api-core" }, { name = "google-api-python-client" }, { name = "google-auth" }, @@ -210,6 +212,7 @@ dependencies = [ { name = "google-genai" }, { name = "groq" }, { name = "httpx" }, + { name = "inquirerpy" }, { name = "markdownify" }, { name = "mcp" }, { name = "ollama" }, @@ -225,13 +228,14 @@ dependencies = [ { name = "python-dotenv" }, { name = "reportlab" }, { name = "requests" }, + { name = "rich" }, { name = "screeninfo", marker = "platform_system != 'darwin'" }, { name = "typing-extensions" }, { name = "uuid7" }, ] -sdist = { url = "https://files.pythonhosted.org/packages/52/e4/7a878e36de58d84eef2d1c2cc2dcd67f15efc985d5cb66a0de2c56690b6a/browser_use-0.9.1.tar.gz", hash = "sha256:c1529cfe487e0dc627512eeab0554c4afa771cacf64632ddb484c63985c3bf50", size = 404187, upload-time = "2025-10-24T09:55:47.067Z" } +sdist = { url = "https://files.pythonhosted.org/packages/db/5c/06f8be2b91c366e11c9c99fbfd24aad1f3f8d8e9a637b7d27c46e9993173/browser_use-0.9.5.tar.gz", hash = "sha256:f8285fe253b149d01769a7084883b4cf4db351e2f38e26302c157bcbf14a703f", size = 435288, upload-time = "2025-11-01T02:21:17.995Z" } wheels = [ - { url = "https://files.pythonhosted.org/packages/34/1a/1b4b8ff7cbc0a002515d5a2bc96ae827b2e481ccbacce147ef443b0ebeb0/browser_use-0.9.1-py3-none-any.whl", hash = "sha256:4700c40a4cf8397f33797ef1b980b13a86f1f1ca4eb66bbe2d58e5d212e7838a", size = 487156, upload-time = "2025-10-24T09:55:45.477Z" }, + { url = "https://files.pythonhosted.org/packages/a1/5c/0748d8b3e8ac69f2c2581734ad0b7dc0acd28d544908d7283c59c3a65aa5/browser_use-0.9.5-py3-none-any.whl", hash = "sha256:4a2e92847204d1ded269026a99cb0cc0e60e38bd2751fa3f58aedd78f00b4e67", size = 522739, upload-time = "2025-11-01T02:21:16.382Z" }, ] [[package]] @@ -426,6 +430,15 @@ wheels = [ { url = "https://files.pythonhosted.org/packages/85/32/10bb5764d90a8eee674e9dc6f4db6a0ab47c8c4d0d83c27f7c39ac415a4d/click-8.2.1-py3-none-any.whl", hash = "sha256:61a3265b914e850b85317d0b3109c7f8cd35a670f963866005d6ef1d5175a12b", size = 102215, upload-time = "2025-05-20T23:19:47.796Z" }, ] +[[package]] +name = "cloudpickle" +version = "3.1.2" +source = { registry = "https://pypi.org/simple" } +sdist = { url = "https://files.pythonhosted.org/packages/27/fb/576f067976d320f5f0114a8d9fa1215425441bb35627b1993e5afd8111e5/cloudpickle-3.1.2.tar.gz", hash = "sha256:7fda9eb655c9c230dab534f1983763de5835249750e85fbcef43aaa30a9a2414", size = 22330, upload-time = "2025-11-03T09:25:26.604Z" } +wheels = [ + { url = "https://files.pythonhosted.org/packages/88/39/799be3f2f0f38cc727ee3b4f1445fe6d5e4133064ec2e4115069418a5bb6/cloudpickle-3.1.2-py3-none-any.whl", hash = "sha256:9acb47f6afd73f60dc1df93bb801b472f05ff42fa6c84167d25cb206be1fbf4a", size = 22228, upload-time = "2025-11-03T09:25:25.534Z" }, +] + [[package]] name = "colorama" version = "0.4.6" @@ -866,6 +879,19 @@ wheels = [ { url = "https://files.pythonhosted.org/packages/76/c6/c88e154df9c4e1a2a66ccf0005a88dfb2650c1dffb6f5ce603dfbd452ce3/idna-3.10-py3-none-any.whl", hash = "sha256:946d195a0d259cbba61165e88e65941f16e9b36ea6ddb97f00452bae8b1287d3", size = 70442, upload-time = "2024-09-15T18:07:37.964Z" }, ] +[[package]] +name = "inquirerpy" +version = "0.3.4" +source = { registry = "https://pypi.org/simple" } +dependencies = [ + { name = "pfzy" }, + { name = "prompt-toolkit" }, +] +sdist = { url = "https://files.pythonhosted.org/packages/64/73/7570847b9da026e07053da3bbe2ac7ea6cde6bb2cbd3c7a5a950fa0ae40b/InquirerPy-0.3.4.tar.gz", hash = "sha256:89d2ada0111f337483cb41ae31073108b2ec1e618a49d7110b0d7ade89fc197e", size = 44431, upload-time = "2022-06-27T23:11:20.598Z" } +wheels = [ + { url = "https://files.pythonhosted.org/packages/ce/ff/3b59672c47c6284e8005b42e84ceba13864aa0f39f067c973d1af02f5d91/InquirerPy-0.3.4-py3-none-any.whl", hash = "sha256:c65fdfbac1fa00e3ee4fb10679f4d3ed7a012abf4833910e63c295827fe2a7d4", size = 67677, upload-time = "2022-06-27T23:11:17.723Z" }, +] + [[package]] name = "jiter" version = "0.10.0" @@ -1283,6 +1309,15 @@ wheels = [ { url = "https://files.pythonhosted.org/packages/39/c2/646d2e93e0af70f4e5359d870a63584dacbc324b54d73e6b3267920ff117/pandas-2.3.0-cp313-cp313t-musllinux_1_2_x86_64.whl", hash = "sha256:bb3be958022198531eb7ec2008cfc78c5b1eed51af8600c6c5d9160d89d8d249", size = 13231847, upload-time = "2025-06-05T03:27:51.465Z" }, ] +[[package]] +name = "pfzy" +version = "0.3.4" +source = { registry = "https://pypi.org/simple" } +sdist = { url = "https://files.pythonhosted.org/packages/d9/5a/32b50c077c86bfccc7bed4881c5a2b823518f5450a30e639db5d3711952e/pfzy-0.3.4.tar.gz", hash = "sha256:717ea765dd10b63618e7298b2d98efd819e0b30cd5905c9707223dceeb94b3f1", size = 8396, upload-time = "2022-01-28T02:26:17.946Z" } +wheels = [ + { url = "https://files.pythonhosted.org/packages/8c/d7/8ff98376b1acc4503253b685ea09981697385ce344d4e3935c2af49e044d/pfzy-0.3.4-py3-none-any.whl", hash = "sha256:5f50d5b2b3207fa72e7ec0ef08372ef652685470974a107d0d4999fc5a903a96", size = 8537, upload-time = "2022-01-28T02:26:16.047Z" }, +] + [[package]] name = "pillow" version = "11.3.0" @@ -1396,6 +1431,18 @@ wheels = [ { url = "https://files.pythonhosted.org/packages/54/e2/c158366e621562ef224f132e75c1d1c1fce6b078a19f7d8060451a12d4b9/posthog-3.25.0-py2.py3-none-any.whl", hash = "sha256:85db78c13d1ecb11aed06fad53759c4e8fb3633442c2f3d0336bc0ce8a585d30", size = 89115, upload-time = "2025-04-15T21:15:43.934Z" }, ] +[[package]] +name = "prompt-toolkit" +version = "3.0.52" +source = { registry = "https://pypi.org/simple" } +dependencies = [ + { name = "wcwidth" }, +] +sdist = { url = "https://files.pythonhosted.org/packages/a1/96/06e01a7b38dce6fe1db213e061a4602dd6032a8a97ef6c1a862537732421/prompt_toolkit-3.0.52.tar.gz", hash = "sha256:28cde192929c8e7321de85de1ddbe736f1375148b02f2e17edd840042b1be855", size = 434198, upload-time = "2025-08-27T15:24:02.057Z" } +wheels = [ + { url = "https://files.pythonhosted.org/packages/84/03/0d3ce49e2505ae70cf43bc5bb3033955d2fc9f932163e84dc0779cc47f48/prompt_toolkit-3.0.52-py3-none-any.whl", hash = "sha256:9aac639a3bbd33284347de5ad8d68ecc044b91a762dc39b7c21095fcd6a19955", size = 391431, upload-time = "2025-08-27T15:23:59.498Z" }, +] + [[package]] name = "propcache" version = "0.3.2" @@ -4763,6 +4810,15 @@ wheels = [ { url = "https://files.pythonhosted.org/packages/b1/4b/4cef6ce21a2aaca9d852a6e84ef4f135d99fcd74fa75105e2fc0c8308acd/uvicorn-0.34.2-py3-none-any.whl", hash = "sha256:deb49af569084536d269fe0a6d67e3754f104cf03aba7c11c40f01aadf33c403", size = 62483, upload-time = "2025-04-19T06:02:48.42Z" }, ] +[[package]] +name = "wcwidth" +version = "0.2.14" +source = { registry = "https://pypi.org/simple" } +sdist = { url = "https://files.pythonhosted.org/packages/24/30/6b0809f4510673dc723187aeaf24c7f5459922d01e2f794277a3dfb90345/wcwidth-0.2.14.tar.gz", hash = "sha256:4d478375d31bc5395a3c55c40ccdf3354688364cd61c4f6adacaa9215d0b3605", size = 102293, upload-time = "2025-09-22T16:29:53.023Z" } +wheels = [ + { url = "https://files.pythonhosted.org/packages/af/b5/123f13c975e9f27ab9c0770f514345bd406d0e8d3b7a0723af9d43f710af/wcwidth-0.2.14-py2.py3-none-any.whl", hash = "sha256:a7bb560c8aee30f9957e5f9895805edd20602f2d7f720186dfd906e82b4982e1", size = 37286, upload-time = "2025-09-22T16:29:51.641Z" }, +] + [[package]] name = "websockets" version = "15.0.1" @@ -4807,7 +4863,7 @@ wheels = [ [[package]] name = "workflow-use" -version = "0.2.7" +version = "0.2.8" source = { editable = "." } dependencies = [ { name = "aiofiles" }, @@ -4831,7 +4887,7 @@ dev = [ [package.metadata] requires-dist = [ { name = "aiofiles", specifier = ">=24.1.0" }, - { name = "browser-use", specifier = ">=0.9.0" }, + { name = "browser-use", specifier = ">=0.9.4" }, { name = "faiss-cpu", specifier = "==1.10.0" }, { name = "fastapi", specifier = ">=0.115.12" }, { name = "fastmcp", specifier = ">=2.3.4" }, diff --git a/workflows/workflow_use/healing/service.py b/workflows/workflow_use/healing/service.py index fe98d11a..29120abb 100644 --- a/workflows/workflow_use/healing/service.py +++ b/workflows/workflow_use/healing/service.py @@ -28,11 +28,30 @@ def __init__( enable_variable_extraction: bool = True, use_deterministic_conversion: bool = False, enable_ai_validation: bool = False, + # NEW: Pattern-based variable identification (no LLM, $0 cost!) + enable_pattern_variable_identification: bool = True, + pattern_variable_confidence: float = 0.5, + # NEW: YAML cleanup options + cleanup_yaml: bool = True, + remove_descriptions: bool = True, + remove_verification_checks: bool = True, + remove_expected_outcomes: bool = True, ): self.llm = llm self.enable_variable_extraction = enable_variable_extraction self.use_deterministic_conversion = use_deterministic_conversion self.enable_ai_validation = enable_ai_validation + + # Pattern-based variable identification settings + self.enable_pattern_variable_identification = enable_pattern_variable_identification + self.pattern_variable_confidence = pattern_variable_confidence + + # YAML cleanup settings + self.cleanup_yaml = cleanup_yaml + self.remove_descriptions = remove_descriptions + self.remove_verification_checks = remove_verification_checks + self.remove_expected_outcomes = remove_expected_outcomes + self.variable_extractor = VariableExtractor(llm=llm) if enable_variable_extraction else None self.deterministic_converter = DeterministicWorkflowConverter(llm=llm) if use_deterministic_conversion else None self.selector_generator = SelectorGenerator() # Initialize multi-strategy selector generator @@ -44,6 +63,98 @@ def __init__( def _remove_none_fields_from_dict(self, d: dict) -> dict: return {k: v for k, v in d.items() if v is not None} + def _post_process_workflow(self, workflow_definition: WorkflowDefinitionSchema) -> WorkflowDefinitionSchema: + """ + Post-process the generated workflow: + 1. Apply pattern-based variable identification + 2. Clean up verbose YAML fields + """ + try: + workflow_dict = workflow_definition.model_dump() + + # Step 1: Pattern-based variable identification + if self.enable_pattern_variable_identification: + try: + print('\n๐Ÿ” Applying pattern-based variable identification...') + print(f' Confidence threshold: {self.pattern_variable_confidence}') + + # Import the identifier directly to avoid package issues + import sys + import importlib.util + from pathlib import Path + + # Get the path to variable_identifier.py + var_id_path = Path(__file__).parent.parent / 'workflow' / 'variable_identifier.py' + + if var_id_path.exists(): + # Load the module directly + spec = importlib.util.spec_from_file_location('variable_identifier', var_id_path) + var_id_module = importlib.util.module_from_spec(spec) + spec.loader.exec_module(var_id_module) + + # Use the identifier + workflow_dict = var_id_module.identify_variables_in_workflow( + workflow_dict, min_confidence=self.pattern_variable_confidence, use_llm=False + ) + + var_count = workflow_dict.get('metadata', {}).get('identified_variable_count', 0) + if var_count > 0: + print(f' โœ… Identified {var_count} variables!') + if workflow_dict.get('input_schema'): + print(' Variables:') + for var in workflow_dict['input_schema'][:5]: # Show first 5 + default_info = f' (default: {var.get("default", "N/A")})' if 'default' in var else '' + print(f' โ€ข {var["name"]}: {var["type"]}{default_info}') + if len(workflow_dict['input_schema']) > 5: + print(f' ... and {len(workflow_dict["input_schema"]) - 5} more') + else: + print(' โ„น๏ธ No variables identified (confidence too low or no input steps)') + else: + print(f' โš ๏ธ variable_identifier.py not found at {var_id_path}') + + except Exception as e: + print(f' โš ๏ธ Warning: Variable identification failed: {e}') + import traceback + + traceback.print_exc() + + # Step 2: Clean up YAML + if self.cleanup_yaml: + try: + print('\n๐Ÿงน Cleaning up YAML (removing verbose fields)...') + fields_removed = 0 + + for step in workflow_dict.get('steps', []): + if self.remove_descriptions and 'description' in step: + del step['description'] + fields_removed += 1 + if self.remove_verification_checks and 'verification_checks' in step: + del step['verification_checks'] + fields_removed += 1 + if self.remove_expected_outcomes and 'expected_outcome' in step: + del step['expected_outcome'] + fields_removed += 1 + + print(f' โœ… Removed {fields_removed} verbose fields') + + except Exception as e: + print(f' โš ๏ธ Warning: YAML cleanup failed: {e}') + import traceback + + traceback.print_exc() + + # Recreate workflow from processed dict + print('\n๐Ÿ”„ Reconstructing workflow...') + return WorkflowDefinitionSchema(**workflow_dict) + + except Exception as e: + print(f'\nโŒ Post-processing failed completely: {e}') + import traceback + + traceback.print_exc() + print(' Returning original workflow...') + return workflow_definition + def _history_to_workflow_definition(self, history_list: AgentHistoryList) -> list[UserMessage]: # history @@ -593,4 +704,7 @@ async def act(self, action, browser_session, *args, **kwargs): print(f'\nโš ๏ธ Validation failed: {e}') print('Continuing with original workflow...') + # Post-process: Apply variable identification and YAML cleanup + workflow_definition = self._post_process_workflow(workflow_definition) + return workflow_definition diff --git a/workflows/workflow_use/recorder/semantic_converter.py b/workflows/workflow_use/recorder/semantic_converter.py index 1d6630a8..24a707e2 100644 --- a/workflows/workflow_use/recorder/semantic_converter.py +++ b/workflows/workflow_use/recorder/semantic_converter.py @@ -8,8 +8,16 @@ class SemanticWorkflowConverter: """Converts recorded workflows to use semantic targeting instead of CSS selectors.""" - def __init__(self): - pass + def __init__(self, enable_variable_identification: bool = True, variable_config: Optional[Dict[str, Any]] = None): + """ + Initialize the semantic workflow converter. + + Args: + enable_variable_identification: Whether to automatically identify and parameterize variables + variable_config: Optional configuration for variable identification (min_confidence, etc.) + """ + self.enable_variable_identification = enable_variable_identification + self.variable_config = variable_config or {} def convert_workflow_to_semantic(self, workflow_data: Dict[str, Any]) -> Dict[str, Any]: """Convert a recorded workflow to use semantic targeting.""" @@ -28,6 +36,10 @@ def convert_workflow_to_semantic(self, workflow_data: Dict[str, Any]) -> Dict[st if 'steps' in semantic_workflow: semantic_workflow['steps'] = self._convert_steps_to_semantic(semantic_workflow['steps']) + # Identify and parameterize variables + if self.enable_variable_identification: + semantic_workflow = self._identify_and_parameterize_variables(semantic_workflow) + return semantic_workflow def _convert_steps_to_semantic(self, steps: List[Dict[str, Any]]) -> List[Dict[str, Any]]: @@ -235,6 +247,36 @@ def _extract_semantic_target_text(self, step: Dict[str, Any]) -> Optional[str]: return None + def _identify_and_parameterize_variables(self, workflow_data: Dict[str, Any]) -> Dict[str, Any]: + """ + Identify and parameterize variables in the workflow. + + This method imports and uses the VariableIdentifier to automatically detect + input values that should be parameterized as variables. + """ + try: + from workflow_use.workflow.variable_identifier import identify_variables_in_workflow + + min_confidence = self.variable_config.get('min_confidence', 0.6) + use_llm = self.variable_config.get('use_llm', False) + + logger.info('Identifying variables in workflow...') + workflow_with_vars = identify_variables_in_workflow(workflow_data, min_confidence, use_llm) + + if workflow_with_vars.get('metadata', {}).get('identified_variable_count', 0) > 0: + logger.info(f'Successfully identified {workflow_with_vars["metadata"]["identified_variable_count"]} variables') + else: + logger.info('No variables identified in workflow') + + return workflow_with_vars + + except ImportError as e: + logger.warning(f'Variable identification not available: {e}') + return workflow_data + except Exception as e: + logger.error(f'Error during variable identification: {e}', exc_info=True) + return workflow_data + def convert_recorded_workflow_to_semantic(workflow_data: Dict[str, Any]) -> Dict[str, Any]: """Convenience function to convert a recorded workflow to semantic targeting.""" diff --git a/workflows/workflow_use/workflow/tests/test_variable_identifier.py b/workflows/workflow_use/workflow/tests/test_variable_identifier.py new file mode 100644 index 00000000..10a92539 --- /dev/null +++ b/workflows/workflow_use/workflow/tests/test_variable_identifier.py @@ -0,0 +1,284 @@ +""" +Tests for Variable Identifier + +Tests the automatic identification and parameterization of variables in workflows. +""" + +import pytest +from workflow_use.workflow.variable_identifier import ( + VariableIdentifier, + VariableType, + identify_variables_in_workflow, +) +from workflow_use.workflow.variable_config import VariableConfigPresets + + +class TestVariableIdentifier: + """Test the VariableIdentifier class.""" + + def test_pattern_detection_email(self): + """Test email pattern detection.""" + identifier = VariableIdentifier() + context = {'name': 'email', 'label': 'Email Address'} + + candidate = identifier.identify_variable('john.doe@example.com', context) + + assert candidate is not None + assert candidate.variable_type == VariableType.EMAIL + assert candidate.confidence >= 0.85 + assert 'email' in candidate.variable_name.lower() + + def test_pattern_detection_phone(self): + """Test phone number pattern detection.""" + identifier = VariableIdentifier() + context = {'name': 'phone', 'placeholder': 'Enter phone number'} + + candidate = identifier.identify_variable('555-123-4567', context) + + assert candidate is not None + assert candidate.variable_type == VariableType.PHONE + assert candidate.confidence >= 0.85 + + def test_pattern_detection_ssn(self): + """Test SSN pattern detection.""" + identifier = VariableIdentifier() + context = {'name': 'ssn'} + + candidate = identifier.identify_variable('123-45-6789', context) + + assert candidate is not None + assert candidate.variable_type == VariableType.SSN + assert candidate.confidence >= 0.85 + + def test_context_based_detection(self): + """Test context-based variable detection.""" + identifier = VariableIdentifier() + + # First name field + context = {'name': 'firstName', 'label': 'First Name'} + candidate = identifier.identify_variable('John', context) + + assert candidate is not None + assert 'name' in candidate.variable_name.lower() + assert candidate.confidence >= 0.6 + + def test_static_value_rejection(self): + """Test that static values are not parameterized.""" + identifier = VariableIdentifier() + context = {} + + # Common static values should be rejected + for value in ['submit', 'yes', 'no', 'true', 'false', '']: + candidate = identifier.identify_variable(value, context) + assert candidate is None + + def test_variable_name_normalization(self): + """Test variable name normalization.""" + identifier = VariableIdentifier() + + # Test various input formats + test_cases = [ + ('firstName', 'firstname'), + ('first-name', 'first_name'), + ('input-email-field', 'email_field'), + ('123test', 'field_123test'), + ] + + for input_name, expected_pattern in test_cases: + normalized = identifier._normalize_variable_name(input_name) + assert expected_pattern in normalized or normalized == expected_pattern + + def test_unique_variable_names(self): + """Test that duplicate variable names get unique suffixes.""" + identifier = VariableIdentifier() + context = {'name': 'email'} + + # First email + name1 = identifier._ensure_unique_variable_name('email') + assert name1 == 'email' + + # Second email + name2 = identifier._ensure_unique_variable_name('email') + assert name2 == 'email_2' + + # Third email + name3 = identifier._ensure_unique_variable_name('email') + assert name3 == 'email_3' + + def test_workflow_identification(self): + """Test end-to-end workflow variable identification.""" + workflow_data = { + 'name': 'Test Form', + 'steps': [ + { + 'type': 'navigation', + 'url': 'https://example.com', + }, + { + 'type': 'input', + 'value': 'john.doe@example.com', + 'cssSelector': 'input[name="email"]', + 'semanticInfo': {'name': 'email', 'labelText': 'Email Address'}, + }, + { + 'type': 'input', + 'value': 'JohnDoe123', + 'cssSelector': 'input[name="username"]', + 'semanticInfo': {'name': 'username', 'placeholder': 'Enter username'}, + }, + { + 'type': 'input', + 'value': 'submit', # Should NOT be parameterized + 'cssSelector': 'button[type="submit"]', + }, + ], + } + + result = identify_variables_in_workflow(workflow_data, min_confidence=0.6) + + # Check that variables were identified + assert 'input_schema' in result + assert len(result['input_schema']) >= 2 # email and username + + # Check that email was replaced with placeholder + email_step = result['steps'][1] + assert email_step['value'] == '{email}' + + # Check that username was replaced + username_step = result['steps'][2] + assert '{' in username_step['value'] # Should be a variable placeholder + + # Check that submit button was NOT parameterized + submit_step = result['steps'][3] + assert submit_step['value'] == 'submit' + + def test_input_schema_generation(self): + """Test that input schema is generated correctly.""" + workflow_data = { + 'name': 'Registration Form', + 'steps': [ + { + 'type': 'input', + 'value': 'john@example.com', + 'cssSelector': 'input[id="email"]', + 'semanticInfo': {'id': 'email'}, + }, + { + 'type': 'input', + 'value': '555-1234', + 'cssSelector': 'input[name="phone"]', + 'semanticInfo': {'name': 'phone'}, + }, + ], + } + + result = identify_variables_in_workflow(workflow_data) + + schema = result.get('input_schema', []) + assert len(schema) >= 2 + + # Check schema structure + email_schema = next((s for s in schema if 'email' in s['name']), None) + assert email_schema is not None + assert email_schema['type'] == 'string' + assert email_schema['required'] is True + + def test_already_parameterized_values(self): + """Test that already parameterized values are not re-processed.""" + workflow_data = { + 'name': 'Test', + 'steps': [ + { + 'type': 'input', + 'value': '{email}', # Already a variable + 'cssSelector': 'input[name="email"]', + } + ], + } + + result = identify_variables_in_workflow(workflow_data) + + # Should not add duplicate variables + assert result['steps'][0]['value'] == '{email}' + + def test_dynamic_value_detection(self): + """Test heuristic detection of dynamic-looking values.""" + identifier = VariableIdentifier(min_confidence=0.4) + context = {} + + # Values that look dynamic + dynamic_values = [ + 'MyPassword123!', + 'User_2024_Name', + 'ComplexValue@123', + ] + + for value in dynamic_values: + candidate = identifier.identify_variable(value, context) + assert candidate is not None, f'Failed to detect dynamic value: {value}' + + def test_config_presets(self): + """Test configuration presets.""" + strict_config = VariableConfigPresets.strict() + assert strict_config.min_confidence == 0.85 + + aggressive_config = VariableConfigPresets.aggressive() + assert aggressive_config.min_confidence == 0.4 + + form_config = VariableConfigPresets.form_filling() + assert 'address' in form_config.always_parameterize + + +class TestIntegrationWithSemanticConverter: + """Test integration with the semantic converter.""" + + def test_semantic_converter_with_variables(self): + """Test that semantic converter can identify variables.""" + from workflow_use.recorder.semantic_converter import SemanticWorkflowConverter + + workflow_data = { + 'name': 'Test Workflow', + 'steps': [ + { + 'type': 'input', + 'value': 'test@example.com', + 'cssSelector': 'input[name="email"]', + 'semanticInfo': {'name': 'email'}, + 'targetText': 'Email', + } + ], + } + + converter = SemanticWorkflowConverter(enable_variable_identification=True, variable_config={'min_confidence': 0.6}) + + result = converter.convert_workflow_to_semantic(workflow_data) + + # Check that variables were identified + assert 'input_schema' in result or result['steps'][0]['value'] == '{email}' + + def test_semantic_converter_disable_variables(self): + """Test that variable identification can be disabled.""" + from workflow_use.recorder.semantic_converter import SemanticWorkflowConverter + + workflow_data = { + 'name': 'Test Workflow', + 'steps': [ + { + 'type': 'input', + 'value': 'test@example.com', + 'cssSelector': 'input[name="email"]', + 'targetText': 'Email', + } + ], + } + + converter = SemanticWorkflowConverter(enable_variable_identification=False) + + result = converter.convert_workflow_to_semantic(workflow_data) + + # Value should remain unchanged + assert result['steps'][0]['value'] == 'test@example.com' + + +if __name__ == '__main__': + pytest.main([__file__, '-v']) diff --git a/workflows/workflow_use/workflow/variable_config.py b/workflows/workflow_use/workflow/variable_config.py new file mode 100644 index 00000000..0cc99196 --- /dev/null +++ b/workflows/workflow_use/workflow/variable_config.py @@ -0,0 +1,254 @@ +""" +Configuration for Variable Identification + +This module provides configuration options for customizing how variables +are identified in workflow automation. +""" + +from typing import Dict, List, Set +from dataclasses import dataclass, field + + +@dataclass +class VariableIdentificationConfig: + """Configuration for variable identification behavior.""" + + # Confidence threshold (0.0 to 1.0) + # Values with confidence below this won't be parameterized + min_confidence: float = 0.6 + + # Enable LLM-based identification for ambiguous cases + use_llm: bool = False + + # Patterns to always parameterize (case-insensitive) + always_parameterize: Set[str] = field( + default_factory=lambda: { + 'email', + 'password', + 'username', + 'phone', + 'ssn', + 'social_security', + 'credit_card', + 'card_number', + 'cvv', + 'routing', + 'account', + } + ) + + # Patterns to never parameterize (static values) + never_parameterize: Set[str] = field( + default_factory=lambda: { + 'submit', + 'cancel', + 'ok', + 'yes', + 'no', + 'true', + 'false', + 'search', + 'login', + 'signup', + 'signin', + } + ) + + # Custom regex patterns for domain-specific fields + # Format: {field_name_pattern: variable_type} + custom_patterns: Dict[str, str] = field(default_factory=dict) + + # Minimum value length to consider for parameterization + min_value_length: int = 2 + + # Maximum value length (very long values might not be variables) + max_value_length: int = 500 + + # Whether to use context hints (labels, placeholders) for detection + use_context_hints: bool = True + + # Whether to generate default values in schema + include_defaults: bool = True + + # Whether to mark all identified variables as required + mark_as_required: bool = True + + # Custom variable name mappings + # Format: {detected_name: preferred_name} + variable_name_mappings: Dict[str, str] = field( + default_factory=lambda: { + 'fname': 'first_name', + 'lname': 'last_name', + 'tel': 'phone_number', + 'mob': 'mobile_number', + 'dob': 'date_of_birth', + } + ) + + # Field name patterns that should be grouped + # (e.g., firstName, middleName, lastName โ†’ name_first, name_middle, name_last) + group_related_fields: bool = True + + # Domain-specific configurations + domain_config: Dict[str, any] = field(default_factory=dict) + + +# Preset configurations for different use cases +class VariableConfigPresets: + """Preset configurations for common use cases.""" + + @staticmethod + def strict() -> VariableIdentificationConfig: + """ + Strict configuration - only parameterize high-confidence matches. + Use for workflows where false positives are costly. + """ + return VariableIdentificationConfig( + min_confidence=0.85, + use_llm=False, + use_context_hints=True, + mark_as_required=True, + include_defaults=False, + ) + + @staticmethod + def balanced() -> VariableIdentificationConfig: + """ + Balanced configuration - reasonable confidence threshold. + Default recommended setting. + """ + return VariableIdentificationConfig( + min_confidence=0.6, + use_llm=False, + use_context_hints=True, + mark_as_required=True, + include_defaults=True, + ) + + @staticmethod + def aggressive() -> VariableIdentificationConfig: + """ + Aggressive configuration - parameterize more values. + Use when you want maximum flexibility in workflows. + """ + return VariableIdentificationConfig( + min_confidence=0.4, + use_llm=False, + use_context_hints=True, + mark_as_required=False, + include_defaults=True, + ) + + @staticmethod + def ai_assisted() -> VariableIdentificationConfig: + """ + AI-assisted configuration - use LLM for ambiguous cases. + Best accuracy but requires LLM access. + """ + return VariableIdentificationConfig( + min_confidence=0.5, + use_llm=True, + use_context_hints=True, + mark_as_required=True, + include_defaults=True, + ) + + @staticmethod + def form_filling() -> VariableIdentificationConfig: + """ + Optimized for form-filling workflows. + Recognizes common form fields and patterns. + """ + config = VariableIdentificationConfig( + min_confidence=0.65, + use_context_hints=True, + mark_as_required=True, + include_defaults=True, + ) + + # Add common form field patterns + config.always_parameterize.update( + { + 'address', + 'city', + 'state', + 'zip', + 'postal', + 'country', + 'firstname', + 'lastname', + 'middlename', + 'dateofbirth', + 'gender', + 'company', + 'occupation', + } + ) + + return config + + @staticmethod + def ecommerce() -> VariableIdentificationConfig: + """ + Optimized for e-commerce workflows. + Handles product searches, checkouts, etc. + """ + config = VariableIdentificationConfig( + min_confidence=0.6, + use_context_hints=True, + mark_as_required=False, # Many fields optional in e-commerce + include_defaults=True, + ) + + config.always_parameterize.update( + { + 'product', + 'quantity', + 'size', + 'color', + 'shipping', + 'billing', + 'coupon', + 'promo', + } + ) + + return config + + +# Global configuration instance +_global_config = VariableIdentificationConfig() + + +def get_config() -> VariableIdentificationConfig: + """Get the global configuration instance.""" + return _global_config + + +def set_config(config: VariableIdentificationConfig) -> None: + """Set the global configuration instance.""" + global _global_config + _global_config = config + + +def load_preset(preset_name: str) -> None: + """ + Load a preset configuration. + + Args: + preset_name: One of 'strict', 'balanced', 'aggressive', 'ai_assisted', + 'form_filling', 'ecommerce' + """ + presets = { + 'strict': VariableConfigPresets.strict, + 'balanced': VariableConfigPresets.balanced, + 'aggressive': VariableConfigPresets.aggressive, + 'ai_assisted': VariableConfigPresets.ai_assisted, + 'form_filling': VariableConfigPresets.form_filling, + 'ecommerce': VariableConfigPresets.ecommerce, + } + + if preset_name not in presets: + raise ValueError(f"Unknown preset '{preset_name}'. Available: {list(presets.keys())}") + + set_config(presets[preset_name]()) diff --git a/workflows/workflow_use/workflow/variable_identifier.py b/workflows/workflow_use/workflow/variable_identifier.py new file mode 100644 index 00000000..db7ffe3a --- /dev/null +++ b/workflows/workflow_use/workflow/variable_identifier.py @@ -0,0 +1,527 @@ +""" +AI-Powered Variable Identifier for Workflow Automation + +This module provides intelligent identification of input values that should be +parameterized as variables in workflow definitions. It uses pattern matching, +heuristics, and optional LLM assistance to create deterministic, reusable workflows. +""" + +import logging +import re +from typing import Any, Dict, List, Optional, Set, Tuple +from dataclasses import dataclass +from enum import Enum + +logger = logging.getLogger(__name__) + + +class VariableType(str, Enum): + """Types of variables that can be identified.""" + + STRING = 'string' + EMAIL = 'email' + PHONE = 'phone' + NUMBER = 'number' + BOOLEAN = 'boolean' + URL = 'url' + DATE = 'date' + CREDIT_CARD = 'credit_card' + SSN = 'ssn' + ZIP_CODE = 'zip_code' + PASSWORD = 'password' + + +@dataclass +class VariableCandidate: + """A candidate value that could be parameterized as a variable.""" + + value: str + variable_name: str + variable_type: VariableType + confidence: float # 0.0 to 1.0 + context: Dict[str, Any] # Context from the step (label, placeholder, etc.) + suggested_default: Optional[str] = None + description: Optional[str] = None + required: bool = True + + +class VariableIdentifier: + """ + Identifies input values that should be parameterized as variables. + + Uses a multi-stage approach: + 1. Pattern-based detection (deterministic) + 2. Context-based detection (semantic analysis) + 3. Type inference and validation + 4. Variable name generation + """ + + # Regex patterns for common data types + PATTERNS = { + VariableType.EMAIL: r'^[a-zA-Z0-9._%+-]+@[a-zA-Z0-9.-]+\.[a-zA-Z]{2,}$', + VariableType.PHONE: r'^[\+]?[(]?[0-9]{3}[)]?[-\s\.]?[0-9]{3}[-\s\.]?[0-9]{4,6}$', + VariableType.URL: r'^https?://[^\s]+$', + VariableType.ZIP_CODE: r'^\d{5}(-\d{4})?$', + VariableType.SSN: r'^\d{3}-?\d{2}-?\d{4}$', + VariableType.CREDIT_CARD: r'^\d{4}[\s-]?\d{4}[\s-]?\d{4}[\s-]?\d{4}$', + VariableType.DATE: r'^\d{1,2}[/-]\d{1,2}[/-]\d{2,4}$|^\d{4}[/-]\d{1,2}[/-]\d{1,2}$', + VariableType.NUMBER: r'^\d+(\.\d+)?$', + } + + # Keywords that suggest a field should be a variable + VARIABLE_KEYWORDS = { + 'email': VariableType.EMAIL, + 'phone': VariableType.PHONE, + 'telephone': VariableType.PHONE, + 'mobile': VariableType.PHONE, + 'url': VariableType.URL, + 'website': VariableType.URL, + 'address': VariableType.STRING, + 'name': VariableType.STRING, + 'firstname': VariableType.STRING, + 'first_name': VariableType.STRING, + 'lastname': VariableType.STRING, + 'last_name': VariableType.STRING, + 'username': VariableType.STRING, + 'password': VariableType.PASSWORD, + 'ssn': VariableType.SSN, + 'social': VariableType.SSN, + 'security': VariableType.SSN, + 'zip': VariableType.ZIP_CODE, + 'postal': VariableType.ZIP_CODE, + 'card': VariableType.CREDIT_CARD, + 'credit': VariableType.CREDIT_CARD, + 'date': VariableType.DATE, + 'dob': VariableType.DATE, + 'birth': VariableType.DATE, + 'age': VariableType.NUMBER, + 'amount': VariableType.NUMBER, + 'quantity': VariableType.NUMBER, + 'price': VariableType.NUMBER, + } + + # Values that should NOT be parameterized (common static values) + STATIC_VALUES = { + '', + ' ', + 'true', + 'false', + 'yes', + 'no', + 'on', + 'off', + '1', + '0', + 'submit', + 'cancel', + 'ok', + } + + def __init__(self, min_confidence: float = 0.6, use_llm: bool = False): + """ + Initialize the variable identifier. + + Args: + min_confidence: Minimum confidence score (0-1) to accept a variable candidate + use_llm: Whether to use LLM for ambiguous cases (future enhancement) + """ + self.min_confidence = min_confidence + self.use_llm = use_llm + self._seen_variables: Set[str] = set() # Track to avoid duplicates + + def identify_variables_in_workflow(self, workflow_data: Dict[str, Any]) -> Dict[str, Any]: + """ + Analyze a workflow and identify variables in input steps. + + Args: + workflow_data: The workflow dictionary + + Returns: + Modified workflow with identified variables replaced with placeholders + """ + workflow = workflow_data.copy() + identified_vars: Dict[str, VariableCandidate] = {} + + # Process each step + if 'steps' in workflow: + for i, step in enumerate(workflow['steps']): + if step.get('type') == 'input' and 'value' in step: + value = step['value'] + + # Skip if already a variable placeholder + if self._is_variable_placeholder(value): + continue + + # Extract context from the step + context = self._extract_step_context(step) + + # Identify if this should be a variable + candidate = self.identify_variable(value, context) + + if candidate and candidate.confidence >= self.min_confidence: + var_name = self._ensure_unique_variable_name(candidate.variable_name) + candidate.variable_name = var_name + + # Replace value with placeholder + workflow['steps'][i]['value'] = f'{{{var_name}}}' + + # Store the candidate + identified_vars[var_name] = candidate + + logger.info( + f"Identified variable '{var_name}' (type: {candidate.variable_type}, " + f'confidence: {candidate.confidence:.2f}) in step {i}' + ) + + # Generate input schema + if identified_vars: + workflow['input_schema'] = self._generate_input_schema(identified_vars) + + # Add metadata about variable identification + if 'metadata' not in workflow: + workflow['metadata'] = {} + workflow['metadata']['variables_auto_identified'] = True + workflow['metadata']['identified_variable_count'] = len(identified_vars) + + return workflow + + def identify_variable(self, value: str, context: Dict[str, Any]) -> Optional[VariableCandidate]: + """ + Identify if a value should be a variable. + + Args: + value: The input value to analyze + context: Context information (labels, placeholders, field names, etc.) + + Returns: + VariableCandidate if the value should be parameterized, None otherwise + """ + # Skip empty or very short values + if not value or len(value.strip()) < 2: + return None + + # Skip common static values + if value.lower().strip() in self.STATIC_VALUES: + return None + + # Stage 1: Pattern-based detection (high confidence) + pattern_result = self._detect_by_pattern(value) + if pattern_result: + var_type, confidence = pattern_result + var_name = self._generate_variable_name(var_type, context) + return VariableCandidate( + value=value, + variable_name=var_name, + variable_type=var_type, + confidence=confidence, + context=context, + suggested_default=value if confidence < 0.95 else None, + description=self._generate_description(var_name, var_type, context), + required=True, + ) + + # Stage 2: Context-based detection (medium confidence) + context_result = self._detect_by_context(value, context) + if context_result: + var_type, confidence, var_name = context_result + return VariableCandidate( + value=value, + variable_name=var_name, + variable_type=var_type, + confidence=confidence, + context=context, + suggested_default=value, + description=self._generate_description(var_name, var_type, context), + required=True, + ) + + # Stage 3: Heuristic detection (lower confidence) + # If the value looks dynamic (contains mixed case, numbers, special chars) + if self._looks_dynamic(value): + var_name = self._generate_variable_name(VariableType.STRING, context) + return VariableCandidate( + value=value, + variable_name=var_name, + variable_type=VariableType.STRING, + confidence=0.5, # Lower confidence + context=context, + suggested_default=value, + description=self._generate_description(var_name, VariableType.STRING, context), + required=True, + ) + + return None + + def _detect_by_pattern(self, value: str) -> Optional[Tuple[VariableType, float]]: + """Detect variable type by regex pattern matching.""" + for var_type, pattern in self.PATTERNS.items(): + if re.match(pattern, value.strip()): + confidence = 0.95 # High confidence for pattern matches + logger.debug(f"Pattern match: '{value}' detected as {var_type}") + return (var_type, confidence) + return None + + def _detect_by_context(self, value: str, context: Dict[str, Any]) -> Optional[Tuple[VariableType, float, str]]: + """Detect variable type by analyzing context (labels, placeholders, etc.).""" + # Extract text hints from context + hints = [] + if 'target_text' in context: + hints.append(context['target_text'].lower()) + if 'placeholder' in context: + hints.append(context['placeholder'].lower()) + if 'label' in context: + hints.append(context['label'].lower()) + if 'name' in context: + hints.append(context['name'].lower()) + if 'id' in context: + hints.append(context['id'].lower()) + if 'description' in context: + hints.append(context['description'].lower()) + + combined_hints = ' '.join(hints) + + # Match keywords + for keyword, var_type in self.VARIABLE_KEYWORDS.items(): + if keyword in combined_hints: + # Generate variable name from context (with type-based fallback) + var_name = self._generate_variable_name(var_type, context) + confidence = 0.85 # Good confidence for context matches + logger.debug(f"Context match: keyword '{keyword}' โ†’ {var_type} (name: {var_name})") + return (var_type, confidence, var_name) + + # Additional check for common field patterns (even without exact keyword match) + # This helps catch "First Name" โ†’ first_name, "Last Name" โ†’ last_name + if combined_hints: + # Check if hints suggest this is a name field + if any(term in combined_hints for term in ['name', 'full name', 'given name', 'surname', 'family name']): + # Generate variable name with fallback + var_name = self._generate_variable_name(VariableType.STRING, context) + if var_name and 'name' in var_name: + confidence = 0.75 # Decent confidence for name fields + logger.debug(f'Name field detected from context: {var_name}') + return (VariableType.STRING, confidence, var_name) + + return None + + def _looks_dynamic(self, value: str) -> bool: + """ + Heuristic to determine if a value looks like dynamic user input + rather than a static value. + """ + # Skip very common words + common_words = {'the', 'and', 'or', 'but', 'in', 'on', 'at', 'to', 'for', 'of', 'with'} + if value.lower() in common_words: + return False + + # Dynamic indicators + has_mixed_case = value != value.lower() and value != value.upper() + has_numbers = any(c.isdigit() for c in value) + has_special = any(c in value for c in ['@', '#', '$', '%', '&', '*', '_', '-']) + is_long = len(value) > 10 + + # If it has multiple dynamic indicators, likely a variable + score = sum([has_mixed_case, has_numbers, has_special, is_long]) + return score >= 2 + + def _generate_variable_name(self, var_type: VariableType, context: Dict[str, Any]) -> str: + """Generate a semantic variable name based on type and context.""" + # Try to extract from context first + var_name = self._generate_variable_name_from_context(context, var_type) + if var_name: + return var_name + + # Fallback to type-based names + type_names = { + VariableType.EMAIL: 'email', + VariableType.PHONE: 'phone_number', + VariableType.URL: 'url', + VariableType.SSN: 'social_security_number', + VariableType.ZIP_CODE: 'zip_code', + VariableType.CREDIT_CARD: 'credit_card_number', + VariableType.DATE: 'date', + VariableType.NUMBER: 'number', + VariableType.PASSWORD: 'password', + VariableType.STRING: 'value', + } + return type_names.get(var_type, 'value') + + def _generate_variable_name_from_context(self, context: Dict[str, Any], var_type: VariableType) -> str: + """Extract a meaningful variable name from step context.""" + # Priority: name attribute > id > label > placeholder > target_text + for key in ['name', 'id', 'label', 'placeholder', 'target_text']: + if key in context and context[key]: + raw_name = context[key] + # Clean and normalize + var_name = self._normalize_variable_name(raw_name) + if var_name and len(var_name) > 1: + return var_name + + return None + + def _normalize_variable_name(self, name: str) -> str: + """Normalize a string to a valid variable name.""" + # Convert to lowercase + name = name.lower() + + # Remove common prefixes/suffixes + for prefix in ['input-', 'field-', 'txt-', 'input_', 'field_']: + if name.startswith(prefix): + name = name[len(prefix) :] + + # Replace special chars with underscore + name = re.sub(r'[^a-z0-9_]', '_', name) + + # Remove multiple consecutive underscores + name = re.sub(r'_+', '_', name) + + # Remove leading/trailing underscores + name = name.strip('_') + + # Ensure it doesn't start with a number + if name and name[0].isdigit(): + name = 'field_' + name + + return name + + def _ensure_unique_variable_name(self, base_name: str) -> str: + """Ensure variable name is unique by adding suffix if needed.""" + if base_name not in self._seen_variables: + self._seen_variables.add(base_name) + return base_name + + # Add numeric suffix + counter = 2 + while f'{base_name}_{counter}' in self._seen_variables: + counter += 1 + + unique_name = f'{base_name}_{counter}' + self._seen_variables.add(unique_name) + return unique_name + + def _extract_step_context(self, step: Dict[str, Any]) -> Dict[str, Any]: + """Extract relevant context from a workflow step.""" + context = {} + + # Extract from semantic info + if 'semanticInfo' in step: + semantic = step['semanticInfo'] + for key in ['labelText', 'placeholder', 'ariaLabel', 'name', 'id', 'textContent']: + if key in semantic: + context[key.lower().replace('text', '')] = semantic[key] + + # Extract from step fields + if 'target_text' in step: + context['target_text'] = step['target_text'] + + # IMPORTANT: Extract from description - this often contains "First Name", "Last Name", etc. + if 'description' in step: + context['description'] = step['description'] + + # Extract from CSS selector + if 'cssSelector' in step and step['cssSelector']: + # Try to extract name or id from selector + css = step['cssSelector'] + name_match = re.search(r'\[name=["\']([^"\']+)["\']\]', css) + if name_match: + context['name'] = name_match.group(1) + id_match = re.search(r'\[id=["\']([^"\']+)["\']\]', css) + if id_match: + context['id'] = id_match.group(1) + + return context + + def _generate_description(self, var_name: str, var_type: VariableType, context: Dict[str, Any]) -> str: + """Generate a human-readable description for the variable.""" + # Try to use context for description + if 'label' in context and context['label']: + return context['label'] + if 'placeholder' in context and context['placeholder']: + return f'Enter {context["placeholder"].lower()}' + + # Fallback to generated description + readable_name = var_name.replace('_', ' ').title() + return f'{readable_name} ({var_type.value})' + + def _generate_input_schema(self, variables: Dict[str, VariableCandidate]) -> List[Dict[str, Any]]: + """Generate input schema from identified variables.""" + # Map our detailed types to WorkflowInputSchemaDefinition types + # Schema only supports: 'string', 'number', 'bool' + type_mapping = { + VariableType.STRING: 'string', + VariableType.EMAIL: 'string', + VariableType.PHONE: 'string', + VariableType.URL: 'string', + VariableType.SSN: 'string', + VariableType.ZIP_CODE: 'string', + VariableType.CREDIT_CARD: 'string', + VariableType.DATE: 'string', + VariableType.PASSWORD: 'string', + VariableType.NUMBER: 'number', + VariableType.BOOLEAN: 'bool', + } + + # Format info for detailed types (used in validation/display) + format_mapping = { + VariableType.EMAIL: 'email', + VariableType.PHONE: 'phone', + VariableType.URL: 'url', + VariableType.SSN: 'ssn', + VariableType.ZIP_CODE: 'zip-code', + VariableType.CREDIT_CARD: 'credit-card', + VariableType.DATE: 'date', + VariableType.PASSWORD: 'password', + } + + schema = [] + for var_name, candidate in variables.items(): + # Map to schema type + schema_type = type_mapping.get(candidate.variable_type, 'string') + + entry = { + 'name': var_name, + 'type': schema_type, + 'required': candidate.required, + } + + # Add format if we have detailed type info + if candidate.variable_type in format_mapping: + entry['format'] = format_mapping[candidate.variable_type] + + # Add description + if candidate.description: + entry['description'] = candidate.description + + # IMPORTANT: Always add default value (original value from workflow) + # This allows the workflow to run without user input if desired + if candidate.suggested_default: + entry['default'] = candidate.suggested_default + else: + # If no suggested default, use the original value + entry['default'] = candidate.value + + schema.append(entry) + + return schema + + def _is_variable_placeholder(self, value: str) -> bool: + """Check if a value is already a variable placeholder like {variable_name}.""" + return bool(re.match(r'^\{[a-zA-Z_][a-zA-Z0-9_]*\}$', str(value))) + + +# Convenience function for direct usage +def identify_variables_in_workflow( + workflow_data: Dict[str, Any], min_confidence: float = 0.6, use_llm: bool = False +) -> Dict[str, Any]: + """ + Identify and parameterize variables in a workflow. + + Args: + workflow_data: The workflow dictionary + min_confidence: Minimum confidence threshold (0-1) + use_llm: Whether to use LLM for ambiguous cases + + Returns: + Modified workflow with variables identified and replaced + """ + identifier = VariableIdentifier(min_confidence=min_confidence, use_llm=use_llm) + return identifier.identify_variables_in_workflow(workflow_data)