|
| 1 | +""" |
| 2 | +This example demonstrates how to create an agent that extracts and redacts Personal Identifiable |
| 3 | +Information (PII) from text. It showcases: |
| 4 | +
|
| 5 | +1. Handling sensitive information with clear categorization |
| 6 | +2. Structured output with both redacted text and extracted PII |
| 7 | +3. Enum usage for PII categories |
| 8 | +4. Comprehensive PII detection and redaction |
| 9 | +""" |
| 10 | + |
| 11 | +import asyncio |
| 12 | +from enum import Enum |
| 13 | + |
| 14 | +from pydantic import BaseModel, Field |
| 15 | + |
| 16 | +import workflowai |
| 17 | +from workflowai import Model |
| 18 | + |
| 19 | + |
| 20 | +class PIIType(str, Enum): |
| 21 | + """Categories of Personal Identifiable Information.""" |
| 22 | + NAME = "NAME" # Full names, first names, last names |
| 23 | + EMAIL = "EMAIL" # Email addresses |
| 24 | + PHONE = "PHONE" # Phone numbers, fax numbers |
| 25 | + ADDRESS = "ADDRESS" # Physical addresses, postal codes |
| 26 | + SSN = "SSN" # Social Security Numbers, National IDs |
| 27 | + DOB = "DOB" # Date of birth, age |
| 28 | + FINANCIAL = "FINANCIAL" # Credit card numbers, bank accounts |
| 29 | + LICENSE = "LICENSE" # Driver's license, professional licenses |
| 30 | + URL = "URL" # Personal URLs, social media profiles |
| 31 | + OTHER = "OTHER" # Other types of PII not covered above |
| 32 | + |
| 33 | + |
| 34 | +class PIIExtraction(BaseModel): |
| 35 | + """Represents an extracted piece of PII with its type.""" |
| 36 | + text: str = Field(description="The extracted PII text") |
| 37 | + type: PIIType = Field(description="The category of PII") |
| 38 | + start_index: int = Field(description="Starting position in the original text") |
| 39 | + end_index: int = Field(description="Ending position in the original text") |
| 40 | + |
| 41 | + |
| 42 | +class PIIInput(BaseModel): |
| 43 | + """Input model for PII extraction.""" |
| 44 | + text: str = Field( |
| 45 | + description="The text to analyze for PII", |
| 46 | + examples=[ |
| 47 | + "Hi, I'm John Doe. You can reach me at john.doe@email.com or call 555-0123. " |
| 48 | + "My SSN is 123-45-6789 and I live at 123 Main St, Springfield, IL 62701.", |
| 49 | + ], |
| 50 | + ) |
| 51 | + |
| 52 | + |
| 53 | +class PIIOutput(BaseModel): |
| 54 | + """Output model containing redacted text and extracted PII.""" |
| 55 | + redacted_text: str = Field( |
| 56 | + description="The original text with all PII replaced by [REDACTED]", |
| 57 | + examples=[ |
| 58 | + "Hi, I'm [REDACTED]. You can reach me at [REDACTED] or call [REDACTED]. " |
| 59 | + "My SSN is [REDACTED] and I live at [REDACTED].", |
| 60 | + ], |
| 61 | + ) |
| 62 | + extracted_pii: list[PIIExtraction] = Field( |
| 63 | + description="List of extracted PII items with their types and positions", |
| 64 | + examples=[ |
| 65 | + [ |
| 66 | + {"text": "John Doe", "type": "NAME", "start_index": 8, "end_index": 16}, |
| 67 | + {"text": "john.doe@email.com", "type": "EMAIL", "start_index": 30, "end_index": 47}, |
| 68 | + {"text": "555-0123", "type": "PHONE", "start_index": 57, "end_index": 65}, |
| 69 | + ], |
| 70 | + ], |
| 71 | + ) |
| 72 | + |
| 73 | + |
| 74 | +@workflowai.agent( |
| 75 | + id="pii-extractor", |
| 76 | + model=Model.CLAUDE_3_5_SONNET_LATEST, |
| 77 | +) |
| 78 | +async def extract_pii(input_data: PIIInput) -> PIIOutput: |
| 79 | + """ |
| 80 | + Extract and redact Personal Identifiable Information (PII) from text. |
| 81 | +
|
| 82 | + Guidelines: |
| 83 | + 1. Identify all instances of PII in the input text |
| 84 | + 2. Categorize each PII instance into one of the defined types |
| 85 | + 3. Record the exact position (start and end indices) of each PII instance |
| 86 | + 4. Replace all PII in the text with [REDACTED] |
| 87 | + 5. Ensure no sensitive information is left unredacted |
| 88 | + 6. Be thorough but avoid over-redacting non-PII information |
| 89 | + 7. When in doubt about PII type, use the OTHER category |
| 90 | + 8. Maintain the original text structure and formatting |
| 91 | + 9. Handle overlapping PII appropriately (e.g., name within an email) |
| 92 | + 10. Consider context when identifying PII (e.g., distinguish between company and personal emails) |
| 93 | + """ |
| 94 | + ... |
| 95 | + |
| 96 | + |
| 97 | +async def main(): |
| 98 | + # Example 1: Basic PII extraction |
| 99 | + print("\nExample 1: Basic PII") |
| 100 | + print("-" * 50) |
| 101 | + text = ( |
| 102 | + "Hello, my name is Sarah Johnson and my email is sarah.j@example.com. " |
| 103 | + "You can reach me at (555) 123-4567 or visit my blog at blog.sarahj.net. " |
| 104 | + "I was born on 03/15/1985." |
| 105 | + ) |
| 106 | + result = await extract_pii.run(PIIInput(text=text)) |
| 107 | + print("\nOriginal text:") |
| 108 | + print(text) |
| 109 | + print("\nRedacted text:") |
| 110 | + print(result.output.redacted_text) |
| 111 | + print("\nExtracted PII:") |
| 112 | + for pii in result.output.extracted_pii: |
| 113 | + print(f"- {pii.type}: {pii.text} (positions {pii.start_index}-{pii.end_index})") |
| 114 | + |
| 115 | + # Example 2: Complex PII with financial and address information |
| 116 | + print("\n\nExample 2: Complex PII") |
| 117 | + print("-" * 50) |
| 118 | + text = ( |
| 119 | + "Customer: David Wilson\n" |
| 120 | + "Card: 4532-9678-1234-5678\n" |
| 121 | + "Address: 789 Oak Avenue, Apt 4B\n" |
| 122 | + " Boston, MA 02108\n" |
| 123 | + "License: MA12-345-678\n" |
| 124 | + "SSN: 078-05-1120" |
| 125 | + ) |
| 126 | + result = await extract_pii.run(PIIInput(text=text)) |
| 127 | + print("\nOriginal text:") |
| 128 | + print(text) |
| 129 | + print("\nRedacted text:") |
| 130 | + print(result.output.redacted_text) |
| 131 | + print("\nExtracted PII:") |
| 132 | + for pii in result.output.extracted_pii: |
| 133 | + print(f"- {pii.type}: {pii.text} (positions {pii.start_index}-{pii.end_index})") |
| 134 | + |
| 135 | + |
| 136 | +if __name__ == "__main__": |
| 137 | + asyncio.run(main()) |
0 commit comments