Skip to content

Commit f4f8495

Browse files
author
Pierre
authored
Merge pull request #58 from WorkflowAI/pierre-pii-extraction
feat(examples): add PII extraction example
2 parents 007b0ec + 54db2a5 commit f4f8495

File tree

1 file changed

+137
-0
lines changed

1 file changed

+137
-0
lines changed

examples/15_pii_extraction.py

Lines changed: 137 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,137 @@
1+
"""
2+
This example demonstrates how to create an agent that extracts and redacts Personal Identifiable
3+
Information (PII) from text. It showcases:
4+
5+
1. Handling sensitive information with clear categorization
6+
2. Structured output with both redacted text and extracted PII
7+
3. Enum usage for PII categories
8+
4. Comprehensive PII detection and redaction
9+
"""
10+
11+
import asyncio
12+
from enum import Enum
13+
14+
from pydantic import BaseModel, Field
15+
16+
import workflowai
17+
from workflowai import Model
18+
19+
20+
class PIIType(str, Enum):
21+
"""Categories of Personal Identifiable Information."""
22+
NAME = "NAME" # Full names, first names, last names
23+
EMAIL = "EMAIL" # Email addresses
24+
PHONE = "PHONE" # Phone numbers, fax numbers
25+
ADDRESS = "ADDRESS" # Physical addresses, postal codes
26+
SSN = "SSN" # Social Security Numbers, National IDs
27+
DOB = "DOB" # Date of birth, age
28+
FINANCIAL = "FINANCIAL" # Credit card numbers, bank accounts
29+
LICENSE = "LICENSE" # Driver's license, professional licenses
30+
URL = "URL" # Personal URLs, social media profiles
31+
OTHER = "OTHER" # Other types of PII not covered above
32+
33+
34+
class PIIExtraction(BaseModel):
35+
"""Represents an extracted piece of PII with its type."""
36+
text: str = Field(description="The extracted PII text")
37+
type: PIIType = Field(description="The category of PII")
38+
start_index: int = Field(description="Starting position in the original text")
39+
end_index: int = Field(description="Ending position in the original text")
40+
41+
42+
class PIIInput(BaseModel):
43+
"""Input model for PII extraction."""
44+
text: str = Field(
45+
description="The text to analyze for PII",
46+
examples=[
47+
"Hi, I'm John Doe. You can reach me at john.doe@email.com or call 555-0123. "
48+
"My SSN is 123-45-6789 and I live at 123 Main St, Springfield, IL 62701.",
49+
],
50+
)
51+
52+
53+
class PIIOutput(BaseModel):
54+
"""Output model containing redacted text and extracted PII."""
55+
redacted_text: str = Field(
56+
description="The original text with all PII replaced by [REDACTED]",
57+
examples=[
58+
"Hi, I'm [REDACTED]. You can reach me at [REDACTED] or call [REDACTED]. "
59+
"My SSN is [REDACTED] and I live at [REDACTED].",
60+
],
61+
)
62+
extracted_pii: list[PIIExtraction] = Field(
63+
description="List of extracted PII items with their types and positions",
64+
examples=[
65+
[
66+
{"text": "John Doe", "type": "NAME", "start_index": 8, "end_index": 16},
67+
{"text": "john.doe@email.com", "type": "EMAIL", "start_index": 30, "end_index": 47},
68+
{"text": "555-0123", "type": "PHONE", "start_index": 57, "end_index": 65},
69+
],
70+
],
71+
)
72+
73+
74+
@workflowai.agent(
75+
id="pii-extractor",
76+
model=Model.CLAUDE_3_5_SONNET_LATEST,
77+
)
78+
async def extract_pii(input_data: PIIInput) -> PIIOutput:
79+
"""
80+
Extract and redact Personal Identifiable Information (PII) from text.
81+
82+
Guidelines:
83+
1. Identify all instances of PII in the input text
84+
2. Categorize each PII instance into one of the defined types
85+
3. Record the exact position (start and end indices) of each PII instance
86+
4. Replace all PII in the text with [REDACTED]
87+
5. Ensure no sensitive information is left unredacted
88+
6. Be thorough but avoid over-redacting non-PII information
89+
7. When in doubt about PII type, use the OTHER category
90+
8. Maintain the original text structure and formatting
91+
9. Handle overlapping PII appropriately (e.g., name within an email)
92+
10. Consider context when identifying PII (e.g., distinguish between company and personal emails)
93+
"""
94+
...
95+
96+
97+
async def main():
98+
# Example 1: Basic PII extraction
99+
print("\nExample 1: Basic PII")
100+
print("-" * 50)
101+
text = (
102+
"Hello, my name is Sarah Johnson and my email is sarah.j@example.com. "
103+
"You can reach me at (555) 123-4567 or visit my blog at blog.sarahj.net. "
104+
"I was born on 03/15/1985."
105+
)
106+
result = await extract_pii.run(PIIInput(text=text))
107+
print("\nOriginal text:")
108+
print(text)
109+
print("\nRedacted text:")
110+
print(result.output.redacted_text)
111+
print("\nExtracted PII:")
112+
for pii in result.output.extracted_pii:
113+
print(f"- {pii.type}: {pii.text} (positions {pii.start_index}-{pii.end_index})")
114+
115+
# Example 2: Complex PII with financial and address information
116+
print("\n\nExample 2: Complex PII")
117+
print("-" * 50)
118+
text = (
119+
"Customer: David Wilson\n"
120+
"Card: 4532-9678-1234-5678\n"
121+
"Address: 789 Oak Avenue, Apt 4B\n"
122+
" Boston, MA 02108\n"
123+
"License: MA12-345-678\n"
124+
"SSN: 078-05-1120"
125+
)
126+
result = await extract_pii.run(PIIInput(text=text))
127+
print("\nOriginal text:")
128+
print(text)
129+
print("\nRedacted text:")
130+
print(result.output.redacted_text)
131+
print("\nExtracted PII:")
132+
for pii in result.output.extracted_pii:
133+
print(f"- {pii.type}: {pii.text} (positions {pii.start_index}-{pii.end_index})")
134+
135+
136+
if __name__ == "__main__":
137+
asyncio.run(main())

0 commit comments

Comments
 (0)