Skip to content

Commit 19e5698

Browse files
author
Pierre
committed
documentation for image, pdf
1 parent 9d106a4 commit 19e5698

File tree

8 files changed

+220
-41
lines changed

8 files changed

+220
-41
lines changed

.env.sample

Lines changed: 5 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -1,4 +1,8 @@
1-
WORKFLOWAI_API_URL=
1+
# Only change this URL if you are self-hosting WorkflowAI
2+
WORKFLOWAI_API_URL=https://run.workflowai.com
3+
4+
# Your WorkflowAI API key
5+
# [Get your API key here](https://workflowai.com/organization/settings/api-keys)
26
WORKFLOWAI_API_KEY=
37

48
# Used when running e2e tests

README.md

Lines changed: 60 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -1,6 +1,6 @@
11
# WorkflowAI Python
22

3-
A library to use WorkflowAI with Python
3+
A library to use [WorkflowAI](https://workflowai.com) with Python
44

55
## Context
66

@@ -172,6 +172,65 @@ def say_hello(input: Input) -> AsyncIterator[Run[Output]]:
172172
...
173173
```
174174

175+
### Images
176+
177+
Add images as input to an agent by using the `Image` class. The `content` should be a base64 encoded string.
178+
179+
```python
180+
from workflowai.fields import Image
181+
182+
class ImageInput(BaseModel):
183+
image: Image = Field(description="The image to analyze")
184+
185+
# use base64 to include the image inline
186+
image = Image(content_type='image/jpeg', data='<base 64 encoded data>')
187+
188+
# You can also use the `url` property to pass an image URL.
189+
image = Image(url="https://example.com/image.jpg")
190+
```
191+
192+
An example of using image as input is available in [city_identifier.py](./examples/images/city_identifier.py).
193+
194+
### Files (PDF, .txt, ...)
195+
196+
Use the `File` class to pass files as input to an agent. Different LLMs support different file types.
197+
198+
```python
199+
from workflowai.fields import File
200+
...
201+
202+
class PDFQuestionInput(BaseModel):
203+
pdf: File = Field(description="The PDF document to analyze")
204+
question: str = Field(description="The question to answer about the PDF content")
205+
206+
class PDFAnswerOutput(BaseModel):
207+
answer: str = Field(description="The answer to the question based on the PDF content")
208+
quotes: List[str] = Field(description="Relevant quotes from the PDF that support the answer")
209+
210+
@workflowai.agent(id="pdf-answer", model=Model.CLAUDE_3_5_SONNET_LATEST)
211+
async def answer_pdf_question(input: PDFQuestionInput) -> PDFAnswerOutput:
212+
"""
213+
Analyze the provided PDF document and answer the given question.
214+
Provide a clear and concise answer based on the content found in the PDF.
215+
"""
216+
...
217+
218+
pdf = File(content_type='application/pdf', data='<base 64 encoded data>')
219+
question = "What are the key findings in this report?"
220+
221+
output = await answer_pdf_question(PDFQuestionInput(pdf=pdf, question=question))
222+
# Print the answer and supporting quotes
223+
print("Answer:", output.answer)
224+
print("\nSupporting quotes:")
225+
for quote in output.quotes:
226+
print(f"- {quote}")
227+
```
228+
An example of using a PDF as input is available in [pdf_answer.py](./examples/pdf_answer.py).
229+
230+
### Audio
231+
232+
[todo]
233+
175234
### Tools
176235

177236
Tools allow enhancing an agent's capabilities by allowing it to call external functions.

examples/city_to_capital_task.py

Lines changed: 0 additions & 39 deletions
This file was deleted.
553 KB
Loading

examples/images/assets/paris.jpg

73.5 KB
Loading

examples/images/city_identifier.py

Lines changed: 78 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,78 @@
1+
from pydantic import BaseModel, Field
2+
from typing import Optional
3+
import asyncio
4+
import workflowai
5+
from workflowai.core.domain.model import Model
6+
from workflowai.fields import Image
7+
import os
8+
from dotenv import load_dotenv
9+
from workflowai import Run, WorkflowAIError
10+
11+
# Load environment variables from .env file
12+
load_dotenv()
13+
14+
class ImageInput(BaseModel):
15+
image: Image = Field(description="The image to analyze")
16+
17+
class ImageOutput(BaseModel):
18+
city: str = Field(default="", description="Name of the city shown in the image")
19+
country: str = Field(default="", description="Name of the country where the city is located")
20+
confidence: Optional[float] = Field(
21+
default=None,
22+
description="Confidence level in the identification (0-1)"
23+
)
24+
25+
@workflowai.agent(id="city-identifier", model=Model.GEMINI_1_5_FLASH_LATEST)
26+
async def identify_city_from_image(input: ImageInput) -> Run[ImageOutput]:
27+
"""
28+
Analyze the provided image and identify the city and country shown in it.
29+
If the image shows a recognizable landmark or cityscape, identify the city and country.
30+
If uncertain, indicate lower confidence or leave fields empty.
31+
32+
Focus on:
33+
- Famous landmarks
34+
- Distinctive architecture
35+
- Recognizable skylines
36+
- Cultural elements that identify the location
37+
38+
Return empty strings if the city/country cannot be determined with reasonable confidence.
39+
"""
40+
...
41+
42+
async def run_city_identifier():
43+
current_dir = os.path.dirname(os.path.abspath(__file__))
44+
image_path = os.path.join(current_dir, "assets", "new-york-city.jpg")
45+
46+
with open(image_path, "rb") as image_file:
47+
import base64
48+
content = base64.b64encode(image_file.read()).decode("utf-8")
49+
50+
image = Image(content_type='image/jpeg', data=content)
51+
try:
52+
agent_run = await identify_city_from_image(
53+
ImageInput(image=image),
54+
use_cache="auto"
55+
)
56+
except WorkflowAIError as e:
57+
print(f"Failed to run task. Code: {e.error.code}. Message: {e.error.message}")
58+
return
59+
60+
print("\n--------\nAgent output:\n", agent_run.output, "\n--------\n")
61+
print(f"Cost: ${agent_run.cost_usd:.10f}")
62+
print(f"Latency: {agent_run.duration_seconds:.2f}s")
63+
64+
# using URL for Image
65+
# TODO: replace with a Github URL
66+
image_url = "https://t4.ftcdn.net/jpg/02/96/15/35/360_F_296153501_B34baBHDkFXbl5RmzxpiOumF4LHGCvAE.jpg"
67+
image = Image(url=image_url)
68+
agent_run = await identify_city_from_image(
69+
ImageInput(image=image),
70+
use_cache="auto"
71+
)
72+
73+
print("\n--------\nAgent output:\n", agent_run.output, "\n--------\n")
74+
print(f"Cost: ${agent_run.cost_usd:.10f}")
75+
print(f"Latency: {agent_run.duration_seconds:.2f}s")
76+
77+
if __name__ == "__main__":
78+
asyncio.run(run_city_identifier())

examples/pdf_answer.py

Lines changed: 77 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,77 @@
1+
from pydantic import BaseModel, Field
2+
import asyncio
3+
import workflowai
4+
from workflowai.core.domain.model import Model
5+
from workflowai.fields import File
6+
import os
7+
from dotenv import load_dotenv
8+
from workflowai import Run, WorkflowAIError
9+
from typing import List
10+
11+
# Load environment variables from .env file
12+
load_dotenv()
13+
14+
class PDFQuestionInput(BaseModel):
15+
pdf: File = Field(description="The PDF document to analyze")
16+
question: str = Field(description="The question to answer about the PDF content")
17+
18+
class PDFAnswerOutput(BaseModel):
19+
answer: str = Field(description="The answer to the question based on the PDF content")
20+
quotes: List[str] = Field(description="Relevant quotes from the PDF that support the answer")
21+
22+
@workflowai.agent(id="pdf-answer", model=Model.CLAUDE_3_5_SONNET_LATEST)
23+
async def answer_pdf_question(input: PDFQuestionInput) -> Run[PDFAnswerOutput]:
24+
"""
25+
Analyze the provided PDF document and answer the given question.
26+
Provide a clear and concise answer based on the content found in the PDF.
27+
28+
Focus on:
29+
- Accurate information extraction from the PDF
30+
- Direct and relevant answers to the question
31+
- Context-aware responses that consider the full document
32+
- Citing specific sections or pages when relevant
33+
34+
If the question cannot be answered based on the PDF content,
35+
provide a clear explanation of why the information is not available.
36+
"""
37+
...
38+
39+
async def run_pdf_answer():
40+
current_dir = os.path.dirname(os.path.abspath(__file__))
41+
pdf_path = os.path.join(current_dir, "pdfs", "sec-form-4.pdf")
42+
43+
with open(pdf_path, "rb") as pdf_file:
44+
import base64
45+
content = base64.b64encode(pdf_file.read()).decode("utf-8")
46+
47+
pdf = File(content_type='application/pdf', data=content)
48+
question = "How many stocks were sold? What is the total amount in USD?"
49+
50+
try:
51+
agent_run = await answer_pdf_question(
52+
PDFQuestionInput(pdf=pdf, question=question),
53+
use_cache="auto"
54+
)
55+
except WorkflowAIError as e:
56+
print(f"Failed to run task. Code: {e.error.code}. Message: {e.error.message}")
57+
return
58+
59+
print("\n--------\nAgent output:\n", agent_run.output, "\n--------\n")
60+
print(f"Cost: ${agent_run.cost_usd:.10f}")
61+
print(f"Latency: {agent_run.duration_seconds:.2f}s")
62+
63+
# # using URL for PDF
64+
# pdf_url = "https://example.com/sample.pdf"
65+
# pdf = File(url=pdf_url)
66+
# question = "What are the key findings in the conclusion?"
67+
# agent_run = await answer_pdf_question(
68+
# PDFQuestionInput(pdf=pdf, question=question),
69+
# use_cache="auto"
70+
# )
71+
72+
# print("\n--------\nAgent output:\n", agent_run.output, "\n--------\n")
73+
# print(f"Cost: ${agent_run.cost_usd:.10f}")
74+
# print(f"Latency: {agent_run.duration_seconds:.2f}s")
75+
76+
if __name__ == "__main__":
77+
asyncio.run(run_pdf_answer())

examples/pdfs/sec-form-4.pdf

101 KB
Binary file not shown.

0 commit comments

Comments
 (0)