From fbed309178c1fd7faada65b78f0919d07ab68558 Mon Sep 17 00:00:00 2001
From: Anya <75702826+anyacherniss@users.noreply.github.com>
Date: Thu, 14 Aug 2025 15:34:13 -0400
Subject: [PATCH 1/2] add image description example agent to documentation

---
 .../docs/examples/image-input-agents.mdx      | 103 ++++++++++++++++++
 1 file changed, 103 insertions(+)
 create mode 100644 docs/content/docs/examples/image-input-agents.mdx

diff --git a/docs/content/docs/examples/image-input-agents.mdx b/docs/content/docs/examples/image-input-agents.mdx
new file mode 100644
index 00000000..fe5ed979
--- /dev/null
+++ b/docs/content/docs/examples/image-input-agents.mdx
@@ -0,0 +1,103 @@
+# Creating Agents with Image Input
+
+## Overview
+
+The goal of this guide is to explain the differences between using images as input instead of text, provide context for why these differences exist, and show you a basic example of an agent with image input 
+
+## The Difference Between Text and Image Inputs
+
+When building agents that take an image as input, images are passed directly in the message content using the `image_url` type, not as input variables. This is because images require special handling in the message format that differs from regular text or JSON inputs.
+
+## Why Images Are Handled Differently
+
+[TODO: explain this better]
+
+Traditional input variables in AnotherAI are designed for text, numbers, and structured data that can be templated into prompts using Jinja2 syntax. For example, you might have `{{ user_name }}` or `{{ email_content }}` in your prompt template.
+
+Images, however, cannot be templated this way because:
+[the following is generated by Claude; not sure if it's accurate]
+- They are binary data or URLs, not text that can be inserted into a string
+- AI models expect images to be provided in a specific format within the message structure
+- The models need to know explicitly that they're receiving image data, not text
+
+## Example: Image Description Agent
+
+Let's explore a complete example that shows how to correctly pass images to an agent:
+
+```python
+def image_description(image_url: str) -> str:
+    res = openai.chat.completions.create(
+        model="gpt-4o-mini",
+        messages=[
+            {
+                "role": "system",
+                "content": """You are an image description specialist who provides detailed and accurate descriptions of images. Your task is to analyze the provided images and generate a comprehensive description that captures the key elements, context, and details visible in the image(s).
+
+                When multiple images are provided, create a unified description that covers all images. If additional information is provided, use it to inform your description and provide more context-specific details.
+
+                Your description should be:
+                - Clear and concise
+                - Factual and objective
+                - Detailed enough to help someone visualize the image
+                - Well-structured and easy to understand""",
+            },
+            {
+                "role": "user",
+                "content": [
+                    {"type": "image_url", "image_url": {"url": image_url}},
+                ],
+            },
+        ],
+    )
+    if not res.choices[0].message.content:
+        raise ValueError("No image description found")
+    return res.choices[0].message.content
+```
+
+### Breaking Down Each Part
+
+#### Function Signature
+```python
+def sassy_image_description(image_url: str) -> str:
+```
+The function accepts a string parameter `image_url`. This should be either:
+- A public URL to an image (e.g., `https://example.com/image.jpg`)
+- A base64-encoded data URL (e.g., `data:image/jpeg;base64,/9j/4AAQ...`)
+
+#### System Message
+```python
+{
+    "role": "system",
+    "content": "You are a sassy image description generator. You are given an image URL and you need to generate a sassy description of the image.",
+}
+```
+The system message is a regular string, just like in text-only agents. This sets the personality and task for the AI.
+
+#### User Message with Image
+```python
+{
+    "role": "user",
+    "content": [
+        {"type": "image_url", "image_url": {"url": image_url}},
+    ],
+}
+```
+This is where images differ from text inputs:
+- `content` is an **array** instead of a string
+- Each array element has a `type` field
+- For images, use `type: "image_url"`
+- The URL is nested: `image_url: {"url": image_url}`
+
+#### Mixing Text and Images
+
+You can combine text and images in the same message:
+
+```python
+{
+    "role": "user",
+    "content": [
+        {"type": "text", "text": "How many cats are in the image?"},
+        {"type": "image_url", "image_url": {"url": image_url}}
+    ],
+}
+```
\ No newline at end of file

From 98597d33b06b7a5eaec6926ef1c3177c5027012e Mon Sep 17 00:00:00 2001
From: Anya <75702826+anyacherniss@users.noreply.github.com>
Date: Thu, 14 Aug 2025 15:54:01 -0400
Subject: [PATCH 2/2] add input variables and images example

---
 .../docs/examples/image-input-agents.mdx      | 93 ++++++++++++-------
 1 file changed, 57 insertions(+), 36 deletions(-)

diff --git a/docs/content/docs/examples/image-input-agents.mdx b/docs/content/docs/examples/image-input-agents.mdx
index fe5ed979..d1f1a64c 100644
--- a/docs/content/docs/examples/image-input-agents.mdx
+++ b/docs/content/docs/examples/image-input-agents.mdx
@@ -1,4 +1,4 @@
-# Creating Agents with Image Input
+mad# Creating Agents with Image Input
 
 ## Overview
 
@@ -31,9 +31,7 @@ def image_description(image_url: str) -> str:
         messages=[
             {
                 "role": "system",
-                "content": """You are an image description specialist who provides detailed and accurate descriptions of images. Your task is to analyze the provided images and generate a comprehensive description that captures the key elements, context, and details visible in the image(s).
-
-                When multiple images are provided, create a unified description that covers all images. If additional information is provided, use it to inform your description and provide more context-specific details.
+                "content": """You are an image description specialist who provides detailed and accurate descriptions of images. Your task is to analyze the provided image and generate a comprehensive description that captures the key elements, context, and details visible in the image.
 
                 Your description should be:
                 - Clear and concise
@@ -54,50 +52,73 @@ def image_description(image_url: str) -> str:
     return res.choices[0].message.content
 ```
 
-### Breaking Down Each Part
+As you can see, images differ from text inputs:
+- For images, use `type: "image_url"`
+- The URL is nested: `image_url: {"url": image_url}`
 
-#### Function Signature
-```python
-def sassy_image_description(image_url: str) -> str:
-```
-The function accepts a string parameter `image_url`. This should be either:
-- A public URL to an image (e.g., `https://example.com/image.jpg`)
-- A base64-encoded data URL (e.g., `data:image/jpeg;base64,/9j/4AAQ...`)
+### Combining Image Input with Text
 
-#### System Message
-```python
-{
-    "role": "system",
-    "content": "You are a sassy image description generator. You are given an image URL and you need to generate a sassy description of the image.",
-}
-```
-The system message is a regular string, just like in text-only agents. This sets the personality and task for the AI.
+#### Mixing Static Text and Images
+
+[TODO: confirm if this is correct]
+You can combine text and image content in the same message. For example:
 
-#### User Message with Image
 ```python
 {
     "role": "user",
     "content": [
-        {"type": "image_url", "image_url": {"url": image_url}},
+        {"type": "text", "text": "How many cats are in the image?"},
+        {"type": "image_url", "image_url": {"url": image_url}}
     ],
 }
 ```
-This is where images differ from text inputs:
-- `content` is an **array** instead of a string
-- Each array element has a `type` field
-- For images, use `type: "image_url"`
-- The URL is nested: `image_url: {"url": image_url}`
 
-#### Mixing Text and Images
+#### Mixing Input Variables and Images
 
-You can combine text and images in the same message:
+[TODO: confirm if this is correct]
+When your input includes both images and an input variable, you can use Jinja2 templating in text content while keeping images in the structured format. For example:
 
 ```python
-{
-    "role": "user",
-    "content": [
-        {"type": "text", "text": "How many cats are in the image?"},
-        {"type": "image_url", "image_url": {"url": image_url}}
-    ],
-}
+class ImageQuestionAnswer(BaseModel):
+    answer: str
+
+def answer_image_question(
+    image_url: str,
+    question: str,
+) -> ImageQuestionAnswer:
+    res = openai.beta.chat.completions.parse(
+        model="gpt-4o-mini",
+        messages=[
+            {
+                "role": "system",
+                "content": """You are an image analyst who provides detailed and accurate answers to questions about images. Your task is to analyze the provided image and question about the image and generate a comprehensive answer.
+
+                Your answer should be:
+                - Clear and concise
+                - Factual and objective
+                - Well-structured and easy to understand""",
+            },
+            {
+                "role": "user",
+                "content": [
+                    {
+                        "type": "text", 
+                        "text": "{{question}}"
+                    },
+                    {"type": "image_url", "image_url": {"url": image_url}}
+                ]
+            }
+        ],
+        response_format=ImageQuestionAnswer,
+        extra_body={
+            "input": {
+                "variables": {
+                    "question": question,
+                }
+            }
+        },
+    )
+    if not res.choices[0].message.parsed:
+        raise ValueError("No image question answer found")
+    return res.choices[0].message.parsed
 ```
\ No newline at end of file