From fbed309178c1fd7faada65b78f0919d07ab68558 Mon Sep 17 00:00:00 2001 From: Anya <75702826+anyacherniss@users.noreply.github.com> Date: Thu, 14 Aug 2025 15:34:13 -0400 Subject: [PATCH 1/2] add image description example agent to documentation --- .../docs/examples/image-input-agents.mdx | 103 ++++++++++++++++++ 1 file changed, 103 insertions(+) create mode 100644 docs/content/docs/examples/image-input-agents.mdx diff --git a/docs/content/docs/examples/image-input-agents.mdx b/docs/content/docs/examples/image-input-agents.mdx new file mode 100644 index 00000000..fe5ed979 --- /dev/null +++ b/docs/content/docs/examples/image-input-agents.mdx @@ -0,0 +1,103 @@ +# Creating Agents with Image Input + +## Overview + +The goal of this guide is to explain the differences between using images as input instead of text, provide context for why these differences exist, and show you a basic example of an agent with image input + +## The Difference Between Text and Image Inputs + +When building agents that take an image as input, images are passed directly in the message content using the `image_url` type, not as input variables. This is because images require special handling in the message format that differs from regular text or JSON inputs. + +## Why Images Are Handled Differently + +[TODO: explain this better] + +Traditional input variables in AnotherAI are designed for text, numbers, and structured data that can be templated into prompts using Jinja2 syntax. For example, you might have `{{ user_name }}` or `{{ email_content }}` in your prompt template. + +Images, however, cannot be templated this way because: +[the following is generated by Claude; not sure if it's accurate] +- They are binary data or URLs, not text that can be inserted into a string +- AI models expect images to be provided in a specific format within the message structure +- The models need to know explicitly that they're receiving image data, not text + +## Example: Image Description Agent + +Let's explore a complete example that shows how to correctly pass images to an agent: + +```python +def image_description(image_url: str) -> str: + res = openai.chat.completions.create( + model="gpt-4o-mini", + messages=[ + { + "role": "system", + "content": """You are an image description specialist who provides detailed and accurate descriptions of images. Your task is to analyze the provided images and generate a comprehensive description that captures the key elements, context, and details visible in the image(s). + + When multiple images are provided, create a unified description that covers all images. If additional information is provided, use it to inform your description and provide more context-specific details. + + Your description should be: + - Clear and concise + - Factual and objective + - Detailed enough to help someone visualize the image + - Well-structured and easy to understand""", + }, + { + "role": "user", + "content": [ + {"type": "image_url", "image_url": {"url": image_url}}, + ], + }, + ], + ) + if not res.choices[0].message.content: + raise ValueError("No image description found") + return res.choices[0].message.content +``` + +### Breaking Down Each Part + +#### Function Signature +```python +def sassy_image_description(image_url: str) -> str: +``` +The function accepts a string parameter `image_url`. This should be either: +- A public URL to an image (e.g., `https://example.com/image.jpg`) +- A base64-encoded data URL (e.g., `data:image/jpeg;base64,/9j/4AAQ...`) + +#### System Message +```python +{ + "role": "system", + "content": "You are a sassy image description generator. You are given an image URL and you need to generate a sassy description of the image.", +} +``` +The system message is a regular string, just like in text-only agents. This sets the personality and task for the AI. + +#### User Message with Image +```python +{ + "role": "user", + "content": [ + {"type": "image_url", "image_url": {"url": image_url}}, + ], +} +``` +This is where images differ from text inputs: +- `content` is an **array** instead of a string +- Each array element has a `type` field +- For images, use `type: "image_url"` +- The URL is nested: `image_url: {"url": image_url}` + +#### Mixing Text and Images + +You can combine text and images in the same message: + +```python +{ + "role": "user", + "content": [ + {"type": "text", "text": "How many cats are in the image?"}, + {"type": "image_url", "image_url": {"url": image_url}} + ], +} +``` \ No newline at end of file From 98597d33b06b7a5eaec6926ef1c3177c5027012e Mon Sep 17 00:00:00 2001 From: Anya <75702826+anyacherniss@users.noreply.github.com> Date: Thu, 14 Aug 2025 15:54:01 -0400 Subject: [PATCH 2/2] add input variables and images example --- .../docs/examples/image-input-agents.mdx | 93 ++++++++++++------- 1 file changed, 57 insertions(+), 36 deletions(-) diff --git a/docs/content/docs/examples/image-input-agents.mdx b/docs/content/docs/examples/image-input-agents.mdx index fe5ed979..d1f1a64c 100644 --- a/docs/content/docs/examples/image-input-agents.mdx +++ b/docs/content/docs/examples/image-input-agents.mdx @@ -1,4 +1,4 @@ -# Creating Agents with Image Input +mad# Creating Agents with Image Input ## Overview @@ -31,9 +31,7 @@ def image_description(image_url: str) -> str: messages=[ { "role": "system", - "content": """You are an image description specialist who provides detailed and accurate descriptions of images. Your task is to analyze the provided images and generate a comprehensive description that captures the key elements, context, and details visible in the image(s). - - When multiple images are provided, create a unified description that covers all images. If additional information is provided, use it to inform your description and provide more context-specific details. + "content": """You are an image description specialist who provides detailed and accurate descriptions of images. Your task is to analyze the provided image and generate a comprehensive description that captures the key elements, context, and details visible in the image. Your description should be: - Clear and concise @@ -54,50 +52,73 @@ def image_description(image_url: str) -> str: return res.choices[0].message.content ``` -### Breaking Down Each Part +As you can see, images differ from text inputs: +- For images, use `type: "image_url"` +- The URL is nested: `image_url: {"url": image_url}` -#### Function Signature -```python -def sassy_image_description(image_url: str) -> str: -``` -The function accepts a string parameter `image_url`. This should be either: -- A public URL to an image (e.g., `https://example.com/image.jpg`) -- A base64-encoded data URL (e.g., `data:image/jpeg;base64,/9j/4AAQ...`) +### Combining Image Input with Text -#### System Message -```python -{ - "role": "system", - "content": "You are a sassy image description generator. You are given an image URL and you need to generate a sassy description of the image.", -} -``` -The system message is a regular string, just like in text-only agents. This sets the personality and task for the AI. +#### Mixing Static Text and Images + +[TODO: confirm if this is correct] +You can combine text and image content in the same message. For example: -#### User Message with Image ```python { "role": "user", "content": [ - {"type": "image_url", "image_url": {"url": image_url}}, + {"type": "text", "text": "How many cats are in the image?"}, + {"type": "image_url", "image_url": {"url": image_url}} ], } ``` -This is where images differ from text inputs: -- `content` is an **array** instead of a string -- Each array element has a `type` field -- For images, use `type: "image_url"` -- The URL is nested: `image_url: {"url": image_url}` -#### Mixing Text and Images +#### Mixing Input Variables and Images -You can combine text and images in the same message: +[TODO: confirm if this is correct] +When your input includes both images and an input variable, you can use Jinja2 templating in text content while keeping images in the structured format. For example: ```python -{ - "role": "user", - "content": [ - {"type": "text", "text": "How many cats are in the image?"}, - {"type": "image_url", "image_url": {"url": image_url}} - ], -} +class ImageQuestionAnswer(BaseModel): + answer: str + +def answer_image_question( + image_url: str, + question: str, +) -> ImageQuestionAnswer: + res = openai.beta.chat.completions.parse( + model="gpt-4o-mini", + messages=[ + { + "role": "system", + "content": """You are an image analyst who provides detailed and accurate answers to questions about images. Your task is to analyze the provided image and question about the image and generate a comprehensive answer. + + Your answer should be: + - Clear and concise + - Factual and objective + - Well-structured and easy to understand""", + }, + { + "role": "user", + "content": [ + { + "type": "text", + "text": "{{question}}" + }, + {"type": "image_url", "image_url": {"url": image_url}} + ] + } + ], + response_format=ImageQuestionAnswer, + extra_body={ + "input": { + "variables": { + "question": question, + } + } + }, + ) + if not res.choices[0].message.parsed: + raise ValueError("No image question answer found") + return res.choices[0].message.parsed ``` \ No newline at end of file