From 5c5ca692b80f334af94ed98bb8f08e0e9302858a Mon Sep 17 00:00:00 2001 From: Yun Kim Date: Wed, 11 Mar 2026 23:16:32 -0400 Subject: [PATCH 1/2] Update anthropic tests only for python new cache metrics --- .../llm/anthropic/test_anthropic_llmobs.py | 76 +++++++------------ 1 file changed, 26 insertions(+), 50 deletions(-) diff --git a/tests/integration_frameworks/llm/anthropic/test_anthropic_llmobs.py b/tests/integration_frameworks/llm/anthropic/test_anthropic_llmobs.py index c74a58e9bb9..41655e07d5c 100644 --- a/tests/integration_frameworks/llm/anthropic/test_anthropic_llmobs.py +++ b/tests/integration_frameworks/llm/anthropic/test_anthropic_llmobs.py @@ -1,5 +1,5 @@ from tests.integration_frameworks.llm.utils import assert_llmobs_span_event -from utils import features, scenarios +from utils import context, features, scenarios from utils.docker_fixtures import FrameworkTestClientApi, TestAgentAPI from .utils import BaseAnthropicTest @@ -8,6 +8,24 @@ from unittest import mock import json + +def _expected_llmobs_metrics(): + """Return expected LLMObs token metrics for anthropic spans. + + Ephemeral cache TTL metrics are only emitted by the Python tracer. + """ + metrics = { + "input_tokens": mock.ANY, + "output_tokens": mock.ANY, + "total_tokens": mock.ANY, + "cache_read_input_tokens": mock.ANY, + "cache_write_input_tokens": mock.ANY, + } + if context.library == "python": + metrics["ephemeral_1h_input_tokens"] = mock.ANY + metrics["ephemeral_5m_input_tokens"] = mock.ANY + return metrics + TOOLS = [ { "name": "get_weather", @@ -75,13 +93,7 @@ def test_create(self, test_agent: TestAgentAPI, test_client: FrameworkTestClient "max_tokens": 100, "temperature": 0.5, }, - metrics={ - "input_tokens": mock.ANY, - "output_tokens": mock.ANY, - "total_tokens": mock.ANY, - "cache_read_input_tokens": mock.ANY, - "cache_write_input_tokens": mock.ANY, - }, + metrics=_expected_llmobs_metrics(), ) def test_create_stream_method(self, test_agent: TestAgentAPI, test_client: FrameworkTestClientApi): @@ -118,13 +130,7 @@ def test_create_stream_method(self, test_agent: TestAgentAPI, test_client: Frame "max_tokens": 100, "temperature": 0.5, }, - metrics={ - "input_tokens": mock.ANY, - "output_tokens": mock.ANY, - "total_tokens": mock.ANY, - "cache_read_input_tokens": mock.ANY, - "cache_write_input_tokens": mock.ANY, - }, + metrics=_expected_llmobs_metrics(), ) @pytest.mark.parametrize("stream", [True, False]) @@ -175,13 +181,7 @@ def test_create_content_block(self, test_agent: TestAgentAPI, test_client: Frame "max_tokens": 100, "temperature": 0.5, }, - metrics={ - "input_tokens": mock.ANY, - "output_tokens": mock.ANY, - "total_tokens": mock.ANY, - "cache_read_input_tokens": mock.ANY, - "cache_write_input_tokens": mock.ANY, - }, + metrics=_expected_llmobs_metrics(), ) @pytest.mark.parametrize("stream", [True, False]) @@ -312,13 +312,7 @@ def test_create_multiple_system_prompts( "max_tokens": 100, "temperature": 0.5, }, - metrics={ - "input_tokens": mock.ANY, - "output_tokens": mock.ANY, - "total_tokens": mock.ANY, - "cache_read_input_tokens": mock.ANY, - "cache_write_input_tokens": mock.ANY, - }, + metrics=_expected_llmobs_metrics(), ) @pytest.mark.parametrize("stream", [True, False]) @@ -378,13 +372,7 @@ def test_create_with_tools(self, test_agent: TestAgentAPI, test_client: Framewor "max_tokens": 100, "temperature": 0.5, }, - metrics={ - "input_tokens": mock.ANY, - "output_tokens": mock.ANY, - "total_tokens": mock.ANY, - "cache_read_input_tokens": mock.ANY, - "cache_write_input_tokens": mock.ANY, - }, + metrics=_expected_llmobs_metrics(), ) @pytest.mark.parametrize("stream", [True, False]) @@ -495,13 +483,7 @@ def test_create_tool_result(self, test_agent: TestAgentAPI, test_client: Framewo "max_tokens": 100, "temperature": 0.5, }, - metrics={ - "input_tokens": mock.ANY, - "output_tokens": mock.ANY, - "total_tokens": mock.ANY, - "cache_read_input_tokens": mock.ANY, - "cache_write_input_tokens": mock.ANY, - }, + metrics=_expected_llmobs_metrics(), ) @pytest.mark.parametrize("stream", [True, False]) @@ -568,13 +550,7 @@ def test_create_redact_image_input( "max_tokens": 100, "temperature": 0.5, }, - metrics={ - "input_tokens": mock.ANY, - "output_tokens": mock.ANY, - "total_tokens": mock.ANY, - "cache_read_input_tokens": mock.ANY, - "cache_write_input_tokens": mock.ANY, - }, + metrics=_expected_llmobs_metrics(), ) @pytest.mark.parametrize("stream", [True, False]) From a33f54bd912bbc9ea649e5cd18a339a379c6471b Mon Sep 17 00:00:00 2001 From: Yun Kim Date: Thu, 12 Mar 2026 15:46:56 -0400 Subject: [PATCH 2/2] Revert missing_feature for python --- manifests/python.yml | 7 -- .../llm/anthropic/test_anthropic_llmobs.py | 91 +++++++++++++------ 2 files changed, 65 insertions(+), 33 deletions(-) diff --git a/manifests/python.yml b/manifests/python.yml index 1463b4b7d29..299e7898b7b 100644 --- a/manifests/python.yml +++ b/manifests/python.yml @@ -1080,15 +1080,8 @@ manifest: tests/ffe/test_exposures.py: v4.2.0-dev tests/integration_frameworks/llm/anthropic/test_anthropic_apm.py::TestAnthropicApmMessages: v3.16.0 tests/integration_frameworks/llm/anthropic/test_anthropic_llmobs.py::TestAnthropicLlmObsMessages: v3.16.0 - tests/integration_frameworks/llm/anthropic/test_anthropic_llmobs.py::TestAnthropicLlmObsMessages::test_create: missing_feature # ephemeral cache TTL metrics not yet released - tests/integration_frameworks/llm/anthropic/test_anthropic_llmobs.py::TestAnthropicLlmObsMessages::test_create_content_block: missing_feature # ephemeral cache TTL metrics not yet released tests/integration_frameworks/llm/anthropic/test_anthropic_llmobs.py::TestAnthropicLlmObsMessages::test_create_error: bug (MLOB-1234) - ? tests/integration_frameworks/llm/anthropic/test_anthropic_llmobs.py::TestAnthropicLlmObsMessages::test_create_multiple_system_prompts - : missing_feature # ephemeral cache TTL metrics not yet released - tests/integration_frameworks/llm/anthropic/test_anthropic_llmobs.py::TestAnthropicLlmObsMessages::test_create_redact_image_input: missing_feature # ephemeral cache TTL metrics not yet released - tests/integration_frameworks/llm/anthropic/test_anthropic_llmobs.py::TestAnthropicLlmObsMessages::test_create_stream_method: missing_feature # ephemeral cache TTL metrics not yet released tests/integration_frameworks/llm/anthropic/test_anthropic_llmobs.py::TestAnthropicLlmObsMessages::test_create_tool_result: bug (MLOB-1234) - tests/integration_frameworks/llm/anthropic/test_anthropic_llmobs.py::TestAnthropicLlmObsMessages::test_create_with_tools: missing_feature # ephemeral cache TTL metrics not yet released tests/integration_frameworks/llm/google_genai/test_google_genai_apm.py::TestGoogleGenAiEmbedContent: v3.11.0 tests/integration_frameworks/llm/google_genai/test_google_genai_apm.py::TestGoogleGenAiGenerateContent: v3.11.0 tests/integration_frameworks/llm/google_genai/test_google_genai_llmobs.py::TestGoogleGenAiEmbedContent: v3.13.0 diff --git a/tests/integration_frameworks/llm/anthropic/test_anthropic_llmobs.py b/tests/integration_frameworks/llm/anthropic/test_anthropic_llmobs.py index 41655e07d5c..1bf59e370d2 100644 --- a/tests/integration_frameworks/llm/anthropic/test_anthropic_llmobs.py +++ b/tests/integration_frameworks/llm/anthropic/test_anthropic_llmobs.py @@ -1,5 +1,5 @@ from tests.integration_frameworks.llm.utils import assert_llmobs_span_event -from utils import context, features, scenarios +from utils import features, scenarios from utils.docker_fixtures import FrameworkTestClientApi, TestAgentAPI from .utils import BaseAnthropicTest @@ -8,24 +8,6 @@ from unittest import mock import json - -def _expected_llmobs_metrics(): - """Return expected LLMObs token metrics for anthropic spans. - - Ephemeral cache TTL metrics are only emitted by the Python tracer. - """ - metrics = { - "input_tokens": mock.ANY, - "output_tokens": mock.ANY, - "total_tokens": mock.ANY, - "cache_read_input_tokens": mock.ANY, - "cache_write_input_tokens": mock.ANY, - } - if context.library == "python": - metrics["ephemeral_1h_input_tokens"] = mock.ANY - metrics["ephemeral_5m_input_tokens"] = mock.ANY - return metrics - TOOLS = [ { "name": "get_weather", @@ -93,7 +75,15 @@ def test_create(self, test_agent: TestAgentAPI, test_client: FrameworkTestClient "max_tokens": 100, "temperature": 0.5, }, - metrics=_expected_llmobs_metrics(), + metrics={ + "input_tokens": mock.ANY, + "output_tokens": mock.ANY, + "total_tokens": mock.ANY, + "cache_read_input_tokens": mock.ANY, + "cache_write_input_tokens": mock.ANY, + "ephemeral_1h_input_tokens": mock.ANY, + "ephemeral_5m_input_tokens": mock.ANY, + }, ) def test_create_stream_method(self, test_agent: TestAgentAPI, test_client: FrameworkTestClientApi): @@ -130,7 +120,15 @@ def test_create_stream_method(self, test_agent: TestAgentAPI, test_client: Frame "max_tokens": 100, "temperature": 0.5, }, - metrics=_expected_llmobs_metrics(), + metrics={ + "input_tokens": mock.ANY, + "output_tokens": mock.ANY, + "total_tokens": mock.ANY, + "cache_read_input_tokens": mock.ANY, + "cache_write_input_tokens": mock.ANY, + "ephemeral_1h_input_tokens": mock.ANY, + "ephemeral_5m_input_tokens": mock.ANY, + }, ) @pytest.mark.parametrize("stream", [True, False]) @@ -181,7 +179,15 @@ def test_create_content_block(self, test_agent: TestAgentAPI, test_client: Frame "max_tokens": 100, "temperature": 0.5, }, - metrics=_expected_llmobs_metrics(), + metrics={ + "input_tokens": mock.ANY, + "output_tokens": mock.ANY, + "total_tokens": mock.ANY, + "cache_read_input_tokens": mock.ANY, + "cache_write_input_tokens": mock.ANY, + "ephemeral_1h_input_tokens": mock.ANY, + "ephemeral_5m_input_tokens": mock.ANY, + }, ) @pytest.mark.parametrize("stream", [True, False]) @@ -312,7 +318,15 @@ def test_create_multiple_system_prompts( "max_tokens": 100, "temperature": 0.5, }, - metrics=_expected_llmobs_metrics(), + metrics={ + "input_tokens": mock.ANY, + "output_tokens": mock.ANY, + "total_tokens": mock.ANY, + "cache_read_input_tokens": mock.ANY, + "cache_write_input_tokens": mock.ANY, + "ephemeral_1h_input_tokens": mock.ANY, + "ephemeral_5m_input_tokens": mock.ANY, + }, ) @pytest.mark.parametrize("stream", [True, False]) @@ -372,7 +386,15 @@ def test_create_with_tools(self, test_agent: TestAgentAPI, test_client: Framewor "max_tokens": 100, "temperature": 0.5, }, - metrics=_expected_llmobs_metrics(), + metrics={ + "input_tokens": mock.ANY, + "output_tokens": mock.ANY, + "total_tokens": mock.ANY, + "cache_read_input_tokens": mock.ANY, + "cache_write_input_tokens": mock.ANY, + "ephemeral_1h_input_tokens": mock.ANY, + "ephemeral_5m_input_tokens": mock.ANY, + }, ) @pytest.mark.parametrize("stream", [True, False]) @@ -483,7 +505,15 @@ def test_create_tool_result(self, test_agent: TestAgentAPI, test_client: Framewo "max_tokens": 100, "temperature": 0.5, }, - metrics=_expected_llmobs_metrics(), + metrics={ + "input_tokens": mock.ANY, + "output_tokens": mock.ANY, + "total_tokens": mock.ANY, + "cache_read_input_tokens": mock.ANY, + "cache_write_input_tokens": mock.ANY, + "ephemeral_1h_input_tokens": mock.ANY, + "ephemeral_5m_input_tokens": mock.ANY, + }, ) @pytest.mark.parametrize("stream", [True, False]) @@ -550,7 +580,15 @@ def test_create_redact_image_input( "max_tokens": 100, "temperature": 0.5, }, - metrics=_expected_llmobs_metrics(), + metrics={ + "input_tokens": mock.ANY, + "output_tokens": mock.ANY, + "total_tokens": mock.ANY, + "cache_read_input_tokens": mock.ANY, + "cache_write_input_tokens": mock.ANY, + "ephemeral_1h_input_tokens": mock.ANY, + "ephemeral_5m_input_tokens": mock.ANY, + }, ) @pytest.mark.parametrize("stream", [True, False]) @@ -606,4 +644,5 @@ def test_create_prompt_caching( write_span_event, read_span_event = span_events assert write_span_event["metrics"]["cache_write_input_tokens"] == 6163 + assert write_span_event["metrics"]["ephemeral_5m_input_tokens"] == 6163 assert read_span_event["metrics"]["cache_read_input_tokens"] == 6163