tomtom215 · tomtom215 · Apr 2, 2026 · Apr 2, 2026 · Apr 2, 2026 · Apr 2, 2026
diff --git a/.github/workflows/README.md b/.github/workflows/README.md
@@ -37,7 +37,7 @@ cargo doc --workspace --no-deps
 
 ## Benchmark Automation
 
-The benchmarks workflow runs all 13 benchmark modules (237 benchmarks total), generates a Markdown results page, and commits it to `book/src/reference/benchmarks.md`. This triggers the docs workflow to redeploy GitHub Pages with fresh numbers.
+The benchmarks workflow runs all 13 benchmark modules (267 benchmarks total), generates a Markdown results page and an interactive dashboard, and commits them to `book/src/reference/benchmarks.md` and `book/src/reference/benchmark-dashboard.html`. This triggers the docs workflow to redeploy GitHub Pages with fresh numbers.
 
 ## License
 

diff --git a/CHANGELOG.md b/CHANGELOG.md
@@ -53,7 +53,7 @@ and this project adheres to [Semantic Versioning](https://semver.org/spec/v2.0.0
   deserialization (~15-25% fewer allocations).
 - **SSE frame building uses thread-local reusable buffer** — Amortized 0
   allocations per event vs previous 1 allocation per event.
-- **237 benchmarks, zero panics, zero errors** — Cleanest benchmark run in
+- **267 benchmarks, zero panics, zero errors** — Cleanest benchmark run in
   project history. All 13 benchmark suites (transport, protocol, lifecycle,
   concurrency, cross-language, realistic, error paths, backpressure, data
   volume, memory, enterprise, production, advanced) pass with zero failures.
@@ -94,7 +94,7 @@ and this project adheres to [Semantic Versioning](https://semver.org/spec/v2.0.0
   for 5 additional benchmark groups based on CI analysis: `transport/payload_scaling`
   (8s→10s), `concurrent/sends` (18s→30s), `realistic/payload_complexity` (10s→15s),
   `realistic/connection` (10s→15s), `enterprise/client_interceptors` (8s→10s).
-  All 237 benchmarks now complete within their budget on CI runners.
+  All 267 benchmarks now complete within their budget on CI runners.
 - **Push config benchmark per-task limit** — `production/push_config/set_roundtrip`
   and `delete_roundtrip` now upsert a pre-created config instead of creating new
   configs each iteration, preventing `push config limit exceeded` panics during

diff --git a/CITATION.cff b/CITATION.cff
@@ -12,7 +12,7 @@ authors:
 repository-code: "https://github.com/tomtom215/a2a-rust"
 url: "https://github.com/tomtom215/a2a-rust"
 license: Apache-2.0
-version: "0.3.0"
+version: "0.5.0"
 date-released: "2026-03-19"
 keywords:
   - a2a

diff --git a/README.md b/README.md
@@ -101,7 +101,7 @@ This project aims to be the first **v1.0.0-compliant** Rust SDK for A2A. We inte
 
 ```toml
 [dependencies]
-a2a-protocol-sdk = "0.4"
+a2a-protocol-sdk = "0.5"
 tokio = { version = "1", features = ["rt-multi-thread", "macros"] }
 ```
 
@@ -297,7 +297,7 @@ cargo fmt --all -- --check
 # Build documentation
 RUSTDOCFLAGS="-D warnings" cargo doc --workspace --no-deps
 
-# Run benchmarks (265+ benchmarks across 13 suites — transport, protocol,
+# Run benchmarks (267 benchmarks across 13 suites — transport, protocol,
 # lifecycle, concurrency, cross-language, realistic, error paths, backpressure,
 # data volume, memory, enterprise, production, and advanced scenarios)
 cargo bench -p a2a-benchmarks

diff --git a/benches/README.md b/benches/README.md
@@ -63,14 +63,19 @@ benches/
 │   ├── backpressure.rs             # streaming under load
 │   ├── data_volume.rs              # store ops at scale
 │   ├── memory_overhead.rs          # heap allocation profiling
+│   ├── enterprise_scenarios.rs     # multi-tenant, CORS, eviction, rate limiting
 │   ├── production_scenarios.rs     # real-world E2E workflows
 │   └── advanced_scenarios.rs       # SDK capability gap coverage
+├── dashboard/
+│   └── template.html               # Interactive dashboard HTML template
 ├── cross_language/
 │   ├── canonical_agent_card.json   # Reference AgentCard for all SDKs
 │   └── canonical_send_params.json  # Reference payload (256 bytes)
 ├── scripts/
 │   ├── run_benchmarks.sh           # Run all + collect results
 │   ├── generate_book_page.sh       # Auto-generate book/src/reference/benchmarks.md
+│   ├── generate_dashboard.sh       # Generate interactive dashboard from criterion data
+│   ├── extract_benchmark_json.py   # Extract criterion results into structured JSON
 │   ├── compare_results.sh          # Cross-language comparison table
 │   ├── cross_language_python.sh    # Python SDK runner
 │   ├── cross_language_go.sh        # Go SDK runner

diff --git a/benches/dashboard/template.html b/benches/dashboard/template.html
diff --git a/benches/scripts/extract_benchmark_json.py b/benches/scripts/extract_benchmark_json.py
@@ -161,7 +161,7 @@ def build_dashboard_data(benchmarks: Dict[str, Dict[str, float]]) -> Dict[str, A
 
     # -- highlights --------------------------------------------------------
     highlights = {
-        "serde_floor_ns": _ns(benchmarks, "protocol_type_serde/agent_card/serialize"),
+        "serde_floor_ns": _ns(benchmarks, "protocol_type_serde/agent_card_serialize"),
         "roundtrip_reused_ms": _ms(benchmarks, "realistic_connection/reused_client"),
         "roundtrip_new_ms": _ms(benchmarks, "realistic_connection/new_client_per_request"),
         "concurrent_64_sends_ms": _ms(benchmarks, "concurrent_sends/jsonrpc/64"),
@@ -335,7 +335,11 @@ def build_dashboard_data(benchmarks: Dict[str, Dict[str, float]]) -> Dict[str, A
     }
 
     # -- memory ------------------------------------------------------------
-    alloc_counts = {
+    # Note: These benchmarks use iter_custom() which returns wall-clock time.
+    # Criterion reports the timing (ns), not allocation counts. Allocation
+    # counts are verified internally via assertions. The values here are
+    # timing in nanoseconds under the counting allocator overhead.
+    alloc_timing = {
         "task_ser": _ns(benchmarks, "memory_serialize/task_alloc_count"),
         "task_de": _ns(benchmarks, "memory_deserialize/task_alloc_count"),
         "agent_card_ser": _ns(benchmarks, "memory_serialize/agent_card_alloc_count"),
@@ -358,7 +362,7 @@ def build_dashboard_data(benchmarks: Dict[str, Dict[str, float]]) -> Dict[str, A
         })
 
     memory = {
-        "alloc_counts": alloc_counts,
+        "alloc_timing": alloc_timing,
         "bytes_per_payload": bytes_per_payload,
         "history_allocs": history_allocs,
     }
@@ -499,6 +503,11 @@ def build_dashboard_data(benchmarks: Dict[str, Dict[str, float]]) -> Dict[str, A
         "pagination_walk": pagination_walk,
     }
 
+    # -- concurrent_mixed --------------------------------------------------
+    concurrent_mixed = {
+        "send_then_get_ms": _ms(benchmarks, "concurrent_mixed/send_then_get"),
+    }
+
     # -- errors ------------------------------------------------------------
     errors = {
         "happy_path_ms": _ms(benchmarks, "errors_happy_vs_error/happy_path"),
@@ -552,6 +561,7 @@ def build_dashboard_data(benchmarks: Dict[str, Dict[str, float]]) -> Dict[str, A
         "enterprise": enterprise,
         "production": production,
         "advanced": advanced,
+        "concurrent_mixed": concurrent_mixed,
         "errors": errors,
         "lifecycle": lifecycle,
         "all_benchmarks": all_benchmarks,

diff --git a/benches/scripts/generate_book_page.sh b/benches/scripts/generate_book_page.sh
@@ -245,7 +245,7 @@ Stream throughput under varying event volumes and consumer speeds.
 Reveals buffering and flow-control overhead that synthetic single-event tests miss.
 
 The default broadcast channel capacity was increased from 64 to 256 events in
-v0.4.2, pushing the per-event cost inflection point from ~52 events to ~252
+v0.5.0, pushing the per-event cost inflection point from ~52 events to ~252
 events. Deployments with >256 events/task should use
 `EventQueueManager::with_capacity()` to set a higher value.
 
@@ -375,15 +375,15 @@ tighter confidence intervals).
 The `data_volume/get/100K` benchmark previously reported ~42% faster lookups
 than the 1K/10K cases due to a **CPU cache warming artifact** from the large
 `populate_store()` setup filling L1/L2 caches. A 4MB cache-busting step was
-added in v0.4.2 to flush caches between populate and measure, producing more
+added in v0.5.0 to flush caches between populate and measure, producing more
 representative O(1) lookup times across all scales. The 1K/10K number (~450ns)
 remains the representative baseline.
 
 ### Stream volume per-event cost inflection
 
 Per-event cost inflects dramatically when events exceed the broadcast channel
 capacity. The default capacity was increased from 64 to **256** events in
-v0.4.2, pushing the inflection from ~52 events to ~252 events:
+v0.5.0, pushing the inflection from ~52 events to ~252 events:
 
 - Below capacity: ~4µs/event (fast path)
 - At capacity boundary: ~53µs/event (12× jump — broadcast back-pressure)

diff --git a/benches/scripts/run_benchmarks.sh b/benches/scripts/run_benchmarks.sh
@@ -118,7 +118,7 @@ cat > "$SUMMARY_FILE" <<EOF
 {
   "language": "rust",
   "sdk": "a2a-protocol-sdk",
-  "version": "0.4.1",
+  "version": "0.5.0",
   "timestamp": "$TIMESTAMP",
   "rust_version": "$(rustc --version)",
   "platform": "$(uname -s)-$(uname -m)",

diff --git a/book/src/building-agents/dispatchers.md b/book/src/building-agents/dispatchers.md
@@ -158,7 +158,7 @@ No web framework required — the dispatchers work directly with hyper's service
 Provides bidirectional A2A communication over WebSocket. Enable with the `websocket` feature flag:
 
 ```toml
-a2a-protocol-server = { version = "0.4", features = ["websocket"] }
+a2a-protocol-server = { version = "0.5", features = ["websocket"] }
 ```
 
 ```rust
@@ -192,7 +192,7 @@ let addr = dispatcher.serve_with_addr("127.0.0.1:0").await?;
 Routes gRPC requests to the handler via `tonic`. Enable with the `grpc` feature flag:
 
 ```toml
-a2a-protocol-server = { version = "0.4", features = ["grpc"] }
+a2a-protocol-server = { version = "0.5", features = ["grpc"] }
 ```
 
 ```rust
@@ -252,7 +252,7 @@ For projects already using Axum, the `axum` feature provides `A2aRouter` — an
 idiomatic adapter that wraps `RequestHandler` as an `axum::Router`:
 
 ```toml
-a2a-protocol-server = { version = "0.4", features = ["axum"] }
+a2a-protocol-server = { version = "0.5", features = ["axum"] }
 ```
 
 ```rust

diff --git a/book/src/building-agents/handler.md b/book/src/building-agents/handler.md
@@ -77,7 +77,7 @@ let handler = RequestHandlerBuilder::new(MyExecutor)
 | `with_push_sender(impl PushSender)` | None | Webhook delivery implementation |
 | `with_interceptor(impl ServerInterceptor)` | Empty chain | Add a server interceptor |
 | `with_executor_timeout(Duration)` | None | Timeout for executor completion |
-| `with_event_queue_capacity(usize)` | 64 | Bounded channel size per stream |
+| `with_event_queue_capacity(usize)` | 256 | Bounded channel size per stream |
 | `with_max_event_size(usize)` | 16 MiB | Maximum serialized event size |
 | `with_max_concurrent_streams(usize)` | Unbounded | Limit concurrent SSE streams |
 | `with_event_queue_write_timeout(Duration)` | 5 seconds | Prevents executor blocking on slow clients |

diff --git a/book/src/building-agents/stores.md b/book/src/building-agents/stores.md
@@ -8,7 +8,7 @@ The `TaskStore` trait defines how tasks are persisted:
 
 ```rust
 pub trait TaskStore: Send + Sync + 'static {
-    fn save<'a>(&'a self, task: Task)
+    fn save<'a>(&'a self, task: &'a Task)
         -> Pin<Box<dyn Future<Output = A2aResult<()>> + Send + 'a>>;
 
     fn get<'a>(&'a self, id: &'a TaskId)
@@ -17,7 +17,7 @@ pub trait TaskStore: Send + Sync + 'static {
     fn list<'a>(&'a self, params: &'a ListTasksParams)
         -> Pin<Box<dyn Future<Output = A2aResult<TaskListResponse>> + Send + 'a>>;
 
-    fn insert_if_absent<'a>(&'a self, task: Task)
+    fn insert_if_absent<'a>(&'a self, task: &'a Task)
         -> Pin<Box<dyn Future<Output = A2aResult<bool>> + Send + 'a>>;
 
     fn delete<'a>(&'a self, id: &'a TaskId)
@@ -67,7 +67,7 @@ Enable the `sqlite` feature for a production-ready persistent store:
 
 ```toml
 [dependencies]
-a2a-protocol-server = { version = "0.4", features = ["sqlite"] }
+a2a-protocol-server = { version = "0.5", features = ["sqlite"] }
 ```
 
 ```rust

diff --git a/book/src/client/builder.md b/book/src/client/builder.md
@@ -180,7 +180,7 @@ async fn bad_pattern(url: &str) {
 
 ## gRPC Client
 
-> Requires the `grpc` feature: `a2a-protocol-client = { version = "0.4", features = ["grpc"] }`
+> Requires the `grpc` feature: `a2a-protocol-client = { version = "0.5", features = ["grpc"] }`
 
 For gRPC transport, use `GrpcTransport::connect()` with `with_custom_transport()`:
 

diff --git a/book/src/concepts/transport-layers.md b/book/src/concepts/transport-layers.md
@@ -112,10 +112,10 @@ The **WebSocket** transport (`websocket` feature flag) provides a persistent bid
 
 ```toml
 # Server
-a2a-protocol-server = { version = "0.4", features = ["websocket"] }
+a2a-protocol-server = { version = "0.5", features = ["websocket"] }
 
 # Client
-a2a-protocol-client = { version = "0.4", features = ["websocket"] }
+a2a-protocol-client = { version = "0.5", features = ["websocket"] }
 ```
 
 ### Server
@@ -162,10 +162,10 @@ The **gRPC** transport (`grpc` feature flag) provides high-performance RPC via p
 
 ```toml
 # Server
-a2a-protocol-server = { version = "0.4", features = ["grpc"] }
+a2a-protocol-server = { version = "0.5", features = ["grpc"] }
 
 # Client
-a2a-protocol-client = { version = "0.4", features = ["grpc"] }
+a2a-protocol-client = { version = "0.5", features = ["grpc"] }
 ```
 
 ### Server

diff --git a/book/src/deployment/cicd.md b/book/src/deployment/cicd.md
@@ -34,10 +34,11 @@ mutant fails the build.
 
 The **Benchmarks** workflow (`.github/workflows/benchmarks.yml`) runs on-demand (`workflow_dispatch`) and on pushes to `main` that affect benchmark or SDK code. It:
 
-1. Builds and runs all 13 benchmark suites (237 benchmarks total) individually via Criterion.rs
+1. Builds and runs all 13 benchmark suites (267 benchmarks total) individually via Criterion.rs
 2. Auto-generates the [benchmark results page](../reference/benchmarks.md) via `benches/scripts/generate_book_page.sh`
-3. Commits the updated results page to `main` via `github-actions[bot]`
-4. Archives the full criterion HTML reports (violin plots, comparison overlays) as workflow artifacts with 30-day retention
+3. Auto-generates the [interactive benchmark dashboard](../reference/dashboard.md) via `benches/scripts/generate_dashboard.sh`
+4. Commits the updated results page and dashboard to `main` via `github-actions[bot]`
+5. Archives the full criterion HTML reports (violin plots, comparison overlays) as workflow artifacts with 30-day retention
 
 The 13 benchmark suites cover: transport throughput (payload scaling to 1MB), protocol overhead (including `protocol/payload_scaling` isolation benchmarks for serde regression detection), task lifecycle, concurrent agents, cross-language comparison, realistic workloads, error paths, streaming and backpressure, data volume scaling (with cache-busting), memory overhead, enterprise scenarios, production scenarios, and advanced scenarios.
 

diff --git a/book/src/deployment/production.md b/book/src/deployment/production.md
@@ -144,8 +144,8 @@ Enable the `tracing` feature for structured logs:
 
 ```toml
 [dependencies]
-a2a-protocol-server = { version = "0.4", features = ["tracing"] }
-tracing-subscriber = { version = "0.4", features = ["env-filter"] }
+a2a-protocol-server = { version = "0.5", features = ["tracing"] }
+tracing-subscriber = { version = "0.5", features = ["env-filter"] }
 ```
 
 ```rust

diff --git a/book/src/deployment/testing.md b/book/src/deployment/testing.md
@@ -374,7 +374,7 @@ returns `true` for terminal states.
 
 ## Performance Benchmarks
 
-The `benches/` directory contains **237 Criterion.rs benchmarks** across 13 suites
+The `benches/` directory contains **267 Criterion.rs benchmarks** across 13 suites
 measuring SDK overhead independently of agent logic:
 
 | Suite | Coverage |

diff --git a/book/src/getting-started/first-agent.md b/book/src/getting-started/first-agent.md
@@ -15,7 +15,7 @@ Add dependencies to `Cargo.toml`:
 
 ```toml
 [dependencies]
-a2a-protocol-sdk = "0.4"
+a2a-protocol-sdk = "0.5"
 tokio = { version = "1", features = ["full"] }
 uuid = { version = "1", features = ["v4"] }
 ```

diff --git a/book/src/getting-started/installation.md b/book/src/getting-started/installation.md
@@ -12,7 +12,7 @@ The easiest way to use a2a-rust is through the umbrella SDK crate, which re-expo
 
 ```toml
 [dependencies]
-a2a-protocol-sdk = "0.4"
+a2a-protocol-sdk = "0.5"
 tokio = { version = "1", features = ["full"] }
 ```
 
@@ -24,13 +24,13 @@ If you prefer fine-grained control, depend on individual crates:
 
 ```toml
 # Types only (no I/O, no async runtime)
-a2a-protocol-types = "0.4"
+a2a-protocol-types = "0.5"
 
 # Client only
-a2a-protocol-client = "0.4"
+a2a-protocol-client = "0.5"
 
 # Server only
-a2a-protocol-server = "0.4"
+a2a-protocol-server = "0.5"
 ```
 
 This is useful when:
@@ -89,11 +89,11 @@ Enable features in your `Cargo.toml`:
 
 ```toml
 [dependencies]
-a2a-protocol-sdk = { version = "0.4", features = ["tracing", "signing"] }
+a2a-protocol-sdk = { version = "0.5", features = ["tracing", "signing"] }
 
 # Or with individual crates:
-a2a-protocol-server = { version = "0.4", features = ["tracing", "sqlite"] }
-a2a-protocol-client = { version = "0.4", features = ["tls-rustls"] }
+a2a-protocol-server = { version = "0.5", features = ["tracing", "sqlite"] }
+a2a-protocol-client = { version = "0.5", features = ["tls-rustls"] }
 ```
 
 ## Verifying the Installation