From 2a5a4ac6adaaf80b47a87eaab40a247e76fa2a39 Mon Sep 17 00:00:00 2001 From: yltw27 Date: Tue, 3 Mar 2026 15:35:00 +0800 Subject: [PATCH 1/6] Add optional OpenTelemetry tracing support for evaluations, batch processing, and async metrics - Install @opentelemetry/api as optional peer dependency for zero overhead when disabled - Implement telemetry module with dynamic import and no-op fallback pattern - Add span instrumentation across batch evaluation, single evaluations, and async metrics - Support custom evaluator instrumentation via public withSpan API - Comprehensive span hierarchy: batch.evaluate -> process_row -> run_evaluators -> evaluator.evaluate - Retry events recorded on process_row spans with attempt number and delay tracking - Token usage tracking across evaluations (input/output/total tokens) - All OpenTelemetry integration is optional - zero performance impact when package not installed - Add Telemetry documentation with setup instructions and production examples Refactored for code health: - Extracted span lifecycle management from evaluate() method into helper methods - Simplified retry logic in batch processing with context object pattern - All files maintain 10/10 code health scores with no regressions Co-Authored-By: Claude Opus 4.6 --- README.md | 27 ++ docs/TELEMETRY.md | 201 ++++++++ package.json | 11 + pnpm-lock.yaml | 99 ++++ src/batch/batch-evaluator-telemetry.spec.ts | 233 +++++++++ src/batch/batch-evaluator.ts | 500 ++++++++++++-------- src/evaluators/evaluator-telemetry.spec.ts | 134 ++++++ src/evaluators/evaluator.ts | 198 +++++--- src/index.ts | 2 + src/metrics/bert-score-telemetry.spec.ts | 92 ++++ src/metrics/bert-score.ts | 127 +++-- src/metrics/perplexity-telemetry.spec.ts | 107 +++++ src/metrics/perplexity.ts | 189 +++++--- src/telemetry-noop.spec.ts | 123 +++++ src/telemetry.spec.ts | 104 ++++ src/telemetry.ts | 180 +++++++ vite.config.ts | 6 + 17 files changed, 1960 insertions(+), 373 deletions(-) create mode 100644 docs/TELEMETRY.md create mode 100644 src/batch/batch-evaluator-telemetry.spec.ts create mode 100644 src/evaluators/evaluator-telemetry.spec.ts create mode 100644 src/metrics/bert-score-telemetry.spec.ts create mode 100644 src/metrics/perplexity-telemetry.spec.ts create mode 100644 src/telemetry-noop.spec.ts create mode 100644 src/telemetry.spec.ts create mode 100644 src/telemetry.ts diff --git a/README.md b/README.md index e0197df..d45ebde 100644 --- a/README.md +++ b/README.md @@ -7,6 +7,7 @@ A TypeScript SDK for evaluating content quality using traditional metrics and AI - **Traditional Metrics**: BLEU, TER, BERTScore, Coherence, Perplexity - **AI-Powered Evaluation**: LLM-based evaluator with prompt templating (via Vercel AI SDK) - **Batch Processing**: Concurrent execution, progress tracking, retry logic, CSV/JSON export +- **OpenTelemetry Tracing**: Optional distributed tracing with zero overhead when disabled ## Installation @@ -87,6 +88,31 @@ await batchEvaluator.export({ }); ``` +### OpenTelemetry Tracing (Optional) + +eval-kit has built-in OpenTelemetry support. Install the optional peer dependency to enable distributed tracing with your existing observability stack (Jaeger, Grafana Tempo, Datadog, etc.): + +```bash +npm install @opentelemetry/api @opentelemetry/sdk-trace-node +``` + +Configure your OTel SDK as usual — eval-kit will automatically emit spans: + +```typescript +import { NodeTracerProvider } from '@opentelemetry/sdk-trace-node'; +import { SimpleSpanProcessor, ConsoleSpanExporter } from '@opentelemetry/sdk-trace-base'; + +// Set up OTel before using eval-kit +const provider = new NodeTracerProvider(); +provider.addSpanProcessor(new SimpleSpanProcessor(new ConsoleSpanExporter())); +provider.register(); + +// Now all eval-kit operations produce traces +const result = await batchEvaluator.evaluate({ filePath: './data.csv' }); +``` + +When `@opentelemetry/api` is not installed, all tracing is a no-op with zero overhead. See the [Telemetry Guide](./docs/TELEMETRY.md) for span details and custom evaluator instrumentation. + ## Documentation | Guide | Description | @@ -95,6 +121,7 @@ await batchEvaluator.export({ | [Evaluator](./docs/EVALUATOR.md) | AI-powered evaluation and scoring | | [Batch Evaluation](./docs/BATCH_EVALUATION_GUIDE.md) | Concurrent processing, progress tracking | | [Export](./docs/EXPORT_GUIDE.md) | CSV and JSON export options | +| [Telemetry](./docs/TELEMETRY.md) | OpenTelemetry tracing and observability | ## Supported LLM Providers diff --git a/docs/TELEMETRY.md b/docs/TELEMETRY.md new file mode 100644 index 0000000..f6e0420 --- /dev/null +++ b/docs/TELEMETRY.md @@ -0,0 +1,201 @@ +# OpenTelemetry Telemetry Guide + +eval-kit provides built-in [OpenTelemetry](https://opentelemetry.io/) tracing. When enabled, it emits spans for evaluations, batch processing, retries, and async metrics — giving you visibility into per-row latency, token usage, retry behavior, and where time is spent. + +## Setup + +### 1. Install dependencies + +`@opentelemetry/api` is an optional peer dependency. Install it along with an SDK and exporter: + +```bash +npm install @opentelemetry/api @opentelemetry/sdk-trace-node @opentelemetry/sdk-trace-base +``` + +### 2. Configure the OTel SDK + +Set up a tracer provider **before** calling any eval-kit functions: + +```typescript +import { NodeTracerProvider } from '@opentelemetry/sdk-trace-node'; +import { SimpleSpanProcessor, ConsoleSpanExporter } from '@opentelemetry/sdk-trace-base'; + +const provider = new NodeTracerProvider(); +provider.addSpanProcessor(new SimpleSpanProcessor(new ConsoleSpanExporter())); +provider.register(); +``` + +For production, replace `ConsoleSpanExporter` with your backend exporter (Jaeger, OTLP, Zipkin, etc.). + +### 3. Use eval-kit as normal + +No code changes needed. eval-kit detects the registered provider and emits spans automatically. + +## Zero overhead when disabled + +When `@opentelemetry/api` is not installed, eval-kit uses no-op stubs internally. All tracing calls become empty function calls that are optimized away by the JS engine. There is no performance impact. + +## Span Hierarchy + +### Batch evaluation + +``` +eval-kit.batch.evaluate +├── eval-kit.batch.parse_input (file-based input only) +├── eval-kit.batch.process_row (per row) +│ ├── [retry event] (on retry attempts) +│ └── eval-kit.batch.run_evaluators +│ └── eval-kit.evaluator.evaluate (per evaluator) +└── eval-kit.batch.export (when export() is called) +``` + +### Single evaluation + +``` +eval-kit.evaluator.evaluate +``` + +### Async metrics (standalone) + +``` +eval-kit.metric.bert_score +eval-kit.metric.perplexity +``` + +## Span Attributes + +### `eval-kit.evaluator.evaluate` + +| Attribute | Type | Description | +|-----------|------|-------------| +| `eval_kit.evaluator.name` | string | Evaluator name | +| `eval_kit.model.id` | string | Model identifier | +| `eval_kit.score_config.type` | string | `"numeric"` or `"categorical"` | +| `eval_kit.input.candidate_text_length` | number | Length of input text | +| `eval_kit.result.score` | number/string | Evaluation score | +| `eval_kit.result.execution_time_ms` | number | Wall clock time | +| `eval_kit.result.token_usage.input` | number | Input tokens consumed | +| `eval_kit.result.token_usage.output` | number | Output tokens generated | +| `eval_kit.result.token_usage.total` | number | Total tokens | +| `eval_kit.result.error` | string | Error message (on failure) | + +### `eval-kit.batch.evaluate` + +| Attribute | Type | Description | +|-----------|------|-------------| +| `eval_kit.batch.id` | string | Unique batch ID | +| `eval_kit.batch.concurrency` | number | Max concurrent rows | +| `eval_kit.batch.execution_mode` | string | `"parallel"` or `"sequential"` | +| `eval_kit.batch.total_rows` | number | Total rows in input | +| `eval_kit.batch.successful_rows` | number | Rows completed successfully | +| `eval_kit.batch.failed_rows` | number | Rows that failed | + +### `eval-kit.batch.process_row` + +| Attribute | Type | Description | +|-----------|------|-------------| +| `eval_kit.row.id` | string | Row identifier | +| `eval_kit.row.index` | number | Row index in input | +| `eval_kit.row.duration_ms` | number | Total time including retries | +| `eval_kit.row.retry_count` | number | Number of retry attempts | +| `eval_kit.result.error` | string | Error message (on failure) | + +**Retry events** are recorded on this span with name `retry`: + +| Event Attribute | Type | Description | +|----------------|------|-------------| +| `eval_kit.retry.attempt` | number | Retry attempt number (1-based) | +| `eval_kit.retry.delay_ms` | number | Delay before retry | +| `eval_kit.retry.error` | string | Error that triggered the retry | + +### `eval-kit.batch.run_evaluators` + +| Attribute | Type | Description | +|-----------|------|-------------| +| `eval_kit.evaluator_count` | number | Number of evaluators | +| `eval_kit.execution_mode` | string | `"parallel"` or `"sequential"` | + +### `eval-kit.batch.parse_input` + +Only created for file-based input (not in-memory data). + +| Attribute | Type | Description | +|-----------|------|-------------| +| `eval_kit.parse.input_format` | string | `"file"` | +| `eval_kit.parse.row_count` | number | Number of parsed rows | + +### `eval-kit.batch.export` + +| Attribute | Type | Description | +|-----------|------|-------------| +| `eval_kit.export.format` | string | `"csv"` or `"json"` | +| `eval_kit.export.row_count` | number | Number of exported rows | + +### `eval-kit.metric.bert_score` / `eval-kit.metric.perplexity` + +| Attribute | Type | Description | +|-----------|------|-------------| +| `eval_kit.metric.name` | string | Metric name | +| `eval_kit.metric.model` | string | Model used | +| `eval_kit.result.score` | number | Computed score | + +A `model_loaded` event is recorded when the model is loaded for the first time (cache miss). + +## Custom Evaluator Instrumentation + +If you implement `IEvaluator` and want your spans to appear in the trace hierarchy, use the exported `withSpan` helper: + +```typescript +import { withSpan, type IEvaluator, type EvaluatorResult } from '@loveholidays/eval-kit'; + +const myEvaluator: IEvaluator = { + name: 'custom-eval', + async evaluate(input) { + return withSpan( + 'my-app.custom-eval', + { attributes: { 'my_app.evaluator.name': 'custom-eval' } }, + async (span) => { + // Your evaluation logic here + const score = await computeScore(input.candidateText); + + span.setAttribute('my_app.result.score', score); + return { + evaluatorName: 'custom-eval', + score, + feedback: 'Custom evaluation', + processingStats: { executionTime: 0 }, + }; + }, + ); + }, +}; +``` + +Your `my-app.custom-eval` span will automatically appear as a child of `eval-kit.batch.run_evaluators` when used in a batch evaluation, because `withSpan` uses the active async context. + +## Example: Full Setup with Jaeger + +```typescript +import { OTLPTraceExporter } from '@opentelemetry/exporter-trace-otlp-http'; +import { Resource } from '@opentelemetry/resources'; +import { BatchSpanProcessor } from '@opentelemetry/sdk-trace-base'; +import { NodeTracerProvider } from '@opentelemetry/sdk-trace-node'; +import { ATTR_SERVICE_NAME } from '@opentelemetry/semantic-conventions'; +import { BatchEvaluator, Evaluator } from '@loveholidays/eval-kit'; + +// Configure OTel +const provider = new NodeTracerProvider({ + resource: new Resource({ [ATTR_SERVICE_NAME]: 'my-eval-pipeline' }), +}); +provider.addSpanProcessor( + new BatchSpanProcessor(new OTLPTraceExporter({ url: 'http://localhost:4318/v1/traces' })) +); +provider.register(); + +// Run evaluation — spans are exported to Jaeger automatically +const batch = new BatchEvaluator({ evaluators: [evaluator], concurrency: 10 }); +const result = await batch.evaluate({ filePath: './data.csv' }); + +// Flush spans before exit +await provider.shutdown(); +``` diff --git a/package.json b/package.json index a5fe3ee..ac309ed 100644 --- a/package.json +++ b/package.json @@ -63,9 +63,20 @@ "lodash": ">=4.17.23" } }, + "peerDependencies": { + "@opentelemetry/api": "^1.0.0" + }, + "peerDependenciesMeta": { + "@opentelemetry/api": { + "optional": true + } + }, "devDependencies": { "@biomejs/biome": "^2.3.7", "@changesets/cli": "^2.29.8", + "@opentelemetry/api": "^1.9.0", + "@opentelemetry/sdk-trace-base": "^1.30.0", + "@opentelemetry/sdk-trace-node": "^1.30.0", "@types/jest": "^30.0.0", "@types/node": "^24.10.1", "jest": "^30.2.0", diff --git a/pnpm-lock.yaml b/pnpm-lock.yaml index fdedad0..92ad58a 100644 --- a/pnpm-lock.yaml +++ b/pnpm-lock.yaml @@ -37,6 +37,15 @@ importers: '@changesets/cli': specifier: ^2.29.8 version: 2.29.8(@types/node@24.10.1) + '@opentelemetry/api': + specifier: ^1.9.0 + version: 1.9.0 + '@opentelemetry/sdk-trace-base': + specifier: ^1.30.0 + version: 1.30.1(@opentelemetry/api@1.9.0) + '@opentelemetry/sdk-trace-node': + specifier: ^1.30.0 + version: 1.30.1(@opentelemetry/api@1.9.0) '@types/jest': specifier: ^30.0.0 version: 30.0.0 @@ -688,6 +697,52 @@ packages: resolution: {integrity: sha512-3giAOQvZiH5F9bMlMiv8+GSPMeqg0dbaeo58/0SlA9sxSqZhnUtxzX9/2FzyhS9sWQf5S0GJE0AKBrFqjpeYcg==} engines: {node: '>=8.0.0'} + '@opentelemetry/context-async-hooks@1.30.1': + resolution: {integrity: sha512-s5vvxXPVdjqS3kTLKMeBMvop9hbWkwzBpu+mUO2M7sZtlkyDJGwFe33wRKnbaYDo8ExRVBIIdwIGrqpxHuKttA==} + engines: {node: '>=14'} + peerDependencies: + '@opentelemetry/api': '>=1.0.0 <1.10.0' + + '@opentelemetry/core@1.30.1': + resolution: {integrity: sha512-OOCM2C/QIURhJMuKaekP3TRBxBKxG/TWWA0TL2J6nXUtDnuCtccy49LUJF8xPFXMX+0LMcxFpCo8M9cGY1W6rQ==} + engines: {node: '>=14'} + peerDependencies: + '@opentelemetry/api': '>=1.0.0 <1.10.0' + + '@opentelemetry/propagator-b3@1.30.1': + resolution: {integrity: sha512-oATwWWDIJzybAZ4pO76ATN5N6FFbOA1otibAVlS8v90B4S1wClnhRUk7K+2CHAwN1JKYuj4jh/lpCEG5BAqFuQ==} + engines: {node: '>=14'} + peerDependencies: + '@opentelemetry/api': '>=1.0.0 <1.10.0' + + '@opentelemetry/propagator-jaeger@1.30.1': + resolution: {integrity: sha512-Pj/BfnYEKIOImirH76M4hDaBSx6HyZ2CXUqk+Kj02m6BB80c/yo4BdWkn/1gDFfU+YPY+bPR2U0DKBfdxCKwmg==} + engines: {node: '>=14'} + peerDependencies: + '@opentelemetry/api': '>=1.0.0 <1.10.0' + + '@opentelemetry/resources@1.30.1': + resolution: {integrity: sha512-5UxZqiAgLYGFjS4s9qm5mBVo433u+dSPUFWVWXmLAD4wB65oMCoXaJP1KJa9DIYYMeHu3z4BZcStG3LC593cWA==} + engines: {node: '>=14'} + peerDependencies: + '@opentelemetry/api': '>=1.0.0 <1.10.0' + + '@opentelemetry/sdk-trace-base@1.30.1': + resolution: {integrity: sha512-jVPgBbH1gCy2Lb7X0AVQ8XAfgg0pJ4nvl8/IiQA6nxOsPvS+0zMJaFSs2ltXe0J6C8dqjcnpyqINDJmU30+uOg==} + engines: {node: '>=14'} + peerDependencies: + '@opentelemetry/api': '>=1.0.0 <1.10.0' + + '@opentelemetry/sdk-trace-node@1.30.1': + resolution: {integrity: sha512-cBjYOINt1JxXdpw1e5MlHmFRc5fgj4GW/86vsKFxJCJ8AL4PdVtYH41gWwl4qd4uQjqEL1oJVrXkSy5cnduAnQ==} + engines: {node: '>=14'} + peerDependencies: + '@opentelemetry/api': '>=1.0.0 <1.10.0' + + '@opentelemetry/semantic-conventions@1.28.0': + resolution: {integrity: sha512-lp4qAiMTD4sNWW4DbKLBkfiMZ4jbAboJIGOQr5DvciMRI494OapieI9qiODpOt0XBr1LjIDy1xAGAnVs5supTA==} + engines: {node: '>=14'} + '@pkgjs/parseargs@0.11.0': resolution: {integrity: sha512-+1VkjdD0QBLPodGrJUeqarH8VAIvQODIbwh9XpP5Syisf7YoQgsJKPNFoqqLQlu+VQ/tVSshMR6loPMn8U+dPg==} engines: {node: '>=14'} @@ -3403,6 +3458,50 @@ snapshots: '@opentelemetry/api@1.9.0': {} + '@opentelemetry/context-async-hooks@1.30.1(@opentelemetry/api@1.9.0)': + dependencies: + '@opentelemetry/api': 1.9.0 + + '@opentelemetry/core@1.30.1(@opentelemetry/api@1.9.0)': + dependencies: + '@opentelemetry/api': 1.9.0 + '@opentelemetry/semantic-conventions': 1.28.0 + + '@opentelemetry/propagator-b3@1.30.1(@opentelemetry/api@1.9.0)': + dependencies: + '@opentelemetry/api': 1.9.0 + '@opentelemetry/core': 1.30.1(@opentelemetry/api@1.9.0) + + '@opentelemetry/propagator-jaeger@1.30.1(@opentelemetry/api@1.9.0)': + dependencies: + '@opentelemetry/api': 1.9.0 + '@opentelemetry/core': 1.30.1(@opentelemetry/api@1.9.0) + + '@opentelemetry/resources@1.30.1(@opentelemetry/api@1.9.0)': + dependencies: + '@opentelemetry/api': 1.9.0 + '@opentelemetry/core': 1.30.1(@opentelemetry/api@1.9.0) + '@opentelemetry/semantic-conventions': 1.28.0 + + '@opentelemetry/sdk-trace-base@1.30.1(@opentelemetry/api@1.9.0)': + dependencies: + '@opentelemetry/api': 1.9.0 + '@opentelemetry/core': 1.30.1(@opentelemetry/api@1.9.0) + '@opentelemetry/resources': 1.30.1(@opentelemetry/api@1.9.0) + '@opentelemetry/semantic-conventions': 1.28.0 + + '@opentelemetry/sdk-trace-node@1.30.1(@opentelemetry/api@1.9.0)': + dependencies: + '@opentelemetry/api': 1.9.0 + '@opentelemetry/context-async-hooks': 1.30.1(@opentelemetry/api@1.9.0) + '@opentelemetry/core': 1.30.1(@opentelemetry/api@1.9.0) + '@opentelemetry/propagator-b3': 1.30.1(@opentelemetry/api@1.9.0) + '@opentelemetry/propagator-jaeger': 1.30.1(@opentelemetry/api@1.9.0) + '@opentelemetry/sdk-trace-base': 1.30.1(@opentelemetry/api@1.9.0) + semver: 7.7.3 + + '@opentelemetry/semantic-conventions@1.28.0': {} + '@pkgjs/parseargs@0.11.0': optional: true diff --git a/src/batch/batch-evaluator-telemetry.spec.ts b/src/batch/batch-evaluator-telemetry.spec.ts new file mode 100644 index 0000000..e558b97 --- /dev/null +++ b/src/batch/batch-evaluator-telemetry.spec.ts @@ -0,0 +1,233 @@ +import { beforeEach, describe, expect, it } from "@jest/globals"; +import { + InMemorySpanExporter, + SimpleSpanProcessor, +} from "@opentelemetry/sdk-trace-base"; +import { NodeTracerProvider } from "@opentelemetry/sdk-trace-node"; +import { SpanStatusCode, _resetTracer } from "../telemetry.js"; +import type { + EvaluationInput, + EvaluatorResult, + IEvaluator, +} from "../types/evaluator.js"; +import { BatchEvaluator } from "./batch-evaluator.js"; + +// Set up OTel SDK for span capture +const exporter = new InMemorySpanExporter(); +const provider = new NodeTracerProvider(); +provider.addSpanProcessor(new SimpleSpanProcessor(exporter)); +provider.register(); + +// Helper to create a mock evaluator +const createMockEvaluator = ( + name: string, + score: number = 85, + shouldFail: boolean = false, +): IEvaluator => ({ + name, + evaluate: async (_input: EvaluationInput): Promise => { + if (shouldFail) { + throw new Error("Evaluator failed: ECONNRESET"); + } + return { + evaluatorName: name, + model: "mock-model", + score, + feedback: "Good", + processingStats: { + executionTime: 10, + tokenUsage: { inputTokens: 50, outputTokens: 10, totalTokens: 60 }, + }, + }; + }, +}); + +describe("BatchEvaluator telemetry", () => { + beforeEach(() => { + exporter.reset(); + _resetTracer(); + }); + + it("should create span hierarchy: batch → parse_input, process_row → run_evaluators", async () => { + const batch = new BatchEvaluator({ + evaluators: [createMockEvaluator("fluency")], + concurrency: 1, + }); + + await batch.evaluate({ + data: [{ candidateText: "Hello world" }], + }); + + const spans = exporter.getFinishedSpans(); + const spanNames = spans.map((s) => s.name).sort(); + + // No parse_input span for in-memory data (no I/O to trace) + expect(spanNames).toEqual([ + "eval-kit.batch.evaluate", + "eval-kit.batch.process_row", + "eval-kit.batch.run_evaluators", + ]); + + // Verify parent-child relationships + const batchSpan = spans.find( + (s) => s.name === "eval-kit.batch.evaluate", + )!; + const rowSpan = spans.find( + (s) => s.name === "eval-kit.batch.process_row", + )!; + const evalSpan = spans.find( + (s) => s.name === "eval-kit.batch.run_evaluators", + )!; + + expect(rowSpan.parentSpanId).toBe(batchSpan.spanContext().spanId); + expect(evalSpan.parentSpanId).toBe(rowSpan.spanContext().spanId); + }); + + it("should set correct attributes on batch span", async () => { + const batch = new BatchEvaluator({ + evaluators: [createMockEvaluator("fluency")], + concurrency: 3, + }); + + await batch.evaluate({ + data: [ + { candidateText: "Row 1" }, + { candidateText: "Row 2" }, + ], + }); + + const batchSpan = exporter + .getFinishedSpans() + .find((s) => s.name === "eval-kit.batch.evaluate")!; + + expect(batchSpan.attributes["eval_kit.batch.concurrency"]).toBe(3); + expect(batchSpan.attributes["eval_kit.batch.total_rows"]).toBe(2); + expect(batchSpan.attributes["eval_kit.batch.successful_rows"]).toBe(2); + expect(batchSpan.attributes["eval_kit.batch.failed_rows"]).toBe(0); + expect(batchSpan.status.code).toBe(SpanStatusCode.OK); + }); + + it("should set correct attributes on process_row span", async () => { + const batch = new BatchEvaluator({ + evaluators: [createMockEvaluator("accuracy")], + concurrency: 1, + }); + + await batch.evaluate({ + data: [{ candidateText: "Test row", id: "custom-id" }], + }); + + const rowSpan = exporter + .getFinishedSpans() + .find((s) => s.name === "eval-kit.batch.process_row")!; + + expect(rowSpan.attributes["eval_kit.row.id"]).toBe("custom-id"); + expect(rowSpan.attributes["eval_kit.row.index"]).toBe(0); + expect(rowSpan.attributes["eval_kit.row.retry_count"]).toBe(0); + expect(rowSpan.attributes["eval_kit.row.duration_ms"]).toBeGreaterThanOrEqual(0); + expect(rowSpan.status.code).toBe(SpanStatusCode.OK); + }); + + it("should record retry events on process_row span", async () => { + let callCount = 0; + const flaky: IEvaluator = { + name: "flaky", + evaluate: async () => { + callCount++; + if (callCount <= 2) { + throw new Error("ECONNRESET"); + } + return { + evaluatorName: "flaky", + score: 80, + feedback: "OK", + processingStats: { executionTime: 5 }, + }; + }, + }; + + const batch = new BatchEvaluator({ + evaluators: [flaky], + concurrency: 1, + retryConfig: { + maxRetries: 3, + retryDelay: 1, // 1ms for fast tests + exponentialBackoff: false, + }, + }); + + await batch.evaluate({ + data: [{ candidateText: "Test" }], + }); + + const rowSpan = exporter + .getFinishedSpans() + .find((s) => s.name === "eval-kit.batch.process_row")!; + + // Should have 2 retry events + const retryEvents = rowSpan.events.filter( + (e) => e.name === "retry", + ); + expect(retryEvents).toHaveLength(2); + expect(retryEvents[0].attributes!["eval_kit.retry.attempt"]).toBe(1); + expect(retryEvents[0].attributes!["eval_kit.retry.error"]).toBe( + "ECONNRESET", + ); + expect(retryEvents[1].attributes!["eval_kit.retry.attempt"]).toBe(2); + + // Final span should be OK (recovered) + expect(rowSpan.attributes["eval_kit.row.retry_count"]).toBe(2); + expect(rowSpan.status.code).toBe(SpanStatusCode.OK); + }); + + it("should set error status on process_row span when all retries exhausted", async () => { + const failing: IEvaluator = { + name: "failing", + evaluate: async () => { + throw new Error("ECONNRESET"); + }, + }; + + const batch = new BatchEvaluator({ + evaluators: [failing], + concurrency: 1, + retryConfig: { + maxRetries: 1, + retryDelay: 1, + }, + }); + + await batch.evaluate({ + data: [{ candidateText: "Test" }], + }); + + const rowSpan = exporter + .getFinishedSpans() + .find((s) => s.name === "eval-kit.batch.process_row")!; + + expect(rowSpan.status.code).toBe(SpanStatusCode.ERROR); + expect(rowSpan.attributes["eval_kit.result.error"]).toBe("ECONNRESET"); + expect(rowSpan.attributes["eval_kit.row.retry_count"]).toBe(1); + }); + + it("should skip parse_input span for in-memory data", async () => { + const batch = new BatchEvaluator({ + evaluators: [createMockEvaluator("test")], + concurrency: 1, + }); + + await batch.evaluate({ + data: [ + { candidateText: "Row 1" }, + { candidateText: "Row 2" }, + { candidateText: "Row 3" }, + ], + }); + + const parseSpan = exporter + .getFinishedSpans() + .find((s) => s.name === "eval-kit.batch.parse_input"); + + expect(parseSpan).toBeUndefined(); + }); +}); diff --git a/src/batch/batch-evaluator.ts b/src/batch/batch-evaluator.ts index 87a0117..a22c9ac 100644 --- a/src/batch/batch-evaluator.ts +++ b/src/batch/batch-evaluator.ts @@ -1,4 +1,10 @@ import { randomUUID } from "node:crypto"; +import { + type EvalKitSpan, + SpanStatusCode, + getCachedTracer, + withSpan, +} from "../telemetry.js"; import type { EvaluatorResult, IEvaluator } from "../types/evaluator.js"; import { ConcurrencyManager } from "./concurrency-manager.js"; import { CsvExporter } from "./exporters/csv-exporter.js"; @@ -43,80 +49,132 @@ export class BatchEvaluator { * Run batch evaluation on input data */ async evaluate(inputConfig: BatchInputConfig): Promise { - // Parse input - const allRows = await this.parseInput(inputConfig); - - // Handle startIndex for resuming from a specific position - const startIndex = inputConfig.startIndex ?? 0; - const rows = startIndex > 0 ? allRows.slice(startIndex) : allRows; - - // Mark rows before startIndex as already processed (for accurate progress tracking) - for (let i = 0; i < startIndex; i++) { - this.processedRowIndices.add(i); - } - - // Initialize progress tracker (use total rows for accurate percentage) - this.progressTracker = new ProgressTracker({ - totalRows: allRows.length, - emitInterval: this.config.progressInterval, - onProgress: this.config.onProgress, - }); - this.progressTracker.start(); + return withSpan( + "eval-kit.batch.evaluate", + { + attributes: { + "eval_kit.batch.id": this.batchId, + "eval_kit.batch.concurrency": this.config.concurrency ?? 5, + "eval_kit.batch.execution_mode": + this.config.evaluatorExecutionMode ?? "parallel", + }, + }, + async (span) => { + // Parse input + const allRows = await this.parseInput(inputConfig); + + span.setAttribute( + "eval_kit.batch.total_rows", + allRows.length, + ); + + // Handle startIndex for resuming from a specific position + const startIndex = inputConfig.startIndex ?? 0; + const rows = + startIndex > 0 ? allRows.slice(startIndex) : allRows; + + // Mark rows before startIndex as already processed (for accurate progress tracking) + for (let i = 0; i < startIndex; i++) { + this.processedRowIndices.add(i); + } - // Fast-forward progress tracker for skipped rows - if (startIndex > 0) { - this.progressTracker.skipRows(startIndex); - } + // Initialize progress tracker (use total rows for accurate percentage) + this.progressTracker = new ProgressTracker({ + totalRows: allRows.length, + emitInterval: this.config.progressInterval, + onProgress: this.config.onProgress, + }); + this.progressTracker.start(); + + // Fast-forward progress tracker for skipped rows + if (startIndex > 0) { + this.progressTracker.skipRows(startIndex); + } - // Process rows with controlled concurrency - // We batch into chunks to avoid creating all promises at once - const maxConcurrency = this.config.concurrency ?? 5; - const batchSize = maxConcurrency * 2; // Process in batches of 2x concurrency + // Process rows with controlled concurrency + // We batch into chunks to avoid creating all promises at once + const maxConcurrency = this.config.concurrency ?? 5; + const batchSize = maxConcurrency * 2; // Process in batches of 2x concurrency - for (let i = 0; i < rows.length; i += batchSize) { - const batch = rows.slice(i, i + batchSize); - const batchPromises = batch.map((row, batchIndex) => - // Adjust index to account for startIndex offset - this.processRow(row, startIndex + i + batchIndex), - ); - await Promise.all(batchPromises); - } - - // Mark completion - this.progressTracker.complete(); + for (let i = 0; i < rows.length; i += batchSize) { + const batch = rows.slice(i, i + batchSize); + const batchPromises = batch.map((row, batchIndex) => + // Adjust index to account for startIndex offset + this.processRow(row, startIndex + i + batchIndex), + ); + await Promise.all(batchPromises); + } - // Return results - return this.buildResult(); + // Mark completion + this.progressTracker.complete(); + + // Return results + const result = this.buildResult(); + span.setAttribute( + "eval_kit.batch.successful_rows", + result.successfulRows, + ); + span.setAttribute( + "eval_kit.batch.failed_rows", + result.failedRows, + ); + return result; + }, + ); } /** * Export results to a destination */ async export(exportConfig: BatchExportConfig): Promise { - const exporter = this.getExporter(exportConfig.format); - await exporter.export(this.results, exportConfig); + return withSpan( + "eval-kit.batch.export", + { + attributes: { + "eval_kit.export.format": exportConfig.format, + "eval_kit.export.row_count": this.results.length, + }, + }, + async () => { + const exporter = this.getExporter(exportConfig.format); + await exporter.export(this.results, exportConfig); + }, + ); } /** * Parse input - supports both file-based and in-memory data */ private async parseInput(config: BatchInputConfig): Promise { - // Check if it's in-memory data config + // In-memory data — no I/O, skip span if ("data" in config) { return config.data; } - // File-based config - const fileConfig = config as BatchInputFileConfig; - let format = fileConfig.format; + // File-based config — actual I/O worth tracing + return withSpan( + "eval-kit.batch.parse_input", + { + attributes: { + "eval_kit.parse.input_format": "file", + }, + }, + async (span) => { + const fileConfig = config as BatchInputFileConfig; + let format = fileConfig.format; - // Handle "auto" format detection - if (!format || format === "auto") { - format = this.detectFormat(fileConfig.filePath); - } + // Handle "auto" format detection + if (!format || format === "auto") { + format = this.detectFormat(fileConfig.filePath); + } - const parser = this.getParser(format); - return parser.parse(fileConfig); + const parser = this.getParser(format); + const rows = await parser.parse(fileConfig); + + span.setAttribute("eval_kit.parse.row_count", rows.length); + return rows; + }, + ); } /** @@ -169,158 +227,220 @@ export class BatchEvaluator { await this.concurrencyManager.run(async () => { const rowId = row.id ?? `row-${index}`; - const startTime = Date.now(); - let retryCount = 0; - const maxRetries = this.config.retryConfig?.maxRetries ?? 3; - - // Merge default input fields with row data - // Row data takes precedence over defaults - const inputData: BatchInputRow = { - ...this.config.defaultInput, - ...row, - }; - - // Loop allows: 1 initial attempt + maxRetries retry attempts - // With maxRetries=3: attempts at retryCount 0,1,2,3 = 4 total attempts - while (true) { - try { - // Run evaluators - const evaluatorResults = await this.runEvaluators(inputData); - - // Calculate duration - const durationMs = Date.now() - startTime; - - // Calculate total tokens used - const tokensUsed = evaluatorResults.reduce( - (sum, r) => sum + (r.processingStats.tokenUsage?.totalTokens ?? 0), - 0, - ); + const tracer = getCachedTracer(); - // Create result (use merged inputData so defaults are included) - const result: BatchEvaluationResult = { - rowId, - rowIndex: index, - input: inputData, - results: evaluatorResults, - timestamp: new Date().toISOString(), - durationMs, - retryCount, - }; - - // Call onResult callback for streaming results - if (this.config.onResult) { - const callbackResult = this.config.onResult(result); - if (callbackResult instanceof Promise) { - await callbackResult; - } - } + await tracer.startActiveSpan( + "eval-kit.batch.process_row", + { attributes: { "eval_kit.row.id": rowId, "eval_kit.row.index": index } }, + (span: EvalKitSpan) => this.runRowWithSpan(row, index, span), + ); + }); + } - // Store result in memory - this.results.push(result); - this.processedRowIndices.add(index); + private async runRowWithSpan( + row: BatchInputRow, + index: number, + span: EvalKitSpan, + ): Promise { + const startTime = Date.now(); + let spanError: string | undefined; + const inputData = { ...this.config.defaultInput, ...row }; + const rowId = row.id ?? `row-${index}`; + + try { + await this.executeRowWithRetry({ inputData, row, index, rowId, span, startTime }); + } catch (error) { + spanError = error instanceof Error ? error.message : String(error); + span.recordException(error instanceof Error ? error : spanError); + } finally { + span.setAttribute("eval_kit.row.duration_ms", Date.now() - startTime); + if (spanError) { + span.setAttribute("eval_kit.result.error", spanError); + span.setStatus({ code: SpanStatusCode.ERROR, message: spanError }); + } else { + span.setStatus({ code: SpanStatusCode.OK }); + } + span.end(); + } + } - // Update progress - this.progressTracker?.recordSuccess(durationMs, tokensUsed); + private async executeRowWithRetry(ctx: { + inputData: BatchInputRow; + row: BatchInputRow; + index: number; + rowId: string; + span: EvalKitSpan; + startTime: number; + }): Promise { + let retryCount = 0; + const maxRetries = this.config.retryConfig?.maxRetries ?? 3; + let lastError: unknown; + + while (true) { + try { + await this.executeRowEvaluation({ ...ctx, retryCount }); + ctx.span.setAttribute("eval_kit.row.retry_count", retryCount); + return; + } catch (error) { + lastError = error; + const errorCtx = { error, span: ctx.span, startTime: ctx.startTime, retryCount, maxRetries, rowId: ctx.rowId, index: ctx.index, row: ctx.row }; + const shouldRetry = await this.handleRowError(errorCtx); + if (!shouldRetry) { + ctx.span.setAttribute("eval_kit.row.retry_count", retryCount); + throw lastError; + } + retryCount++; + } + } + } - return; // Success, exit retry loop - } catch (error) { - const errorMessage = - error instanceof Error ? error.message : String(error); + private async handleRowError(ctx: { + error: unknown; + span: EvalKitSpan; + startTime: number; + retryCount: number; + maxRetries: number; + rowId: string; + index: number; + row: BatchInputRow; + }): Promise { + const errorMessage = ctx.error instanceof Error ? ctx.error.message : String(ctx.error); + const shouldRetry = this.shouldRetry(errorMessage, ctx.retryCount, ctx.maxRetries); + + if (shouldRetry) { + const nextAttempt = ctx.retryCount + 1; + this.progressTracker?.recordRetry(errorMessage, nextAttempt); + const delay = this.calculateRetryDelay(nextAttempt); + + ctx.span.addEvent("retry", { + "eval_kit.retry.attempt": nextAttempt, + "eval_kit.retry.delay_ms": delay, + "eval_kit.retry.error": errorMessage, + }); + + await this.sleep(delay); + return true; + } - // Check if we should retry - const shouldRetry = this.shouldRetry( - errorMessage, - retryCount, - maxRetries, - ); + const durationMs = Date.now() - ctx.startTime; + this.results.push({ + rowId: ctx.rowId, + rowIndex: ctx.index, + input: ctx.row, + results: [], + timestamp: new Date().toISOString(), + durationMs, + retryCount: ctx.retryCount, + error: errorMessage, + }); - if (shouldRetry) { - retryCount++; - this.progressTracker?.recordRetry(errorMessage, retryCount); - - // Calculate delay with exponential backoff - const baseDelay = this.config.retryConfig?.retryDelay ?? 1000; - const delay = this.config.retryConfig?.exponentialBackoff - ? baseDelay * 2 ** (retryCount - 1) - : baseDelay; - - await this.sleep(delay); - } else { - // Max retries reached or non-retryable error - const durationMs = Date.now() - startTime; - - const result: BatchEvaluationResult = { - rowId, - rowIndex: index, - input: row, - results: [], - timestamp: new Date().toISOString(), - durationMs, - retryCount, - error: errorMessage, - }; - - this.results.push(result); - this.processedRowIndices.add(index); - this.progressTracker?.recordFailure(durationMs); - - // Stop on error if configured - if (this.config.stopOnError) { - throw new Error( - `Stopping batch evaluation due to error: ${errorMessage}`, - ); - } - - return; // Exit retry loop - } - } + this.processedRowIndices.add(ctx.index); + this.progressTracker?.recordFailure(durationMs); + + if (this.config.stopOnError) { + throw new Error(`Stopping batch evaluation due to error: ${errorMessage}`); + } + + return false; + } + + private calculateRetryDelay(attempt: number): number { + const baseDelay = this.config.retryConfig?.retryDelay ?? 1000; + return this.config.retryConfig?.exponentialBackoff ? baseDelay * 2 ** (attempt - 1) : baseDelay; + } + + private async executeRowEvaluation(ctx: { + inputData: BatchInputRow; + index: number; + rowId: string; + startTime: number; + retryCount: number; + }): Promise { + const evaluatorResults = await this.runEvaluators(ctx.inputData); + const durationMs = Date.now() - ctx.startTime; + const tokensUsed = evaluatorResults.reduce( + (sum, r) => sum + (r.processingStats.tokenUsage?.totalTokens ?? 0), + 0, + ); + + const result: BatchEvaluationResult = { + rowId: ctx.rowId, + rowIndex: ctx.index, + input: ctx.inputData, + results: evaluatorResults, + timestamp: new Date().toISOString(), + durationMs, + retryCount: ctx.retryCount, + }; + + if (this.config.onResult) { + const callbackResult = this.config.onResult(result); + if (callbackResult instanceof Promise) { + await callbackResult; } - }); + } + + this.results.push(result); + this.processedRowIndices.add(ctx.index); + this.progressTracker?.recordSuccess(durationMs, tokensUsed); } /** * Run all evaluators on a single row */ private async runEvaluators(row: BatchInputRow): Promise { - const input = { - candidateText: row.candidateText, - prompt: row.prompt, - referenceText: row.referenceText, - sourceText: row.sourceText, - contentType: row.contentType, - language: row.language, - }; - - // Apply timeout if configured - const timeout = this.config.timeout; - const evaluateWithTimeout = async (evaluator: IEvaluator) => { - if (timeout) { - return Promise.race([ - evaluator.evaluate(input), - this.timeoutPromise( - timeout, - `Evaluator ${evaluator.name} timed out after ${timeout}ms`, + return withSpan( + "eval-kit.batch.run_evaluators", + { + attributes: { + "eval_kit.evaluator_count": this.evaluators.length, + "eval_kit.execution_mode": + this.config.evaluatorExecutionMode ?? "parallel", + }, + }, + async () => { + const input = { + candidateText: row.candidateText, + prompt: row.prompt, + referenceText: row.referenceText, + sourceText: row.sourceText, + contentType: row.contentType, + language: row.language, + }; + + // Apply timeout if configured + const timeout = this.config.timeout; + const evaluateWithTimeout = async (evaluator: IEvaluator) => { + if (timeout) { + return Promise.race([ + evaluator.evaluate(input), + this.timeoutPromise( + timeout, + `Evaluator ${evaluator.name} timed out after ${timeout}ms`, + ), + ]); + } + return evaluator.evaluate(input); + }; + + // Run evaluators in parallel or sequential mode + if (this.config.evaluatorExecutionMode === "sequential") { + const results: EvaluatorResult[] = []; + for (const evaluator of this.evaluators) { + const result = await evaluateWithTimeout(evaluator); + results.push(result as EvaluatorResult); + } + return results; + } + // Parallel mode (default) + const results = await Promise.all( + this.evaluators.map((evaluator) => + evaluateWithTimeout(evaluator), ), - ]); - } - return evaluator.evaluate(input); - }; - - // Run evaluators in parallel or sequential mode - if (this.config.evaluatorExecutionMode === "sequential") { - const results: EvaluatorResult[] = []; - for (const evaluator of this.evaluators) { - const result = await evaluateWithTimeout(evaluator); - results.push(result as EvaluatorResult); - } - return results; - } else { - // Parallel mode (default) - const results = await Promise.all( - this.evaluators.map((evaluator) => evaluateWithTimeout(evaluator)), - ); - return results as EvaluatorResult[]; - } + ); + return results as EvaluatorResult[]; + }, + ); } /** diff --git a/src/evaluators/evaluator-telemetry.spec.ts b/src/evaluators/evaluator-telemetry.spec.ts new file mode 100644 index 0000000..e045c22 --- /dev/null +++ b/src/evaluators/evaluator-telemetry.spec.ts @@ -0,0 +1,134 @@ +import { beforeEach, describe, expect, it, jest } from "@jest/globals"; +import type { LanguageModel } from "ai"; +import { + InMemorySpanExporter, + SimpleSpanProcessor, +} from "@opentelemetry/sdk-trace-base"; +import { NodeTracerProvider } from "@opentelemetry/sdk-trace-node"; +import { SpanStatusCode, _resetTracer } from "../telemetry.js"; + +// Set up OTel SDK for span capture +const exporter = new InMemorySpanExporter(); +const provider = new NodeTracerProvider(); +provider.addSpanProcessor(new SimpleSpanProcessor(exporter)); +provider.register(); + +// Mock the ai module +const mockGenerateObject = jest.fn(); +jest.unstable_mockModule("ai", () => ({ + generateObject: mockGenerateObject, +})); + +const { Evaluator } = await import("./evaluator.js"); + +const createMockModel = (): LanguageModel => + ({ + specificationVersion: "v1", + provider: "mock", + modelId: "mock-model-v1", + defaultObjectGenerationMode: "json", + }) as unknown as LanguageModel; + +describe("Evaluator telemetry", () => { + beforeEach(() => { + exporter.reset(); + _resetTracer(); + mockGenerateObject.mockClear(); + }); + + it("should create a span with correct name and initial attributes on success", async () => { + mockGenerateObject.mockResolvedValue({ + object: { score: 85, feedback: "Good quality" }, + usage: { inputTokens: 100, outputTokens: 20, totalTokens: 120 }, + }); + + const evaluator = new Evaluator({ + name: "fluency", + model: createMockModel(), + evaluationPrompt: "Rate: {{candidateText}}", + }); + + const result = await evaluator.evaluate({ + candidateText: "Hello world", + }); + + expect(result.score).toBe(85); + expect(result.error).toBeUndefined(); + + const spans = exporter.getFinishedSpans(); + expect(spans).toHaveLength(1); + + const span = spans[0]; + expect(span.name).toBe("eval-kit.evaluator.evaluate"); + expect(span.attributes["eval_kit.evaluator.name"]).toBe("fluency"); + expect(span.attributes["eval_kit.model.id"]).toBe("mock-model-v1"); + expect(span.attributes["eval_kit.score_config.type"]).toBe("numeric"); + expect(span.attributes["eval_kit.input.candidate_text_length"]).toBe(11); + + // Completion attributes + expect(span.attributes["eval_kit.result.score"]).toBe(85); + expect(span.attributes["eval_kit.result.token_usage.input"]).toBe(100); + expect(span.attributes["eval_kit.result.token_usage.output"]).toBe(20); + expect(span.attributes["eval_kit.result.token_usage.total"]).toBe(120); + expect(span.attributes["eval_kit.result.execution_time_ms"]).toBeGreaterThanOrEqual(0); + + expect(span.status.code).toBe(SpanStatusCode.OK); + }); + + it("should record error attributes when evaluation fails", async () => { + mockGenerateObject.mockRejectedValue(new Error("API rate limited")); + + const evaluator = new Evaluator({ + name: "accuracy", + model: createMockModel(), + evaluationPrompt: "Rate: {{candidateText}}", + }); + + const result = await evaluator.evaluate({ + candidateText: "Test text", + }); + + // Evaluator catches errors and returns a result + expect(result.error).toBeDefined(); + expect(result.score).toBe(0); + + const spans = exporter.getFinishedSpans(); + expect(spans).toHaveLength(1); + + const span = spans[0]; + expect(span.status.code).toBe(SpanStatusCode.ERROR); + expect(span.attributes["eval_kit.result.error"]).toBe("API rate limited"); + expect(span.attributes["eval_kit.result.execution_time_ms"]).toBeGreaterThanOrEqual(0); + + // Exception event recorded + expect(span.events.length).toBeGreaterThanOrEqual(1); + const exceptionEvent = span.events.find((e) => e.name === "exception"); + expect(exceptionEvent).toBeDefined(); + }); + + it("should not break existing behavior when OTel is present", async () => { + mockGenerateObject.mockResolvedValue({ + object: { score: "excellent", feedback: "Top quality" }, + usage: undefined, + }); + + const evaluator = new Evaluator({ + name: "quality", + model: createMockModel(), + evaluationPrompt: "Rate: {{candidateText}}", + scoreConfig: { + type: "categorical", + categories: ["poor", "fair", "good", "excellent"], + }, + }); + + const result = await evaluator.evaluate({ + candidateText: "Test", + }); + + expect(result.evaluatorName).toBe("quality"); + expect(result.score).toBe("excellent"); + expect(result.feedback).toBe("Top quality"); + expect(result.processingStats.executionTime).toBeGreaterThanOrEqual(0); + }); +}); diff --git a/src/evaluators/evaluator.ts b/src/evaluators/evaluator.ts index be4fb73..2737c75 100644 --- a/src/evaluators/evaluator.ts +++ b/src/evaluators/evaluator.ts @@ -1,5 +1,6 @@ import { generateObject } from "ai"; import { z } from "zod"; +import { type EvalKitSpan, getTracer, SpanStatusCode } from "../telemetry.js"; import type { EvaluationInput, EvaluatorConfig, @@ -44,66 +45,151 @@ export class Evaluator { } async evaluate(input: EvaluationInput): Promise { + const tracer = await getTracer(); + const modelId = (this.model as { modelId?: string }).modelId; + + return tracer.startActiveSpan( + "eval-kit.evaluator.evaluate", + { + attributes: { + "eval_kit.evaluator.name": this.name, + "eval_kit.model.id": modelId ?? "unknown", + "eval_kit.score_config.type": this.scoreConfig.type, + "eval_kit.input.candidate_text_length": input.candidateText.length, + }, + }, + (span: EvalKitSpan) => this.runEvaluation(input, modelId, span), + ); + } + + private async runEvaluation( + input: EvaluationInput, + modelId: string | undefined, + span: EvalKitSpan, + ): Promise { const startTime = Date.now(); + let spanError: string | undefined; + let evaluatorResult: EvaluatorResult; try { - const variables = this.prepareVariables(input); - const scoreInstructions = this.formatScoreInstructions(); - const prompt = `${this.templateRenderer.render(this.evaluationPrompt, variables)}\n\n${scoreInstructions}`; - - const schema = this.createSchema(); - - const result = (await generateObject({ - model: this.model, - schema, - prompt, - temperature: this.modelSettings?.temperature, - maxOutputTokens: this.modelSettings?.maxOutputTokens, - topP: this.modelSettings?.topP, - topK: this.modelSettings?.topK, - presencePenalty: this.modelSettings?.presencePenalty, - frequencyPenalty: this.modelSettings?.frequencyPenalty, - seed: this.modelSettings?.seed, - // eslint-disable-next-line @typescript-eslint/no-explicit-any - } as any)) as unknown as { - object: { score: number | string; feedback: string }; - usage: unknown; - }; - - const executionTime = Date.now() - startTime; - const tokenUsage = this.extractTokenUsage(result.usage); - - return { - evaluatorName: this.name, - model: (this.model as { modelId?: string }).modelId, - score: result.object.score, - feedback: result.object.feedback, - processingStats: { - executionTime, - tokenUsage, - }, - }; + evaluatorResult = await this.executeEvaluation(input, modelId, span); } catch (error) { - const executionTime = Date.now() - startTime; - const errorMessage = - error instanceof Error ? error.message : String(error); - const errorDetails = - error instanceof Error && "cause" in error - ? String(error.cause) - : undefined; - - return { - evaluatorName: this.name, - model: (this.model as { modelId?: string }).modelId, - score: 0, - feedback: `Evaluation failed: ${errorMessage}`, - processingStats: { - executionTime, - }, - error: errorDetails - ? `${errorMessage} (${errorDetails})` - : errorMessage, - }; + evaluatorResult = this.buildErrorResult(error, modelId, startTime); + spanError = evaluatorResult.error; + span.recordException( + error instanceof Error ? error : (spanError ?? String(error)), + ); + } finally { + span.setAttribute( + "eval_kit.result.execution_time_ms", + Date.now() - startTime, + ); + if (spanError) { + span.setAttribute("eval_kit.result.error", spanError); + span.setStatus({ code: SpanStatusCode.ERROR, message: spanError }); + } else { + span.setStatus({ code: SpanStatusCode.OK }); + } + span.end(); + } + + return evaluatorResult; + } + + private buildErrorResult( + error: unknown, + modelId: string | undefined, + startTime: number, + ): EvaluatorResult { + const errorMessage = error instanceof Error ? error.message : String(error); + const errorDetails = + error instanceof Error && "cause" in error + ? String(error.cause) + : undefined; + const fullError = errorDetails + ? `${errorMessage} (${errorDetails})` + : errorMessage; + + return { + evaluatorName: this.name, + model: modelId, + score: 0, + feedback: `Evaluation failed: ${errorMessage}`, + processingStats: { executionTime: Date.now() - startTime }, + error: fullError, + }; + } + + private async executeEvaluation( + input: EvaluationInput, + modelId: string | undefined, + span: EvalKitSpan, + ): Promise { + const startTime = Date.now(); + const prompt = this.buildPrompt(input); + const schema = this.createSchema(); + + const result = (await generateObject({ + model: this.model, + schema, + prompt, + temperature: this.modelSettings?.temperature, + maxOutputTokens: this.modelSettings?.maxOutputTokens, + topP: this.modelSettings?.topP, + topK: this.modelSettings?.topK, + presencePenalty: this.modelSettings?.presencePenalty, + frequencyPenalty: this.modelSettings?.frequencyPenalty, + seed: this.modelSettings?.seed, + // eslint-disable-next-line @typescript-eslint/no-explicit-any + } as any)) as unknown as { + object: { score: number | string; feedback: string }; + usage: unknown; + }; + + const executionTime = Date.now() - startTime; + const tokenUsage = this.extractTokenUsage(result.usage); + + this.setTokenAttributes(span, tokenUsage); + span.setAttribute("eval_kit.result.score", result.object.score); + + return { + evaluatorName: this.name, + model: modelId, + score: result.object.score, + feedback: result.object.feedback, + processingStats: { executionTime, tokenUsage }, + }; + } + + private buildPrompt(input: EvaluationInput): string { + const variables = this.prepareVariables(input); + const scoreInstructions = this.formatScoreInstructions(); + return `${this.templateRenderer.render(this.evaluationPrompt, variables)}\n\n${scoreInstructions}`; + } + + private setTokenAttributes( + span: EvalKitSpan, + tokenUsage: + | { inputTokens?: number; outputTokens?: number; totalTokens?: number } + | undefined, + ): void { + if (tokenUsage?.inputTokens !== undefined) { + span.setAttribute( + "eval_kit.result.token_usage.input", + tokenUsage.inputTokens, + ); + } + if (tokenUsage?.outputTokens !== undefined) { + span.setAttribute( + "eval_kit.result.token_usage.output", + tokenUsage.outputTokens, + ); + } + if (tokenUsage?.totalTokens !== undefined) { + span.setAttribute( + "eval_kit.result.token_usage.total", + tokenUsage.totalTokens, + ); } } diff --git a/src/index.ts b/src/index.ts index db6d222..261b34f 100644 --- a/src/index.ts +++ b/src/index.ts @@ -83,5 +83,7 @@ export { cosineSimilarity } from "./utils/similarity.js"; // Evaluator Utilities export { TemplateRenderer } from "./utils/template-engine.js"; export { calculateIDF, calculateTFIDF } from "./utils/tfidf.js"; +// Telemetry +export { type EvalKitSpan, type WithSpanOptions, withSpan } from "./telemetry.js"; // Utilities export { tokenizeSentences, tokenizeWords } from "./utils/tokenization.js"; diff --git a/src/metrics/bert-score-telemetry.spec.ts b/src/metrics/bert-score-telemetry.spec.ts new file mode 100644 index 0000000..7285b44 --- /dev/null +++ b/src/metrics/bert-score-telemetry.spec.ts @@ -0,0 +1,92 @@ +import { beforeEach, describe, expect, it, jest } from "@jest/globals"; +import { + InMemorySpanExporter, + SimpleSpanProcessor, +} from "@opentelemetry/sdk-trace-base"; +import { NodeTracerProvider } from "@opentelemetry/sdk-trace-node"; +import { _resetTracer, SpanStatusCode } from "../telemetry.js"; + +// Set up OTel SDK +const exporter = new InMemorySpanExporter(); +const provider = new NodeTracerProvider(); +provider.addSpanProcessor(new SimpleSpanProcessor(exporter)); +provider.register(); + +// Mock @xenova/transformers to avoid ESM/model download issues +const mockPipeline = jest.fn(); +const mockCosSim = jest.fn(); +jest.unstable_mockModule("@xenova/transformers", () => ({ + pipeline: mockPipeline, + cos_sim: mockCosSim, +})); + +const { calculateBertScore, clearBertCache } = await import("./bert-score.js"); + +describe("bert-score telemetry", () => { + beforeEach(() => { + exporter.reset(); + _resetTracer(); + clearBertCache(); + mockPipeline.mockClear(); + mockCosSim.mockClear(); + }); + + it("should create a span with correct name and attributes", async () => { + const mockExtractor = jest.fn().mockResolvedValue({ + data: new Float32Array([0.1, 0.2, 0.3]), + }); + mockPipeline.mockResolvedValue(mockExtractor); + mockCosSim.mockReturnValue(0.85); + + const result = await calculateBertScore("hello world", "hello there", { + model: "test-model", + scoreType: "f1", + }); + + expect(result.score).toBeGreaterThanOrEqual(0); + + const spans = exporter.getFinishedSpans(); + const span = spans.find((s) => s.name === "eval-kit.metric.bert_score"); + expect(span).toBeDefined(); + expect(span?.attributes["eval_kit.metric.name"]).toBe("bert_score"); + expect(span?.attributes["eval_kit.metric.model"]).toBe("test-model"); + expect(span?.attributes["eval_kit.metric.score_type"]).toBe("f1"); + expect(span?.attributes["eval_kit.result.score"]).toBeDefined(); + expect(span?.status.code).toBe(SpanStatusCode.OK); + }); + + it("should add model_loaded event on cache miss", async () => { + const mockExtractor = jest.fn().mockResolvedValue({ + data: new Float32Array([0.1, 0.2, 0.3]), + }); + mockPipeline.mockResolvedValue(mockExtractor); + mockCosSim.mockReturnValue(0.9); + + await calculateBertScore("a", "b", { model: "fresh-model" }); + + const span = exporter + .getFinishedSpans() + .find((s) => s.name === "eval-kit.metric.bert_score"); + + const loadEvent = span?.events.find((e) => e.name === "model_loaded"); + expect(loadEvent).toBeDefined(); + expect(loadEvent?.attributes?.["eval_kit.metric.model"]).toBe( + "fresh-model", + ); + }); + + it("should record error when pipeline fails", async () => { + mockPipeline.mockRejectedValue(new Error("Model download failed")); + + await expect(calculateBertScore("a", "b")).rejects.toThrow( + "Model download failed", + ); + + const span = exporter + .getFinishedSpans() + .find((s) => s.name === "eval-kit.metric.bert_score"); + + expect(span?.status.code).toBe(SpanStatusCode.ERROR); + expect(span?.events.some((e) => e.name === "exception")).toBe(true); + }); +}); diff --git a/src/metrics/bert-score.ts b/src/metrics/bert-score.ts index 24c250d..9e769e0 100644 --- a/src/metrics/bert-score.ts +++ b/src/metrics/bert-score.ts @@ -1,5 +1,6 @@ import type { FeatureExtractionPipeline, Tensor } from "@xenova/transformers"; import { cos_sim, pipeline } from "@xenova/transformers"; +import { withSpan } from "../telemetry.js"; import { tokenizeWords } from "../utils/tokenization.js"; export interface BertScoreOptions { @@ -77,60 +78,84 @@ export const calculateBertScore = async ( ): Promise => { const { model = "Xenova/all-MiniLM-L6-v2", scoreType = "f1" } = options; - const extractor = await getPipeline(model); - - const candidateTokens = tokenizeWords(candidate); - const referenceTokens = tokenizeWords(reference); + return withSpan( + "eval-kit.metric.bert_score", + { + attributes: { + "eval_kit.metric.name": "bert_score", + "eval_kit.metric.model": model, + "eval_kit.metric.score_type": scoreType, + }, + }, + async (span) => { + const wasCached = + cachedPipeline !== null && cachedModelName === model; + const extractor = await getPipeline(model); + if (!wasCached) { + span.addEvent("model_loaded", { + "eval_kit.metric.model": model, + }); + } - const candidateEmbeddings = await getTokenEmbeddings( - candidateTokens, - extractor, - ); - const referenceEmbeddings = await getTokenEmbeddings( - referenceTokens, - extractor, - ); + const candidateTokens = tokenizeWords(candidate); + const referenceTokens = tokenizeWords(reference); + + const candidateEmbeddings = await getTokenEmbeddings( + candidateTokens, + extractor, + ); + const referenceEmbeddings = await getTokenEmbeddings( + referenceTokens, + extractor, + ); + + const precisionSimilarities = computeMaxSimilarities( + candidateEmbeddings, + referenceEmbeddings, + ); + const recallSimilarities = computeMaxSimilarities( + referenceEmbeddings, + candidateEmbeddings, + ); + + const precision = + precisionSimilarities.length > 0 + ? precisionSimilarities.reduce( + (sum, sim) => sum + sim, + 0, + ) / precisionSimilarities.length + : 0; + + const recall = + recallSimilarities.length > 0 + ? recallSimilarities.reduce((sum, sim) => sum + sim, 0) / + recallSimilarities.length + : 0; + + const f1 = + precision + recall > 0 + ? (2 * precision * recall) / (precision + recall) + : 0; + + let finalScore = f1; + if (scoreType === "precision") { + finalScore = precision; + } else if (scoreType === "recall") { + finalScore = recall; + } - const precisionSimilarities = computeMaxSimilarities( - candidateEmbeddings, - referenceEmbeddings, + const result = { + score: Math.round(finalScore * 10000) / 100, + precision: Math.round(precision * 10000) / 100, + recall: Math.round(recall * 10000) / 100, + f1: Math.round(f1 * 10000) / 100, + modelUsed: model, + }; + + span.setAttribute("eval_kit.result.score", result.score); + return result; + }, ); - const recallSimilarities = computeMaxSimilarities( - referenceEmbeddings, - candidateEmbeddings, - ); - - const precision = - precisionSimilarities.length > 0 - ? precisionSimilarities.reduce((sum, sim) => sum + sim, 0) / - precisionSimilarities.length - : 0; - - const recall = - recallSimilarities.length > 0 - ? recallSimilarities.reduce((sum, sim) => sum + sim, 0) / - recallSimilarities.length - : 0; - - const f1 = - precision + recall > 0 - ? (2 * precision * recall) / (precision + recall) - : 0; - - let finalScore = f1; - if (scoreType === "precision") { - finalScore = precision; - } else if (scoreType === "recall") { - finalScore = recall; - } - - return { - score: Math.round(finalScore * 10000) / 100, - precision: Math.round(precision * 10000) / 100, - recall: Math.round(recall * 10000) / 100, - f1: Math.round(f1 * 10000) / 100, - modelUsed: model, - }; }; export const clearBertCache = (): void => { diff --git a/src/metrics/perplexity-telemetry.spec.ts b/src/metrics/perplexity-telemetry.spec.ts new file mode 100644 index 0000000..9aa58ce --- /dev/null +++ b/src/metrics/perplexity-telemetry.spec.ts @@ -0,0 +1,107 @@ +import { beforeEach, describe, expect, it, jest } from "@jest/globals"; +import { + InMemorySpanExporter, + SimpleSpanProcessor, +} from "@opentelemetry/sdk-trace-base"; +import { NodeTracerProvider } from "@opentelemetry/sdk-trace-node"; +import { _resetTracer, SpanStatusCode } from "../telemetry.js"; + +// Set up OTel SDK +const exporter = new InMemorySpanExporter(); +const provider = new NodeTracerProvider(); +provider.addSpanProcessor(new SimpleSpanProcessor(exporter)); +provider.register(); + +// Mock @xenova/transformers to avoid ESM/model download issues +const mockFromPretrainedTokenizer = jest.fn(); +const mockFromPretrainedModel = jest.fn(); +jest.unstable_mockModule("@xenova/transformers", () => ({ + AutoTokenizer: { from_pretrained: mockFromPretrainedTokenizer }, + AutoModelForCausalLM: { from_pretrained: mockFromPretrainedModel }, + Tensor: class MockTensor { + constructor( + public type: string, + public data: BigInt64Array, + public dims: number[], + ) {} + }, +})); + +const { calculatePerplexity, clearPerplexityCache } = await import( + "./perplexity.js" +); + +describe("perplexity telemetry", () => { + beforeEach(() => { + exporter.reset(); + _resetTracer(); + clearPerplexityCache(); + mockFromPretrainedTokenizer.mockClear(); + mockFromPretrainedModel.mockClear(); + }); + + it("should create a span with correct name and attributes", async () => { + // Mock tokenizer that returns a short sequence (triggers early return) + const mockTokenizer = jest.fn().mockResolvedValue({ + input_ids: { + data: new BigInt64Array([BigInt(1)]), + }, + }); + mockFromPretrainedTokenizer.mockResolvedValue(mockTokenizer); + mockFromPretrainedModel.mockResolvedValue({}); + + const result = await calculatePerplexity("hi", { + model: "test-model", + }); + + // Short text triggers early return with perplexity 1.0 + expect(result.perplexity).toBe(1.0); + expect(result.score).toBe(100); + + const spans = exporter.getFinishedSpans(); + const span = spans.find((s) => s.name === "eval-kit.metric.perplexity"); + expect(span).toBeDefined(); + expect(span?.attributes["eval_kit.metric.name"]).toBe("perplexity"); + expect(span?.attributes["eval_kit.metric.model"]).toBe("test-model"); + expect(span?.attributes["eval_kit.result.score"]).toBe(100); + expect(span?.status.code).toBe(SpanStatusCode.OK); + }); + + it("should add model_loaded event on cache miss", async () => { + const mockTokenizer = jest.fn().mockResolvedValue({ + input_ids: { + data: new BigInt64Array([BigInt(1)]), + }, + }); + mockFromPretrainedTokenizer.mockResolvedValue(mockTokenizer); + mockFromPretrainedModel.mockResolvedValue({}); + + await calculatePerplexity("hi", { model: "fresh-model" }); + + const span = exporter + .getFinishedSpans() + .find((s) => s.name === "eval-kit.metric.perplexity"); + + const loadEvent = span?.events.find((e) => e.name === "model_loaded"); + expect(loadEvent).toBeDefined(); + expect(loadEvent?.attributes?.["eval_kit.metric.model"]).toBe( + "fresh-model", + ); + }); + + it("should record error when model loading fails", async () => { + mockFromPretrainedTokenizer.mockRejectedValue(new Error("Network error")); + mockFromPretrainedModel.mockRejectedValue(new Error("Network error")); + + await expect(calculatePerplexity("test text")).rejects.toThrow( + "Network error", + ); + + const span = exporter + .getFinishedSpans() + .find((s) => s.name === "eval-kit.metric.perplexity"); + + expect(span?.status.code).toBe(SpanStatusCode.ERROR); + expect(span?.events.some((e) => e.name === "exception")).toBe(true); + }); +}); diff --git a/src/metrics/perplexity.ts b/src/metrics/perplexity.ts index 661f9c3..1445d68 100644 --- a/src/metrics/perplexity.ts +++ b/src/metrics/perplexity.ts @@ -5,6 +5,7 @@ import { type PreTrainedTokenizer, Tensor, } from "@xenova/transformers"; +import { withSpan } from "../telemetry.js"; export interface PerplexityOptions { model?: string; @@ -90,83 +91,119 @@ export const calculatePerplexity = async ( ): Promise => { const { model: modelName = "Xenova/gpt2", stride = 512 } = options; - const { tokenizer, model } = await getModelAndTokenizer(modelName); - - const encoded = await tokenizer(text, { - return_tensors: "pt", - truncation: false, - add_special_tokens: true, - }); - - const inputIds = Array.from(encoded.input_ids.data as BigInt64Array).map( - (x) => Number(x), + return withSpan( + "eval-kit.metric.perplexity", + { + attributes: { + "eval_kit.metric.name": "perplexity", + "eval_kit.metric.model": modelName, + }, + }, + async (span) => { + const wasCached = + cachedModel !== null && cachedModelName === modelName; + const { tokenizer, model } = + await getModelAndTokenizer(modelName); + if (!wasCached) { + span.addEvent("model_loaded", { + "eval_kit.metric.model": modelName, + }); + } + + const encoded = await tokenizer(text, { + return_tensors: "pt", + truncation: false, + add_special_tokens: true, + }); + + const inputIds = Array.from( + encoded.input_ids.data as BigInt64Array, + ).map((x) => Number(x)); + + if (inputIds.length <= 1) { + span.setAttribute("eval_kit.metric.token_count", inputIds.length); + span.setAttribute("eval_kit.result.score", 100); + return { + perplexity: 1.0, + score: 100, + tokenCount: inputIds.length, + averageLogProb: 0, + modelUsed: modelName, + feedback: + "Text too short to calculate perplexity meaningfully", + }; + } + + let totalLogProb = 0; + let totalTokens = 0; + + for (let i = 0; i < inputIds.length; i += stride) { + const end = Math.min(i + stride + 1, inputIds.length); + const batch = inputIds.slice(i, end); + + if (batch.length <= 1) continue; + + const batchTensor = { + input_ids: new Tensor( + "int64", + new BigInt64Array(batch.map((x) => BigInt(x))), + [1, batch.length], + ), + }; + + const outputs = await model(batchTensor); + const logits = outputs.logits; + + if (!logits || !logits.data) continue; + + // Get vocab size from logits shape: [batch_size, seq_len, vocab_size] + const vocabSize = logits.dims?.[2] || 50257; + const logitsArray = Array.from( + logits.data as Float32Array, + ); + + for (let j = 1; j < batch.length; j++) { + const startIdx = (j - 1) * vocabSize; + const endIdx = startIdx + vocabSize; + const prevLogits = logitsArray.slice(startIdx, endIdx); + + const probs = softmax(prevLogits); + const targetTokenId = batch[j]; + const prob = probs[targetTokenId] || 1e-10; + + totalLogProb += Math.log(prob); + totalTokens++; + } + + if (end >= inputIds.length) break; + } + + const averageLogProb = + totalTokens > 0 ? totalLogProb / totalTokens : 0; + const perplexity = Math.exp(-averageLogProb); + const score = normalizePerplexityToScore(perplexity); + + span.setAttribute("eval_kit.metric.token_count", totalTokens); + span.setAttribute( + "eval_kit.result.perplexity", + Math.round(perplexity * 100) / 100, + ); + span.setAttribute( + "eval_kit.result.score", + Math.round(score * 100) / 100, + ); + + return { + perplexity: Math.round(perplexity * 100) / 100, + score: Math.round(score * 100) / 100, + tokenCount: totalTokens, + averageLogProb: + Math.round(averageLogProb * 10000) / 10000, + modelUsed: modelName, + feedback: generateFeedback(perplexity, score), + }; + }, ); - - if (inputIds.length <= 1) { - return { - perplexity: 1.0, - score: 100, - tokenCount: inputIds.length, - averageLogProb: 0, - modelUsed: modelName, - feedback: "Text too short to calculate perplexity meaningfully", - }; - } - - let totalLogProb = 0; - let totalTokens = 0; - - for (let i = 0; i < inputIds.length; i += stride) { - const end = Math.min(i + stride + 1, inputIds.length); - const batch = inputIds.slice(i, end); - - if (batch.length <= 1) continue; - - const batchTensor = { - input_ids: new Tensor( - "int64", - new BigInt64Array(batch.map((x) => BigInt(x))), - [1, batch.length], - ), - }; - - const outputs = await model(batchTensor); - const logits = outputs.logits; - - if (!logits || !logits.data) continue; - - // Get vocab size from logits shape: [batch_size, seq_len, vocab_size] - const vocabSize = logits.dims?.[2] || 50257; - const logitsArray = Array.from(logits.data as Float32Array); - - for (let j = 1; j < batch.length; j++) { - const startIdx = (j - 1) * vocabSize; - const endIdx = startIdx + vocabSize; - const prevLogits = logitsArray.slice(startIdx, endIdx); - - const probs = softmax(prevLogits); - const targetTokenId = batch[j]; - const prob = probs[targetTokenId] || 1e-10; - - totalLogProb += Math.log(prob); - totalTokens++; - } - - if (end >= inputIds.length) break; - } - - const averageLogProb = totalTokens > 0 ? totalLogProb / totalTokens : 0; - const perplexity = Math.exp(-averageLogProb); - const score = normalizePerplexityToScore(perplexity); - - return { - perplexity: Math.round(perplexity * 100) / 100, - score: Math.round(score * 100) / 100, - tokenCount: totalTokens, - averageLogProb: Math.round(averageLogProb * 10000) / 10000, - modelUsed: modelName, - feedback: generateFeedback(perplexity, score), - }; }; export const clearPerplexityCache = (): void => { diff --git a/src/telemetry-noop.spec.ts b/src/telemetry-noop.spec.ts new file mode 100644 index 0000000..46e59e9 --- /dev/null +++ b/src/telemetry-noop.spec.ts @@ -0,0 +1,123 @@ +/** + * Tests for the noop fallback path — the default experience when + * @opentelemetry/api is not installed. + */ +import { beforeEach, describe, expect, it, jest } from "@jest/globals"; + +// Mock the dynamic import to simulate @opentelemetry/api not being installed. +// This MUST be before importing telemetry. +jest.unstable_mockModule("@opentelemetry/api", () => { + throw new Error("Cannot find module '@opentelemetry/api'"); +}); + +const { _resetTracer, withSpan, getTracer, getCachedTracer } = await import( + "./telemetry.js" +); + +describe("telemetry noop fallback", () => { + beforeEach(() => { + _resetTracer(); + }); + + describe("withSpan", () => { + it("should execute callback and return result when OTel is absent", async () => { + const result = await withSpan( + "test.operation", + { attributes: { "test.key": "value" } }, + async () => 42, + ); + + expect(result).toBe(42); + }); + + it("should still propagate errors from callback", async () => { + await expect( + withSpan("test.failing", {}, async () => { + throw new Error("callback error"); + }), + ).rejects.toThrow("callback error"); + }); + + it("should provide a noop span that accepts all operations without error", async () => { + await withSpan("test.noop", {}, async (span) => { + // All of these should be silent no-ops + span.setAttribute("key", "value"); + span.setAttribute("num", 123); + span.setAttribute("bool", true); + span.setStatus({ code: 1 }); + span.setStatus({ code: 2, message: "error" }); + span.addEvent("event", { "detail": "test" }); + span.recordException(new Error("test")); + span.recordException("string error"); + span.end(); + }); + }); + + it("should handle nested withSpan calls with noop tracer", async () => { + const result = await withSpan( + "parent", + {}, + async () => { + return withSpan( + "child", + {}, + async () => "nested-result", + ); + }, + ); + + expect(result).toBe("nested-result"); + }); + }); + + describe("getTracer", () => { + it("should return a tracer that produces noop spans", async () => { + const tracer = await getTracer(); + expect(tracer).toBeDefined(); + + // The noop tracer's startActiveSpan should execute the callback + const result = tracer.startActiveSpan( + "test", + {}, + (span) => { + span.setAttribute("key", "value"); + span.end(); + return "result"; + }, + ); + expect(result).toBe("result"); + }); + }); + + describe("getCachedTracer", () => { + it("should return noop tracer before any async resolution", () => { + _resetTracer(); + const tracer = getCachedTracer(); + expect(tracer).toBeDefined(); + + const result = tracer.startActiveSpan( + "test", + {}, + (span) => { + span.setAttribute("key", "value"); + span.end(); + return "sync-result"; + }, + ); + expect(result).toBe("sync-result"); + }); + + it("should still return noop tracer after failed resolution", async () => { + // Trigger resolution (which will fail due to our mock) + await getTracer(); + + const tracer = getCachedTracer(); + const result = tracer.startActiveSpan( + "test", + {}, + () => "post-resolution", + ); + expect(result).toBe("post-resolution"); + }); + }); +}); diff --git a/src/telemetry.spec.ts b/src/telemetry.spec.ts new file mode 100644 index 0000000..d0dca31 --- /dev/null +++ b/src/telemetry.spec.ts @@ -0,0 +1,104 @@ +import { beforeEach, describe, expect, it } from "@jest/globals"; +import { + InMemorySpanExporter, + SimpleSpanProcessor, +} from "@opentelemetry/sdk-trace-base"; +import { NodeTracerProvider } from "@opentelemetry/sdk-trace-node"; +import { SpanStatusCode, _resetTracer, withSpan } from "./telemetry.js"; + +// Set up a real OTel SDK so spans are captured +const exporter = new InMemorySpanExporter(); +const provider = new NodeTracerProvider(); +provider.addSpanProcessor(new SimpleSpanProcessor(exporter)); +provider.register(); + +describe("telemetry", () => { + beforeEach(() => { + exporter.reset(); + _resetTracer(); + }); + + describe("withSpan", () => { + it("should create a span with correct name and attributes", async () => { + const result = await withSpan( + "test.operation", + { + attributes: { + "test.key": "value", + "test.number": 42, + }, + }, + async () => "hello", + ); + + expect(result).toBe("hello"); + + const spans = exporter.getFinishedSpans(); + expect(spans).toHaveLength(1); + expect(spans[0].name).toBe("test.operation"); + expect(spans[0].attributes["test.key"]).toBe("value"); + expect(spans[0].attributes["test.number"]).toBe(42); + expect(spans[0].status.code).toBe(SpanStatusCode.OK); + }); + + it("should record exception and set error status on failure", async () => { + const testError = new Error("test failure"); + + await expect( + withSpan("test.failing", {}, async () => { + throw testError; + }), + ).rejects.toThrow("test failure"); + + const spans = exporter.getFinishedSpans(); + expect(spans).toHaveLength(1); + expect(spans[0].status.code).toBe(SpanStatusCode.ERROR); + expect(spans[0].status.message).toBe("test failure"); + expect(spans[0].events).toHaveLength(1); + expect(spans[0].events[0].name).toBe("exception"); + }); + + it("should allow adding attributes during execution", async () => { + await withSpan("test.dynamic", {}, async (span) => { + span.setAttribute("dynamic.key", "added-later"); + }); + + const spans = exporter.getFinishedSpans(); + expect(spans).toHaveLength(1); + expect(spans[0].attributes["dynamic.key"]).toBe("added-later"); + }); + + it("should allow adding events during execution", async () => { + await withSpan("test.events", {}, async (span) => { + span.addEvent("custom_event", { + "event.detail": "something happened", + }); + }); + + const spans = exporter.getFinishedSpans(); + expect(spans).toHaveLength(1); + expect(spans[0].events).toHaveLength(1); + expect(spans[0].events[0].name).toBe("custom_event"); + }); + + it("should nest spans correctly via async context", async () => { + await withSpan("parent", {}, async () => { + await withSpan("child", {}, async () => { + // child span + }); + }); + + const spans = exporter.getFinishedSpans(); + expect(spans).toHaveLength(2); + + const child = spans.find((s) => s.name === "child"); + const parent = spans.find((s) => s.name === "parent"); + expect(child).toBeDefined(); + expect(parent).toBeDefined(); + // Child's parent span ID should match parent's span ID + expect(child!.parentSpanId).toBe( + parent!.spanContext().spanId, + ); + }); + }); +}); diff --git a/src/telemetry.ts b/src/telemetry.ts new file mode 100644 index 0000000..382c1c8 --- /dev/null +++ b/src/telemetry.ts @@ -0,0 +1,180 @@ +/** + * OpenTelemetry instrumentation for eval-kit. + * + * Uses dynamic import so @opentelemetry/api is an optional peer dependency. + * When the package is not installed or no SDK is configured, all tracing + * operations are no-ops with zero overhead. + */ + +declare const __EVAL_KIT_VERSION__: string; + +// ---------- Type stubs (mirror the OTel API surface we use) ---------- + +interface SpanAttributes { + [key: string]: string | number | boolean | undefined; +} + +/** + * Minimal span interface matching the subset of @opentelemetry/api Span we use. + */ +export interface EvalKitSpan { + setAttribute(key: string, value: string | number | boolean): void; + setStatus(status: { code: number; message?: string }): void; + recordException(error: Error | string): void; + addEvent(name: string, attributes?: SpanAttributes): void; + end(): void; +} + +interface Tracer { + startActiveSpan( + name: string, + options: { attributes?: SpanAttributes }, + fn: (span: EvalKitSpan) => T, + ): T; +} + +// ---------- No-op implementations ---------- + +const noopSpan: EvalKitSpan = { + setAttribute() {}, + setStatus() {}, + recordException() {}, + addEvent() {}, + end() {}, +}; + +const noopTracer: Tracer = { + startActiveSpan( + _name: string, + _options: { attributes?: SpanAttributes }, + fn: (span: EvalKitSpan) => T, + ): T { + return fn(noopSpan); + }, +}; + +// ---------- Lazy tracer resolution ---------- + +let resolvedTracer: Tracer | null = null; +let tracerPromise: Promise | null = null; + +function resolveTracer(): Promise { + if (!tracerPromise) { + tracerPromise = (async () => { + try { + const api = await import("@opentelemetry/api"); + const version = typeof __EVAL_KIT_VERSION__ !== "undefined" + ? __EVAL_KIT_VERSION__ + : "unknown"; + const tracer = api.trace.getTracer("eval-kit", version); + // Wrap the OTel tracer with an explicit adapter at the boundary + const wrapSpan = (span: import("@opentelemetry/api").Span): EvalKitSpan => ({ + setAttribute: (key, value) => span.setAttribute(key, value), + setStatus: (status) => span.setStatus(status), + recordException: (error) => span.recordException(error), + addEvent: (name, attributes) => span.addEvent(name, attributes), + end: () => span.end(), + }); + + resolvedTracer = { + startActiveSpan( + name: string, + options: { attributes?: SpanAttributes }, + fn: (span: EvalKitSpan) => T, + ): T { + return tracer.startActiveSpan( + name, + options, + (span) => fn(wrapSpan(span)), + ); + }, + }; + return resolvedTracer; + } catch { + // @opentelemetry/api not installed — use no-op + resolvedTracer = noopTracer; + return noopTracer; + } + })(); + } + return tracerPromise; +} + +// ---------- Status codes (match OTel SpanStatusCode) ---------- + +export const SpanStatusCode = { + UNSET: 0, + OK: 1, + ERROR: 2, +} as const; + +// ---------- Public API ---------- + +/** + * Get the eval-kit tracer (async). Triggers resolution on first call. + */ +export async function getTracer(): Promise { + return resolveTracer(); +} + +/** + * Get the cached tracer synchronously. Returns no-op if the tracer + * hasn't been resolved yet (i.e., no prior withSpan/getTracer call). + * Use this in hot paths where the tracer is guaranteed to be resolved + * by a parent span. + */ +export function getCachedTracer(): Tracer { + return resolvedTracer ?? noopTracer; +} + +export interface WithSpanOptions { + attributes?: SpanAttributes; +} + +/** + * Wrap an async operation with an OpenTelemetry span. + * + * - Creates a child span under the current active span (if any) + * - Sets initial attributes from options + * - On success: sets status OK, ends span + * - On error: records exception, sets status ERROR, ends span, re-throws + * - The callback receives the span so it can add attributes/events during execution + */ +export async function withSpan( + name: string, + options: WithSpanOptions, + fn: (span: EvalKitSpan) => Promise, +): Promise { + const tracer = await resolveTracer(); + return tracer.startActiveSpan( + name, + { attributes: options.attributes }, + async (span) => { + try { + const result = await fn(span); + span.setStatus({ code: SpanStatusCode.OK }); + return result; + } catch (error) { + span.setStatus({ + code: SpanStatusCode.ERROR, + message: + error instanceof Error ? error.message : String(error), + }); + span.recordException( + error instanceof Error ? error : String(error), + ); + throw error; + } finally { + span.end(); + } + }, + ); +} + +/** + * Reset the cached tracer. Only used in tests. + */ +export function _resetTracer(): void { + resolvedTracer = null; + tracerPromise = null; +} diff --git a/vite.config.ts b/vite.config.ts index 6b5fccf..a89e906 100644 --- a/vite.config.ts +++ b/vite.config.ts @@ -1,7 +1,10 @@ +import { readFileSync } from "node:fs"; import { resolve } from "node:path"; import { defineConfig } from "vite"; import dts from "vite-plugin-dts"; +const pkg = JSON.parse(readFileSync(resolve(__dirname, "package.json"), "utf-8")); + // Externalize function - exclude all dependencies from the bundle const external = (id: string) => { // Externalize node built-ins @@ -16,6 +19,9 @@ const external = (id: string) => { }; export default defineConfig({ + define: { + __EVAL_KIT_VERSION__: JSON.stringify(pkg.version), + }, plugins: [ dts({ include: ["src"], From 741d7beda473222ba91c71646bab31279f0122d5 Mon Sep 17 00:00:00 2001 From: yltw27 Date: Tue, 3 Mar 2026 16:11:04 +0800 Subject: [PATCH 2/6] lint fix --- src/batch/batch-evaluator-telemetry.spec.ts | 73 ++++++++++----------- src/batch/batch-evaluator.ts | 60 +++++++++++------ src/batch/exporters/csv-exporter.ts | 2 +- src/batch/exporters/json-exporter.ts | 2 +- src/batch/progress-tracker.ts | 1 - src/evaluators/evaluator-telemetry.spec.ts | 12 ++-- src/index.ts | 8 ++- src/metrics/bert-score.ts | 11 ++-- src/metrics/perplexity.ts | 32 ++++----- src/telemetry-noop.spec.ts | 44 ++++--------- src/telemetry.spec.ts | 6 +- src/telemetry.ts | 24 ++++--- src/utils/template-engine.ts | 12 ++-- src/utils/tfidf.spec.ts | 2 +- 14 files changed, 137 insertions(+), 152 deletions(-) diff --git a/src/batch/batch-evaluator-telemetry.spec.ts b/src/batch/batch-evaluator-telemetry.spec.ts index e558b97..bbcbb28 100644 --- a/src/batch/batch-evaluator-telemetry.spec.ts +++ b/src/batch/batch-evaluator-telemetry.spec.ts @@ -4,7 +4,7 @@ import { SimpleSpanProcessor, } from "@opentelemetry/sdk-trace-base"; import { NodeTracerProvider } from "@opentelemetry/sdk-trace-node"; -import { SpanStatusCode, _resetTracer } from "../telemetry.js"; +import { _resetTracer, SpanStatusCode } from "../telemetry.js"; import type { EvaluationInput, EvaluatorResult, @@ -69,18 +69,14 @@ describe("BatchEvaluator telemetry", () => { ]); // Verify parent-child relationships - const batchSpan = spans.find( - (s) => s.name === "eval-kit.batch.evaluate", - )!; - const rowSpan = spans.find( - (s) => s.name === "eval-kit.batch.process_row", - )!; + const batchSpan = spans.find((s) => s.name === "eval-kit.batch.evaluate"); + const rowSpan = spans.find((s) => s.name === "eval-kit.batch.process_row"); const evalSpan = spans.find( (s) => s.name === "eval-kit.batch.run_evaluators", - )!; + ); - expect(rowSpan.parentSpanId).toBe(batchSpan.spanContext().spanId); - expect(evalSpan.parentSpanId).toBe(rowSpan.spanContext().spanId); + expect(rowSpan?.parentSpanId).toBe(batchSpan?.spanContext().spanId); + expect(evalSpan?.parentSpanId).toBe(rowSpan?.spanContext().spanId); }); it("should set correct attributes on batch span", async () => { @@ -90,21 +86,18 @@ describe("BatchEvaluator telemetry", () => { }); await batch.evaluate({ - data: [ - { candidateText: "Row 1" }, - { candidateText: "Row 2" }, - ], + data: [{ candidateText: "Row 1" }, { candidateText: "Row 2" }], }); const batchSpan = exporter .getFinishedSpans() - .find((s) => s.name === "eval-kit.batch.evaluate")!; + .find((s) => s.name === "eval-kit.batch.evaluate"); - expect(batchSpan.attributes["eval_kit.batch.concurrency"]).toBe(3); - expect(batchSpan.attributes["eval_kit.batch.total_rows"]).toBe(2); - expect(batchSpan.attributes["eval_kit.batch.successful_rows"]).toBe(2); - expect(batchSpan.attributes["eval_kit.batch.failed_rows"]).toBe(0); - expect(batchSpan.status.code).toBe(SpanStatusCode.OK); + expect(batchSpan?.attributes["eval_kit.batch.concurrency"]).toBe(3); + expect(batchSpan?.attributes["eval_kit.batch.total_rows"]).toBe(2); + expect(batchSpan?.attributes["eval_kit.batch.successful_rows"]).toBe(2); + expect(batchSpan?.attributes["eval_kit.batch.failed_rows"]).toBe(0); + expect(batchSpan?.status.code).toBe(SpanStatusCode.OK); }); it("should set correct attributes on process_row span", async () => { @@ -119,13 +112,15 @@ describe("BatchEvaluator telemetry", () => { const rowSpan = exporter .getFinishedSpans() - .find((s) => s.name === "eval-kit.batch.process_row")!; - - expect(rowSpan.attributes["eval_kit.row.id"]).toBe("custom-id"); - expect(rowSpan.attributes["eval_kit.row.index"]).toBe(0); - expect(rowSpan.attributes["eval_kit.row.retry_count"]).toBe(0); - expect(rowSpan.attributes["eval_kit.row.duration_ms"]).toBeGreaterThanOrEqual(0); - expect(rowSpan.status.code).toBe(SpanStatusCode.OK); + .find((s) => s.name === "eval-kit.batch.process_row"); + + expect(rowSpan?.attributes["eval_kit.row.id"]).toBe("custom-id"); + expect(rowSpan?.attributes["eval_kit.row.index"]).toBe(0); + expect(rowSpan?.attributes["eval_kit.row.retry_count"]).toBe(0); + expect( + rowSpan?.attributes["eval_kit.row.duration_ms"], + ).toBeGreaterThanOrEqual(0); + expect(rowSpan?.status.code).toBe(SpanStatusCode.OK); }); it("should record retry events on process_row span", async () => { @@ -162,22 +157,20 @@ describe("BatchEvaluator telemetry", () => { const rowSpan = exporter .getFinishedSpans() - .find((s) => s.name === "eval-kit.batch.process_row")!; + .find((s) => s.name === "eval-kit.batch.process_row"); // Should have 2 retry events - const retryEvents = rowSpan.events.filter( - (e) => e.name === "retry", - ); + const retryEvents = rowSpan?.events.filter((e) => e.name === "retry"); expect(retryEvents).toHaveLength(2); - expect(retryEvents[0].attributes!["eval_kit.retry.attempt"]).toBe(1); - expect(retryEvents[0].attributes!["eval_kit.retry.error"]).toBe( + expect(retryEvents?.[0].attributes?.["eval_kit.retry.attempt"]).toBe(1); + expect(retryEvents?.[0].attributes?.["eval_kit.retry.error"]).toBe( "ECONNRESET", ); - expect(retryEvents[1].attributes!["eval_kit.retry.attempt"]).toBe(2); + expect(retryEvents?.[1].attributes?.["eval_kit.retry.attempt"]).toBe(2); // Final span should be OK (recovered) - expect(rowSpan.attributes["eval_kit.row.retry_count"]).toBe(2); - expect(rowSpan.status.code).toBe(SpanStatusCode.OK); + expect(rowSpan?.attributes["eval_kit.row.retry_count"]).toBe(2); + expect(rowSpan?.status.code).toBe(SpanStatusCode.OK); }); it("should set error status on process_row span when all retries exhausted", async () => { @@ -203,11 +196,11 @@ describe("BatchEvaluator telemetry", () => { const rowSpan = exporter .getFinishedSpans() - .find((s) => s.name === "eval-kit.batch.process_row")!; + .find((s) => s.name === "eval-kit.batch.process_row"); - expect(rowSpan.status.code).toBe(SpanStatusCode.ERROR); - expect(rowSpan.attributes["eval_kit.result.error"]).toBe("ECONNRESET"); - expect(rowSpan.attributes["eval_kit.row.retry_count"]).toBe(1); + expect(rowSpan?.status.code).toBe(SpanStatusCode.ERROR); + expect(rowSpan?.attributes["eval_kit.result.error"]).toBe("ECONNRESET"); + expect(rowSpan?.attributes["eval_kit.row.retry_count"]).toBe(1); }); it("should skip parse_input span for in-memory data", async () => { diff --git a/src/batch/batch-evaluator.ts b/src/batch/batch-evaluator.ts index a22c9ac..d71b210 100644 --- a/src/batch/batch-evaluator.ts +++ b/src/batch/batch-evaluator.ts @@ -1,8 +1,8 @@ import { randomUUID } from "node:crypto"; import { type EvalKitSpan, - SpanStatusCode, getCachedTracer, + SpanStatusCode, withSpan, } from "../telemetry.js"; import type { EvaluatorResult, IEvaluator } from "../types/evaluator.js"; @@ -63,15 +63,11 @@ export class BatchEvaluator { // Parse input const allRows = await this.parseInput(inputConfig); - span.setAttribute( - "eval_kit.batch.total_rows", - allRows.length, - ); + span.setAttribute("eval_kit.batch.total_rows", allRows.length); // Handle startIndex for resuming from a specific position const startIndex = inputConfig.startIndex ?? 0; - const rows = - startIndex > 0 ? allRows.slice(startIndex) : allRows; + const rows = startIndex > 0 ? allRows.slice(startIndex) : allRows; // Mark rows before startIndex as already processed (for accurate progress tracking) for (let i = 0; i < startIndex; i++) { @@ -114,10 +110,7 @@ export class BatchEvaluator { "eval_kit.batch.successful_rows", result.successfulRows, ); - span.setAttribute( - "eval_kit.batch.failed_rows", - result.failedRows, - ); + span.setAttribute("eval_kit.batch.failed_rows", result.failedRows); return result; }, ); @@ -231,7 +224,9 @@ export class BatchEvaluator { await tracer.startActiveSpan( "eval-kit.batch.process_row", - { attributes: { "eval_kit.row.id": rowId, "eval_kit.row.index": index } }, + { + attributes: { "eval_kit.row.id": rowId, "eval_kit.row.index": index }, + }, (span: EvalKitSpan) => this.runRowWithSpan(row, index, span), ); }); @@ -248,7 +243,14 @@ export class BatchEvaluator { const rowId = row.id ?? `row-${index}`; try { - await this.executeRowWithRetry({ inputData, row, index, rowId, span, startTime }); + await this.executeRowWithRetry({ + inputData, + row, + index, + rowId, + span, + startTime, + }); } catch (error) { spanError = error instanceof Error ? error.message : String(error); span.recordException(error instanceof Error ? error : spanError); @@ -283,7 +285,16 @@ export class BatchEvaluator { return; } catch (error) { lastError = error; - const errorCtx = { error, span: ctx.span, startTime: ctx.startTime, retryCount, maxRetries, rowId: ctx.rowId, index: ctx.index, row: ctx.row }; + const errorCtx = { + error, + span: ctx.span, + startTime: ctx.startTime, + retryCount, + maxRetries, + rowId: ctx.rowId, + index: ctx.index, + row: ctx.row, + }; const shouldRetry = await this.handleRowError(errorCtx); if (!shouldRetry) { ctx.span.setAttribute("eval_kit.row.retry_count", retryCount); @@ -304,8 +315,13 @@ export class BatchEvaluator { index: number; row: BatchInputRow; }): Promise { - const errorMessage = ctx.error instanceof Error ? ctx.error.message : String(ctx.error); - const shouldRetry = this.shouldRetry(errorMessage, ctx.retryCount, ctx.maxRetries); + const errorMessage = + ctx.error instanceof Error ? ctx.error.message : String(ctx.error); + const shouldRetry = this.shouldRetry( + errorMessage, + ctx.retryCount, + ctx.maxRetries, + ); if (shouldRetry) { const nextAttempt = ctx.retryCount + 1; @@ -338,7 +354,9 @@ export class BatchEvaluator { this.progressTracker?.recordFailure(durationMs); if (this.config.stopOnError) { - throw new Error(`Stopping batch evaluation due to error: ${errorMessage}`); + throw new Error( + `Stopping batch evaluation due to error: ${errorMessage}`, + ); } return false; @@ -346,7 +364,9 @@ export class BatchEvaluator { private calculateRetryDelay(attempt: number): number { const baseDelay = this.config.retryConfig?.retryDelay ?? 1000; - return this.config.retryConfig?.exponentialBackoff ? baseDelay * 2 ** (attempt - 1) : baseDelay; + return this.config.retryConfig?.exponentialBackoff + ? baseDelay * 2 ** (attempt - 1) + : baseDelay; } private async executeRowEvaluation(ctx: { @@ -434,9 +454,7 @@ export class BatchEvaluator { } // Parallel mode (default) const results = await Promise.all( - this.evaluators.map((evaluator) => - evaluateWithTimeout(evaluator), - ), + this.evaluators.map((evaluator) => evaluateWithTimeout(evaluator)), ); return results as EvaluatorResult[]; }, diff --git a/src/batch/exporters/csv-exporter.ts b/src/batch/exporters/csv-exporter.ts index b7fe32b..4c3c43f 100644 --- a/src/batch/exporters/csv-exporter.ts +++ b/src/batch/exporters/csv-exporter.ts @@ -179,7 +179,7 @@ export class CsvExporter { } // Exclude logic - if (excludeFields && excludeFields.includes(key)) { + if (excludeFields?.includes(key)) { continue; } diff --git a/src/batch/exporters/json-exporter.ts b/src/batch/exporters/json-exporter.ts index 0952b30..b54ff17 100644 --- a/src/batch/exporters/json-exporter.ts +++ b/src/batch/exporters/json-exporter.ts @@ -72,7 +72,7 @@ export class JsonExporter { } // Exclude logic - if (excludeFields && excludeFields.includes(key)) { + if (excludeFields?.includes(key)) { continue; } diff --git a/src/batch/progress-tracker.ts b/src/batch/progress-tracker.ts index 7bb5cf1..c01e563 100644 --- a/src/batch/progress-tracker.ts +++ b/src/batch/progress-tracker.ts @@ -14,7 +14,6 @@ export class ProgressTracker { private processedRows = 0; private successfulRows = 0; private failedRows = 0; - private startTime: number = 0; private processingTimes: number[] = []; private lastEmitTime = 0; private totalTokens = 0; diff --git a/src/evaluators/evaluator-telemetry.spec.ts b/src/evaluators/evaluator-telemetry.spec.ts index e045c22..c66a9ca 100644 --- a/src/evaluators/evaluator-telemetry.spec.ts +++ b/src/evaluators/evaluator-telemetry.spec.ts @@ -1,11 +1,11 @@ import { beforeEach, describe, expect, it, jest } from "@jest/globals"; -import type { LanguageModel } from "ai"; import { InMemorySpanExporter, SimpleSpanProcessor, } from "@opentelemetry/sdk-trace-base"; import { NodeTracerProvider } from "@opentelemetry/sdk-trace-node"; -import { SpanStatusCode, _resetTracer } from "../telemetry.js"; +import type { LanguageModel } from "ai"; +import { _resetTracer, SpanStatusCode } from "../telemetry.js"; // Set up OTel SDK for span capture const exporter = new InMemorySpanExporter(); @@ -70,7 +70,9 @@ describe("Evaluator telemetry", () => { expect(span.attributes["eval_kit.result.token_usage.input"]).toBe(100); expect(span.attributes["eval_kit.result.token_usage.output"]).toBe(20); expect(span.attributes["eval_kit.result.token_usage.total"]).toBe(120); - expect(span.attributes["eval_kit.result.execution_time_ms"]).toBeGreaterThanOrEqual(0); + expect( + span.attributes["eval_kit.result.execution_time_ms"], + ).toBeGreaterThanOrEqual(0); expect(span.status.code).toBe(SpanStatusCode.OK); }); @@ -98,7 +100,9 @@ describe("Evaluator telemetry", () => { const span = spans[0]; expect(span.status.code).toBe(SpanStatusCode.ERROR); expect(span.attributes["eval_kit.result.error"]).toBe("API rate limited"); - expect(span.attributes["eval_kit.result.execution_time_ms"]).toBeGreaterThanOrEqual(0); + expect( + span.attributes["eval_kit.result.execution_time_ms"], + ).toBeGreaterThanOrEqual(0); // Exception event recorded expect(span.events.length).toBeGreaterThanOrEqual(1); diff --git a/src/index.ts b/src/index.ts index 261b34f..6d52ad6 100644 --- a/src/index.ts +++ b/src/index.ts @@ -62,6 +62,12 @@ export type { } from "./batch/types.js"; // Evaluator export { Evaluator } from "./evaluators/evaluator.js"; +// Telemetry +export { + type EvalKitSpan, + type WithSpanOptions, + withSpan, +} from "./telemetry.js"; export type { CategoricalScoreConfig, EvaluationInput, @@ -83,7 +89,5 @@ export { cosineSimilarity } from "./utils/similarity.js"; // Evaluator Utilities export { TemplateRenderer } from "./utils/template-engine.js"; export { calculateIDF, calculateTFIDF } from "./utils/tfidf.js"; -// Telemetry -export { type EvalKitSpan, type WithSpanOptions, withSpan } from "./telemetry.js"; // Utilities export { tokenizeSentences, tokenizeWords } from "./utils/tokenization.js"; diff --git a/src/metrics/bert-score.ts b/src/metrics/bert-score.ts index 9e769e0..8c4ac83 100644 --- a/src/metrics/bert-score.ts +++ b/src/metrics/bert-score.ts @@ -1,4 +1,4 @@ -import type { FeatureExtractionPipeline, Tensor } from "@xenova/transformers"; +import type { FeatureExtractionPipeline } from "@xenova/transformers"; import { cos_sim, pipeline } from "@xenova/transformers"; import { withSpan } from "../telemetry.js"; import { tokenizeWords } from "../utils/tokenization.js"; @@ -88,8 +88,7 @@ export const calculateBertScore = async ( }, }, async (span) => { - const wasCached = - cachedPipeline !== null && cachedModelName === model; + const wasCached = cachedPipeline !== null && cachedModelName === model; const extractor = await getPipeline(model); if (!wasCached) { span.addEvent("model_loaded", { @@ -120,10 +119,8 @@ export const calculateBertScore = async ( const precision = precisionSimilarities.length > 0 - ? precisionSimilarities.reduce( - (sum, sim) => sum + sim, - 0, - ) / precisionSimilarities.length + ? precisionSimilarities.reduce((sum, sim) => sum + sim, 0) / + precisionSimilarities.length : 0; const recall = diff --git a/src/metrics/perplexity.ts b/src/metrics/perplexity.ts index 1445d68..92c117b 100644 --- a/src/metrics/perplexity.ts +++ b/src/metrics/perplexity.ts @@ -69,7 +69,7 @@ const normalizePerplexityToScore = (perplexity: number): number => { return Math.max(0, 10 - (perplexity - 300) / 100); }; -const generateFeedback = (perplexity: number, score: number): string => { +const generateFeedback = (perplexity: number, _score: number): string => { if (perplexity < 20) { return `Excellent text quality with very natural, human-like language (perplexity: ${perplexity.toFixed(1)})`; } @@ -100,10 +100,8 @@ export const calculatePerplexity = async ( }, }, async (span) => { - const wasCached = - cachedModel !== null && cachedModelName === modelName; - const { tokenizer, model } = - await getModelAndTokenizer(modelName); + const wasCached = cachedModel !== null && cachedModelName === modelName; + const { tokenizer, model } = await getModelAndTokenizer(modelName); if (!wasCached) { span.addEvent("model_loaded", { "eval_kit.metric.model": modelName, @@ -116,9 +114,9 @@ export const calculatePerplexity = async ( add_special_tokens: true, }); - const inputIds = Array.from( - encoded.input_ids.data as BigInt64Array, - ).map((x) => Number(x)); + const inputIds = Array.from(encoded.input_ids.data as BigInt64Array).map( + (x) => Number(x), + ); if (inputIds.length <= 1) { span.setAttribute("eval_kit.metric.token_count", inputIds.length); @@ -129,8 +127,7 @@ export const calculatePerplexity = async ( tokenCount: inputIds.length, averageLogProb: 0, modelUsed: modelName, - feedback: - "Text too short to calculate perplexity meaningfully", + feedback: "Text too short to calculate perplexity meaningfully", }; } @@ -158,9 +155,7 @@ export const calculatePerplexity = async ( // Get vocab size from logits shape: [batch_size, seq_len, vocab_size] const vocabSize = logits.dims?.[2] || 50257; - const logitsArray = Array.from( - logits.data as Float32Array, - ); + const logitsArray = Array.from(logits.data as Float32Array); for (let j = 1; j < batch.length; j++) { const startIdx = (j - 1) * vocabSize; @@ -178,8 +173,7 @@ export const calculatePerplexity = async ( if (end >= inputIds.length) break; } - const averageLogProb = - totalTokens > 0 ? totalLogProb / totalTokens : 0; + const averageLogProb = totalTokens > 0 ? totalLogProb / totalTokens : 0; const perplexity = Math.exp(-averageLogProb); const score = normalizePerplexityToScore(perplexity); @@ -188,17 +182,13 @@ export const calculatePerplexity = async ( "eval_kit.result.perplexity", Math.round(perplexity * 100) / 100, ); - span.setAttribute( - "eval_kit.result.score", - Math.round(score * 100) / 100, - ); + span.setAttribute("eval_kit.result.score", Math.round(score * 100) / 100); return { perplexity: Math.round(perplexity * 100) / 100, score: Math.round(score * 100) / 100, tokenCount: totalTokens, - averageLogProb: - Math.round(averageLogProb * 10000) / 10000, + averageLogProb: Math.round(averageLogProb * 10000) / 10000, modelUsed: modelName, feedback: generateFeedback(perplexity, score), }; diff --git a/src/telemetry-noop.spec.ts b/src/telemetry-noop.spec.ts index 46e59e9..9c09b4a 100644 --- a/src/telemetry-noop.spec.ts +++ b/src/telemetry-noop.spec.ts @@ -46,7 +46,7 @@ describe("telemetry noop fallback", () => { span.setAttribute("bool", true); span.setStatus({ code: 1 }); span.setStatus({ code: 2, message: "error" }); - span.addEvent("event", { "detail": "test" }); + span.addEvent("event", { detail: "test" }); span.recordException(new Error("test")); span.recordException("string error"); span.end(); @@ -54,17 +54,9 @@ describe("telemetry noop fallback", () => { }); it("should handle nested withSpan calls with noop tracer", async () => { - const result = await withSpan( - "parent", - {}, - async () => { - return withSpan( - "child", - {}, - async () => "nested-result", - ); - }, - ); + const result = await withSpan("parent", {}, async () => { + return withSpan("child", {}, async () => "nested-result"); + }); expect(result).toBe("nested-result"); }); @@ -76,15 +68,11 @@ describe("telemetry noop fallback", () => { expect(tracer).toBeDefined(); // The noop tracer's startActiveSpan should execute the callback - const result = tracer.startActiveSpan( - "test", - {}, - (span) => { - span.setAttribute("key", "value"); - span.end(); - return "result"; - }, - ); + const result = tracer.startActiveSpan("test", {}, (span) => { + span.setAttribute("key", "value"); + span.end(); + return "result"; + }); expect(result).toBe("result"); }); }); @@ -95,15 +83,11 @@ describe("telemetry noop fallback", () => { const tracer = getCachedTracer(); expect(tracer).toBeDefined(); - const result = tracer.startActiveSpan( - "test", - {}, - (span) => { - span.setAttribute("key", "value"); - span.end(); - return "sync-result"; - }, - ); + const result = tracer.startActiveSpan("test", {}, (span) => { + span.setAttribute("key", "value"); + span.end(); + return "sync-result"; + }); expect(result).toBe("sync-result"); }); diff --git a/src/telemetry.spec.ts b/src/telemetry.spec.ts index d0dca31..dc10fa6 100644 --- a/src/telemetry.spec.ts +++ b/src/telemetry.spec.ts @@ -4,7 +4,7 @@ import { SimpleSpanProcessor, } from "@opentelemetry/sdk-trace-base"; import { NodeTracerProvider } from "@opentelemetry/sdk-trace-node"; -import { SpanStatusCode, _resetTracer, withSpan } from "./telemetry.js"; +import { _resetTracer, SpanStatusCode, withSpan } from "./telemetry.js"; // Set up a real OTel SDK so spans are captured const exporter = new InMemorySpanExporter(); @@ -96,9 +96,7 @@ describe("telemetry", () => { expect(child).toBeDefined(); expect(parent).toBeDefined(); // Child's parent span ID should match parent's span ID - expect(child!.parentSpanId).toBe( - parent!.spanContext().spanId, - ); + expect(child?.parentSpanId).toBe(parent?.spanContext().spanId); }); }); }); diff --git a/src/telemetry.ts b/src/telemetry.ts index 382c1c8..d44fb99 100644 --- a/src/telemetry.ts +++ b/src/telemetry.ts @@ -63,12 +63,15 @@ function resolveTracer(): Promise { tracerPromise = (async () => { try { const api = await import("@opentelemetry/api"); - const version = typeof __EVAL_KIT_VERSION__ !== "undefined" - ? __EVAL_KIT_VERSION__ - : "unknown"; + const version = + typeof __EVAL_KIT_VERSION__ !== "undefined" + ? __EVAL_KIT_VERSION__ + : "unknown"; const tracer = api.trace.getTracer("eval-kit", version); // Wrap the OTel tracer with an explicit adapter at the boundary - const wrapSpan = (span: import("@opentelemetry/api").Span): EvalKitSpan => ({ + const wrapSpan = ( + span: import("@opentelemetry/api").Span, + ): EvalKitSpan => ({ setAttribute: (key, value) => span.setAttribute(key, value), setStatus: (status) => span.setStatus(status), recordException: (error) => span.recordException(error), @@ -82,10 +85,8 @@ function resolveTracer(): Promise { options: { attributes?: SpanAttributes }, fn: (span: EvalKitSpan) => T, ): T { - return tracer.startActiveSpan( - name, - options, - (span) => fn(wrapSpan(span)), + return tracer.startActiveSpan(name, options, (span) => + fn(wrapSpan(span)), ); }, }; @@ -157,12 +158,9 @@ export async function withSpan( } catch (error) { span.setStatus({ code: SpanStatusCode.ERROR, - message: - error instanceof Error ? error.message : String(error), + message: error instanceof Error ? error.message : String(error), }); - span.recordException( - error instanceof Error ? error : String(error), - ); + span.recordException(error instanceof Error ? error : String(error)); throw error; } finally { span.end(); diff --git a/src/utils/template-engine.ts b/src/utils/template-engine.ts index 3b7a603..d7b7416 100644 --- a/src/utils/template-engine.ts +++ b/src/utils/template-engine.ts @@ -94,12 +94,12 @@ export class TemplateRenderer { const conditionals = template.matchAll(/\{\{#if\s+(\w+)\}\}/g); for (const match of conditionals) { - variables.add(match[1]!); + variables.add(match[1]); } const substitutions = template.matchAll(/\{\{(\w+)\}\}/g); for (const match of substitutions) { - variables.add(match[1]!); + variables.add(match[1]); } return Array.from(variables); @@ -113,11 +113,11 @@ export class TemplateRenderer { /\{\{#if\s+(\w+)\}\}([\s\S]*?)\{\{\/if\}\}/g, ); for (const match of conditionals) { - optional.add(match[1]!); + optional.add(match[1]); - const innerVars = match[2]!.matchAll(/\{\{(\w+)\}\}/g); + const innerVars = match[2]?.matchAll(/\{\{(\w+)\}\}/g); for (const innerMatch of innerVars) { - optional.add(innerMatch[1]!); + optional.add(innerMatch[1]); } } @@ -125,7 +125,7 @@ export class TemplateRenderer { for (const part of parts) { const substitutions = part.matchAll(/\{\{(\w+)\}\}/g); for (const match of substitutions) { - const varName = match[1]!; + const varName = match[1]; if (!optional.has(varName)) { required.add(varName); } diff --git a/src/utils/tfidf.spec.ts b/src/utils/tfidf.spec.ts index 0f4db9c..5ec32b5 100644 --- a/src/utils/tfidf.spec.ts +++ b/src/utils/tfidf.spec.ts @@ -50,7 +50,7 @@ describe("TF-IDF Utils", () => { const idf = calculateIDF(sentenceTokens); // "rare" appears in 1/3 sentences: log((3+1)/(1+1)) = log(2) = 0.693... - expect(idf.get("rare")).toBeCloseTo(0.693, 2); + expect(idf.get("rare")).toBeCloseTo(Math.LN2, 2); }); it("should handle empty sentence tokens", () => { From 49590530faaa468a54e55d923d1e7c1169658715 Mon Sep 17 00:00:00 2001 From: yltw27 Date: Tue, 3 Mar 2026 16:33:24 +0800 Subject: [PATCH 3/6] refactor: improve code health in batch-evaluator telemetry tests Extract shared helpers (runBatch, getSpan) to reduce duplication and eliminate optional chaining in assertions, bringing score to 10/10. Co-Authored-By: Claude Opus 4.6 --- src/batch/batch-evaluator-telemetry.spec.ts | 244 ++++++++------------ 1 file changed, 99 insertions(+), 145 deletions(-) diff --git a/src/batch/batch-evaluator-telemetry.spec.ts b/src/batch/batch-evaluator-telemetry.spec.ts index bbcbb28..5abf06e 100644 --- a/src/batch/batch-evaluator-telemetry.spec.ts +++ b/src/batch/batch-evaluator-telemetry.spec.ts @@ -1,4 +1,5 @@ import { beforeEach, describe, expect, it } from "@jest/globals"; +import type { ReadableSpan } from "@opentelemetry/sdk-trace-base"; import { InMemorySpanExporter, SimpleSpanProcessor, @@ -18,29 +19,46 @@ const provider = new NodeTracerProvider(); provider.addSpanProcessor(new SimpleSpanProcessor(exporter)); provider.register(); -// Helper to create a mock evaluator -const createMockEvaluator = ( - name: string, - score: number = 85, - shouldFail: boolean = false, -): IEvaluator => ({ +const createMockEvaluator = (name: string): IEvaluator => ({ name, - evaluate: async (_input: EvaluationInput): Promise => { - if (shouldFail) { - throw new Error("Evaluator failed: ECONNRESET"); - } - return { - evaluatorName: name, - model: "mock-model", - score, - feedback: "Good", - processingStats: { - executionTime: 10, - tokenUsage: { inputTokens: 50, outputTokens: 10, totalTokens: 60 }, - }, + evaluate: async (_input: EvaluationInput): Promise => ({ + evaluatorName: name, + model: "mock-model", + score: 85, + feedback: "Good", + processingStats: { + executionTime: 10, + tokenUsage: { inputTokens: 50, outputTokens: 10, totalTokens: 60 }, + }, + }), +}); + +async function runBatch( + evaluators: IEvaluator[], + data: { candidateText: string; id?: string }[], + config?: { + concurrency?: number; + retryConfig?: { + maxRetries: number; + retryDelay: number; + exponentialBackoff?: boolean; }; }, -}); +) { + const batch = new BatchEvaluator({ + evaluators, + concurrency: config?.concurrency ?? 1, + retryConfig: config?.retryConfig, + }); + await batch.evaluate({ data }); + return exporter.getFinishedSpans(); +} + +function getSpan(spans: ReadableSpan[], name: string): ReadableSpan { + const span = spans.find((s) => s.name === name); + expect(span).toBeDefined(); + return span as ReadableSpan; +} describe("BatchEvaluator telemetry", () => { beforeEach(() => { @@ -48,79 +66,53 @@ describe("BatchEvaluator telemetry", () => { _resetTracer(); }); - it("should create span hierarchy: batch → parse_input, process_row → run_evaluators", async () => { - const batch = new BatchEvaluator({ - evaluators: [createMockEvaluator("fluency")], - concurrency: 1, - }); - - await batch.evaluate({ - data: [{ candidateText: "Hello world" }], - }); - - const spans = exporter.getFinishedSpans(); - const spanNames = spans.map((s) => s.name).sort(); + it("should create span hierarchy: batch → process_row → run_evaluators", async () => { + const spans = await runBatch( + [createMockEvaluator("fluency")], + [{ candidateText: "Hello world" }], + ); - // No parse_input span for in-memory data (no I/O to trace) - expect(spanNames).toEqual([ + expect(spans.map((s) => s.name).sort()).toEqual([ "eval-kit.batch.evaluate", "eval-kit.batch.process_row", "eval-kit.batch.run_evaluators", ]); - // Verify parent-child relationships - const batchSpan = spans.find((s) => s.name === "eval-kit.batch.evaluate"); - const rowSpan = spans.find((s) => s.name === "eval-kit.batch.process_row"); - const evalSpan = spans.find( - (s) => s.name === "eval-kit.batch.run_evaluators", - ); + const batchSpan = getSpan(spans, "eval-kit.batch.evaluate"); + const rowSpan = getSpan(spans, "eval-kit.batch.process_row"); + const evalSpan = getSpan(spans, "eval-kit.batch.run_evaluators"); - expect(rowSpan?.parentSpanId).toBe(batchSpan?.spanContext().spanId); - expect(evalSpan?.parentSpanId).toBe(rowSpan?.spanContext().spanId); + expect(rowSpan.parentSpanId).toBe(batchSpan.spanContext().spanId); + expect(evalSpan.parentSpanId).toBe(rowSpan.spanContext().spanId); }); it("should set correct attributes on batch span", async () => { - const batch = new BatchEvaluator({ - evaluators: [createMockEvaluator("fluency")], - concurrency: 3, - }); - - await batch.evaluate({ - data: [{ candidateText: "Row 1" }, { candidateText: "Row 2" }], - }); - - const batchSpan = exporter - .getFinishedSpans() - .find((s) => s.name === "eval-kit.batch.evaluate"); - - expect(batchSpan?.attributes["eval_kit.batch.concurrency"]).toBe(3); - expect(batchSpan?.attributes["eval_kit.batch.total_rows"]).toBe(2); - expect(batchSpan?.attributes["eval_kit.batch.successful_rows"]).toBe(2); - expect(batchSpan?.attributes["eval_kit.batch.failed_rows"]).toBe(0); - expect(batchSpan?.status.code).toBe(SpanStatusCode.OK); + const spans = await runBatch( + [createMockEvaluator("fluency")], + [{ candidateText: "Row 1" }, { candidateText: "Row 2" }], + { concurrency: 3 }, + ); + + const span = getSpan(spans, "eval-kit.batch.evaluate"); + expect(span.attributes["eval_kit.batch.concurrency"]).toBe(3); + expect(span.attributes["eval_kit.batch.total_rows"]).toBe(2); + expect(span.attributes["eval_kit.batch.successful_rows"]).toBe(2); + expect(span.attributes["eval_kit.batch.failed_rows"]).toBe(0); + expect(span.status.code).toBe(SpanStatusCode.OK); }); it("should set correct attributes on process_row span", async () => { - const batch = new BatchEvaluator({ - evaluators: [createMockEvaluator("accuracy")], - concurrency: 1, - }); - - await batch.evaluate({ - data: [{ candidateText: "Test row", id: "custom-id" }], - }); - - const rowSpan = exporter - .getFinishedSpans() - .find((s) => s.name === "eval-kit.batch.process_row"); - - expect(rowSpan?.attributes["eval_kit.row.id"]).toBe("custom-id"); - expect(rowSpan?.attributes["eval_kit.row.index"]).toBe(0); - expect(rowSpan?.attributes["eval_kit.row.retry_count"]).toBe(0); - expect( - rowSpan?.attributes["eval_kit.row.duration_ms"], - ).toBeGreaterThanOrEqual(0); - expect(rowSpan?.status.code).toBe(SpanStatusCode.OK); + const spans = await runBatch( + [createMockEvaluator("accuracy")], + [{ candidateText: "Test row", id: "custom-id" }], + ); + + const span = getSpan(spans, "eval-kit.batch.process_row"); + expect(span.attributes["eval_kit.row.id"]).toBe("custom-id"); + expect(span.attributes["eval_kit.row.index"]).toBe(0); + expect(span.attributes["eval_kit.row.retry_count"]).toBe(0); + expect(span.attributes["eval_kit.row.duration_ms"]).toBeGreaterThanOrEqual(0); + expect(span.status.code).toBe(SpanStatusCode.OK); }); it("should record retry events on process_row span", async () => { @@ -129,9 +121,7 @@ describe("BatchEvaluator telemetry", () => { name: "flaky", evaluate: async () => { callCount++; - if (callCount <= 2) { - throw new Error("ECONNRESET"); - } + if (callCount <= 2) throw new Error("ECONNRESET"); return { evaluatorName: "flaky", score: 80, @@ -141,36 +131,21 @@ describe("BatchEvaluator telemetry", () => { }, }; - const batch = new BatchEvaluator({ - evaluators: [flaky], - concurrency: 1, - retryConfig: { - maxRetries: 3, - retryDelay: 1, // 1ms for fast tests - exponentialBackoff: false, - }, - }); - - await batch.evaluate({ - data: [{ candidateText: "Test" }], - }); + const spans = await runBatch( + [flaky], + [{ candidateText: "Test" }], + { retryConfig: { maxRetries: 3, retryDelay: 1, exponentialBackoff: false } }, + ); - const rowSpan = exporter - .getFinishedSpans() - .find((s) => s.name === "eval-kit.batch.process_row"); + const span = getSpan(spans, "eval-kit.batch.process_row"); + const retryEvents = span.events.filter((e) => e.name === "retry"); - // Should have 2 retry events - const retryEvents = rowSpan?.events.filter((e) => e.name === "retry"); expect(retryEvents).toHaveLength(2); - expect(retryEvents?.[0].attributes?.["eval_kit.retry.attempt"]).toBe(1); - expect(retryEvents?.[0].attributes?.["eval_kit.retry.error"]).toBe( - "ECONNRESET", - ); - expect(retryEvents?.[1].attributes?.["eval_kit.retry.attempt"]).toBe(2); - - // Final span should be OK (recovered) - expect(rowSpan?.attributes["eval_kit.row.retry_count"]).toBe(2); - expect(rowSpan?.status.code).toBe(SpanStatusCode.OK); + expect(retryEvents[0].attributes?.["eval_kit.retry.attempt"]).toBe(1); + expect(retryEvents[0].attributes?.["eval_kit.retry.error"]).toBe("ECONNRESET"); + expect(retryEvents[1].attributes?.["eval_kit.retry.attempt"]).toBe(2); + expect(span.attributes["eval_kit.row.retry_count"]).toBe(2); + expect(span.status.code).toBe(SpanStatusCode.OK); }); it("should set error status on process_row span when all retries exhausted", async () => { @@ -181,46 +156,25 @@ describe("BatchEvaluator telemetry", () => { }, }; - const batch = new BatchEvaluator({ - evaluators: [failing], - concurrency: 1, - retryConfig: { - maxRetries: 1, - retryDelay: 1, - }, - }); - - await batch.evaluate({ - data: [{ candidateText: "Test" }], - }); - - const rowSpan = exporter - .getFinishedSpans() - .find((s) => s.name === "eval-kit.batch.process_row"); + const spans = await runBatch( + [failing], + [{ candidateText: "Test" }], + { retryConfig: { maxRetries: 1, retryDelay: 1 } }, + ); - expect(rowSpan?.status.code).toBe(SpanStatusCode.ERROR); - expect(rowSpan?.attributes["eval_kit.result.error"]).toBe("ECONNRESET"); - expect(rowSpan?.attributes["eval_kit.row.retry_count"]).toBe(1); + const span = getSpan(spans, "eval-kit.batch.process_row"); + expect(span.status.code).toBe(SpanStatusCode.ERROR); + expect(span.attributes["eval_kit.result.error"]).toBe("ECONNRESET"); + expect(span.attributes["eval_kit.row.retry_count"]).toBe(1); }); it("should skip parse_input span for in-memory data", async () => { - const batch = new BatchEvaluator({ - evaluators: [createMockEvaluator("test")], - concurrency: 1, - }); - - await batch.evaluate({ - data: [ - { candidateText: "Row 1" }, - { candidateText: "Row 2" }, - { candidateText: "Row 3" }, - ], - }); - - const parseSpan = exporter - .getFinishedSpans() - .find((s) => s.name === "eval-kit.batch.parse_input"); + const spans = await runBatch( + [createMockEvaluator("test")], + [{ candidateText: "Row 1" }, { candidateText: "Row 2" }, { candidateText: "Row 3" }], + ); + const parseSpan = spans.find((s) => s.name === "eval-kit.batch.parse_input"); expect(parseSpan).toBeUndefined(); }); }); From 83000967dcabd8dd9eba5272f791ccddb8fc7841 Mon Sep 17 00:00:00 2001 From: yltw27 Date: Tue, 3 Mar 2026 16:37:20 +0800 Subject: [PATCH 4/6] fix: resolve lint and formatting issues - Fix biome formatting in telemetry spec files - Remove unused private startTime field in ProgressTracker Co-Authored-By: Claude Opus 4.6 --- src/batch/batch-evaluator-telemetry.spec.ts | 34 ++++++++++++--------- src/batch/progress-tracker.ts | 1 - 2 files changed, 20 insertions(+), 15 deletions(-) diff --git a/src/batch/batch-evaluator-telemetry.spec.ts b/src/batch/batch-evaluator-telemetry.spec.ts index 5abf06e..1351713 100644 --- a/src/batch/batch-evaluator-telemetry.spec.ts +++ b/src/batch/batch-evaluator-telemetry.spec.ts @@ -111,7 +111,9 @@ describe("BatchEvaluator telemetry", () => { expect(span.attributes["eval_kit.row.id"]).toBe("custom-id"); expect(span.attributes["eval_kit.row.index"]).toBe(0); expect(span.attributes["eval_kit.row.retry_count"]).toBe(0); - expect(span.attributes["eval_kit.row.duration_ms"]).toBeGreaterThanOrEqual(0); + expect(span.attributes["eval_kit.row.duration_ms"]).toBeGreaterThanOrEqual( + 0, + ); expect(span.status.code).toBe(SpanStatusCode.OK); }); @@ -131,18 +133,18 @@ describe("BatchEvaluator telemetry", () => { }, }; - const spans = await runBatch( - [flaky], - [{ candidateText: "Test" }], - { retryConfig: { maxRetries: 3, retryDelay: 1, exponentialBackoff: false } }, - ); + const spans = await runBatch([flaky], [{ candidateText: "Test" }], { + retryConfig: { maxRetries: 3, retryDelay: 1, exponentialBackoff: false }, + }); const span = getSpan(spans, "eval-kit.batch.process_row"); const retryEvents = span.events.filter((e) => e.name === "retry"); expect(retryEvents).toHaveLength(2); expect(retryEvents[0].attributes?.["eval_kit.retry.attempt"]).toBe(1); - expect(retryEvents[0].attributes?.["eval_kit.retry.error"]).toBe("ECONNRESET"); + expect(retryEvents[0].attributes?.["eval_kit.retry.error"]).toBe( + "ECONNRESET", + ); expect(retryEvents[1].attributes?.["eval_kit.retry.attempt"]).toBe(2); expect(span.attributes["eval_kit.row.retry_count"]).toBe(2); expect(span.status.code).toBe(SpanStatusCode.OK); @@ -156,11 +158,9 @@ describe("BatchEvaluator telemetry", () => { }, }; - const spans = await runBatch( - [failing], - [{ candidateText: "Test" }], - { retryConfig: { maxRetries: 1, retryDelay: 1 } }, - ); + const spans = await runBatch([failing], [{ candidateText: "Test" }], { + retryConfig: { maxRetries: 1, retryDelay: 1 }, + }); const span = getSpan(spans, "eval-kit.batch.process_row"); expect(span.status.code).toBe(SpanStatusCode.ERROR); @@ -171,10 +171,16 @@ describe("BatchEvaluator telemetry", () => { it("should skip parse_input span for in-memory data", async () => { const spans = await runBatch( [createMockEvaluator("test")], - [{ candidateText: "Row 1" }, { candidateText: "Row 2" }, { candidateText: "Row 3" }], + [ + { candidateText: "Row 1" }, + { candidateText: "Row 2" }, + { candidateText: "Row 3" }, + ], ); - const parseSpan = spans.find((s) => s.name === "eval-kit.batch.parse_input"); + const parseSpan = spans.find( + (s) => s.name === "eval-kit.batch.parse_input", + ); expect(parseSpan).toBeUndefined(); }); }); diff --git a/src/batch/progress-tracker.ts b/src/batch/progress-tracker.ts index c01e563..f92e64d 100644 --- a/src/batch/progress-tracker.ts +++ b/src/batch/progress-tracker.ts @@ -28,7 +28,6 @@ export class ProgressTracker { * Start tracking progress */ start(): void { - this.startTime = Date.now(); this.emit("started"); } From 8aa6211265615a043ffd3091d9f004d4795529bc Mon Sep 17 00:00:00 2001 From: yltw27 Date: Tue, 3 Mar 2026 17:29:19 +0800 Subject: [PATCH 5/6] feat: add global telemetry toggle (disabled by default) Add enableTelemetry(enabled) and isTelemetryEnabled() to control eval-kit tracing globally. When disabled, all spans are suppressed including Vercel AI SDK's experimental_telemetry. This lets users run OTel for other services without eval-kit cluttering their traces. Co-Authored-By: Claude Opus 4.6 --- docs/TELEMETRY.md | 18 ++++++++- src/batch/batch-evaluator-telemetry.spec.ts | 3 +- src/evaluators/evaluator-telemetry.spec.ts | 3 +- src/evaluators/evaluator.ts | 8 +++- src/index.ts | 2 + src/metrics/bert-score-telemetry.spec.ts | 3 +- src/metrics/perplexity-telemetry.spec.ts | 3 +- src/telemetry.spec.ts | 43 ++++++++++++++++++++- src/telemetry.ts | 34 ++++++++++++++-- 9 files changed, 106 insertions(+), 11 deletions(-) diff --git a/docs/TELEMETRY.md b/docs/TELEMETRY.md index f6e0420..380259e 100644 --- a/docs/TELEMETRY.md +++ b/docs/TELEMETRY.md @@ -31,7 +31,23 @@ For production, replace `ConsoleSpanExporter` with your backend exporter (Jaeger No code changes needed. eval-kit detects the registered provider and emits spans automatically. -## Zero overhead when disabled +## Disabling Telemetry + +eval-kit's telemetry is **disabled by default**. To opt in: + +```typescript +import { enableTelemetry } from '@loveholidays/eval-kit'; + +// Enable eval-kit tracing +enableTelemetry(true); + +// Disable again if needed (OTel remains active for the rest of your app) +enableTelemetry(false); +``` + +When disabled, all tracing functions return no-ops with zero overhead — the same behaviour as when `@opentelemetry/api` is not installed. + +## Zero overhead when not installed When `@opentelemetry/api` is not installed, eval-kit uses no-op stubs internally. All tracing calls become empty function calls that are optimized away by the JS engine. There is no performance impact. diff --git a/src/batch/batch-evaluator-telemetry.spec.ts b/src/batch/batch-evaluator-telemetry.spec.ts index 1351713..f3eb1ae 100644 --- a/src/batch/batch-evaluator-telemetry.spec.ts +++ b/src/batch/batch-evaluator-telemetry.spec.ts @@ -5,7 +5,7 @@ import { SimpleSpanProcessor, } from "@opentelemetry/sdk-trace-base"; import { NodeTracerProvider } from "@opentelemetry/sdk-trace-node"; -import { _resetTracer, SpanStatusCode } from "../telemetry.js"; +import { _resetTracer, enableTelemetry, SpanStatusCode } from "../telemetry.js"; import type { EvaluationInput, EvaluatorResult, @@ -64,6 +64,7 @@ describe("BatchEvaluator telemetry", () => { beforeEach(() => { exporter.reset(); _resetTracer(); + enableTelemetry(true); }); it("should create span hierarchy: batch → process_row → run_evaluators", async () => { diff --git a/src/evaluators/evaluator-telemetry.spec.ts b/src/evaluators/evaluator-telemetry.spec.ts index c66a9ca..11396f1 100644 --- a/src/evaluators/evaluator-telemetry.spec.ts +++ b/src/evaluators/evaluator-telemetry.spec.ts @@ -5,7 +5,7 @@ import { } from "@opentelemetry/sdk-trace-base"; import { NodeTracerProvider } from "@opentelemetry/sdk-trace-node"; import type { LanguageModel } from "ai"; -import { _resetTracer, SpanStatusCode } from "../telemetry.js"; +import { _resetTracer, enableTelemetry, SpanStatusCode } from "../telemetry.js"; // Set up OTel SDK for span capture const exporter = new InMemorySpanExporter(); @@ -33,6 +33,7 @@ describe("Evaluator telemetry", () => { beforeEach(() => { exporter.reset(); _resetTracer(); + enableTelemetry(true); mockGenerateObject.mockClear(); }); diff --git a/src/evaluators/evaluator.ts b/src/evaluators/evaluator.ts index 2737c75..571c337 100644 --- a/src/evaluators/evaluator.ts +++ b/src/evaluators/evaluator.ts @@ -1,6 +1,11 @@ import { generateObject } from "ai"; import { z } from "zod"; -import { type EvalKitSpan, getTracer, SpanStatusCode } from "../telemetry.js"; +import { + type EvalKitSpan, + getTracer, + isTelemetryEnabled, + SpanStatusCode, +} from "../telemetry.js"; import type { EvaluationInput, EvaluatorConfig, @@ -140,6 +145,7 @@ export class Evaluator { presencePenalty: this.modelSettings?.presencePenalty, frequencyPenalty: this.modelSettings?.frequencyPenalty, seed: this.modelSettings?.seed, + experimental_telemetry: { isEnabled: isTelemetryEnabled() }, // eslint-disable-next-line @typescript-eslint/no-explicit-any } as any)) as unknown as { object: { score: number | string; feedback: string }; diff --git a/src/index.ts b/src/index.ts index 6d52ad6..d91f6a7 100644 --- a/src/index.ts +++ b/src/index.ts @@ -65,6 +65,8 @@ export { Evaluator } from "./evaluators/evaluator.js"; // Telemetry export { type EvalKitSpan, + enableTelemetry, + isTelemetryEnabled, type WithSpanOptions, withSpan, } from "./telemetry.js"; diff --git a/src/metrics/bert-score-telemetry.spec.ts b/src/metrics/bert-score-telemetry.spec.ts index 7285b44..c6f7b6a 100644 --- a/src/metrics/bert-score-telemetry.spec.ts +++ b/src/metrics/bert-score-telemetry.spec.ts @@ -4,7 +4,7 @@ import { SimpleSpanProcessor, } from "@opentelemetry/sdk-trace-base"; import { NodeTracerProvider } from "@opentelemetry/sdk-trace-node"; -import { _resetTracer, SpanStatusCode } from "../telemetry.js"; +import { _resetTracer, enableTelemetry, SpanStatusCode } from "../telemetry.js"; // Set up OTel SDK const exporter = new InMemorySpanExporter(); @@ -26,6 +26,7 @@ describe("bert-score telemetry", () => { beforeEach(() => { exporter.reset(); _resetTracer(); + enableTelemetry(true); clearBertCache(); mockPipeline.mockClear(); mockCosSim.mockClear(); diff --git a/src/metrics/perplexity-telemetry.spec.ts b/src/metrics/perplexity-telemetry.spec.ts index 9aa58ce..fea1273 100644 --- a/src/metrics/perplexity-telemetry.spec.ts +++ b/src/metrics/perplexity-telemetry.spec.ts @@ -4,7 +4,7 @@ import { SimpleSpanProcessor, } from "@opentelemetry/sdk-trace-base"; import { NodeTracerProvider } from "@opentelemetry/sdk-trace-node"; -import { _resetTracer, SpanStatusCode } from "../telemetry.js"; +import { _resetTracer, enableTelemetry, SpanStatusCode } from "../telemetry.js"; // Set up OTel SDK const exporter = new InMemorySpanExporter(); @@ -35,6 +35,7 @@ describe("perplexity telemetry", () => { beforeEach(() => { exporter.reset(); _resetTracer(); + enableTelemetry(true); clearPerplexityCache(); mockFromPretrainedTokenizer.mockClear(); mockFromPretrainedModel.mockClear(); diff --git a/src/telemetry.spec.ts b/src/telemetry.spec.ts index dc10fa6..56de100 100644 --- a/src/telemetry.spec.ts +++ b/src/telemetry.spec.ts @@ -1,10 +1,15 @@ -import { beforeEach, describe, expect, it } from "@jest/globals"; +import { afterEach, beforeEach, describe, expect, it } from "@jest/globals"; import { InMemorySpanExporter, SimpleSpanProcessor, } from "@opentelemetry/sdk-trace-base"; import { NodeTracerProvider } from "@opentelemetry/sdk-trace-node"; -import { _resetTracer, SpanStatusCode, withSpan } from "./telemetry.js"; +import { + _resetTracer, + enableTelemetry, + SpanStatusCode, + withSpan, +} from "./telemetry.js"; // Set up a real OTel SDK so spans are captured const exporter = new InMemorySpanExporter(); @@ -16,6 +21,7 @@ describe("telemetry", () => { beforeEach(() => { exporter.reset(); _resetTracer(); + enableTelemetry(true); }); describe("withSpan", () => { @@ -99,4 +105,37 @@ describe("telemetry", () => { expect(child?.parentSpanId).toBe(parent?.spanContext().spanId); }); }); + + describe("enableTelemetry", () => { + afterEach(() => { + // Always re-enable so other tests aren't affected + enableTelemetry(true); + }); + + it("should execute callback but emit no span when disabled", async () => { + enableTelemetry(false); + _resetTracer(); + + const result = await withSpan("test.disabled", {}, async () => "hello"); + + expect(result).toBe("hello"); + expect(exporter.getFinishedSpans()).toHaveLength(0); + }); + + it("should restore span creation after re-enabling", async () => { + enableTelemetry(false); + _resetTracer(); + + await withSpan("while.disabled", {}, async () => "ignored"); + + enableTelemetry(true); + _resetTracer(); + + await withSpan("after.reenable", {}, async () => "tracked"); + + const spans = exporter.getFinishedSpans(); + expect(spans).toHaveLength(1); + expect(spans[0].name).toBe("after.reenable"); + }); + }); }); diff --git a/src/telemetry.ts b/src/telemetry.ts index d44fb99..b023250 100644 --- a/src/telemetry.ts +++ b/src/telemetry.ts @@ -53,6 +53,30 @@ const noopTracer: Tracer = { }, }; +// ---------- Global toggle ---------- + +let telemetryEnabled = false; + +/** + * Enable or disable eval-kit's OpenTelemetry instrumentation globally. + * + * When disabled, all tracing functions return no-ops — identical to the + * behaviour when `@opentelemetry/api` is not installed. This lets you + * use OTel in your application without eval-kit emitting spans. + * + * Telemetry is **disabled** by default. + */ +export function enableTelemetry(enabled: boolean): void { + telemetryEnabled = enabled; +} + +/** + * Returns the current telemetry enabled state. + */ +export function isTelemetryEnabled(): boolean { + return telemetryEnabled; +} + // ---------- Lazy tracer resolution ---------- let resolvedTracer: Tracer | null = null; @@ -113,18 +137,20 @@ export const SpanStatusCode = { /** * Get the eval-kit tracer (async). Triggers resolution on first call. + * Returns the no-op tracer when telemetry is disabled. */ export async function getTracer(): Promise { + if (!telemetryEnabled) return noopTracer; return resolveTracer(); } /** * Get the cached tracer synchronously. Returns no-op if the tracer - * hasn't been resolved yet (i.e., no prior withSpan/getTracer call). - * Use this in hot paths where the tracer is guaranteed to be resolved - * by a parent span. + * hasn't been resolved yet (i.e., no prior withSpan/getTracer call) + * or if telemetry is disabled. */ export function getCachedTracer(): Tracer { + if (!telemetryEnabled) return noopTracer; return resolvedTracer ?? noopTracer; } @@ -146,6 +172,8 @@ export async function withSpan( options: WithSpanOptions, fn: (span: EvalKitSpan) => Promise, ): Promise { + if (!telemetryEnabled) return fn(noopSpan); + const tracer = await resolveTracer(); return tracer.startActiveSpan( name, From 5ec989045a5b01c554a39839e06d070ea86cea71 Mon Sep 17 00:00:00 2001 From: yltw27 Date: Tue, 3 Mar 2026 17:41:14 +0800 Subject: [PATCH 6/6] chore: add changeset for OpenTelemetry tracing feature Co-Authored-By: Claude Opus 4.6 --- .changeset/add-telemetry-toggle.md | 11 +++++++++++ 1 file changed, 11 insertions(+) create mode 100644 .changeset/add-telemetry-toggle.md diff --git a/.changeset/add-telemetry-toggle.md b/.changeset/add-telemetry-toggle.md new file mode 100644 index 0000000..4d6d28a --- /dev/null +++ b/.changeset/add-telemetry-toggle.md @@ -0,0 +1,11 @@ +--- +"@loveholidays/eval-kit": minor +--- + +Add optional OpenTelemetry tracing support for evaluations, batch processing, and async metrics. + +- Emit spans for `Evaluator.evaluate`, `BatchEvaluator.evaluate`, row processing, retries, and BERT/perplexity metrics +- `@opentelemetry/api` is an optional peer dependency — zero overhead when not installed +- Telemetry is disabled by default; call `enableTelemetry(true)` to opt in +- `isTelemetryEnabled()` getter for reading the current state +- Exported `withSpan` helper for custom evaluator instrumentation