diff --git a/.changeset/add-telemetry-toggle.md b/.changeset/add-telemetry-toggle.md new file mode 100644 index 0000000..4d6d28a --- /dev/null +++ b/.changeset/add-telemetry-toggle.md @@ -0,0 +1,11 @@ +--- +"@loveholidays/eval-kit": minor +--- + +Add optional OpenTelemetry tracing support for evaluations, batch processing, and async metrics. + +- Emit spans for `Evaluator.evaluate`, `BatchEvaluator.evaluate`, row processing, retries, and BERT/perplexity metrics +- `@opentelemetry/api` is an optional peer dependency — zero overhead when not installed +- Telemetry is disabled by default; call `enableTelemetry(true)` to opt in +- `isTelemetryEnabled()` getter for reading the current state +- Exported `withSpan` helper for custom evaluator instrumentation diff --git a/README.md b/README.md index e0197df..d45ebde 100644 --- a/README.md +++ b/README.md @@ -7,6 +7,7 @@ A TypeScript SDK for evaluating content quality using traditional metrics and AI - **Traditional Metrics**: BLEU, TER, BERTScore, Coherence, Perplexity - **AI-Powered Evaluation**: LLM-based evaluator with prompt templating (via Vercel AI SDK) - **Batch Processing**: Concurrent execution, progress tracking, retry logic, CSV/JSON export +- **OpenTelemetry Tracing**: Optional distributed tracing with zero overhead when disabled ## Installation @@ -87,6 +88,31 @@ await batchEvaluator.export({ }); ``` +### OpenTelemetry Tracing (Optional) + +eval-kit has built-in OpenTelemetry support. Install the optional peer dependency to enable distributed tracing with your existing observability stack (Jaeger, Grafana Tempo, Datadog, etc.): + +```bash +npm install @opentelemetry/api @opentelemetry/sdk-trace-node +``` + +Configure your OTel SDK as usual — eval-kit will automatically emit spans: + +```typescript +import { NodeTracerProvider } from '@opentelemetry/sdk-trace-node'; +import { SimpleSpanProcessor, ConsoleSpanExporter } from '@opentelemetry/sdk-trace-base'; + +// Set up OTel before using eval-kit +const provider = new NodeTracerProvider(); +provider.addSpanProcessor(new SimpleSpanProcessor(new ConsoleSpanExporter())); +provider.register(); + +// Now all eval-kit operations produce traces +const result = await batchEvaluator.evaluate({ filePath: './data.csv' }); +``` + +When `@opentelemetry/api` is not installed, all tracing is a no-op with zero overhead. See the [Telemetry Guide](./docs/TELEMETRY.md) for span details and custom evaluator instrumentation. + ## Documentation | Guide | Description | @@ -95,6 +121,7 @@ await batchEvaluator.export({ | [Evaluator](./docs/EVALUATOR.md) | AI-powered evaluation and scoring | | [Batch Evaluation](./docs/BATCH_EVALUATION_GUIDE.md) | Concurrent processing, progress tracking | | [Export](./docs/EXPORT_GUIDE.md) | CSV and JSON export options | +| [Telemetry](./docs/TELEMETRY.md) | OpenTelemetry tracing and observability | ## Supported LLM Providers diff --git a/docs/TELEMETRY.md b/docs/TELEMETRY.md new file mode 100644 index 0000000..380259e --- /dev/null +++ b/docs/TELEMETRY.md @@ -0,0 +1,217 @@ +# OpenTelemetry Telemetry Guide + +eval-kit provides built-in [OpenTelemetry](https://opentelemetry.io/) tracing. When enabled, it emits spans for evaluations, batch processing, retries, and async metrics — giving you visibility into per-row latency, token usage, retry behavior, and where time is spent. + +## Setup + +### 1. Install dependencies + +`@opentelemetry/api` is an optional peer dependency. Install it along with an SDK and exporter: + +```bash +npm install @opentelemetry/api @opentelemetry/sdk-trace-node @opentelemetry/sdk-trace-base +``` + +### 2. Configure the OTel SDK + +Set up a tracer provider **before** calling any eval-kit functions: + +```typescript +import { NodeTracerProvider } from '@opentelemetry/sdk-trace-node'; +import { SimpleSpanProcessor, ConsoleSpanExporter } from '@opentelemetry/sdk-trace-base'; + +const provider = new NodeTracerProvider(); +provider.addSpanProcessor(new SimpleSpanProcessor(new ConsoleSpanExporter())); +provider.register(); +``` + +For production, replace `ConsoleSpanExporter` with your backend exporter (Jaeger, OTLP, Zipkin, etc.). + +### 3. Use eval-kit as normal + +No code changes needed. eval-kit detects the registered provider and emits spans automatically. + +## Disabling Telemetry + +eval-kit's telemetry is **disabled by default**. To opt in: + +```typescript +import { enableTelemetry } from '@loveholidays/eval-kit'; + +// Enable eval-kit tracing +enableTelemetry(true); + +// Disable again if needed (OTel remains active for the rest of your app) +enableTelemetry(false); +``` + +When disabled, all tracing functions return no-ops with zero overhead — the same behaviour as when `@opentelemetry/api` is not installed. + +## Zero overhead when not installed + +When `@opentelemetry/api` is not installed, eval-kit uses no-op stubs internally. All tracing calls become empty function calls that are optimized away by the JS engine. There is no performance impact. + +## Span Hierarchy + +### Batch evaluation + +``` +eval-kit.batch.evaluate +├── eval-kit.batch.parse_input (file-based input only) +├── eval-kit.batch.process_row (per row) +│ ├── [retry event] (on retry attempts) +│ └── eval-kit.batch.run_evaluators +│ └── eval-kit.evaluator.evaluate (per evaluator) +└── eval-kit.batch.export (when export() is called) +``` + +### Single evaluation + +``` +eval-kit.evaluator.evaluate +``` + +### Async metrics (standalone) + +``` +eval-kit.metric.bert_score +eval-kit.metric.perplexity +``` + +## Span Attributes + +### `eval-kit.evaluator.evaluate` + +| Attribute | Type | Description | +|-----------|------|-------------| +| `eval_kit.evaluator.name` | string | Evaluator name | +| `eval_kit.model.id` | string | Model identifier | +| `eval_kit.score_config.type` | string | `"numeric"` or `"categorical"` | +| `eval_kit.input.candidate_text_length` | number | Length of input text | +| `eval_kit.result.score` | number/string | Evaluation score | +| `eval_kit.result.execution_time_ms` | number | Wall clock time | +| `eval_kit.result.token_usage.input` | number | Input tokens consumed | +| `eval_kit.result.token_usage.output` | number | Output tokens generated | +| `eval_kit.result.token_usage.total` | number | Total tokens | +| `eval_kit.result.error` | string | Error message (on failure) | + +### `eval-kit.batch.evaluate` + +| Attribute | Type | Description | +|-----------|------|-------------| +| `eval_kit.batch.id` | string | Unique batch ID | +| `eval_kit.batch.concurrency` | number | Max concurrent rows | +| `eval_kit.batch.execution_mode` | string | `"parallel"` or `"sequential"` | +| `eval_kit.batch.total_rows` | number | Total rows in input | +| `eval_kit.batch.successful_rows` | number | Rows completed successfully | +| `eval_kit.batch.failed_rows` | number | Rows that failed | + +### `eval-kit.batch.process_row` + +| Attribute | Type | Description | +|-----------|------|-------------| +| `eval_kit.row.id` | string | Row identifier | +| `eval_kit.row.index` | number | Row index in input | +| `eval_kit.row.duration_ms` | number | Total time including retries | +| `eval_kit.row.retry_count` | number | Number of retry attempts | +| `eval_kit.result.error` | string | Error message (on failure) | + +**Retry events** are recorded on this span with name `retry`: + +| Event Attribute | Type | Description | +|----------------|------|-------------| +| `eval_kit.retry.attempt` | number | Retry attempt number (1-based) | +| `eval_kit.retry.delay_ms` | number | Delay before retry | +| `eval_kit.retry.error` | string | Error that triggered the retry | + +### `eval-kit.batch.run_evaluators` + +| Attribute | Type | Description | +|-----------|------|-------------| +| `eval_kit.evaluator_count` | number | Number of evaluators | +| `eval_kit.execution_mode` | string | `"parallel"` or `"sequential"` | + +### `eval-kit.batch.parse_input` + +Only created for file-based input (not in-memory data). + +| Attribute | Type | Description | +|-----------|------|-------------| +| `eval_kit.parse.input_format` | string | `"file"` | +| `eval_kit.parse.row_count` | number | Number of parsed rows | + +### `eval-kit.batch.export` + +| Attribute | Type | Description | +|-----------|------|-------------| +| `eval_kit.export.format` | string | `"csv"` or `"json"` | +| `eval_kit.export.row_count` | number | Number of exported rows | + +### `eval-kit.metric.bert_score` / `eval-kit.metric.perplexity` + +| Attribute | Type | Description | +|-----------|------|-------------| +| `eval_kit.metric.name` | string | Metric name | +| `eval_kit.metric.model` | string | Model used | +| `eval_kit.result.score` | number | Computed score | + +A `model_loaded` event is recorded when the model is loaded for the first time (cache miss). + +## Custom Evaluator Instrumentation + +If you implement `IEvaluator` and want your spans to appear in the trace hierarchy, use the exported `withSpan` helper: + +```typescript +import { withSpan, type IEvaluator, type EvaluatorResult } from '@loveholidays/eval-kit'; + +const myEvaluator: IEvaluator = { + name: 'custom-eval', + async evaluate(input) { + return withSpan( + 'my-app.custom-eval', + { attributes: { 'my_app.evaluator.name': 'custom-eval' } }, + async (span) => { + // Your evaluation logic here + const score = await computeScore(input.candidateText); + + span.setAttribute('my_app.result.score', score); + return { + evaluatorName: 'custom-eval', + score, + feedback: 'Custom evaluation', + processingStats: { executionTime: 0 }, + }; + }, + ); + }, +}; +``` + +Your `my-app.custom-eval` span will automatically appear as a child of `eval-kit.batch.run_evaluators` when used in a batch evaluation, because `withSpan` uses the active async context. + +## Example: Full Setup with Jaeger + +```typescript +import { OTLPTraceExporter } from '@opentelemetry/exporter-trace-otlp-http'; +import { Resource } from '@opentelemetry/resources'; +import { BatchSpanProcessor } from '@opentelemetry/sdk-trace-base'; +import { NodeTracerProvider } from '@opentelemetry/sdk-trace-node'; +import { ATTR_SERVICE_NAME } from '@opentelemetry/semantic-conventions'; +import { BatchEvaluator, Evaluator } from '@loveholidays/eval-kit'; + +// Configure OTel +const provider = new NodeTracerProvider({ + resource: new Resource({ [ATTR_SERVICE_NAME]: 'my-eval-pipeline' }), +}); +provider.addSpanProcessor( + new BatchSpanProcessor(new OTLPTraceExporter({ url: 'http://localhost:4318/v1/traces' })) +); +provider.register(); + +// Run evaluation — spans are exported to Jaeger automatically +const batch = new BatchEvaluator({ evaluators: [evaluator], concurrency: 10 }); +const result = await batch.evaluate({ filePath: './data.csv' }); + +// Flush spans before exit +await provider.shutdown(); +``` diff --git a/package.json b/package.json index a5fe3ee..ac309ed 100644 --- a/package.json +++ b/package.json @@ -63,9 +63,20 @@ "lodash": ">=4.17.23" } }, + "peerDependencies": { + "@opentelemetry/api": "^1.0.0" + }, + "peerDependenciesMeta": { + "@opentelemetry/api": { + "optional": true + } + }, "devDependencies": { "@biomejs/biome": "^2.3.7", "@changesets/cli": "^2.29.8", + "@opentelemetry/api": "^1.9.0", + "@opentelemetry/sdk-trace-base": "^1.30.0", + "@opentelemetry/sdk-trace-node": "^1.30.0", "@types/jest": "^30.0.0", "@types/node": "^24.10.1", "jest": "^30.2.0", diff --git a/pnpm-lock.yaml b/pnpm-lock.yaml index fdedad0..92ad58a 100644 --- a/pnpm-lock.yaml +++ b/pnpm-lock.yaml @@ -37,6 +37,15 @@ importers: '@changesets/cli': specifier: ^2.29.8 version: 2.29.8(@types/node@24.10.1) + '@opentelemetry/api': + specifier: ^1.9.0 + version: 1.9.0 + '@opentelemetry/sdk-trace-base': + specifier: ^1.30.0 + version: 1.30.1(@opentelemetry/api@1.9.0) + '@opentelemetry/sdk-trace-node': + specifier: ^1.30.0 + version: 1.30.1(@opentelemetry/api@1.9.0) '@types/jest': specifier: ^30.0.0 version: 30.0.0 @@ -688,6 +697,52 @@ packages: resolution: {integrity: sha512-3giAOQvZiH5F9bMlMiv8+GSPMeqg0dbaeo58/0SlA9sxSqZhnUtxzX9/2FzyhS9sWQf5S0GJE0AKBrFqjpeYcg==} engines: {node: '>=8.0.0'} + '@opentelemetry/context-async-hooks@1.30.1': + resolution: {integrity: sha512-s5vvxXPVdjqS3kTLKMeBMvop9hbWkwzBpu+mUO2M7sZtlkyDJGwFe33wRKnbaYDo8ExRVBIIdwIGrqpxHuKttA==} + engines: {node: '>=14'} + peerDependencies: + '@opentelemetry/api': '>=1.0.0 <1.10.0' + + '@opentelemetry/core@1.30.1': + resolution: {integrity: sha512-OOCM2C/QIURhJMuKaekP3TRBxBKxG/TWWA0TL2J6nXUtDnuCtccy49LUJF8xPFXMX+0LMcxFpCo8M9cGY1W6rQ==} + engines: {node: '>=14'} + peerDependencies: + '@opentelemetry/api': '>=1.0.0 <1.10.0' + + '@opentelemetry/propagator-b3@1.30.1': + resolution: {integrity: sha512-oATwWWDIJzybAZ4pO76ATN5N6FFbOA1otibAVlS8v90B4S1wClnhRUk7K+2CHAwN1JKYuj4jh/lpCEG5BAqFuQ==} + engines: {node: '>=14'} + peerDependencies: + '@opentelemetry/api': '>=1.0.0 <1.10.0' + + '@opentelemetry/propagator-jaeger@1.30.1': + resolution: {integrity: sha512-Pj/BfnYEKIOImirH76M4hDaBSx6HyZ2CXUqk+Kj02m6BB80c/yo4BdWkn/1gDFfU+YPY+bPR2U0DKBfdxCKwmg==} + engines: {node: '>=14'} + peerDependencies: + '@opentelemetry/api': '>=1.0.0 <1.10.0' + + '@opentelemetry/resources@1.30.1': + resolution: {integrity: sha512-5UxZqiAgLYGFjS4s9qm5mBVo433u+dSPUFWVWXmLAD4wB65oMCoXaJP1KJa9DIYYMeHu3z4BZcStG3LC593cWA==} + engines: {node: '>=14'} + peerDependencies: + '@opentelemetry/api': '>=1.0.0 <1.10.0' + + '@opentelemetry/sdk-trace-base@1.30.1': + resolution: {integrity: sha512-jVPgBbH1gCy2Lb7X0AVQ8XAfgg0pJ4nvl8/IiQA6nxOsPvS+0zMJaFSs2ltXe0J6C8dqjcnpyqINDJmU30+uOg==} + engines: {node: '>=14'} + peerDependencies: + '@opentelemetry/api': '>=1.0.0 <1.10.0' + + '@opentelemetry/sdk-trace-node@1.30.1': + resolution: {integrity: sha512-cBjYOINt1JxXdpw1e5MlHmFRc5fgj4GW/86vsKFxJCJ8AL4PdVtYH41gWwl4qd4uQjqEL1oJVrXkSy5cnduAnQ==} + engines: {node: '>=14'} + peerDependencies: + '@opentelemetry/api': '>=1.0.0 <1.10.0' + + '@opentelemetry/semantic-conventions@1.28.0': + resolution: {integrity: sha512-lp4qAiMTD4sNWW4DbKLBkfiMZ4jbAboJIGOQr5DvciMRI494OapieI9qiODpOt0XBr1LjIDy1xAGAnVs5supTA==} + engines: {node: '>=14'} + '@pkgjs/parseargs@0.11.0': resolution: {integrity: sha512-+1VkjdD0QBLPodGrJUeqarH8VAIvQODIbwh9XpP5Syisf7YoQgsJKPNFoqqLQlu+VQ/tVSshMR6loPMn8U+dPg==} engines: {node: '>=14'} @@ -3403,6 +3458,50 @@ snapshots: '@opentelemetry/api@1.9.0': {} + '@opentelemetry/context-async-hooks@1.30.1(@opentelemetry/api@1.9.0)': + dependencies: + '@opentelemetry/api': 1.9.0 + + '@opentelemetry/core@1.30.1(@opentelemetry/api@1.9.0)': + dependencies: + '@opentelemetry/api': 1.9.0 + '@opentelemetry/semantic-conventions': 1.28.0 + + '@opentelemetry/propagator-b3@1.30.1(@opentelemetry/api@1.9.0)': + dependencies: + '@opentelemetry/api': 1.9.0 + '@opentelemetry/core': 1.30.1(@opentelemetry/api@1.9.0) + + '@opentelemetry/propagator-jaeger@1.30.1(@opentelemetry/api@1.9.0)': + dependencies: + '@opentelemetry/api': 1.9.0 + '@opentelemetry/core': 1.30.1(@opentelemetry/api@1.9.0) + + '@opentelemetry/resources@1.30.1(@opentelemetry/api@1.9.0)': + dependencies: + '@opentelemetry/api': 1.9.0 + '@opentelemetry/core': 1.30.1(@opentelemetry/api@1.9.0) + '@opentelemetry/semantic-conventions': 1.28.0 + + '@opentelemetry/sdk-trace-base@1.30.1(@opentelemetry/api@1.9.0)': + dependencies: + '@opentelemetry/api': 1.9.0 + '@opentelemetry/core': 1.30.1(@opentelemetry/api@1.9.0) + '@opentelemetry/resources': 1.30.1(@opentelemetry/api@1.9.0) + '@opentelemetry/semantic-conventions': 1.28.0 + + '@opentelemetry/sdk-trace-node@1.30.1(@opentelemetry/api@1.9.0)': + dependencies: + '@opentelemetry/api': 1.9.0 + '@opentelemetry/context-async-hooks': 1.30.1(@opentelemetry/api@1.9.0) + '@opentelemetry/core': 1.30.1(@opentelemetry/api@1.9.0) + '@opentelemetry/propagator-b3': 1.30.1(@opentelemetry/api@1.9.0) + '@opentelemetry/propagator-jaeger': 1.30.1(@opentelemetry/api@1.9.0) + '@opentelemetry/sdk-trace-base': 1.30.1(@opentelemetry/api@1.9.0) + semver: 7.7.3 + + '@opentelemetry/semantic-conventions@1.28.0': {} + '@pkgjs/parseargs@0.11.0': optional: true diff --git a/src/batch/batch-evaluator-telemetry.spec.ts b/src/batch/batch-evaluator-telemetry.spec.ts new file mode 100644 index 0000000..f3eb1ae --- /dev/null +++ b/src/batch/batch-evaluator-telemetry.spec.ts @@ -0,0 +1,187 @@ +import { beforeEach, describe, expect, it } from "@jest/globals"; +import type { ReadableSpan } from "@opentelemetry/sdk-trace-base"; +import { + InMemorySpanExporter, + SimpleSpanProcessor, +} from "@opentelemetry/sdk-trace-base"; +import { NodeTracerProvider } from "@opentelemetry/sdk-trace-node"; +import { _resetTracer, enableTelemetry, SpanStatusCode } from "../telemetry.js"; +import type { + EvaluationInput, + EvaluatorResult, + IEvaluator, +} from "../types/evaluator.js"; +import { BatchEvaluator } from "./batch-evaluator.js"; + +// Set up OTel SDK for span capture +const exporter = new InMemorySpanExporter(); +const provider = new NodeTracerProvider(); +provider.addSpanProcessor(new SimpleSpanProcessor(exporter)); +provider.register(); + +const createMockEvaluator = (name: string): IEvaluator => ({ + name, + evaluate: async (_input: EvaluationInput): Promise => ({ + evaluatorName: name, + model: "mock-model", + score: 85, + feedback: "Good", + processingStats: { + executionTime: 10, + tokenUsage: { inputTokens: 50, outputTokens: 10, totalTokens: 60 }, + }, + }), +}); + +async function runBatch( + evaluators: IEvaluator[], + data: { candidateText: string; id?: string }[], + config?: { + concurrency?: number; + retryConfig?: { + maxRetries: number; + retryDelay: number; + exponentialBackoff?: boolean; + }; + }, +) { + const batch = new BatchEvaluator({ + evaluators, + concurrency: config?.concurrency ?? 1, + retryConfig: config?.retryConfig, + }); + await batch.evaluate({ data }); + return exporter.getFinishedSpans(); +} + +function getSpan(spans: ReadableSpan[], name: string): ReadableSpan { + const span = spans.find((s) => s.name === name); + expect(span).toBeDefined(); + return span as ReadableSpan; +} + +describe("BatchEvaluator telemetry", () => { + beforeEach(() => { + exporter.reset(); + _resetTracer(); + enableTelemetry(true); + }); + + it("should create span hierarchy: batch → process_row → run_evaluators", async () => { + const spans = await runBatch( + [createMockEvaluator("fluency")], + [{ candidateText: "Hello world" }], + ); + + expect(spans.map((s) => s.name).sort()).toEqual([ + "eval-kit.batch.evaluate", + "eval-kit.batch.process_row", + "eval-kit.batch.run_evaluators", + ]); + + const batchSpan = getSpan(spans, "eval-kit.batch.evaluate"); + const rowSpan = getSpan(spans, "eval-kit.batch.process_row"); + const evalSpan = getSpan(spans, "eval-kit.batch.run_evaluators"); + + expect(rowSpan.parentSpanId).toBe(batchSpan.spanContext().spanId); + expect(evalSpan.parentSpanId).toBe(rowSpan.spanContext().spanId); + }); + + it("should set correct attributes on batch span", async () => { + const spans = await runBatch( + [createMockEvaluator("fluency")], + [{ candidateText: "Row 1" }, { candidateText: "Row 2" }], + { concurrency: 3 }, + ); + + const span = getSpan(spans, "eval-kit.batch.evaluate"); + expect(span.attributes["eval_kit.batch.concurrency"]).toBe(3); + expect(span.attributes["eval_kit.batch.total_rows"]).toBe(2); + expect(span.attributes["eval_kit.batch.successful_rows"]).toBe(2); + expect(span.attributes["eval_kit.batch.failed_rows"]).toBe(0); + expect(span.status.code).toBe(SpanStatusCode.OK); + }); + + it("should set correct attributes on process_row span", async () => { + const spans = await runBatch( + [createMockEvaluator("accuracy")], + [{ candidateText: "Test row", id: "custom-id" }], + ); + + const span = getSpan(spans, "eval-kit.batch.process_row"); + expect(span.attributes["eval_kit.row.id"]).toBe("custom-id"); + expect(span.attributes["eval_kit.row.index"]).toBe(0); + expect(span.attributes["eval_kit.row.retry_count"]).toBe(0); + expect(span.attributes["eval_kit.row.duration_ms"]).toBeGreaterThanOrEqual( + 0, + ); + expect(span.status.code).toBe(SpanStatusCode.OK); + }); + + it("should record retry events on process_row span", async () => { + let callCount = 0; + const flaky: IEvaluator = { + name: "flaky", + evaluate: async () => { + callCount++; + if (callCount <= 2) throw new Error("ECONNRESET"); + return { + evaluatorName: "flaky", + score: 80, + feedback: "OK", + processingStats: { executionTime: 5 }, + }; + }, + }; + + const spans = await runBatch([flaky], [{ candidateText: "Test" }], { + retryConfig: { maxRetries: 3, retryDelay: 1, exponentialBackoff: false }, + }); + + const span = getSpan(spans, "eval-kit.batch.process_row"); + const retryEvents = span.events.filter((e) => e.name === "retry"); + + expect(retryEvents).toHaveLength(2); + expect(retryEvents[0].attributes?.["eval_kit.retry.attempt"]).toBe(1); + expect(retryEvents[0].attributes?.["eval_kit.retry.error"]).toBe( + "ECONNRESET", + ); + expect(retryEvents[1].attributes?.["eval_kit.retry.attempt"]).toBe(2); + expect(span.attributes["eval_kit.row.retry_count"]).toBe(2); + expect(span.status.code).toBe(SpanStatusCode.OK); + }); + + it("should set error status on process_row span when all retries exhausted", async () => { + const failing: IEvaluator = { + name: "failing", + evaluate: async () => { + throw new Error("ECONNRESET"); + }, + }; + + const spans = await runBatch([failing], [{ candidateText: "Test" }], { + retryConfig: { maxRetries: 1, retryDelay: 1 }, + }); + + const span = getSpan(spans, "eval-kit.batch.process_row"); + expect(span.status.code).toBe(SpanStatusCode.ERROR); + expect(span.attributes["eval_kit.result.error"]).toBe("ECONNRESET"); + expect(span.attributes["eval_kit.row.retry_count"]).toBe(1); + }); + + it("should skip parse_input span for in-memory data", async () => { + const spans = await runBatch( + [createMockEvaluator("test")], + [ + { candidateText: "Row 1" }, + { candidateText: "Row 2" }, + { candidateText: "Row 3" }, + ], + ); + + const parseSpan = spans.find( + (s) => s.name === "eval-kit.batch.parse_input", + ); + expect(parseSpan).toBeUndefined(); + }); +}); diff --git a/src/batch/batch-evaluator.ts b/src/batch/batch-evaluator.ts index 87a0117..d71b210 100644 --- a/src/batch/batch-evaluator.ts +++ b/src/batch/batch-evaluator.ts @@ -1,4 +1,10 @@ import { randomUUID } from "node:crypto"; +import { + type EvalKitSpan, + getCachedTracer, + SpanStatusCode, + withSpan, +} from "../telemetry.js"; import type { EvaluatorResult, IEvaluator } from "../types/evaluator.js"; import { ConcurrencyManager } from "./concurrency-manager.js"; import { CsvExporter } from "./exporters/csv-exporter.js"; @@ -43,80 +49,125 @@ export class BatchEvaluator { * Run batch evaluation on input data */ async evaluate(inputConfig: BatchInputConfig): Promise { - // Parse input - const allRows = await this.parseInput(inputConfig); + return withSpan( + "eval-kit.batch.evaluate", + { + attributes: { + "eval_kit.batch.id": this.batchId, + "eval_kit.batch.concurrency": this.config.concurrency ?? 5, + "eval_kit.batch.execution_mode": + this.config.evaluatorExecutionMode ?? "parallel", + }, + }, + async (span) => { + // Parse input + const allRows = await this.parseInput(inputConfig); - // Handle startIndex for resuming from a specific position - const startIndex = inputConfig.startIndex ?? 0; - const rows = startIndex > 0 ? allRows.slice(startIndex) : allRows; + span.setAttribute("eval_kit.batch.total_rows", allRows.length); - // Mark rows before startIndex as already processed (for accurate progress tracking) - for (let i = 0; i < startIndex; i++) { - this.processedRowIndices.add(i); - } + // Handle startIndex for resuming from a specific position + const startIndex = inputConfig.startIndex ?? 0; + const rows = startIndex > 0 ? allRows.slice(startIndex) : allRows; - // Initialize progress tracker (use total rows for accurate percentage) - this.progressTracker = new ProgressTracker({ - totalRows: allRows.length, - emitInterval: this.config.progressInterval, - onProgress: this.config.onProgress, - }); - this.progressTracker.start(); - - // Fast-forward progress tracker for skipped rows - if (startIndex > 0) { - this.progressTracker.skipRows(startIndex); - } + // Mark rows before startIndex as already processed (for accurate progress tracking) + for (let i = 0; i < startIndex; i++) { + this.processedRowIndices.add(i); + } - // Process rows with controlled concurrency - // We batch into chunks to avoid creating all promises at once - const maxConcurrency = this.config.concurrency ?? 5; - const batchSize = maxConcurrency * 2; // Process in batches of 2x concurrency + // Initialize progress tracker (use total rows for accurate percentage) + this.progressTracker = new ProgressTracker({ + totalRows: allRows.length, + emitInterval: this.config.progressInterval, + onProgress: this.config.onProgress, + }); + this.progressTracker.start(); + + // Fast-forward progress tracker for skipped rows + if (startIndex > 0) { + this.progressTracker.skipRows(startIndex); + } - for (let i = 0; i < rows.length; i += batchSize) { - const batch = rows.slice(i, i + batchSize); - const batchPromises = batch.map((row, batchIndex) => - // Adjust index to account for startIndex offset - this.processRow(row, startIndex + i + batchIndex), - ); - await Promise.all(batchPromises); - } + // Process rows with controlled concurrency + // We batch into chunks to avoid creating all promises at once + const maxConcurrency = this.config.concurrency ?? 5; + const batchSize = maxConcurrency * 2; // Process in batches of 2x concurrency - // Mark completion - this.progressTracker.complete(); + for (let i = 0; i < rows.length; i += batchSize) { + const batch = rows.slice(i, i + batchSize); + const batchPromises = batch.map((row, batchIndex) => + // Adjust index to account for startIndex offset + this.processRow(row, startIndex + i + batchIndex), + ); + await Promise.all(batchPromises); + } - // Return results - return this.buildResult(); + // Mark completion + this.progressTracker.complete(); + + // Return results + const result = this.buildResult(); + span.setAttribute( + "eval_kit.batch.successful_rows", + result.successfulRows, + ); + span.setAttribute("eval_kit.batch.failed_rows", result.failedRows); + return result; + }, + ); } /** * Export results to a destination */ async export(exportConfig: BatchExportConfig): Promise { - const exporter = this.getExporter(exportConfig.format); - await exporter.export(this.results, exportConfig); + return withSpan( + "eval-kit.batch.export", + { + attributes: { + "eval_kit.export.format": exportConfig.format, + "eval_kit.export.row_count": this.results.length, + }, + }, + async () => { + const exporter = this.getExporter(exportConfig.format); + await exporter.export(this.results, exportConfig); + }, + ); } /** * Parse input - supports both file-based and in-memory data */ private async parseInput(config: BatchInputConfig): Promise { - // Check if it's in-memory data config + // In-memory data — no I/O, skip span if ("data" in config) { return config.data; } - // File-based config - const fileConfig = config as BatchInputFileConfig; - let format = fileConfig.format; + // File-based config — actual I/O worth tracing + return withSpan( + "eval-kit.batch.parse_input", + { + attributes: { + "eval_kit.parse.input_format": "file", + }, + }, + async (span) => { + const fileConfig = config as BatchInputFileConfig; + let format = fileConfig.format; - // Handle "auto" format detection - if (!format || format === "auto") { - format = this.detectFormat(fileConfig.filePath); - } + // Handle "auto" format detection + if (!format || format === "auto") { + format = this.detectFormat(fileConfig.filePath); + } + + const parser = this.getParser(format); + const rows = await parser.parse(fileConfig); - const parser = this.getParser(format); - return parser.parse(fileConfig); + span.setAttribute("eval_kit.parse.row_count", rows.length); + return rows; + }, + ); } /** @@ -169,158 +220,245 @@ export class BatchEvaluator { await this.concurrencyManager.run(async () => { const rowId = row.id ?? `row-${index}`; - const startTime = Date.now(); - let retryCount = 0; - const maxRetries = this.config.retryConfig?.maxRetries ?? 3; - - // Merge default input fields with row data - // Row data takes precedence over defaults - const inputData: BatchInputRow = { - ...this.config.defaultInput, - ...row, - }; - - // Loop allows: 1 initial attempt + maxRetries retry attempts - // With maxRetries=3: attempts at retryCount 0,1,2,3 = 4 total attempts - while (true) { - try { - // Run evaluators - const evaluatorResults = await this.runEvaluators(inputData); - - // Calculate duration - const durationMs = Date.now() - startTime; - - // Calculate total tokens used - const tokensUsed = evaluatorResults.reduce( - (sum, r) => sum + (r.processingStats.tokenUsage?.totalTokens ?? 0), - 0, - ); + const tracer = getCachedTracer(); + + await tracer.startActiveSpan( + "eval-kit.batch.process_row", + { + attributes: { "eval_kit.row.id": rowId, "eval_kit.row.index": index }, + }, + (span: EvalKitSpan) => this.runRowWithSpan(row, index, span), + ); + }); + } - // Create result (use merged inputData so defaults are included) - const result: BatchEvaluationResult = { - rowId, - rowIndex: index, - input: inputData, - results: evaluatorResults, - timestamp: new Date().toISOString(), - durationMs, - retryCount, - }; - - // Call onResult callback for streaming results - if (this.config.onResult) { - const callbackResult = this.config.onResult(result); - if (callbackResult instanceof Promise) { - await callbackResult; - } - } + private async runRowWithSpan( + row: BatchInputRow, + index: number, + span: EvalKitSpan, + ): Promise { + const startTime = Date.now(); + let spanError: string | undefined; + const inputData = { ...this.config.defaultInput, ...row }; + const rowId = row.id ?? `row-${index}`; + + try { + await this.executeRowWithRetry({ + inputData, + row, + index, + rowId, + span, + startTime, + }); + } catch (error) { + spanError = error instanceof Error ? error.message : String(error); + span.recordException(error instanceof Error ? error : spanError); + } finally { + span.setAttribute("eval_kit.row.duration_ms", Date.now() - startTime); + if (spanError) { + span.setAttribute("eval_kit.result.error", spanError); + span.setStatus({ code: SpanStatusCode.ERROR, message: spanError }); + } else { + span.setStatus({ code: SpanStatusCode.OK }); + } + span.end(); + } + } - // Store result in memory - this.results.push(result); - this.processedRowIndices.add(index); + private async executeRowWithRetry(ctx: { + inputData: BatchInputRow; + row: BatchInputRow; + index: number; + rowId: string; + span: EvalKitSpan; + startTime: number; + }): Promise { + let retryCount = 0; + const maxRetries = this.config.retryConfig?.maxRetries ?? 3; + let lastError: unknown; + + while (true) { + try { + await this.executeRowEvaluation({ ...ctx, retryCount }); + ctx.span.setAttribute("eval_kit.row.retry_count", retryCount); + return; + } catch (error) { + lastError = error; + const errorCtx = { + error, + span: ctx.span, + startTime: ctx.startTime, + retryCount, + maxRetries, + rowId: ctx.rowId, + index: ctx.index, + row: ctx.row, + }; + const shouldRetry = await this.handleRowError(errorCtx); + if (!shouldRetry) { + ctx.span.setAttribute("eval_kit.row.retry_count", retryCount); + throw lastError; + } + retryCount++; + } + } + } - // Update progress - this.progressTracker?.recordSuccess(durationMs, tokensUsed); + private async handleRowError(ctx: { + error: unknown; + span: EvalKitSpan; + startTime: number; + retryCount: number; + maxRetries: number; + rowId: string; + index: number; + row: BatchInputRow; + }): Promise { + const errorMessage = + ctx.error instanceof Error ? ctx.error.message : String(ctx.error); + const shouldRetry = this.shouldRetry( + errorMessage, + ctx.retryCount, + ctx.maxRetries, + ); - return; // Success, exit retry loop - } catch (error) { - const errorMessage = - error instanceof Error ? error.message : String(error); + if (shouldRetry) { + const nextAttempt = ctx.retryCount + 1; + this.progressTracker?.recordRetry(errorMessage, nextAttempt); + const delay = this.calculateRetryDelay(nextAttempt); - // Check if we should retry - const shouldRetry = this.shouldRetry( - errorMessage, - retryCount, - maxRetries, - ); + ctx.span.addEvent("retry", { + "eval_kit.retry.attempt": nextAttempt, + "eval_kit.retry.delay_ms": delay, + "eval_kit.retry.error": errorMessage, + }); - if (shouldRetry) { - retryCount++; - this.progressTracker?.recordRetry(errorMessage, retryCount); - - // Calculate delay with exponential backoff - const baseDelay = this.config.retryConfig?.retryDelay ?? 1000; - const delay = this.config.retryConfig?.exponentialBackoff - ? baseDelay * 2 ** (retryCount - 1) - : baseDelay; - - await this.sleep(delay); - } else { - // Max retries reached or non-retryable error - const durationMs = Date.now() - startTime; - - const result: BatchEvaluationResult = { - rowId, - rowIndex: index, - input: row, - results: [], - timestamp: new Date().toISOString(), - durationMs, - retryCount, - error: errorMessage, - }; - - this.results.push(result); - this.processedRowIndices.add(index); - this.progressTracker?.recordFailure(durationMs); - - // Stop on error if configured - if (this.config.stopOnError) { - throw new Error( - `Stopping batch evaluation due to error: ${errorMessage}`, - ); - } - - return; // Exit retry loop - } - } - } + await this.sleep(delay); + return true; + } + + const durationMs = Date.now() - ctx.startTime; + this.results.push({ + rowId: ctx.rowId, + rowIndex: ctx.index, + input: ctx.row, + results: [], + timestamp: new Date().toISOString(), + durationMs, + retryCount: ctx.retryCount, + error: errorMessage, }); + + this.processedRowIndices.add(ctx.index); + this.progressTracker?.recordFailure(durationMs); + + if (this.config.stopOnError) { + throw new Error( + `Stopping batch evaluation due to error: ${errorMessage}`, + ); + } + + return false; } - /** - * Run all evaluators on a single row - */ - private async runEvaluators(row: BatchInputRow): Promise { - const input = { - candidateText: row.candidateText, - prompt: row.prompt, - referenceText: row.referenceText, - sourceText: row.sourceText, - contentType: row.contentType, - language: row.language, - }; + private calculateRetryDelay(attempt: number): number { + const baseDelay = this.config.retryConfig?.retryDelay ?? 1000; + return this.config.retryConfig?.exponentialBackoff + ? baseDelay * 2 ** (attempt - 1) + : baseDelay; + } - // Apply timeout if configured - const timeout = this.config.timeout; - const evaluateWithTimeout = async (evaluator: IEvaluator) => { - if (timeout) { - return Promise.race([ - evaluator.evaluate(input), - this.timeoutPromise( - timeout, - `Evaluator ${evaluator.name} timed out after ${timeout}ms`, - ), - ]); - } - return evaluator.evaluate(input); + private async executeRowEvaluation(ctx: { + inputData: BatchInputRow; + index: number; + rowId: string; + startTime: number; + retryCount: number; + }): Promise { + const evaluatorResults = await this.runEvaluators(ctx.inputData); + const durationMs = Date.now() - ctx.startTime; + const tokensUsed = evaluatorResults.reduce( + (sum, r) => sum + (r.processingStats.tokenUsage?.totalTokens ?? 0), + 0, + ); + + const result: BatchEvaluationResult = { + rowId: ctx.rowId, + rowIndex: ctx.index, + input: ctx.inputData, + results: evaluatorResults, + timestamp: new Date().toISOString(), + durationMs, + retryCount: ctx.retryCount, }; - // Run evaluators in parallel or sequential mode - if (this.config.evaluatorExecutionMode === "sequential") { - const results: EvaluatorResult[] = []; - for (const evaluator of this.evaluators) { - const result = await evaluateWithTimeout(evaluator); - results.push(result as EvaluatorResult); + if (this.config.onResult) { + const callbackResult = this.config.onResult(result); + if (callbackResult instanceof Promise) { + await callbackResult; } - return results; - } else { - // Parallel mode (default) - const results = await Promise.all( - this.evaluators.map((evaluator) => evaluateWithTimeout(evaluator)), - ); - return results as EvaluatorResult[]; } + + this.results.push(result); + this.processedRowIndices.add(ctx.index); + this.progressTracker?.recordSuccess(durationMs, tokensUsed); + } + + /** + * Run all evaluators on a single row + */ + private async runEvaluators(row: BatchInputRow): Promise { + return withSpan( + "eval-kit.batch.run_evaluators", + { + attributes: { + "eval_kit.evaluator_count": this.evaluators.length, + "eval_kit.execution_mode": + this.config.evaluatorExecutionMode ?? "parallel", + }, + }, + async () => { + const input = { + candidateText: row.candidateText, + prompt: row.prompt, + referenceText: row.referenceText, + sourceText: row.sourceText, + contentType: row.contentType, + language: row.language, + }; + + // Apply timeout if configured + const timeout = this.config.timeout; + const evaluateWithTimeout = async (evaluator: IEvaluator) => { + if (timeout) { + return Promise.race([ + evaluator.evaluate(input), + this.timeoutPromise( + timeout, + `Evaluator ${evaluator.name} timed out after ${timeout}ms`, + ), + ]); + } + return evaluator.evaluate(input); + }; + + // Run evaluators in parallel or sequential mode + if (this.config.evaluatorExecutionMode === "sequential") { + const results: EvaluatorResult[] = []; + for (const evaluator of this.evaluators) { + const result = await evaluateWithTimeout(evaluator); + results.push(result as EvaluatorResult); + } + return results; + } + // Parallel mode (default) + const results = await Promise.all( + this.evaluators.map((evaluator) => evaluateWithTimeout(evaluator)), + ); + return results as EvaluatorResult[]; + }, + ); } /** diff --git a/src/batch/exporters/csv-exporter.ts b/src/batch/exporters/csv-exporter.ts index b7fe32b..4c3c43f 100644 --- a/src/batch/exporters/csv-exporter.ts +++ b/src/batch/exporters/csv-exporter.ts @@ -179,7 +179,7 @@ export class CsvExporter { } // Exclude logic - if (excludeFields && excludeFields.includes(key)) { + if (excludeFields?.includes(key)) { continue; } diff --git a/src/batch/exporters/json-exporter.ts b/src/batch/exporters/json-exporter.ts index 0952b30..b54ff17 100644 --- a/src/batch/exporters/json-exporter.ts +++ b/src/batch/exporters/json-exporter.ts @@ -72,7 +72,7 @@ export class JsonExporter { } // Exclude logic - if (excludeFields && excludeFields.includes(key)) { + if (excludeFields?.includes(key)) { continue; } diff --git a/src/batch/progress-tracker.ts b/src/batch/progress-tracker.ts index 7bb5cf1..f92e64d 100644 --- a/src/batch/progress-tracker.ts +++ b/src/batch/progress-tracker.ts @@ -14,7 +14,6 @@ export class ProgressTracker { private processedRows = 0; private successfulRows = 0; private failedRows = 0; - private startTime: number = 0; private processingTimes: number[] = []; private lastEmitTime = 0; private totalTokens = 0; @@ -29,7 +28,6 @@ export class ProgressTracker { * Start tracking progress */ start(): void { - this.startTime = Date.now(); this.emit("started"); } diff --git a/src/evaluators/evaluator-telemetry.spec.ts b/src/evaluators/evaluator-telemetry.spec.ts new file mode 100644 index 0000000..11396f1 --- /dev/null +++ b/src/evaluators/evaluator-telemetry.spec.ts @@ -0,0 +1,139 @@ +import { beforeEach, describe, expect, it, jest } from "@jest/globals"; +import { + InMemorySpanExporter, + SimpleSpanProcessor, +} from "@opentelemetry/sdk-trace-base"; +import { NodeTracerProvider } from "@opentelemetry/sdk-trace-node"; +import type { LanguageModel } from "ai"; +import { _resetTracer, enableTelemetry, SpanStatusCode } from "../telemetry.js"; + +// Set up OTel SDK for span capture +const exporter = new InMemorySpanExporter(); +const provider = new NodeTracerProvider(); +provider.addSpanProcessor(new SimpleSpanProcessor(exporter)); +provider.register(); + +// Mock the ai module +const mockGenerateObject = jest.fn(); +jest.unstable_mockModule("ai", () => ({ + generateObject: mockGenerateObject, +})); + +const { Evaluator } = await import("./evaluator.js"); + +const createMockModel = (): LanguageModel => + ({ + specificationVersion: "v1", + provider: "mock", + modelId: "mock-model-v1", + defaultObjectGenerationMode: "json", + }) as unknown as LanguageModel; + +describe("Evaluator telemetry", () => { + beforeEach(() => { + exporter.reset(); + _resetTracer(); + enableTelemetry(true); + mockGenerateObject.mockClear(); + }); + + it("should create a span with correct name and initial attributes on success", async () => { + mockGenerateObject.mockResolvedValue({ + object: { score: 85, feedback: "Good quality" }, + usage: { inputTokens: 100, outputTokens: 20, totalTokens: 120 }, + }); + + const evaluator = new Evaluator({ + name: "fluency", + model: createMockModel(), + evaluationPrompt: "Rate: {{candidateText}}", + }); + + const result = await evaluator.evaluate({ + candidateText: "Hello world", + }); + + expect(result.score).toBe(85); + expect(result.error).toBeUndefined(); + + const spans = exporter.getFinishedSpans(); + expect(spans).toHaveLength(1); + + const span = spans[0]; + expect(span.name).toBe("eval-kit.evaluator.evaluate"); + expect(span.attributes["eval_kit.evaluator.name"]).toBe("fluency"); + expect(span.attributes["eval_kit.model.id"]).toBe("mock-model-v1"); + expect(span.attributes["eval_kit.score_config.type"]).toBe("numeric"); + expect(span.attributes["eval_kit.input.candidate_text_length"]).toBe(11); + + // Completion attributes + expect(span.attributes["eval_kit.result.score"]).toBe(85); + expect(span.attributes["eval_kit.result.token_usage.input"]).toBe(100); + expect(span.attributes["eval_kit.result.token_usage.output"]).toBe(20); + expect(span.attributes["eval_kit.result.token_usage.total"]).toBe(120); + expect( + span.attributes["eval_kit.result.execution_time_ms"], + ).toBeGreaterThanOrEqual(0); + + expect(span.status.code).toBe(SpanStatusCode.OK); + }); + + it("should record error attributes when evaluation fails", async () => { + mockGenerateObject.mockRejectedValue(new Error("API rate limited")); + + const evaluator = new Evaluator({ + name: "accuracy", + model: createMockModel(), + evaluationPrompt: "Rate: {{candidateText}}", + }); + + const result = await evaluator.evaluate({ + candidateText: "Test text", + }); + + // Evaluator catches errors and returns a result + expect(result.error).toBeDefined(); + expect(result.score).toBe(0); + + const spans = exporter.getFinishedSpans(); + expect(spans).toHaveLength(1); + + const span = spans[0]; + expect(span.status.code).toBe(SpanStatusCode.ERROR); + expect(span.attributes["eval_kit.result.error"]).toBe("API rate limited"); + expect( + span.attributes["eval_kit.result.execution_time_ms"], + ).toBeGreaterThanOrEqual(0); + + // Exception event recorded + expect(span.events.length).toBeGreaterThanOrEqual(1); + const exceptionEvent = span.events.find((e) => e.name === "exception"); + expect(exceptionEvent).toBeDefined(); + }); + + it("should not break existing behavior when OTel is present", async () => { + mockGenerateObject.mockResolvedValue({ + object: { score: "excellent", feedback: "Top quality" }, + usage: undefined, + }); + + const evaluator = new Evaluator({ + name: "quality", + model: createMockModel(), + evaluationPrompt: "Rate: {{candidateText}}", + scoreConfig: { + type: "categorical", + categories: ["poor", "fair", "good", "excellent"], + }, + }); + + const result = await evaluator.evaluate({ + candidateText: "Test", + }); + + expect(result.evaluatorName).toBe("quality"); + expect(result.score).toBe("excellent"); + expect(result.feedback).toBe("Top quality"); + expect(result.processingStats.executionTime).toBeGreaterThanOrEqual(0); + }); +}); diff --git a/src/evaluators/evaluator.ts b/src/evaluators/evaluator.ts index be4fb73..571c337 100644 --- a/src/evaluators/evaluator.ts +++ b/src/evaluators/evaluator.ts @@ -1,5 +1,11 @@ import { generateObject } from "ai"; import { z } from "zod"; +import { + type EvalKitSpan, + getTracer, + isTelemetryEnabled, + SpanStatusCode, +} from "../telemetry.js"; import type { EvaluationInput, EvaluatorConfig, @@ -44,66 +50,152 @@ export class Evaluator { } async evaluate(input: EvaluationInput): Promise { + const tracer = await getTracer(); + const modelId = (this.model as { modelId?: string }).modelId; + + return tracer.startActiveSpan( + "eval-kit.evaluator.evaluate", + { + attributes: { + "eval_kit.evaluator.name": this.name, + "eval_kit.model.id": modelId ?? "unknown", + "eval_kit.score_config.type": this.scoreConfig.type, + "eval_kit.input.candidate_text_length": input.candidateText.length, + }, + }, + (span: EvalKitSpan) => this.runEvaluation(input, modelId, span), + ); + } + + private async runEvaluation( + input: EvaluationInput, + modelId: string | undefined, + span: EvalKitSpan, + ): Promise { const startTime = Date.now(); + let spanError: string | undefined; + let evaluatorResult: EvaluatorResult; try { - const variables = this.prepareVariables(input); - const scoreInstructions = this.formatScoreInstructions(); - const prompt = `${this.templateRenderer.render(this.evaluationPrompt, variables)}\n\n${scoreInstructions}`; - - const schema = this.createSchema(); - - const result = (await generateObject({ - model: this.model, - schema, - prompt, - temperature: this.modelSettings?.temperature, - maxOutputTokens: this.modelSettings?.maxOutputTokens, - topP: this.modelSettings?.topP, - topK: this.modelSettings?.topK, - presencePenalty: this.modelSettings?.presencePenalty, - frequencyPenalty: this.modelSettings?.frequencyPenalty, - seed: this.modelSettings?.seed, - // eslint-disable-next-line @typescript-eslint/no-explicit-any - } as any)) as unknown as { - object: { score: number | string; feedback: string }; - usage: unknown; - }; - - const executionTime = Date.now() - startTime; - const tokenUsage = this.extractTokenUsage(result.usage); - - return { - evaluatorName: this.name, - model: (this.model as { modelId?: string }).modelId, - score: result.object.score, - feedback: result.object.feedback, - processingStats: { - executionTime, - tokenUsage, - }, - }; + evaluatorResult = await this.executeEvaluation(input, modelId, span); } catch (error) { - const executionTime = Date.now() - startTime; - const errorMessage = - error instanceof Error ? error.message : String(error); - const errorDetails = - error instanceof Error && "cause" in error - ? String(error.cause) - : undefined; - - return { - evaluatorName: this.name, - model: (this.model as { modelId?: string }).modelId, - score: 0, - feedback: `Evaluation failed: ${errorMessage}`, - processingStats: { - executionTime, - }, - error: errorDetails - ? `${errorMessage} (${errorDetails})` - : errorMessage, - }; + evaluatorResult = this.buildErrorResult(error, modelId, startTime); + spanError = evaluatorResult.error; + span.recordException( + error instanceof Error ? error : (spanError ?? String(error)), + ); + } finally { + span.setAttribute( + "eval_kit.result.execution_time_ms", + Date.now() - startTime, + ); + if (spanError) { + span.setAttribute("eval_kit.result.error", spanError); + span.setStatus({ code: SpanStatusCode.ERROR, message: spanError }); + } else { + span.setStatus({ code: SpanStatusCode.OK }); + } + span.end(); + } + + return evaluatorResult; + } + + private buildErrorResult( + error: unknown, + modelId: string | undefined, + startTime: number, + ): EvaluatorResult { + const errorMessage = error instanceof Error ? error.message : String(error); + const errorDetails = + error instanceof Error && "cause" in error + ? String(error.cause) + : undefined; + const fullError = errorDetails + ? `${errorMessage} (${errorDetails})` + : errorMessage; + + return { + evaluatorName: this.name, + model: modelId, + score: 0, + feedback: `Evaluation failed: ${errorMessage}`, + processingStats: { executionTime: Date.now() - startTime }, + error: fullError, + }; + } + + private async executeEvaluation( + input: EvaluationInput, + modelId: string | undefined, + span: EvalKitSpan, + ): Promise { + const startTime = Date.now(); + const prompt = this.buildPrompt(input); + const schema = this.createSchema(); + + const result = (await generateObject({ + model: this.model, + schema, + prompt, + temperature: this.modelSettings?.temperature, + maxOutputTokens: this.modelSettings?.maxOutputTokens, + topP: this.modelSettings?.topP, + topK: this.modelSettings?.topK, + presencePenalty: this.modelSettings?.presencePenalty, + frequencyPenalty: this.modelSettings?.frequencyPenalty, + seed: this.modelSettings?.seed, + experimental_telemetry: { isEnabled: isTelemetryEnabled() }, + // eslint-disable-next-line @typescript-eslint/no-explicit-any + } as any)) as unknown as { + object: { score: number | string; feedback: string }; + usage: unknown; + }; + + const executionTime = Date.now() - startTime; + const tokenUsage = this.extractTokenUsage(result.usage); + + this.setTokenAttributes(span, tokenUsage); + span.setAttribute("eval_kit.result.score", result.object.score); + + return { + evaluatorName: this.name, + model: modelId, + score: result.object.score, + feedback: result.object.feedback, + processingStats: { executionTime, tokenUsage }, + }; + } + + private buildPrompt(input: EvaluationInput): string { + const variables = this.prepareVariables(input); + const scoreInstructions = this.formatScoreInstructions(); + return `${this.templateRenderer.render(this.evaluationPrompt, variables)}\n\n${scoreInstructions}`; + } + + private setTokenAttributes( + span: EvalKitSpan, + tokenUsage: + | { inputTokens?: number; outputTokens?: number; totalTokens?: number } + | undefined, + ): void { + if (tokenUsage?.inputTokens !== undefined) { + span.setAttribute( + "eval_kit.result.token_usage.input", + tokenUsage.inputTokens, + ); + } + if (tokenUsage?.outputTokens !== undefined) { + span.setAttribute( + "eval_kit.result.token_usage.output", + tokenUsage.outputTokens, + ); + } + if (tokenUsage?.totalTokens !== undefined) { + span.setAttribute( + "eval_kit.result.token_usage.total", + tokenUsage.totalTokens, + ); } } diff --git a/src/index.ts b/src/index.ts index db6d222..d91f6a7 100644 --- a/src/index.ts +++ b/src/index.ts @@ -62,6 +62,14 @@ export type { } from "./batch/types.js"; // Evaluator export { Evaluator } from "./evaluators/evaluator.js"; +// Telemetry +export { + type EvalKitSpan, + enableTelemetry, + isTelemetryEnabled, + type WithSpanOptions, + withSpan, +} from "./telemetry.js"; export type { CategoricalScoreConfig, EvaluationInput, diff --git a/src/metrics/bert-score-telemetry.spec.ts b/src/metrics/bert-score-telemetry.spec.ts new file mode 100644 index 0000000..c6f7b6a --- /dev/null +++ b/src/metrics/bert-score-telemetry.spec.ts @@ -0,0 +1,93 @@ +import { beforeEach, describe, expect, it, jest } from "@jest/globals"; +import { + InMemorySpanExporter, + SimpleSpanProcessor, +} from "@opentelemetry/sdk-trace-base"; +import { NodeTracerProvider } from "@opentelemetry/sdk-trace-node"; +import { _resetTracer, enableTelemetry, SpanStatusCode } from "../telemetry.js"; + +// Set up OTel SDK +const exporter = new InMemorySpanExporter(); +const provider = new NodeTracerProvider(); +provider.addSpanProcessor(new SimpleSpanProcessor(exporter)); +provider.register(); + +// Mock @xenova/transformers to avoid ESM/model download issues +const mockPipeline = jest.fn(); +const mockCosSim = jest.fn(); +jest.unstable_mockModule("@xenova/transformers", () => ({ + pipeline: mockPipeline, + cos_sim: mockCosSim, +})); + +const { calculateBertScore, clearBertCache } = await import("./bert-score.js"); + +describe("bert-score telemetry", () => { + beforeEach(() => { + exporter.reset(); + _resetTracer(); + enableTelemetry(true); + clearBertCache(); + mockPipeline.mockClear(); + mockCosSim.mockClear(); + }); + + it("should create a span with correct name and attributes", async () => { + const mockExtractor = jest.fn().mockResolvedValue({ + data: new Float32Array([0.1, 0.2, 0.3]), + }); + mockPipeline.mockResolvedValue(mockExtractor); + mockCosSim.mockReturnValue(0.85); + + const result = await calculateBertScore("hello world", "hello there", { + model: "test-model", + scoreType: "f1", + }); + + expect(result.score).toBeGreaterThanOrEqual(0); + + const spans = exporter.getFinishedSpans(); + const span = spans.find((s) => s.name === "eval-kit.metric.bert_score"); + expect(span).toBeDefined(); + expect(span?.attributes["eval_kit.metric.name"]).toBe("bert_score"); + expect(span?.attributes["eval_kit.metric.model"]).toBe("test-model"); + expect(span?.attributes["eval_kit.metric.score_type"]).toBe("f1"); + expect(span?.attributes["eval_kit.result.score"]).toBeDefined(); + expect(span?.status.code).toBe(SpanStatusCode.OK); + }); + + it("should add model_loaded event on cache miss", async () => { + const mockExtractor = jest.fn().mockResolvedValue({ + data: new Float32Array([0.1, 0.2, 0.3]), + }); + mockPipeline.mockResolvedValue(mockExtractor); + mockCosSim.mockReturnValue(0.9); + + await calculateBertScore("a", "b", { model: "fresh-model" }); + + const span = exporter + .getFinishedSpans() + .find((s) => s.name === "eval-kit.metric.bert_score"); + + const loadEvent = span?.events.find((e) => e.name === "model_loaded"); + expect(loadEvent).toBeDefined(); + expect(loadEvent?.attributes?.["eval_kit.metric.model"]).toBe( + "fresh-model", + ); + }); + + it("should record error when pipeline fails", async () => { + mockPipeline.mockRejectedValue(new Error("Model download failed")); + + await expect(calculateBertScore("a", "b")).rejects.toThrow( + "Model download failed", + ); + + const span = exporter + .getFinishedSpans() + .find((s) => s.name === "eval-kit.metric.bert_score"); + + expect(span?.status.code).toBe(SpanStatusCode.ERROR); + expect(span?.events.some((e) => e.name === "exception")).toBe(true); + }); +}); diff --git a/src/metrics/bert-score.ts b/src/metrics/bert-score.ts index 24c250d..8c4ac83 100644 --- a/src/metrics/bert-score.ts +++ b/src/metrics/bert-score.ts @@ -1,5 +1,6 @@ -import type { FeatureExtractionPipeline, Tensor } from "@xenova/transformers"; +import type { FeatureExtractionPipeline } from "@xenova/transformers"; import { cos_sim, pipeline } from "@xenova/transformers"; +import { withSpan } from "../telemetry.js"; import { tokenizeWords } from "../utils/tokenization.js"; export interface BertScoreOptions { @@ -77,60 +78,81 @@ export const calculateBertScore = async ( ): Promise => { const { model = "Xenova/all-MiniLM-L6-v2", scoreType = "f1" } = options; - const extractor = await getPipeline(model); - - const candidateTokens = tokenizeWords(candidate); - const referenceTokens = tokenizeWords(reference); + return withSpan( + "eval-kit.metric.bert_score", + { + attributes: { + "eval_kit.metric.name": "bert_score", + "eval_kit.metric.model": model, + "eval_kit.metric.score_type": scoreType, + }, + }, + async (span) => { + const wasCached = cachedPipeline !== null && cachedModelName === model; + const extractor = await getPipeline(model); + if (!wasCached) { + span.addEvent("model_loaded", { + "eval_kit.metric.model": model, + }); + } - const candidateEmbeddings = await getTokenEmbeddings( - candidateTokens, - extractor, - ); - const referenceEmbeddings = await getTokenEmbeddings( - referenceTokens, - extractor, - ); + const candidateTokens = tokenizeWords(candidate); + const referenceTokens = tokenizeWords(reference); + + const candidateEmbeddings = await getTokenEmbeddings( + candidateTokens, + extractor, + ); + const referenceEmbeddings = await getTokenEmbeddings( + referenceTokens, + extractor, + ); + + const precisionSimilarities = computeMaxSimilarities( + candidateEmbeddings, + referenceEmbeddings, + ); + const recallSimilarities = computeMaxSimilarities( + referenceEmbeddings, + candidateEmbeddings, + ); + + const precision = + precisionSimilarities.length > 0 + ? precisionSimilarities.reduce((sum, sim) => sum + sim, 0) / + precisionSimilarities.length + : 0; + + const recall = + recallSimilarities.length > 0 + ? recallSimilarities.reduce((sum, sim) => sum + sim, 0) / + recallSimilarities.length + : 0; + + const f1 = + precision + recall > 0 + ? (2 * precision * recall) / (precision + recall) + : 0; + + let finalScore = f1; + if (scoreType === "precision") { + finalScore = precision; + } else if (scoreType === "recall") { + finalScore = recall; + } - const precisionSimilarities = computeMaxSimilarities( - candidateEmbeddings, - referenceEmbeddings, + const result = { + score: Math.round(finalScore * 10000) / 100, + precision: Math.round(precision * 10000) / 100, + recall: Math.round(recall * 10000) / 100, + f1: Math.round(f1 * 10000) / 100, + modelUsed: model, + }; + + span.setAttribute("eval_kit.result.score", result.score); + return result; + }, ); - const recallSimilarities = computeMaxSimilarities( - referenceEmbeddings, - candidateEmbeddings, - ); - - const precision = - precisionSimilarities.length > 0 - ? precisionSimilarities.reduce((sum, sim) => sum + sim, 0) / - precisionSimilarities.length - : 0; - - const recall = - recallSimilarities.length > 0 - ? recallSimilarities.reduce((sum, sim) => sum + sim, 0) / - recallSimilarities.length - : 0; - - const f1 = - precision + recall > 0 - ? (2 * precision * recall) / (precision + recall) - : 0; - - let finalScore = f1; - if (scoreType === "precision") { - finalScore = precision; - } else if (scoreType === "recall") { - finalScore = recall; - } - - return { - score: Math.round(finalScore * 10000) / 100, - precision: Math.round(precision * 10000) / 100, - recall: Math.round(recall * 10000) / 100, - f1: Math.round(f1 * 10000) / 100, - modelUsed: model, - }; }; export const clearBertCache = (): void => { diff --git a/src/metrics/perplexity-telemetry.spec.ts b/src/metrics/perplexity-telemetry.spec.ts new file mode 100644 index 0000000..fea1273 --- /dev/null +++ b/src/metrics/perplexity-telemetry.spec.ts @@ -0,0 +1,108 @@ +import { beforeEach, describe, expect, it, jest } from "@jest/globals"; +import { + InMemorySpanExporter, + SimpleSpanProcessor, +} from "@opentelemetry/sdk-trace-base"; +import { NodeTracerProvider } from "@opentelemetry/sdk-trace-node"; +import { _resetTracer, enableTelemetry, SpanStatusCode } from "../telemetry.js"; + +// Set up OTel SDK +const exporter = new InMemorySpanExporter(); +const provider = new NodeTracerProvider(); +provider.addSpanProcessor(new SimpleSpanProcessor(exporter)); +provider.register(); + +// Mock @xenova/transformers to avoid ESM/model download issues +const mockFromPretrainedTokenizer = jest.fn(); +const mockFromPretrainedModel = jest.fn(); +jest.unstable_mockModule("@xenova/transformers", () => ({ + AutoTokenizer: { from_pretrained: mockFromPretrainedTokenizer }, + AutoModelForCausalLM: { from_pretrained: mockFromPretrainedModel }, + Tensor: class MockTensor { + constructor( + public type: string, + public data: BigInt64Array, + public dims: number[], + ) {} + }, +})); + +const { calculatePerplexity, clearPerplexityCache } = await import( + "./perplexity.js" +); + +describe("perplexity telemetry", () => { + beforeEach(() => { + exporter.reset(); + _resetTracer(); + enableTelemetry(true); + clearPerplexityCache(); + mockFromPretrainedTokenizer.mockClear(); + mockFromPretrainedModel.mockClear(); + }); + + it("should create a span with correct name and attributes", async () => { + // Mock tokenizer that returns a short sequence (triggers early return) + const mockTokenizer = jest.fn().mockResolvedValue({ + input_ids: { + data: new BigInt64Array([BigInt(1)]), + }, + }); + mockFromPretrainedTokenizer.mockResolvedValue(mockTokenizer); + mockFromPretrainedModel.mockResolvedValue({}); + + const result = await calculatePerplexity("hi", { + model: "test-model", + }); + + // Short text triggers early return with perplexity 1.0 + expect(result.perplexity).toBe(1.0); + expect(result.score).toBe(100); + + const spans = exporter.getFinishedSpans(); + const span = spans.find((s) => s.name === "eval-kit.metric.perplexity"); + expect(span).toBeDefined(); + expect(span?.attributes["eval_kit.metric.name"]).toBe("perplexity"); + expect(span?.attributes["eval_kit.metric.model"]).toBe("test-model"); + expect(span?.attributes["eval_kit.result.score"]).toBe(100); + expect(span?.status.code).toBe(SpanStatusCode.OK); + }); + + it("should add model_loaded event on cache miss", async () => { + const mockTokenizer = jest.fn().mockResolvedValue({ + input_ids: { + data: new BigInt64Array([BigInt(1)]), + }, + }); + mockFromPretrainedTokenizer.mockResolvedValue(mockTokenizer); + mockFromPretrainedModel.mockResolvedValue({}); + + await calculatePerplexity("hi", { model: "fresh-model" }); + + const span = exporter + .getFinishedSpans() + .find((s) => s.name === "eval-kit.metric.perplexity"); + + const loadEvent = span?.events.find((e) => e.name === "model_loaded"); + expect(loadEvent).toBeDefined(); + expect(loadEvent?.attributes?.["eval_kit.metric.model"]).toBe( + "fresh-model", + ); + }); + + it("should record error when model loading fails", async () => { + mockFromPretrainedTokenizer.mockRejectedValue(new Error("Network error")); + mockFromPretrainedModel.mockRejectedValue(new Error("Network error")); + + await expect(calculatePerplexity("test text")).rejects.toThrow( + "Network error", + ); + + const span = exporter + .getFinishedSpans() + .find((s) => s.name === "eval-kit.metric.perplexity"); + + expect(span?.status.code).toBe(SpanStatusCode.ERROR); + expect(span?.events.some((e) => e.name === "exception")).toBe(true); + }); +}); diff --git a/src/metrics/perplexity.ts b/src/metrics/perplexity.ts index 661f9c3..92c117b 100644 --- a/src/metrics/perplexity.ts +++ b/src/metrics/perplexity.ts @@ -5,6 +5,7 @@ import { type PreTrainedTokenizer, Tensor, } from "@xenova/transformers"; +import { withSpan } from "../telemetry.js"; export interface PerplexityOptions { model?: string; @@ -68,7 +69,7 @@ const normalizePerplexityToScore = (perplexity: number): number => { return Math.max(0, 10 - (perplexity - 300) / 100); }; -const generateFeedback = (perplexity: number, score: number): string => { +const generateFeedback = (perplexity: number, _score: number): string => { if (perplexity < 20) { return `Excellent text quality with very natural, human-like language (perplexity: ${perplexity.toFixed(1)})`; } @@ -90,83 +91,109 @@ export const calculatePerplexity = async ( ): Promise => { const { model: modelName = "Xenova/gpt2", stride = 512 } = options; - const { tokenizer, model } = await getModelAndTokenizer(modelName); - - const encoded = await tokenizer(text, { - return_tensors: "pt", - truncation: false, - add_special_tokens: true, - }); - - const inputIds = Array.from(encoded.input_ids.data as BigInt64Array).map( - (x) => Number(x), + return withSpan( + "eval-kit.metric.perplexity", + { + attributes: { + "eval_kit.metric.name": "perplexity", + "eval_kit.metric.model": modelName, + }, + }, + async (span) => { + const wasCached = cachedModel !== null && cachedModelName === modelName; + const { tokenizer, model } = await getModelAndTokenizer(modelName); + if (!wasCached) { + span.addEvent("model_loaded", { + "eval_kit.metric.model": modelName, + }); + } + + const encoded = await tokenizer(text, { + return_tensors: "pt", + truncation: false, + add_special_tokens: true, + }); + + const inputIds = Array.from(encoded.input_ids.data as BigInt64Array).map( + (x) => Number(x), + ); + + if (inputIds.length <= 1) { + span.setAttribute("eval_kit.metric.token_count", inputIds.length); + span.setAttribute("eval_kit.result.score", 100); + return { + perplexity: 1.0, + score: 100, + tokenCount: inputIds.length, + averageLogProb: 0, + modelUsed: modelName, + feedback: "Text too short to calculate perplexity meaningfully", + }; + } + + let totalLogProb = 0; + let totalTokens = 0; + + for (let i = 0; i < inputIds.length; i += stride) { + const end = Math.min(i + stride + 1, inputIds.length); + const batch = inputIds.slice(i, end); + + if (batch.length <= 1) continue; + + const batchTensor = { + input_ids: new Tensor( + "int64", + new BigInt64Array(batch.map((x) => BigInt(x))), + [1, batch.length], + ), + }; + + const outputs = await model(batchTensor); + const logits = outputs.logits; + + if (!logits || !logits.data) continue; + + // Get vocab size from logits shape: [batch_size, seq_len, vocab_size] + const vocabSize = logits.dims?.[2] || 50257; + const logitsArray = Array.from(logits.data as Float32Array); + + for (let j = 1; j < batch.length; j++) { + const startIdx = (j - 1) * vocabSize; + const endIdx = startIdx + vocabSize; + const prevLogits = logitsArray.slice(startIdx, endIdx); + + const probs = softmax(prevLogits); + const targetTokenId = batch[j]; + const prob = probs[targetTokenId] || 1e-10; + + totalLogProb += Math.log(prob); + totalTokens++; + } + + if (end >= inputIds.length) break; + } + + const averageLogProb = totalTokens > 0 ? totalLogProb / totalTokens : 0; + const perplexity = Math.exp(-averageLogProb); + const score = normalizePerplexityToScore(perplexity); + + span.setAttribute("eval_kit.metric.token_count", totalTokens); + span.setAttribute( + "eval_kit.result.perplexity", + Math.round(perplexity * 100) / 100, + ); + span.setAttribute("eval_kit.result.score", Math.round(score * 100) / 100); + + return { + perplexity: Math.round(perplexity * 100) / 100, + score: Math.round(score * 100) / 100, + tokenCount: totalTokens, + averageLogProb: Math.round(averageLogProb * 10000) / 10000, + modelUsed: modelName, + feedback: generateFeedback(perplexity, score), + }; + }, ); - - if (inputIds.length <= 1) { - return { - perplexity: 1.0, - score: 100, - tokenCount: inputIds.length, - averageLogProb: 0, - modelUsed: modelName, - feedback: "Text too short to calculate perplexity meaningfully", - }; - } - - let totalLogProb = 0; - let totalTokens = 0; - - for (let i = 0; i < inputIds.length; i += stride) { - const end = Math.min(i + stride + 1, inputIds.length); - const batch = inputIds.slice(i, end); - - if (batch.length <= 1) continue; - - const batchTensor = { - input_ids: new Tensor( - "int64", - new BigInt64Array(batch.map((x) => BigInt(x))), - [1, batch.length], - ), - }; - - const outputs = await model(batchTensor); - const logits = outputs.logits; - - if (!logits || !logits.data) continue; - - // Get vocab size from logits shape: [batch_size, seq_len, vocab_size] - const vocabSize = logits.dims?.[2] || 50257; - const logitsArray = Array.from(logits.data as Float32Array); - - for (let j = 1; j < batch.length; j++) { - const startIdx = (j - 1) * vocabSize; - const endIdx = startIdx + vocabSize; - const prevLogits = logitsArray.slice(startIdx, endIdx); - - const probs = softmax(prevLogits); - const targetTokenId = batch[j]; - const prob = probs[targetTokenId] || 1e-10; - - totalLogProb += Math.log(prob); - totalTokens++; - } - - if (end >= inputIds.length) break; - } - - const averageLogProb = totalTokens > 0 ? totalLogProb / totalTokens : 0; - const perplexity = Math.exp(-averageLogProb); - const score = normalizePerplexityToScore(perplexity); - - return { - perplexity: Math.round(perplexity * 100) / 100, - score: Math.round(score * 100) / 100, - tokenCount: totalTokens, - averageLogProb: Math.round(averageLogProb * 10000) / 10000, - modelUsed: modelName, - feedback: generateFeedback(perplexity, score), - }; }; export const clearPerplexityCache = (): void => { diff --git a/src/telemetry-noop.spec.ts b/src/telemetry-noop.spec.ts new file mode 100644 index 0000000..9c09b4a --- /dev/null +++ b/src/telemetry-noop.spec.ts @@ -0,0 +1,107 @@ +/** + * Tests for the noop fallback path — the default experience when + * @opentelemetry/api is not installed. + */ +import { beforeEach, describe, expect, it, jest } from "@jest/globals"; + +// Mock the dynamic import to simulate @opentelemetry/api not being installed. +// This MUST be before importing telemetry. +jest.unstable_mockModule("@opentelemetry/api", () => { + throw new Error("Cannot find module '@opentelemetry/api'"); +}); + +const { _resetTracer, withSpan, getTracer, getCachedTracer } = await import( + "./telemetry.js" +); + +describe("telemetry noop fallback", () => { + beforeEach(() => { + _resetTracer(); + }); + + describe("withSpan", () => { + it("should execute callback and return result when OTel is absent", async () => { + const result = await withSpan( + "test.operation", + { attributes: { "test.key": "value" } }, + async () => 42, + ); + + expect(result).toBe(42); + }); + + it("should still propagate errors from callback", async () => { + await expect( + withSpan("test.failing", {}, async () => { + throw new Error("callback error"); + }), + ).rejects.toThrow("callback error"); + }); + + it("should provide a noop span that accepts all operations without error", async () => { + await withSpan("test.noop", {}, async (span) => { + // All of these should be silent no-ops + span.setAttribute("key", "value"); + span.setAttribute("num", 123); + span.setAttribute("bool", true); + span.setStatus({ code: 1 }); + span.setStatus({ code: 2, message: "error" }); + span.addEvent("event", { detail: "test" }); + span.recordException(new Error("test")); + span.recordException("string error"); + span.end(); + }); + }); + + it("should handle nested withSpan calls with noop tracer", async () => { + const result = await withSpan("parent", {}, async () => { + return withSpan("child", {}, async () => "nested-result"); + }); + + expect(result).toBe("nested-result"); + }); + }); + + describe("getTracer", () => { + it("should return a tracer that produces noop spans", async () => { + const tracer = await getTracer(); + expect(tracer).toBeDefined(); + + // The noop tracer's startActiveSpan should execute the callback + const result = tracer.startActiveSpan("test", {}, (span) => { + span.setAttribute("key", "value"); + span.end(); + return "result"; + }); + expect(result).toBe("result"); + }); + }); + + describe("getCachedTracer", () => { + it("should return noop tracer before any async resolution", () => { + _resetTracer(); + const tracer = getCachedTracer(); + expect(tracer).toBeDefined(); + + const result = tracer.startActiveSpan("test", {}, (span) => { + span.setAttribute("key", "value"); + span.end(); + return "sync-result"; + }); + expect(result).toBe("sync-result"); + }); + + it("should still return noop tracer after failed resolution", async () => { + // Trigger resolution (which will fail due to our mock) + await getTracer(); + + const tracer = getCachedTracer(); + const result = tracer.startActiveSpan( + "test", + {}, + () => "post-resolution", + ); + expect(result).toBe("post-resolution"); + }); + }); +}); diff --git a/src/telemetry.spec.ts b/src/telemetry.spec.ts new file mode 100644 index 0000000..56de100 --- /dev/null +++ b/src/telemetry.spec.ts @@ -0,0 +1,141 @@ +import { afterEach, beforeEach, describe, expect, it } from "@jest/globals"; +import { + InMemorySpanExporter, + SimpleSpanProcessor, +} from "@opentelemetry/sdk-trace-base"; +import { NodeTracerProvider } from "@opentelemetry/sdk-trace-node"; +import { + _resetTracer, + enableTelemetry, + SpanStatusCode, + withSpan, +} from "./telemetry.js"; + +// Set up a real OTel SDK so spans are captured +const exporter = new InMemorySpanExporter(); +const provider = new NodeTracerProvider(); +provider.addSpanProcessor(new SimpleSpanProcessor(exporter)); +provider.register(); + +describe("telemetry", () => { + beforeEach(() => { + exporter.reset(); + _resetTracer(); + enableTelemetry(true); + }); + + describe("withSpan", () => { + it("should create a span with correct name and attributes", async () => { + const result = await withSpan( + "test.operation", + { + attributes: { + "test.key": "value", + "test.number": 42, + }, + }, + async () => "hello", + ); + + expect(result).toBe("hello"); + + const spans = exporter.getFinishedSpans(); + expect(spans).toHaveLength(1); + expect(spans[0].name).toBe("test.operation"); + expect(spans[0].attributes["test.key"]).toBe("value"); + expect(spans[0].attributes["test.number"]).toBe(42); + expect(spans[0].status.code).toBe(SpanStatusCode.OK); + }); + + it("should record exception and set error status on failure", async () => { + const testError = new Error("test failure"); + + await expect( + withSpan("test.failing", {}, async () => { + throw testError; + }), + ).rejects.toThrow("test failure"); + + const spans = exporter.getFinishedSpans(); + expect(spans).toHaveLength(1); + expect(spans[0].status.code).toBe(SpanStatusCode.ERROR); + expect(spans[0].status.message).toBe("test failure"); + expect(spans[0].events).toHaveLength(1); + expect(spans[0].events[0].name).toBe("exception"); + }); + + it("should allow adding attributes during execution", async () => { + await withSpan("test.dynamic", {}, async (span) => { + span.setAttribute("dynamic.key", "added-later"); + }); + + const spans = exporter.getFinishedSpans(); + expect(spans).toHaveLength(1); + expect(spans[0].attributes["dynamic.key"]).toBe("added-later"); + }); + + it("should allow adding events during execution", async () => { + await withSpan("test.events", {}, async (span) => { + span.addEvent("custom_event", { + "event.detail": "something happened", + }); + }); + + const spans = exporter.getFinishedSpans(); + expect(spans).toHaveLength(1); + expect(spans[0].events).toHaveLength(1); + expect(spans[0].events[0].name).toBe("custom_event"); + }); + + it("should nest spans correctly via async context", async () => { + await withSpan("parent", {}, async () => { + await withSpan("child", {}, async () => { + // child span + }); + }); + + const spans = exporter.getFinishedSpans(); + expect(spans).toHaveLength(2); + + const child = spans.find((s) => s.name === "child"); + const parent = spans.find((s) => s.name === "parent"); + expect(child).toBeDefined(); + expect(parent).toBeDefined(); + // Child's parent span ID should match parent's span ID + expect(child?.parentSpanId).toBe(parent?.spanContext().spanId); + }); + }); + + describe("enableTelemetry", () => { + afterEach(() => { + // Always re-enable so other tests aren't affected + enableTelemetry(true); + }); + + it("should execute callback but emit no span when disabled", async () => { + enableTelemetry(false); + _resetTracer(); + + const result = await withSpan("test.disabled", {}, async () => "hello"); + + expect(result).toBe("hello"); + expect(exporter.getFinishedSpans()).toHaveLength(0); + }); + + it("should restore span creation after re-enabling", async () => { + enableTelemetry(false); + _resetTracer(); + + await withSpan("while.disabled", {}, async () => "ignored"); + + enableTelemetry(true); + _resetTracer(); + + await withSpan("after.reenable", {}, async () => "tracked"); + + const spans = exporter.getFinishedSpans(); + expect(spans).toHaveLength(1); + expect(spans[0].name).toBe("after.reenable"); + }); + }); +}); diff --git a/src/telemetry.ts b/src/telemetry.ts new file mode 100644 index 0000000..b023250 --- /dev/null +++ b/src/telemetry.ts @@ -0,0 +1,206 @@ +/** + * OpenTelemetry instrumentation for eval-kit. + * + * Uses dynamic import so @opentelemetry/api is an optional peer dependency. + * When the package is not installed or no SDK is configured, all tracing + * operations are no-ops with zero overhead. + */ + +declare const __EVAL_KIT_VERSION__: string; + +// ---------- Type stubs (mirror the OTel API surface we use) ---------- + +interface SpanAttributes { + [key: string]: string | number | boolean | undefined; +} + +/** + * Minimal span interface matching the subset of @opentelemetry/api Span we use. + */ +export interface EvalKitSpan { + setAttribute(key: string, value: string | number | boolean): void; + setStatus(status: { code: number; message?: string }): void; + recordException(error: Error | string): void; + addEvent(name: string, attributes?: SpanAttributes): void; + end(): void; +} + +interface Tracer { + startActiveSpan( + name: string, + options: { attributes?: SpanAttributes }, + fn: (span: EvalKitSpan) => T, + ): T; +} + +// ---------- No-op implementations ---------- + +const noopSpan: EvalKitSpan = { + setAttribute() {}, + setStatus() {}, + recordException() {}, + addEvent() {}, + end() {}, +}; + +const noopTracer: Tracer = { + startActiveSpan( + _name: string, + _options: { attributes?: SpanAttributes }, + fn: (span: EvalKitSpan) => T, + ): T { + return fn(noopSpan); + }, +}; + +// ---------- Global toggle ---------- + +let telemetryEnabled = false; + +/** + * Enable or disable eval-kit's OpenTelemetry instrumentation globally. + * + * When disabled, all tracing functions return no-ops — identical to the + * behaviour when `@opentelemetry/api` is not installed. This lets you + * use OTel in your application without eval-kit emitting spans. + * + * Telemetry is **disabled** by default. + */ +export function enableTelemetry(enabled: boolean): void { + telemetryEnabled = enabled; +} + +/** + * Returns the current telemetry enabled state. + */ +export function isTelemetryEnabled(): boolean { + return telemetryEnabled; +} + +// ---------- Lazy tracer resolution ---------- + +let resolvedTracer: Tracer | null = null; +let tracerPromise: Promise | null = null; + +function resolveTracer(): Promise { + if (!tracerPromise) { + tracerPromise = (async () => { + try { + const api = await import("@opentelemetry/api"); + const version = + typeof __EVAL_KIT_VERSION__ !== "undefined" + ? __EVAL_KIT_VERSION__ + : "unknown"; + const tracer = api.trace.getTracer("eval-kit", version); + // Wrap the OTel tracer with an explicit adapter at the boundary + const wrapSpan = ( + span: import("@opentelemetry/api").Span, + ): EvalKitSpan => ({ + setAttribute: (key, value) => span.setAttribute(key, value), + setStatus: (status) => span.setStatus(status), + recordException: (error) => span.recordException(error), + addEvent: (name, attributes) => span.addEvent(name, attributes), + end: () => span.end(), + }); + + resolvedTracer = { + startActiveSpan( + name: string, + options: { attributes?: SpanAttributes }, + fn: (span: EvalKitSpan) => T, + ): T { + return tracer.startActiveSpan(name, options, (span) => + fn(wrapSpan(span)), + ); + }, + }; + return resolvedTracer; + } catch { + // @opentelemetry/api not installed — use no-op + resolvedTracer = noopTracer; + return noopTracer; + } + })(); + } + return tracerPromise; +} + +// ---------- Status codes (match OTel SpanStatusCode) ---------- + +export const SpanStatusCode = { + UNSET: 0, + OK: 1, + ERROR: 2, +} as const; + +// ---------- Public API ---------- + +/** + * Get the eval-kit tracer (async). Triggers resolution on first call. + * Returns the no-op tracer when telemetry is disabled. + */ +export async function getTracer(): Promise { + if (!telemetryEnabled) return noopTracer; + return resolveTracer(); +} + +/** + * Get the cached tracer synchronously. Returns no-op if the tracer + * hasn't been resolved yet (i.e., no prior withSpan/getTracer call) + * or if telemetry is disabled. + */ +export function getCachedTracer(): Tracer { + if (!telemetryEnabled) return noopTracer; + return resolvedTracer ?? noopTracer; +} + +export interface WithSpanOptions { + attributes?: SpanAttributes; +} + +/** + * Wrap an async operation with an OpenTelemetry span. + * + * - Creates a child span under the current active span (if any) + * - Sets initial attributes from options + * - On success: sets status OK, ends span + * - On error: records exception, sets status ERROR, ends span, re-throws + * - The callback receives the span so it can add attributes/events during execution + */ +export async function withSpan( + name: string, + options: WithSpanOptions, + fn: (span: EvalKitSpan) => Promise, +): Promise { + if (!telemetryEnabled) return fn(noopSpan); + + const tracer = await resolveTracer(); + return tracer.startActiveSpan( + name, + { attributes: options.attributes }, + async (span) => { + try { + const result = await fn(span); + span.setStatus({ code: SpanStatusCode.OK }); + return result; + } catch (error) { + span.setStatus({ + code: SpanStatusCode.ERROR, + message: error instanceof Error ? error.message : String(error), + }); + span.recordException(error instanceof Error ? error : String(error)); + throw error; + } finally { + span.end(); + } + }, + ); +} + +/** + * Reset the cached tracer. Only used in tests. + */ +export function _resetTracer(): void { + resolvedTracer = null; + tracerPromise = null; +} diff --git a/src/utils/template-engine.ts b/src/utils/template-engine.ts index 3b7a603..d7b7416 100644 --- a/src/utils/template-engine.ts +++ b/src/utils/template-engine.ts @@ -94,12 +94,12 @@ export class TemplateRenderer { const conditionals = template.matchAll(/\{\{#if\s+(\w+)\}\}/g); for (const match of conditionals) { - variables.add(match[1]!); + variables.add(match[1]); } const substitutions = template.matchAll(/\{\{(\w+)\}\}/g); for (const match of substitutions) { - variables.add(match[1]!); + variables.add(match[1]); } return Array.from(variables); @@ -113,11 +113,11 @@ export class TemplateRenderer { /\{\{#if\s+(\w+)\}\}([\s\S]*?)\{\{\/if\}\}/g, ); for (const match of conditionals) { - optional.add(match[1]!); + optional.add(match[1]); - const innerVars = match[2]!.matchAll(/\{\{(\w+)\}\}/g); + const innerVars = match[2]?.matchAll(/\{\{(\w+)\}\}/g); for (const innerMatch of innerVars) { - optional.add(innerMatch[1]!); + optional.add(innerMatch[1]); } } @@ -125,7 +125,7 @@ export class TemplateRenderer { for (const part of parts) { const substitutions = part.matchAll(/\{\{(\w+)\}\}/g); for (const match of substitutions) { - const varName = match[1]!; + const varName = match[1]; if (!optional.has(varName)) { required.add(varName); } diff --git a/src/utils/tfidf.spec.ts b/src/utils/tfidf.spec.ts index 0f4db9c..5ec32b5 100644 --- a/src/utils/tfidf.spec.ts +++ b/src/utils/tfidf.spec.ts @@ -50,7 +50,7 @@ describe("TF-IDF Utils", () => { const idf = calculateIDF(sentenceTokens); // "rare" appears in 1/3 sentences: log((3+1)/(1+1)) = log(2) = 0.693... - expect(idf.get("rare")).toBeCloseTo(0.693, 2); + expect(idf.get("rare")).toBeCloseTo(Math.LN2, 2); }); it("should handle empty sentence tokens", () => { diff --git a/vite.config.ts b/vite.config.ts index 6b5fccf..a89e906 100644 --- a/vite.config.ts +++ b/vite.config.ts @@ -1,7 +1,10 @@ +import { readFileSync } from "node:fs"; import { resolve } from "node:path"; import { defineConfig } from "vite"; import dts from "vite-plugin-dts"; +const pkg = JSON.parse(readFileSync(resolve(__dirname, "package.json"), "utf-8")); + // Externalize function - exclude all dependencies from the bundle const external = (id: string) => { // Externalize node built-ins @@ -16,6 +19,9 @@ const external = (id: string) => { }; export default defineConfig({ + define: { + __EVAL_KIT_VERSION__: JSON.stringify(pkg.version), + }, plugins: [ dts({ include: ["src"],