From b430284ec99fd132629167ff455e0eedad797556 Mon Sep 17 00:00:00 2001 From: Adnan Rashid Hussain Date: Thu, 5 Feb 2026 16:48:26 -0800 Subject: [PATCH 01/10] feat: Implement core evaluator files and Vocab implementation --- sdks/typescript/README.md | 165 ++++++++ sdks/typescript/docs/telemetry.md | 128 +++++++ sdks/typescript/src/errors.ts | 241 ++++++++++++ sdks/typescript/src/evaluators/base.ts | 290 ++++++++++++++ sdks/typescript/src/evaluators/index.ts | 7 + sdks/typescript/src/evaluators/vocabulary.ts | 358 ++++++++++++++++++ sdks/typescript/src/features/index.ts | 4 + sdks/typescript/src/features/readability.ts | 55 +++ sdks/typescript/src/index.ts | 62 ++- sdks/typescript/src/logger.ts | 159 ++++++++ .../vocabulary/background-knowledge.ts | 16 + .../src/prompts/vocabulary/index.ts | 3 + .../src/prompts/vocabulary/system.ts | 28 ++ .../typescript/src/prompts/vocabulary/user.ts | 39 ++ .../src/providers/ai-sdk-provider.ts | 153 ++++++++ sdks/typescript/src/providers/base.ts | 73 ++++ sdks/typescript/src/providers/index.ts | 10 + sdks/typescript/src/schemas/index.ts | 10 + sdks/typescript/src/schemas/outputs.ts | 75 ++++ sdks/typescript/src/schemas/vocabulary.ts | 39 ++ sdks/typescript/src/telemetry/client.ts | 61 +++ sdks/typescript/src/telemetry/index.ts | 10 + sdks/typescript/src/telemetry/types.ts | 105 +++++ sdks/typescript/src/telemetry/utils.ts | 63 +++ sdks/typescript/src/utils/prompts.ts | 21 + sdks/typescript/tests/README.md | 221 +++++++++++ .../vocabulary.integration.test.ts | 146 +++++++ .../tests/unit/evaluators/validation.test.ts | 125 ++++++ .../tests/unit/evaluators/vocabulary.test.ts | 249 ++++++++++++ .../tests/unit/features/readability.test.ts | 27 ++ .../tests/unit/telemetry/utils.test.ts | 88 +++++ sdks/typescript/tests/utils/index.ts | 18 + sdks/typescript/tests/utils/test-helpers.ts | 254 +++++++++++++ 33 files changed, 3302 insertions(+), 1 deletion(-) create mode 100644 sdks/typescript/docs/telemetry.md create mode 100644 sdks/typescript/src/errors.ts create mode 100644 sdks/typescript/src/evaluators/base.ts create mode 100644 sdks/typescript/src/evaluators/index.ts create mode 100644 sdks/typescript/src/evaluators/vocabulary.ts create mode 100644 sdks/typescript/src/features/index.ts create mode 100644 sdks/typescript/src/features/readability.ts create mode 100644 sdks/typescript/src/logger.ts create mode 100644 sdks/typescript/src/prompts/vocabulary/background-knowledge.ts create mode 100644 sdks/typescript/src/prompts/vocabulary/index.ts create mode 100644 sdks/typescript/src/prompts/vocabulary/system.ts create mode 100644 sdks/typescript/src/prompts/vocabulary/user.ts create mode 100644 sdks/typescript/src/providers/ai-sdk-provider.ts create mode 100644 sdks/typescript/src/providers/base.ts create mode 100644 sdks/typescript/src/providers/index.ts create mode 100644 sdks/typescript/src/schemas/index.ts create mode 100644 sdks/typescript/src/schemas/outputs.ts create mode 100644 sdks/typescript/src/schemas/vocabulary.ts create mode 100644 sdks/typescript/src/telemetry/client.ts create mode 100644 sdks/typescript/src/telemetry/index.ts create mode 100644 sdks/typescript/src/telemetry/types.ts create mode 100644 sdks/typescript/src/telemetry/utils.ts create mode 100644 sdks/typescript/src/utils/prompts.ts create mode 100644 sdks/typescript/tests/README.md create mode 100644 sdks/typescript/tests/integration/vocabulary.integration.test.ts create mode 100644 sdks/typescript/tests/unit/evaluators/validation.test.ts create mode 100644 sdks/typescript/tests/unit/evaluators/vocabulary.test.ts create mode 100644 sdks/typescript/tests/unit/features/readability.test.ts create mode 100644 sdks/typescript/tests/unit/telemetry/utils.test.ts create mode 100644 sdks/typescript/tests/utils/index.ts create mode 100644 sdks/typescript/tests/utils/test-helpers.ts diff --git a/sdks/typescript/README.md b/sdks/typescript/README.md index 3ea9d1c..9be34c3 100644 --- a/sdks/typescript/README.md +++ b/sdks/typescript/README.md @@ -1 +1,166 @@ # @learning-commons/evaluators + +TypeScript SDK for Learning Commons educational text complexity evaluators. + +## Installation + +```bash +npm install ai @learning-commons/evaluators +``` + +## Quick Start + +```typescript +import { VocabularyEvaluator } from '@learning-commons/evaluators'; + +const evaluator = new VocabularyEvaluator({ + googleApiKey: process.env.GOOGLE_API_KEY, + openaiApiKey: process.env.OPENAI_API_KEY +}); + +const result = await evaluator.evaluate("Your text here", "5"); +console.log(result.score); // "moderately complex" +``` + +--- + +## Evaluators + +### 1. Vocabulary Evaluator + +Evaluates vocabulary complexity using the Qual Text Complexity rubric (SAP). + +**Supported Grades:** 3-12 + +**Uses:** Google Gemini 2.5 Pro + OpenAI GPT-4o + +**Constructor:** +```typescript +const evaluator = new VocabularyEvaluator({ + googleApiKey: string; // Required - Google API key + openaiApiKey: string; // Required - OpenAI API key + maxRetries?: number; // Optional - Max retry attempts (default: 2) + telemetry?: boolean | TelemetryOptions; // Optional (default: true) + logger?: Logger; // Optional - Custom logger + logLevel?: LogLevel; // Optional - SILENT | ERROR | WARN | INFO | DEBUG (default: WARN) +}); +``` + +**API:** +```typescript +await evaluator.evaluate(text: string, grade: string) +``` + +**Returns:** +```typescript +{ + score: 'slightly complex' | 'moderately complex' | 'very complex' | 'exceedingly complex'; + reasoning: string; + metadata: { + promptVersion: string; + model: string; + timestamp: Date; + processingTimeMs: number; + }; + _internal: VocabularyComplexity; // Detailed analysis +} +``` + +## Error Handling + +The SDK provides specific error types to help you handle different scenarios: + +```typescript +import { + ValidationError, + APIError, + AuthenticationError, + RateLimitError, + NetworkError, + TimeoutError, +} from '@learning-commons/evaluators'; + +try { + const result = await evaluator.evaluate(text, grade); +} catch (error) { + if (error instanceof ValidationError) { + // Invalid input (text too short, invalid grade, etc.) + console.error('Invalid input:', error.message); + } else if (error instanceof AuthenticationError) { + // Invalid API keys + console.error('Check your API keys:', error.message); + } else if (error instanceof RateLimitError) { + // Rate limit exceeded - wait and retry + console.error('Rate limited. Retry after:', error.retryAfter); + } else if (error instanceof NetworkError) { + // Network connectivity issues + console.error('Network error:', error.message); + } else if (error instanceof APIError) { + // Other API errors + console.error('API error:', error.message, 'Status:', error.statusCode); + } +} +``` + +--- + +## Logging + +Control logging verbosity with `logLevel`: + +```typescript +import { VocabularyEvaluator, LogLevel } from '@learning-commons/evaluators'; + +const evaluator = new VocabularyEvaluator({ + googleApiKey: '...', + openaiApiKey: '...', + logLevel: LogLevel.INFO, // SILENT | ERROR | WARN | INFO | DEBUG +}); +``` + +Or provide a custom logger: + +```typescript +import type { Logger } from '@learning-commons/evaluators'; + +const customLogger: Logger = { + debug: (msg, ctx) => myLogger.debug(msg, ctx), + info: (msg, ctx) => myLogger.info(msg, ctx), + warn: (msg, ctx) => myLogger.warn(msg, ctx), + error: (msg, ctx) => myLogger.error(msg, ctx), +}; + +const evaluator = new VocabularyEvaluator({ + googleApiKey: '...', + openaiApiKey: '...', + logger: customLogger, +}); +``` + +--- + +## Telemetry & Privacy + +See [docs/telemetry.md](./docs/telemetry.md) for telemetry configuration and privacy information. + +--- + +## Configuration Options + +All evaluators support these common options: + +```typescript +interface BaseEvaluatorConfig { + maxRetries?: number; // Max API retry attempts (default: 2) + telemetry?: boolean | TelemetryOptions; // Telemetry config (default: true) + logger?: Logger; // Custom logger (optional) + logLevel?: LogLevel; // Console log level (default: WARN) + apiKey?: string; // Learning Commons API key for authenticated telemetry (optional) +} +``` + +--- + +## License + +MIT diff --git a/sdks/typescript/docs/telemetry.md b/sdks/typescript/docs/telemetry.md new file mode 100644 index 0000000..f9f810b --- /dev/null +++ b/sdks/typescript/docs/telemetry.md @@ -0,0 +1,128 @@ +# Telemetry + +## Why We Collect Telemetry + +We use telemetry data to improve evaluator quality, identify edge cases, and optimize performance. This helps us build better tools for our developer partners. + +Telemetry is **anonymous by default**. If you'd like to partner with us to improve your specific use case, you can optionally provide an API key (see Configuration section below). This allows us to connect with you and collaborate more deeply. + +## What We Collect + +**By default, telemetry is enabled** and sends: +- Performance metrics (latency, token usage) +- Metadata (evaluator type, grade, SDK version) +- **Input text** (the text you're evaluating) + +We **never** collect your API keys (only a hashed identifier). + +## Example Telemetry Event + +```json +{ + "timestamp": "2026-02-05T19:30:00.000Z", + "sdk_version": "0.1.0", + "evaluator_type": "vocabulary", + "grade": "5", + "status": "success", + "latency_ms": 3500, + "text_length_chars": 456, + "provider": "google:gemini-2.5-pro+openai:gpt-4o", + "retry_attempts": -1, + "token_usage": { + "input_tokens": 650, + "output_tokens": 350 + }, + "input_text": "The mitochondria is the powerhouse of the cell...", + "metadata": { + "stage_details": [ + { + "stage": "background_knowledge", + "provider": "openai:gpt-4o-2024-11-20", + "latency_ms": 1200, + "retry_attempts": -1, + "token_usage": { + "input_tokens": 250, + "output_tokens": 150 + } + }, + { + "stage": "complexity_evaluation", + "provider": "google:gemini-2.5-pro", + "latency_ms": 2300, + "retry_attempts": -1, + "token_usage": { + "input_tokens": 400, + "output_tokens": 200 + } + } + ] + } +} +``` + +## Field Reference + +| Field | Description | +|-------|-------------| +| `timestamp` | ISO 8601 timestamp when evaluation started | +| `sdk_version` | Version of the SDK (e.g., "0.1.0") | +| `evaluator_type` | Which evaluator ran (e.g., "vocabulary", "sentence-structure") | +| `grade` | Grade level evaluated (e.g., "5", "K") | +| `status` | Evaluation outcome: "success" or "error" | +| `error_code` | Error type if status is "error" (e.g., "Error", "TypeError") | +| `latency_ms` | Total evaluation time in milliseconds | +| `text_length_chars` | Length of input text in characters | +| `provider` | LLM provider(s) used (e.g., "openai:gpt-4o", "google:gemini-2.5-pro+openai:gpt-4o") | +| `retry_attempts` | Number of retries (-1 means unknown, see note below) | +| `token_usage` | Total tokens consumed (input, output, total) | +| `input_text` | The text being evaluated (omitted if `recordInputs: false`) | +| `metadata.stage_details` | Per-stage breakdown for multi-stage evaluators (optional) | + +**Note on `retry_attempts`:** Currently set to `-1` (unknown) as actual retry counts are not yet tracked. This field is included for backward compatibility as we plan to add this as a future enhancement. + +## Configuration + +### Default (Anonymous) + +```typescript +const evaluator = new VocabularyEvaluator({ + googleApiKey: process.env.GOOGLE_API_KEY!, + openaiApiKey: process.env.OPENAI_API_KEY!, + // telemetry: true (default - anonymous) +}); +``` + +### Partner with Us (Authenticated) + +To help us support your specific use case, provide an API key: + +```typescript +const evaluator = new VocabularyEvaluator({ + googleApiKey: process.env.GOOGLE_API_KEY!, + openaiApiKey: process.env.OPENAI_API_KEY!, + apiKey: process.env.LEARNING_COMMONS_API_KEY!, // Contact us for a key +}); +``` + +### Disable Telemetry Completely + +```typescript +const evaluator = new VocabularyEvaluator({ + googleApiKey: process.env.GOOGLE_API_KEY!, + openaiApiKey: process.env.OPENAI_API_KEY!, + telemetry: false, // No data sent +}); +``` + +### Disable Input Text Collection + +```typescript +const evaluator = new VocabularyEvaluator({ + googleApiKey: process.env.GOOGLE_API_KEY!, + openaiApiKey: process.env.OPENAI_API_KEY!, + telemetry: { + enabled: true, + recordInputs: false, // Only metrics, no text content + }, +}); +``` diff --git a/sdks/typescript/src/errors.ts b/sdks/typescript/src/errors.ts new file mode 100644 index 0000000..2be798c --- /dev/null +++ b/sdks/typescript/src/errors.ts @@ -0,0 +1,241 @@ +/** + * Custom error types for the Evaluators SDK + * + * This module provides a hierarchy of error types to help users + * distinguish between different error scenarios and implement + * appropriate error handling strategies. + */ + +/** + * Base error class for all evaluator errors + */ +export class EvaluatorError extends Error { + constructor( + message: string, + public readonly code?: string + ) { + super(message); + this.name = 'EvaluatorError'; + // Maintains proper stack trace for where error was thrown (only available on V8) + if (Error.captureStackTrace) { + Error.captureStackTrace(this, this.constructor); + } + } +} + +/** + * Validation error - thrown when input validation fails + * These are client-side errors that should NOT be retried + * + * @example + * ```typescript + * try { + * await evaluator.evaluate('', '5'); + * } catch (error) { + * if (error instanceof ValidationError) { + * // Show user-friendly error message + * console.error('Invalid input:', error.message); + * } + * } + * ``` + */ +export class ValidationError extends EvaluatorError { + constructor(message: string) { + super(message, 'VALIDATION_ERROR'); + this.name = 'ValidationError'; + } +} + +/** + * Base API error - thrown when LLM API calls fail + * Contains additional context about the API error + */ +export class APIError extends EvaluatorError { + constructor( + message: string, + public readonly statusCode?: number, + public readonly retryable: boolean = false, + code?: string + ) { + super(message, code); + this.name = 'APIError'; + } +} + +/** + * Authentication error - thrown when API keys are invalid or missing + * HTTP 401 or 403 responses + * Should NOT be retried + * + * @example + * ```typescript + * try { + * await evaluator.evaluate(text, grade); + * } catch (error) { + * if (error instanceof AuthenticationError) { + * // Prompt user to check API keys + * console.error('Invalid API keys. Please check your credentials.'); + * } + * } + * ``` + */ +export class AuthenticationError extends APIError { + constructor(message: string, statusCode?: number) { + super(message, statusCode, false, 'AUTHENTICATION_ERROR'); + this.name = 'AuthenticationError'; + } +} + +/** + * Rate limit error - thrown when API rate limits are exceeded + * HTTP 429 responses + * Should be retried with exponential backoff + * + * @example + * ```typescript + * try { + * await evaluator.evaluate(text, grade); + * } catch (error) { + * if (error instanceof RateLimitError) { + * // Wait and retry + * await sleep(error.retryAfter || 5000); + * // retry... + * } + * } + * ``` + */ +export class RateLimitError extends APIError { + constructor( + message: string, + public readonly retryAfter?: number // milliseconds + ) { + super(message, 429, true, 'RATE_LIMIT_ERROR'); + this.name = 'RateLimitError'; + } +} + +/** + * Network error - thrown when network requests fail + * Connection timeouts, DNS failures, etc. + * May be retryable depending on the scenario + * + * @example + * ```typescript + * try { + * await evaluator.evaluate(text, grade); + * } catch (error) { + * if (error instanceof NetworkError) { + * // Check network connection and retry + * console.error('Network error:', error.message); + * } + * } + * ``` + */ +export class NetworkError extends APIError { + constructor(message: string, retryable: boolean = true) { + super(message, undefined, retryable, 'NETWORK_ERROR'); + this.name = 'NetworkError'; + } +} + +/** + * Timeout error - thrown when requests exceed timeout limits + * Should be retried with caution + * + * @example + * ```typescript + * try { + * await evaluator.evaluate(text, grade); + * } catch (error) { + * if (error instanceof TimeoutError) { + * // Retry with longer timeout or smaller text + * console.error('Request timed out'); + * } + * } + * ``` + */ +export class TimeoutError extends APIError { + constructor(message: string = 'Request timed out') { + super(message, 408, true, 'TIMEOUT_ERROR'); + this.name = 'TimeoutError'; + } +} + +/** + * Parse structured output from LLM provider error + */ +function parseProviderError(error: unknown): { message: string; statusCode?: number; code?: string } { + // Handle Error objects + if (error instanceof Error) { + const message = error.message; + + // Try to extract status code from error message + // Common patterns: "429", "401", "Error 429:", "Status: 429" + const statusMatch = message.match(/\b(4\d{2}|5\d{2})\b/); + const statusCode = statusMatch ? parseInt(statusMatch[1]) : undefined; + + return { + message, + statusCode, + code: error.name !== 'Error' ? error.name : undefined, + }; + } + + // Handle unknown error types + return { + message: String(error), + }; +} + +/** + * Wrap a provider error into the appropriate error type + * + * @internal + */ +export function wrapProviderError(error: unknown, defaultMessage: string = 'API request failed'): APIError { + const { message, statusCode, code } = parseProviderError(error); + + // Detect authentication errors (401, 403) + if (statusCode === 401 || statusCode === 403) { + return new AuthenticationError( + message.includes('API key') ? message : 'Invalid API key', + statusCode + ); + } + + // Detect rate limit errors (429) + if (statusCode === 429) { + // Try to extract retry-after if present + const retryAfterMatch = message.match(/retry[- ]after[:\s]+(\d+)/i); + const retryAfter = retryAfterMatch ? parseInt(retryAfterMatch[1]) * 1000 : undefined; + + return new RateLimitError( + message.includes('rate limit') ? message : 'Rate limit exceeded', + retryAfter + ); + } + + // Detect network errors + if ( + message.includes('ECONNREFUSED') || + message.includes('ENOTFOUND') || + message.includes('ETIMEDOUT') || + message.includes('network') || + message.includes('Network') + ) { + return new NetworkError(message); + } + + // Detect timeout errors + if (message.includes('timeout') || message.includes('timed out')) { + return new TimeoutError(message); + } + + // Generic API error for everything else + return new APIError( + message || defaultMessage, + statusCode, + statusCode ? statusCode >= 500 : false, // 5xx errors are retryable + code + ); +} diff --git a/sdks/typescript/src/evaluators/base.ts b/sdks/typescript/src/evaluators/base.ts new file mode 100644 index 0000000..65ea428 --- /dev/null +++ b/sdks/typescript/src/evaluators/base.ts @@ -0,0 +1,290 @@ +import { + TelemetryClient, + generateClientId, + getSDKVersion, + type TelemetryMetadata, + type TokenUsage, +} from '../telemetry/index.js'; +import { ValidationError } from '../errors.js'; +import { createLogger, LogLevel, type Logger } from '../logger.js'; + +/** + * Validation constants for input text + */ +export const VALIDATION_LIMITS = { + /** Minimum text length in characters */ + MIN_TEXT_LENGTH: 10, + /** Maximum text length in characters (100K chars ≈ 25K tokens) */ + MAX_TEXT_LENGTH: 100_000, +} as const; + +/** + * Granular telemetry configuration options + */ +export interface TelemetryOptions { + /** Enable telemetry (default: true) */ + enabled?: boolean; + + /** Record input text in telemetry (default: true) */ + recordInputs?: boolean; +} + +/** + * Base configuration for all evaluators + */ +export interface BaseEvaluatorConfig { + /** Google API key (for evaluators using Gemini) */ + googleApiKey?: string; + + /** OpenAI API key (for evaluators using GPT) */ + openaiApiKey?: string; + + /** Learning Commons API key for authenticated telemetry (optional) */ + apiKey?: string; + + /** + * Maximum number of retries for failed API calls (default: 2) + * Set to 0 to disable retries. + * + * Note: With maxRetries=2, a failed call will be attempted up to 3 times total + * (1 initial attempt + 2 retries) + */ + maxRetries?: number; + + /** + * Telemetry configuration (default: all enabled) + * + * Can be: + * - `true`: Enable with defaults (recordInputs: true) + * - `false`: Disable completely + * - `TelemetryOptions`: Granular control + */ + telemetry?: boolean | TelemetryOptions; + + /** + * Custom logger implementation (optional) + * If not provided, uses console logger with specified logLevel + */ + logger?: Logger; + + /** + * Log level for default console logger (default: WARN) + * Only used if custom logger is not provided + * + * - DEBUG: Very verbose, shows all operations + * - INFO: Normal operations + * - WARN: Warnings only (default) + * - ERROR: Errors only + * - SILENT: No logging + */ + logLevel?: LogLevel; +} + +/** + * Abstract base class for all evaluators + * + * Provides common functionality: + * - Telemetry setup and event sending + * - Text validation + * - Grade validation (with overridable default) + * - Metadata creation + */ +export abstract class BaseEvaluator { + protected telemetryClient?: TelemetryClient; + protected logger: Logger; + protected config: Required> & { + telemetry: Required; + }; + + constructor(config: BaseEvaluatorConfig) { + // Initialize logger + this.logger = createLogger(config.logger, config.logLevel ?? LogLevel.WARN); + // Normalize telemetry config + const telemetryConfig = this.normalizeTelemetryConfig(config.telemetry); + + // Set defaults for common config + this.config = { + maxRetries: config.maxRetries ?? 2, + telemetry: telemetryConfig, + }; + + // Initialize telemetry if enabled + if (this.config.telemetry.enabled) { + // Use all provider keys for client ID generation + const providerKeys = [config.googleApiKey, config.openaiApiKey].filter( + (key): key is string => key !== undefined + ); + + this.telemetryClient = new TelemetryClient({ + endpoint: 'https://api.learningcommons.org/v1/telemetry', + apiKey: config.apiKey, + clientId: generateClientId(...providerKeys), + enabled: true, + }); + } + } + + /** + * Normalize telemetry config to standard format + */ + private normalizeTelemetryConfig( + telemetry: boolean | TelemetryOptions | undefined + ): Required { + // Handle boolean shortcuts + if (telemetry === false) { + return { + enabled: false, + recordInputs: false, + }; + } + + if (telemetry === true || telemetry === undefined) { + return { + enabled: true, + recordInputs: true, + }; + } + + // Handle granular config object + return { + enabled: telemetry.enabled ?? true, + recordInputs: telemetry.recordInputs ?? true, + }; + } + + /** + * Get the evaluator type identifier (e.g., "vocabulary", "sentence-structure") + * Must be implemented by concrete evaluators + */ + protected abstract getEvaluatorType(): string; + + /** + * Validate text meets requirements + * Default implementation - can be overridden by concrete evaluators + * + * @throws {Error} If text is invalid + */ + protected validateText(text: string): void { + this.logger.debug('Validating text input', { + evaluator: this.getEvaluatorType(), + operation: 'validateText', + textLength: text.length, + }); + + // Check if text is empty or only whitespace + const trimmedText = text.trim(); + if (!trimmedText) { + const error = new ValidationError( + 'Text cannot be empty or contain only whitespace' + ); + this.logger.error('Text validation failed: empty or whitespace only', { + evaluator: this.getEvaluatorType(), + error, + }); + throw error; + } + + // Check minimum length + if (trimmedText.length < VALIDATION_LIMITS.MIN_TEXT_LENGTH) { + const error = new ValidationError( + `Text is too short. Minimum length is ${VALIDATION_LIMITS.MIN_TEXT_LENGTH} characters, received ${trimmedText.length} characters` + ); + this.logger.error('Text validation failed: too short', { + evaluator: this.getEvaluatorType(), + error, + minLength: VALIDATION_LIMITS.MIN_TEXT_LENGTH, + actualLength: trimmedText.length, + }); + throw error; + } + + // Check maximum length + if (text.length > VALIDATION_LIMITS.MAX_TEXT_LENGTH) { + const error = new ValidationError( + `Text is too long. Maximum length is ${VALIDATION_LIMITS.MAX_TEXT_LENGTH.toLocaleString()} characters, received ${text.length.toLocaleString()} characters` + ); + this.logger.error('Text validation failed: too long', { + evaluator: this.getEvaluatorType(), + error, + maxLength: VALIDATION_LIMITS.MAX_TEXT_LENGTH, + actualLength: text.length, + }); + throw error; + } + } + + /** + * Validate grade is in supported range + * Default implementation - can be overridden by concrete evaluators + * + * @param grade - Grade level to validate + * @param validGrades - Set of valid grades for this evaluator + * @throws {Error} If grade is invalid + */ + protected validateGrade(grade: string, validGrades: Set): void { + this.logger.debug('Validating grade input', { + evaluator: this.getEvaluatorType(), + operation: 'validateGrade', + grade, + }); + + // Check if grade is in valid set + if (!validGrades.has(grade)) { + const validList = Array.from(validGrades).sort((a, b) => { + // Sort K first, then numerically + if (a === 'K') return -1; + if (b === 'K') return 1; + return parseInt(a) - parseInt(b); + }).join(', '); + + const error = new ValidationError( + `Invalid grade "${grade}". Supported grades for this evaluator: ${validList}` + ); + this.logger.error('Grade validation failed: invalid grade', { + evaluator: this.getEvaluatorType(), + error, + providedGrade: grade, + validGrades: validList, + }); + throw error; + } + } + + /** + * Send telemetry event to analytics service + * Common helper for all evaluators + */ + protected async sendTelemetry(params: { + status: 'success' | 'error'; + latencyMs: number; + textLength: number; + grade?: string; + provider: string; + retryAttempts: number; + errorCode?: string; + tokenUsage?: TokenUsage; + metadata?: TelemetryMetadata; + inputText?: string; + }): Promise { + if (!this.telemetryClient) { + return; + } + + await this.telemetryClient.send({ + timestamp: new Date().toISOString(), + sdk_version: getSDKVersion(), + evaluator_type: this.getEvaluatorType(), + grade: params.grade, + status: params.status, + error_code: params.errorCode, + latency_ms: params.latencyMs, + text_length_chars: params.textLength, + provider: params.provider, + retry_attempts: params.retryAttempts, + token_usage: params.tokenUsage, + metadata: params.metadata, + // Include input text only if recording is enabled + input_text: this.config.telemetry.recordInputs ? params.inputText : undefined, + }); + } +} diff --git a/sdks/typescript/src/evaluators/index.ts b/sdks/typescript/src/evaluators/index.ts new file mode 100644 index 0000000..2f6ff9d --- /dev/null +++ b/sdks/typescript/src/evaluators/index.ts @@ -0,0 +1,7 @@ +export { BaseEvaluator, type BaseEvaluatorConfig, type TelemetryOptions } from './base.js'; + +export { + VocabularyEvaluator, + evaluateVocabulary, + type VocabularyEvaluatorConfig, +} from './vocabulary.js'; diff --git a/sdks/typescript/src/evaluators/vocabulary.ts b/sdks/typescript/src/evaluators/vocabulary.ts new file mode 100644 index 0000000..b9270bd --- /dev/null +++ b/sdks/typescript/src/evaluators/vocabulary.ts @@ -0,0 +1,358 @@ +import type { LLMProvider } from '../providers/index.js'; +import { createProvider } from '../providers/index.js'; +import { + VocabularyComplexitySchema, + type VocabularyComplexity, + type BackgroundKnowledge, +} from '../schemas/vocabulary.js'; +import { calculateFleschKincaidGrade } from '../features/index.js'; +import { + getBackgroundKnowledgePrompt, + getSystemPrompt, + getUserPrompt, +} from '../prompts/vocabulary/index.js'; +import type { EvaluationResult } from '../schemas/index.js'; +import { BaseEvaluator, type BaseEvaluatorConfig } from './base.js'; +import type { StageDetail } from '../telemetry/index.js'; +import { ValidationError, wrapProviderError } from '../errors.js'; + +/** + * Valid grade levels (3-12) + */ +const VALID_GRADES = new Set(['3', '4', '5', '6', '7', '8', '9', '10', '11', '12']); + +/** + * Configuration for VocabularyEvaluator + */ +export interface VocabularyEvaluatorConfig extends BaseEvaluatorConfig { + /** Google API key for complexity evaluation (uses Gemini 2.5 Pro) */ + googleApiKey: string; + + /** OpenAI API key for background knowledge generation (uses GPT-4o) */ + openaiApiKey: string; +} + +/** + * Vocabulary Evaluator + * + * Evaluates vocabulary complexity of educational texts relative to grade level. + * Uses a 2-stage process: + * 1. Generate background knowledge assumption for the student's grade level + * 2. Evaluate vocabulary complexity using that background knowledge + * + * Based on Qual Text Complexity rubric (SAP) with 4 levels: + * - Slightly complex + * - Moderately complex + * - Very complex + * - Exceedingly complex + * + * @example + * ```typescript + * const evaluator = new VocabularyEvaluator({ + * googleApiKey: process.env.GOOGLE_API_KEY, + * openaiApiKey: process.env.OPENAI_API_KEY + * }); + * + * const result = await evaluator.evaluate(text, "3"); + * console.log(result.score); // "moderately complex" + * console.log(result.reasoning); + * ``` + */ +export class VocabularyEvaluator extends BaseEvaluator { + private complexityProvider: LLMProvider; + private backgroundKnowledgeProvider: LLMProvider; + private evaluatorConfig: VocabularyEvaluatorConfig; + + constructor(config: VocabularyEvaluatorConfig) { + // Call base constructor for common setup (telemetry, etc.) + super(config); + + // Validate required API keys + if (!config.googleApiKey) { + throw new ValidationError('Google API key is required. Pass googleApiKey in config.'); + } + + if (!config.openaiApiKey) { + throw new ValidationError('OpenAI API key is required. Pass openaiApiKey in config.'); + } + + this.evaluatorConfig = config; + + // Create Google Gemini provider for complexity evaluation + this.complexityProvider = createProvider({ + type: 'google', + model: 'gemini-2.5-pro', + apiKey: config.googleApiKey, + maxRetries: this.config.maxRetries, + }); + + // Create OpenAI GPT-4o provider for background knowledge generation + this.backgroundKnowledgeProvider = createProvider({ + type: 'openai', + model: 'gpt-4o-2024-11-20', + apiKey: config.openaiApiKey, + maxRetries: this.config.maxRetries, + }); + } + + // Implement abstract methods from BaseEvaluator + protected getEvaluatorType(): string { + return 'vocabulary'; + } + + /** + * Evaluate vocabulary complexity for a given text and grade level + * + * @param text - The text to evaluate + * @param grade - The target grade level (K-12) + * @returns Evaluation result with complexity score and detailed analysis + * @throws {Error} If text is empty or grade is invalid + */ + async evaluate( + text: string, + grade: string + ): Promise> { + this.logger.info('Starting vocabulary evaluation', { + evaluator: 'vocabulary', + operation: 'evaluate', + grade, + textLength: text.length, + }); + + // Use inherited validation methods + this.validateText(text); + this.validateGrade(grade, VALID_GRADES); + + const startTime = Date.now(); + const stageDetails: StageDetail[] = []; + + try { + this.logger.debug('Stage 1: Generating background knowledge', { + evaluator: 'vocabulary', + operation: 'background_knowledge', + }); + // Stage 1: Generate background knowledge assumption + const bgResponse = await this.getBackgroundKnowledgeAssumption(text, grade); + + stageDetails.push({ + stage: 'background_knowledge', + provider: 'openai:gpt-4o-2024-11-20', + latency_ms: bgResponse.latencyMs, + // TODO: Retry tracking - Vercel AI SDK doesn't expose actual retry attempts + // We set -1 to indicate "unknown" (we may have retried, but can't track it) + // To fix: Implement custom retry wrapper that tracks each attempt + retry_attempts: -1, + token_usage: { + input_tokens: bgResponse.usage.inputTokens, + output_tokens: bgResponse.usage.outputTokens, + }, + }); + + // Calculate Flesch-Kincaid grade level + const fkLevel = calculateFleschKincaidGrade(text); + + // Stage 2: Evaluate vocabulary complexity + const complexityResponse = await this.evaluateComplexity( + text, + grade, + bgResponse.knowledge.assumption, + fkLevel + ); + + stageDetails.push({ + stage: 'complexity_evaluation', + provider: 'google:gemini-2.5-pro', + latency_ms: complexityResponse.latencyMs, + // TODO: Retry tracking - Vercel AI SDK doesn't expose actual retry attempts + // We set -1 to indicate "unknown" (we may have retried, but can't track it) + // To fix: Implement custom retry wrapper that tracks each attempt + retry_attempts: -1, + token_usage: { + input_tokens: complexityResponse.usage.inputTokens, + output_tokens: complexityResponse.usage.outputTokens, + }, + }); + + const latencyMs = Date.now() - startTime; + + // Aggregate token usage + const totalTokenUsage = { + input_tokens: stageDetails.reduce((sum, s) => sum + (s.token_usage?.input_tokens || 0), 0), + output_tokens: stageDetails.reduce((sum, s) => sum + (s.token_usage?.output_tokens || 0), 0), + }; + + // If any stage has unknown retries (-1), total is unknown + const totalRetries = stageDetails.some(s => s.retry_attempts === -1) + ? -1 + : stageDetails.reduce((sum, s) => sum + s.retry_attempts, 0); + + const result = { + score: complexityResponse.data.complexity_score, + reasoning: complexityResponse.data.reasoning, + metadata: { + promptVersion: '1.0', + model: 'gemini-2.5-pro + gpt-4o-2024-11-20', + timestamp: new Date(), + processingTimeMs: latencyMs, + }, + _internal: complexityResponse.data, + }; + + // Send success telemetry (fire-and-forget) + this.sendTelemetry({ + status: 'success', + latencyMs, + textLength: text.length, + grade, + provider: 'google:gemini-2.5-pro+openai:gpt-4o', + retryAttempts: totalRetries, + tokenUsage: totalTokenUsage, + metadata: { + stage_details: stageDetails, + }, + inputText: text, + }).catch(() => { + // Ignore telemetry errors + }); + + this.logger.info('Vocabulary evaluation completed successfully', { + evaluator: 'vocabulary', + operation: 'evaluate', + grade, + score: result.score, + processingTimeMs: latencyMs, + }); + + return result; + } catch (error) { + const latencyMs = Date.now() - startTime; + + // Log the error + this.logger.error('Vocabulary evaluation failed', { + evaluator: 'vocabulary', + operation: 'evaluate', + grade, + error: error instanceof Error ? error : undefined, + processingTimeMs: latencyMs, + completedStages: stageDetails.length, + }); + + // Aggregate metrics from completed stages + const totalTokenUsage = stageDetails.length > 0 ? { + input_tokens: stageDetails.reduce((sum, s) => sum + (s.token_usage?.input_tokens || 0), 0), + output_tokens: stageDetails.reduce((sum, s) => sum + (s.token_usage?.output_tokens || 0), 0), + } : undefined; + + // If any stage has unknown retries (-1), total is unknown + const totalRetries = stageDetails.length > 0 && stageDetails.some(s => s.retry_attempts === -1) + ? -1 + : stageDetails.reduce((sum, s) => sum + s.retry_attempts, 0); + + // Send failure telemetry (fire-and-forget) + this.sendTelemetry({ + status: 'error', + latencyMs, + textLength: text.length, + grade, + provider: 'google:gemini-2.5-pro+openai:gpt-4o', + retryAttempts: totalRetries, + tokenUsage: totalTokenUsage, + errorCode: error instanceof Error ? error.name : 'UnknownError', + metadata: stageDetails.length > 0 ? { stage_details: stageDetails } : undefined, + inputText: text, + }).catch(() => { + // Ignore telemetry errors + }); + + // Re-throw validation errors as-is + if (error instanceof ValidationError) { + throw error; + } + + // Wrap provider errors into appropriate error types + throw wrapProviderError(error, 'Vocabulary evaluation failed'); + } + } + + /** + * Stage 1: Generate background knowledge assumption + * + * Estimates what topics the student at the given grade level would be familiar with + * based on Common Core curriculum progression. + */ + private async getBackgroundKnowledgeAssumption( + text: string, + grade: string + ): Promise<{ knowledge: BackgroundKnowledge; usage: { inputTokens: number; outputTokens: number }; latencyMs: number }> { + const prompt = getBackgroundKnowledgePrompt(text, grade); + + const response = await this.backgroundKnowledgeProvider.generateText( + [{ role: 'user', content: prompt }], + 0 // temperature = 0 for consistency + ); + + return { + knowledge: { + assumption: response.text.trim(), + grade, + }, + usage: response.usage, + latencyMs: response.latencyMs, + }; + } + + /** + * Stage 2: Evaluate vocabulary complexity + * + * Uses the Qual Text Complexity rubric (SAP) and background knowledge to evaluate vocabulary complexity + */ + private async evaluateComplexity( + text: string, + grade: string, + backgroundKnowledge: string, + fkLevel: number + ): Promise<{ data: VocabularyComplexity; usage: { inputTokens: number; outputTokens: number }; latencyMs: number }> { + const systemPrompt = getSystemPrompt(grade); + const userPrompt = getUserPrompt(text, grade, backgroundKnowledge, fkLevel); + + const response = await this.complexityProvider.generateStructured({ + messages: [ + { role: 'system', content: systemPrompt }, + { role: 'user', content: userPrompt }, + ], + schema: VocabularyComplexitySchema, + temperature: 0, + }); + + return { + data: response.data, + usage: response.usage, + latencyMs: response.latencyMs, + }; + } + +} + +/** + * Functional API for vocabulary evaluation + * + * @example + * ```typescript + * const result = await evaluateVocabulary( + * "The mitochondria is the powerhouse of the cell.", + * "3", + * { + * googleApiKey: process.env.GOOGLE_API_KEY, + * openaiApiKey: process.env.OPENAI_API_KEY + * } + * ); + * ``` + */ +export async function evaluateVocabulary( + text: string, + grade: string, + config: VocabularyEvaluatorConfig +): Promise> { + const evaluator = new VocabularyEvaluator(config); + return evaluator.evaluate(text, grade); +} diff --git a/sdks/typescript/src/features/index.ts b/sdks/typescript/src/features/index.ts new file mode 100644 index 0000000..e99c290 --- /dev/null +++ b/sdks/typescript/src/features/index.ts @@ -0,0 +1,4 @@ +export { + calculateFleschKincaidGrade, + calculateReadabilityMetrics, +} from './readability.js'; diff --git a/sdks/typescript/src/features/readability.ts b/sdks/typescript/src/features/readability.ts new file mode 100644 index 0000000..f6d8350 --- /dev/null +++ b/sdks/typescript/src/features/readability.ts @@ -0,0 +1,55 @@ +import nlp from 'compromise'; +import { syllable } from 'syllable'; + +/** + * Calculate Flesch-Kincaid Grade Level + * Equivalent to Python's textstat.flesch_kincaid_grade() + */ +export function calculateFleschKincaidGrade(text: string): number { + const doc = nlp(text); + + const sentences = doc.sentences().length; + const words = doc.terms().length; + + if (sentences === 0 || words === 0) { + return 0; + } + + // Count syllables for all words + const allWords = doc.terms().out('array'); + const totalSyllables = allWords.reduce((sum: number, word: string) => { + return sum + syllable(word); + }, 0); + + // Flesch-Kincaid formula + const avgWordsPerSentence = words / sentences; + const avgSyllablesPerWord = totalSyllables / words; + + const fkGrade = 0.39 * avgWordsPerSentence + 11.8 * avgSyllablesPerWord - 15.59; + + return Math.round(fkGrade * 100) / 100; // Round to 2 decimal places +} + +/** + * Additional readability metrics + */ +export function calculateReadabilityMetrics(text: string) { + const doc = nlp(text); + + const sentences = doc.sentences().length; + const words = doc.terms().length; + const characters = text.replace(/\s/g, '').length; + + const allWords = doc.terms().out('array'); + const totalSyllables = allWords.reduce((sum: number, word: string) => sum + syllable(word), 0); + + return { + sentenceCount: sentences, + wordCount: words, + characterCount: characters, + syllableCount: totalSyllables, + avgWordsPerSentence: sentences > 0 ? words / sentences : 0, + avgSyllablesPerWord: words > 0 ? totalSyllables / words : 0, + fleschKincaidGrade: calculateFleschKincaidGrade(text), + }; +} diff --git a/sdks/typescript/src/index.ts b/sdks/typescript/src/index.ts index cb0ff5c..22d069b 100644 --- a/sdks/typescript/src/index.ts +++ b/sdks/typescript/src/index.ts @@ -1 +1,61 @@ -export {}; +// Core types and schemas +export type { + EvaluationResult, + EvaluationMetadata, + BatchEvaluationResult, + BatchSummary, + EvaluationError, +} from './schemas/index.js'; + +export { ComplexityLevel, GradeLevel, } from './schemas/index.js'; + +// Error types +export { + EvaluatorError, + ValidationError, + APIError, + AuthenticationError, + RateLimitError, + NetworkError, + TimeoutError, + wrapProviderError, +} from './errors.js'; + +// Logger +export type { Logger, LogContext } from './logger.js'; +export { LogLevel, createLogger, formatError } from './logger.js'; + +// Provider exports +export type { + LLMProvider, + LLMRequest, + LLMResponse, + TextGenerationResponse, + Message, + ProviderConfig, +} from './providers/index.js'; + +export { VercelAIProvider, createProvider } from './providers/index.js'; + +// Vocabulary exports +export type { + VocabularyComplexity, + VocabularyComplexityLevel, + BackgroundKnowledge, +} from './schemas/vocabulary.js'; + +export { VocabularyComplexitySchema } from './schemas/vocabulary.js'; + +export { + VocabularyEvaluator, + evaluateVocabulary, + type VocabularyEvaluatorConfig, + type BaseEvaluatorConfig, + type TelemetryOptions, +} from './evaluators/index.js'; + +// Features +export { + calculateFleschKincaidGrade, + calculateReadabilityMetrics, +} from './features/index.js'; diff --git a/sdks/typescript/src/logger.ts b/sdks/typescript/src/logger.ts new file mode 100644 index 0000000..bdc5a1b --- /dev/null +++ b/sdks/typescript/src/logger.ts @@ -0,0 +1,159 @@ +/** + * Logging interface for the Evaluators SDK + * + * Provides structured logging with verbosity levels. + * Users can inject custom loggers or use the default console logger. + */ + +/** + * Log levels in order of verbosity + */ +export enum LogLevel { + /** Debug messages - very verbose, for development */ + DEBUG = 0, + /** Informational messages - normal operations */ + INFO = 1, + /** Warning messages - potentially problematic situations */ + WARN = 2, + /** Error messages - errors that need attention */ + ERROR = 3, + /** Silent - no logging */ + SILENT = 4, +} + +/** + * Context object for structured logging + */ +export interface LogContext { + /** Evaluator type (vocabulary, sentence-structure, etc.) */ + evaluator?: string; + /** Current operation or stage */ + operation?: string; + /** Error object if applicable */ + error?: Error; + /** Additional metadata */ + [key: string]: unknown; +} + +/** + * Logger interface + * + * Implement this interface to provide custom logging behavior. + * + * @example + * ```typescript + * const customLogger: Logger = { + * debug: (msg, ctx) => myLogger.debug(msg, ctx), + * info: (msg, ctx) => myLogger.info(msg, ctx), + * warn: (msg, ctx) => myLogger.warn(msg, ctx), + * error: (msg, ctx) => myLogger.error(msg, ctx), + * }; + * + * const evaluator = new VocabularyEvaluator({ + * googleApiKey: '...', + * openaiApiKey: '...', + * logger: customLogger, + * logLevel: LogLevel.INFO, + * }); + * ``` + */ +export interface Logger { + /** + * Log debug message + * Used for detailed debugging information + */ + debug(message: string, context?: LogContext): void; + + /** + * Log informational message + * Used for normal operations + */ + info(message: string, context?: LogContext): void; + + /** + * Log warning message + * Used for potentially problematic situations + */ + warn(message: string, context?: LogContext): void; + + /** + * Log error message + * Used for errors that need attention + */ + error(message: string, context?: LogContext): void; +} + +/** + * Default console logger implementation + */ +class ConsoleLogger implements Logger { + constructor(private level: LogLevel = LogLevel.WARN) {} + + debug(message: string, context?: LogContext): void { + if (this.level <= LogLevel.DEBUG) { + console.debug(`[DEBUG] ${message}`, context || ''); + } + } + + info(message: string, context?: LogContext): void { + if (this.level <= LogLevel.INFO) { + console.info(`[INFO] ${message}`, context || ''); + } + } + + warn(message: string, context?: LogContext): void { + if (this.level <= LogLevel.WARN) { + console.warn(`[WARN] ${message}`, context || ''); + } + } + + error(message: string, context?: LogContext): void { + if (this.level <= LogLevel.ERROR) { + console.error(`[ERROR] ${message}`, context || ''); + } + } +} + +/** + * Silent logger - logs nothing + */ +class SilentLogger implements Logger { + debug(): void {} + info(): void {} + warn(): void {} + error(): void {} +} + +/** + * Create a logger instance + * + * @param customLogger - Optional custom logger implementation + * @param level - Log level (default: WARN) + * @returns Logger instance + */ +export function createLogger(customLogger?: Logger, level: LogLevel = LogLevel.WARN): Logger { + // Use custom logger if provided + if (customLogger) { + return customLogger; + } + + // Use silent logger if level is SILENT + if (level === LogLevel.SILENT) { + return new SilentLogger(); + } + + // Use console logger with specified level + return new ConsoleLogger(level); +} + +/** + * Format error for logging + * + * @internal + */ +export function formatError(error: unknown): string { + if (error instanceof Error) { + return `${error.name}: ${error.message}`; + } + return String(error); +} diff --git a/sdks/typescript/src/prompts/vocabulary/background-knowledge.ts b/sdks/typescript/src/prompts/vocabulary/background-knowledge.ts new file mode 100644 index 0000000..9f108cc --- /dev/null +++ b/sdks/typescript/src/prompts/vocabulary/background-knowledge.ts @@ -0,0 +1,16 @@ +import { loadPrompt } from '../../utils/prompts'; + +/** + * Background knowledge prompt template + * Loaded from: prompts/vocabulary/background-knowledge.txt + */ +const BACKGROUND_KNOWLEDGE_TEMPLATE = loadPrompt('vocabulary/background-knowledge.txt'); + +/** + * Generate the background knowledge prompt for a given text and grade level + */ +export function getBackgroundKnowledgePrompt(text: string, grade: string): string { + return BACKGROUND_KNOWLEDGE_TEMPLATE + .replace('{grade}', grade) + .replace('{text}', text); +} diff --git a/sdks/typescript/src/prompts/vocabulary/index.ts b/sdks/typescript/src/prompts/vocabulary/index.ts new file mode 100644 index 0000000..47ed85a --- /dev/null +++ b/sdks/typescript/src/prompts/vocabulary/index.ts @@ -0,0 +1,3 @@ +export { getBackgroundKnowledgePrompt } from './background-knowledge.js'; +export { getSystemPrompt } from './system.js'; +export { getUserPrompt } from './user.js'; diff --git a/sdks/typescript/src/prompts/vocabulary/system.ts b/sdks/typescript/src/prompts/vocabulary/system.ts new file mode 100644 index 0000000..634dbad --- /dev/null +++ b/sdks/typescript/src/prompts/vocabulary/system.ts @@ -0,0 +1,28 @@ +import { loadPrompt } from '../../utils/prompts'; + +/** + * System prompt for vocabulary complexity evaluation (Grades 3-4) + * Loaded from: prompts/vocabulary/grades-3-4-system.txt + */ +const SYSTEM_PROMPT_GRADES_3_4 = loadPrompt('vocabulary/grades-3-4-system.txt'); + +/** + * System prompt for vocabulary complexity evaluation (Other grades: K-2, 5-12) + * Loaded from: prompts/vocabulary/other-grades-system.txt + */ +const SYSTEM_PROMPT_OTHER_GRADES = loadPrompt('vocabulary/other-grades-system.txt'); + +/** + * Get the appropriate system prompt based on grade level + * @param grade - The target grade level (K-12) + * @returns The system prompt for the grade level + */ +export function getSystemPrompt(grade: string): string { + // Grades 3-4 use the GRADES_3_4 prompt + if (grade === '3' || grade === '4') { + return SYSTEM_PROMPT_GRADES_3_4; + } + + // All other grades (K, 1, 2, 5-12) use OTHER_GRADES prompt + return SYSTEM_PROMPT_OTHER_GRADES; +} diff --git a/sdks/typescript/src/prompts/vocabulary/user.ts b/sdks/typescript/src/prompts/vocabulary/user.ts new file mode 100644 index 0000000..6c06e87 --- /dev/null +++ b/sdks/typescript/src/prompts/vocabulary/user.ts @@ -0,0 +1,39 @@ +import { loadPrompt } from '../../utils/prompts'; + +/** + * User prompt template for vocabulary complexity evaluation (Grades 3-4) + * Loaded from: prompts/vocabulary/grades-3-4-user.txt + */ +const USER_PROMPT_TEMPLATE_GRADES_3_4 = loadPrompt('vocabulary/grades-3-4-user.txt'); + +/** + * User prompt template for vocabulary complexity evaluation (Other grades: K-2, 5-12) + * Loaded from: prompts/vocabulary/other-grades-user.txt + */ +const USER_PROMPT_TEMPLATE_OTHER_GRADES = loadPrompt('vocabulary/other-grades-user.txt'); + +/** + * Generate the user prompt for vocabulary complexity evaluation + * @param text - The text to evaluate + * @param studentGradeLevel - The student's grade level + * @param studentBackgroundKnowledge - Background knowledge assumption + * @param fkLevel - Flesch-Kincaid grade level + * @returns The formatted user prompt + */ +export function getUserPrompt( + text: string, + studentGradeLevel: string, + studentBackgroundKnowledge: string, + fkLevel: number +): string { + // Select the appropriate template based on grade + const template = studentGradeLevel === '3' || studentGradeLevel === '4' + ? USER_PROMPT_TEMPLATE_GRADES_3_4 + : USER_PROMPT_TEMPLATE_OTHER_GRADES; + + return template + .replace('{student_grade_level}', studentGradeLevel) + .replace('{student_background_knowledge}', studentBackgroundKnowledge) + .replace('{fk_level}', fkLevel.toString()) + .replace('{text}', text); +} diff --git a/sdks/typescript/src/providers/ai-sdk-provider.ts b/sdks/typescript/src/providers/ai-sdk-provider.ts new file mode 100644 index 0000000..984a572 --- /dev/null +++ b/sdks/typescript/src/providers/ai-sdk-provider.ts @@ -0,0 +1,153 @@ +import { generateText, Output } from 'ai'; +import { createOpenAI } from '@ai-sdk/openai'; +import { createAnthropic } from '@ai-sdk/anthropic'; +import { createGoogleGenerativeAI } from '@ai-sdk/google'; +import type { + LLMProvider, + LLMRequest, + LLMResponse, + Message, + ProviderConfig, +} from './base.js'; + +/** + * Default models for each provider based on Python implementation + */ +const DEFAULT_MODELS = { + openai: 'gpt-4o', + anthropic: 'claude-sonnet-4-5-20250929', + google: 'gemini-2.5-pro', +} as const; + +/** + * Vercel AI SDK provider implementation + * Supports OpenAI, Anthropic, and Google Gemini + */ +export class VercelAIProvider implements LLMProvider { + constructor(private config: ProviderConfig) { + if (config.type === 'custom') { + throw new Error( + 'VercelAIProvider does not support custom type. Use config.customProvider directly.' + ); + } + } + + /** + * Generate structured output using Vercel AI SDK's generateText with output + */ + async generateStructured(request: LLMRequest): Promise> { + const model = this.getModel(request.model); + const startTime = Date.now(); + + try { + const baseParams = { + model, + messages: request.messages, + output: Output.object({ schema: request.schema }), + temperature: request.temperature ?? 0, + maxRetries: this.config.maxRetries ?? 0, + }; + + const params = request.maxTokens !== undefined + ? { ...baseParams, maxTokens: request.maxTokens } + : baseParams; + + const { output, usage } = await generateText(params as Parameters[0]); + + return { + data: output, + model: request.model || this.getDefaultModel(), + usage: { + inputTokens: usage.inputTokens || 0, + outputTokens: usage.outputTokens || 0, + }, + latencyMs: Date.now() - startTime, + }; + } catch (error) { + throw new Error( + `Failed to generate structured output: ${error instanceof Error ? error.message : String(error)}` + ); + } + } + + /** + * Generate plain text using Vercel AI SDK's generateText + */ + async generateText(messages: Message[], temperature?: number): Promise { + const model = this.getModel(); + const startTime = Date.now(); + + try { + const params = { + model, + messages, + temperature: temperature ?? this.config.temperature ?? 0, + maxRetries: this.config.maxRetries ?? 0, + }; + + const { text, usage } = await generateText(params as Parameters[0]); + + return { + text, + usage: { + inputTokens: usage.inputTokens || 0, + outputTokens: usage.outputTokens || 0, + }, + latencyMs: Date.now() - startTime, + }; + } catch (error) { + throw new Error( + `Failed to generate text: ${error instanceof Error ? error.message : String(error)}` + ); + } + } + + /** + * Get the configured language model + */ + private getModel(requestModel?: string) { + const modelId = requestModel || this.config.model || this.getDefaultModel(); + const apiKey = this.config.apiKey; + + switch (this.config.type) { + case 'openai': { + const provider = createOpenAI(apiKey ? { apiKey } : {}); + return provider(modelId); + } + case 'anthropic': { + const provider = createAnthropic(apiKey ? { apiKey } : {}); + return provider(modelId); + } + case 'google': { + const provider = createGoogleGenerativeAI(apiKey ? { apiKey } : {}); + return provider(modelId); + } + default: + throw new Error(`Unsupported provider type: ${this.config.type}`); + } + } + + /** + * Get default model for the configured provider + */ + private getDefaultModel(): string { + const providerType = this.config.type; + + if (providerType === 'custom') { + throw new Error('Cannot get default model for custom provider type'); + } + + return DEFAULT_MODELS[providerType]; + } +} + +/** + * Factory function to create a provider instance + */ +export function createProvider(config: ProviderConfig): LLMProvider { + if (config.type === 'custom' && config.customProvider) { + return config.customProvider; + } + + return new VercelAIProvider(config); +} diff --git a/sdks/typescript/src/providers/base.ts b/sdks/typescript/src/providers/base.ts new file mode 100644 index 0000000..5b6dee9 --- /dev/null +++ b/sdks/typescript/src/providers/base.ts @@ -0,0 +1,73 @@ +import type { z } from 'zod'; + +/** + * Message format for LLM conversations + */ +export interface Message { + role: 'system' | 'user' | 'assistant'; + content: string; +} + +/** + * Request configuration for structured LLM generation + */ +export interface LLMRequest { + messages: Message[]; + schema: z.ZodSchema; + temperature?: number; + maxTokens?: number; + model?: string; +} + +/** + * Response from LLM with usage metadata + */ +export interface LLMResponse { + data: T; + model: string; + usage: { + inputTokens: number; + outputTokens: number; + }; + latencyMs: number; +} + +/** + * Response from plain text generation + */ +export interface TextGenerationResponse { + text: string; + usage: { + inputTokens: number; + outputTokens: number; + }; + latencyMs: number; +} + +/** + * Base interface for LLM provider implementations + */ +export interface LLMProvider { + /** + * Generate structured output from LLM using Zod schema + */ + generateStructured(request: LLMRequest): Promise>; + + /** + * Generate plain text from LLM + */ + generateText(messages: Message[], temperature?: number): Promise; +} + +/** + * Configuration for LLM provider + */ +export interface ProviderConfig { + type: 'openai' | 'anthropic' | 'google' | 'custom'; + apiKey?: string; + model?: string; + temperature?: number; + baseURL?: string; + customProvider?: LLMProvider; + maxRetries?: number; +} diff --git a/sdks/typescript/src/providers/index.ts b/sdks/typescript/src/providers/index.ts new file mode 100644 index 0000000..f32e5e3 --- /dev/null +++ b/sdks/typescript/src/providers/index.ts @@ -0,0 +1,10 @@ +export type { + LLMProvider, + LLMRequest, + LLMResponse, + TextGenerationResponse, + Message, + ProviderConfig, +} from './base.js'; + +export { VercelAIProvider, createProvider } from './ai-sdk-provider.js'; diff --git a/sdks/typescript/src/schemas/index.ts b/sdks/typescript/src/schemas/index.ts new file mode 100644 index 0000000..ded6c72 --- /dev/null +++ b/sdks/typescript/src/schemas/index.ts @@ -0,0 +1,10 @@ +export { + ComplexityLevel, + GradeLevel, + type EvaluationResult, + type EvaluationMetadata, + type BatchEvaluationResult, + type BatchSummary, + type EvaluationError, +} from './outputs.js'; + diff --git a/sdks/typescript/src/schemas/outputs.ts b/sdks/typescript/src/schemas/outputs.ts new file mode 100644 index 0000000..9ab807e --- /dev/null +++ b/sdks/typescript/src/schemas/outputs.ts @@ -0,0 +1,75 @@ +import { z } from 'zod'; + +/** + * Complexity levels for sentence structure evaluation + */ +export const ComplexityLevel = z.enum([ + 'Slightly Complex', + 'Moderately Complex', + 'Very Complex', + 'Exceedingly Complex', +]); + +export type ComplexityLevel = z.infer; + +/** + * Grade levels for vocabulary evaluation + */ +export const GradeLevel = z.enum([ + 'Below Grade Level', + 'At Grade Level', + 'Above Grade Level', +]); + +export type GradeLevel = z.infer; + +/** + * Metadata attached to all evaluation results + */ +export interface EvaluationMetadata { + evaluatorVersion?: string; + promptVersion: string; + model: string; + timestamp: Date; + processingTimeMs: number; +} + +/** + * Base evaluation result structure + */ +export interface EvaluationResult { + score: TScore; + reasoning: string; + metadata: EvaluationMetadata; + _internal?: TInternal; +} + +/** + * Batch evaluation summary statistics + */ +export interface BatchSummary { + total: number; + successful: number; + failed: number; + averageProcessingTimeMs: number; +} + +/** + * Error type for failed evaluations + */ +export interface EvaluationError { + error: string; + input: { + text: string; + grade?: string; + }; + timestamp: Date; +} + +/** + * Batch evaluation result + */ +export interface BatchEvaluationResult { + results: Array; + summary: BatchSummary; +} diff --git a/sdks/typescript/src/schemas/vocabulary.ts b/sdks/typescript/src/schemas/vocabulary.ts new file mode 100644 index 0000000..f5b80d0 --- /dev/null +++ b/sdks/typescript/src/schemas/vocabulary.ts @@ -0,0 +1,39 @@ +import { z } from 'zod'; + +/** + * Vocabulary complexity levels matching Qual Text Complexity rubric (SAP) + */ +export const VocabularyComplexityLevel = z.enum([ + 'slightly complex', + 'moderately complex', + 'very complex', + 'exceedingly complex', +]); + +export type VocabularyComplexityLevel = z.infer; + +/** + * Vocabulary complexity evaluation output + * Ported from Python Output BaseModel + */ +export const VocabularyComplexitySchema = z.object({ + tier_2_words: z.string().describe('List of Tier 2 words (academic words)'), + tier_3_words: z.string().describe('List of Tier 3 words (domain-specific)'), + archaic_words: z.string().describe('List of Archaic words'), + other_complex_words: z.string().describe('List of Other Complex words'), + complexity_score: VocabularyComplexityLevel.describe( + 'The complexity of the text vocabulary' + ), + reasoning: z.string().describe('Detailed reasoning for the complexity rating'), +}); + +export type VocabularyComplexity = z.infer; + +/** + * Background knowledge assumption for a student at a given grade level + * This is generated in Stage 1 and used as input for Stage 2 + */ +export interface BackgroundKnowledge { + assumption: string; + grade: string; +} diff --git a/sdks/typescript/src/telemetry/client.ts b/sdks/typescript/src/telemetry/client.ts new file mode 100644 index 0000000..7dc65f2 --- /dev/null +++ b/sdks/typescript/src/telemetry/client.ts @@ -0,0 +1,61 @@ +import type { TelemetryConfig, TelemetryEvent } from './types.js'; + +/** + * Telemetry client for sending analytics events + * + * Fire-and-forget implementation that never blocks SDK operations. + * Errors are logged but don't fail evaluations. + */ +export class TelemetryClient { + private config: TelemetryConfig; + + constructor(config: TelemetryConfig) { + this.config = config; + } + + /** + * Send telemetry event to analytics service + * + * Fire-and-forget: Errors are logged but don't throw. + */ + async send(event: TelemetryEvent): Promise { + // Skip if telemetry disabled + if (!this.config.enabled) { + return; + } + + try { + const headers: Record = { + 'Content-Type': 'application/json', + 'X-Client-ID': this.config.clientId, + }; + + // Add API key if provided + if (this.config.apiKey) { + headers['X-API-Key'] = this.config.apiKey; + } + + const response = await fetch(this.config.endpoint, { + method: 'POST', + headers, + body: JSON.stringify(event), + // Don't block SDK operations on slow networks + signal: AbortSignal.timeout(5000), // 5 second timeout + }); + + if (!response.ok) { + console.error( + `[Telemetry] Failed to send event: ${response.status} ${response.statusText}` + ); + } + } catch (error) { + // Log error but never throw (fire-and-forget) + if (error instanceof Error) { + // Don't log timeout errors (expected on slow networks) + if (error.name !== 'TimeoutError' && error.name !== 'AbortError') { + console.error('[Telemetry] Error sending event:', error.message); + } + } + } + } +} diff --git a/sdks/typescript/src/telemetry/index.ts b/sdks/typescript/src/telemetry/index.ts new file mode 100644 index 0000000..ae1cbb7 --- /dev/null +++ b/sdks/typescript/src/telemetry/index.ts @@ -0,0 +1,10 @@ +export { TelemetryClient } from './client.js'; +export { generateClientId, getSDKVersion } from './utils.js'; +export type { + TelemetryConfig, + TelemetryEvent, + EvaluationStatus, + TokenUsage, + StageDetail, + TelemetryMetadata, +} from './types.js'; diff --git a/sdks/typescript/src/telemetry/types.ts b/sdks/typescript/src/telemetry/types.ts new file mode 100644 index 0000000..6f3abf8 --- /dev/null +++ b/sdks/typescript/src/telemetry/types.ts @@ -0,0 +1,105 @@ +// TODO: Generate these types from the telemetry service OpenAPI/JSON Schema +// instead of maintaining them manually. This will prevent drift between +// client and server schemas. + +/** + * Evaluation status + */ +export type EvaluationStatus = 'success' | 'error'; + +/** + * Token usage metrics from LLM providers + */ +export interface TokenUsage { + input_tokens: number; + output_tokens: number; +} + +/** + * Per-stage details for multi-stage evaluations + */ +export interface StageDetail { + /** Stage name (e.g., "background_knowledge", "complexity_evaluation") */ + stage: string; + + /** Provider used for this stage (e.g., "openai:gpt-4o") */ + provider: string; + + /** Total latency including all retries (ms) */ + latency_ms: number; + + /** + * Number of retries for this stage + * + * IMPORTANT: Currently set to -1 (unknown) because Vercel AI SDK doesn't expose + * actual retry attempts. We may have retried, but can't track it. + * + * Values: + * - -1 = unknown (current implementation) + * - 0+ = known retry count (requires custom retry wrapper) + * + * Note: Token usage and costs only reflect the final successful attempt. + * Failed retry attempts are not included due to SDK limitations. + */ + retry_attempts: number; + + /** Token usage aggregated across all attempts */ + token_usage?: TokenUsage; + + /** + * Whether schema validation failed (indicates prompt needs clearer instructions) + * + * TODO: Not currently tracked. Vercel AI SDK abstracts validation away. + * To implement: Add custom retry wrapper that catches validation errors. + */ + schema_validation_failed?: boolean; +} + +/** + * Extensible metadata for telemetry events + */ +export interface TelemetryMetadata { + /** Detailed breakdown by stage (for multi-stage evaluations) */ + stage_details?: StageDetail[]; + + // Future fields can be added here: + // cache_hit?: boolean; + // prompt_tokens_breakdown?: {...}; + // etc. +} + +/** + * Telemetry event payload + */ +export interface TelemetryEvent { + timestamp: string; + sdk_version: string; + evaluator_type: string; + grade?: string; + status: EvaluationStatus; + error_code?: string; + latency_ms: number; + text_length_chars: number; + provider: string; // Format: "provider:model" or "provider1+provider2" for multi-provider + retry_attempts: number; // Total retries across all stages (-1 = unknown, see StageDetail docs) + token_usage?: TokenUsage; // Aggregated across all stages and attempts + metadata?: TelemetryMetadata; // Optional per-stage breakdown + input_text?: string; // Input text (only if recordInputs enabled) +} + +/** + * Configuration for telemetry client + */ +export interface TelemetryConfig { + /** Analytics service endpoint URL */ + endpoint: string; + + /** Learning Commons API key (optional, sent as X-API-Key header) */ + apiKey?: string; + + /** Client ID for anonymous tracking (sha256 of LLM API keys) */ + clientId: string; + + /** Enable telemetry (default: true) */ + enabled: boolean; +} diff --git a/sdks/typescript/src/telemetry/utils.ts b/sdks/typescript/src/telemetry/utils.ts new file mode 100644 index 0000000..4a912b1 --- /dev/null +++ b/sdks/typescript/src/telemetry/utils.ts @@ -0,0 +1,63 @@ +import { createHash } from 'crypto'; +import { readFileSync } from 'fs'; +import { join, dirname } from 'path'; +import { fileURLToPath } from 'url'; + +const __filename = fileURLToPath(import.meta.url); +const __dirname = dirname(__filename); + +/** + * Generate client ID for anonymous tracking + * + * Creates SHA256 hash of API keys to create consistent identifier + * across requests while maintaining anonymity. + * + * @param apiKeys - Array of API keys to hash + * @returns SHA256 hex string + */ +export function generateClientId(...apiKeys: (string | undefined)[]): string { + // Filter out undefined keys and sort for consistency + const keys = apiKeys.filter((k): k is string => k !== undefined).sort(); + + // If no keys provided, generate random ID for this session + if (keys.length === 0) { + return createHash('sha256') + .update(Math.random().toString()) + .digest('hex'); + } + + // Hash the concatenated keys with delimiter to prevent collisions + return createHash('sha256') + .update(keys.join('|')) + .digest('hex'); +} + +let cachedVersion: string | undefined; + +/** + * Get SDK version from package.json + */ +export function getSDKVersion(): string { + if (cachedVersion) { + return cachedVersion; + } + + const possiblePaths = [ + join(__dirname, '../../package.json'), // From src/ + join(__dirname, '../package.json'), // From dist/ + ]; + + for (const path of possiblePaths) { + try { + const pkg = JSON.parse(readFileSync(path, 'utf-8')) as { version?: string }; + cachedVersion = pkg.version || '0.0.0'; + return cachedVersion; + } catch { + continue; + } + } + + // Fallback if no package.json found + cachedVersion = '0.0.0'; + return cachedVersion; +} diff --git a/sdks/typescript/src/utils/prompts.ts b/sdks/typescript/src/utils/prompts.ts new file mode 100644 index 0000000..757f2e9 --- /dev/null +++ b/sdks/typescript/src/utils/prompts.ts @@ -0,0 +1,21 @@ +import { readFileSync } from 'fs'; +import { fileURLToPath } from 'url'; +import { dirname, join } from 'path'; + +const __filename = fileURLToPath(import.meta.url); +const __dirname = dirname(__filename); + +// Path to prompts directory +// When in src/utils/, go up one level. When bundled in dist/, stay in current dir. +const PROMPTS_DIR = __dirname.endsWith('utils') + ? join(dirname(__dirname), 'prompts') + : join(__dirname, 'prompts'); + +/** + * Load a prompt file from the prompts directory + * @param relativePath - Path relative to prompts directory (e.g., 'vocabulary/grades-3-4-system.txt') + * @returns The prompt file contents as a string + */ +export function loadPrompt(relativePath: string): string { + return readFileSync(join(PROMPTS_DIR, relativePath), 'utf-8'); +} diff --git a/sdks/typescript/tests/README.md b/sdks/typescript/tests/README.md new file mode 100644 index 0000000..e5499ba --- /dev/null +++ b/sdks/typescript/tests/README.md @@ -0,0 +1,221 @@ +# Test Suite + +This directory contains unit and integration tests for the Evaluators SDK. + +## Structure + +``` +tests/ +├── unit/ # Fast tests, no API calls +├── integration/ # Real API calls +└── utils/ # Shared test utilities +``` + +## Running Tests + +### Unit Tests +```bash +npm run test:unit # Fast, no API keys needed +``` + +### Integration Tests +Requires API keys in `.env`: +```bash +OPENAI_API_KEY=sk-... +GOOGLE_API_KEY=... +``` + +Run tests: +```bash +RUN_INTEGRATION_TESTS=true npm run test:integration +``` + +### All Tests +```bash +RUN_INTEGRATION_TESTS=true npm run test:all +``` + +### CI Tests +```bash +npm run test:ci # Tests built dist/ package +``` + +## Key Patterns + +### 1. Acceptable Values for LLM Non-Determinism + +LLMs are non-deterministic. Tests use **expected** values with **acceptable** adjacent values: + +```typescript +{ + id: 'V3', + grade: '3', + text: 'Sample text...', + expected: 'very complex', // Try to match this first + acceptable: ['moderately complex'], // Accept if no expected match +} +``` + +**Strategy:** +- Try up to 3 attempts to match expected value (short-circuit on match) +- If no expected match, check if any result is in acceptable range +- Pass test if either expected or acceptable match found + +### 2. Parallel Test Execution + +All tests run concurrently using `it.concurrent()`: + +```typescript +describeIntegration.concurrent('Test Suite', () => { + TEST_CASES.forEach((testCase) => { + it.concurrent(`${testCase.id}`, async () => { + // Test runs in parallel + }, TEST_TIMEOUT_MS); + }); +}); +``` + +**Benefits**: 3-4x faster test execution + +### 3. Buffered Logging + +Logs are buffered and printed atomically to prevent interleaving: + +```typescript +const logBuffer: string[] = []; +logBuffer.push('Test output...'); +// ... collect all logs +console.log(logBuffer.join('\n')); // Print once at end +``` + +## Writing New Integration Tests + +### Basic Template + +```typescript +import { describe, it, expect, beforeAll } from 'vitest'; +import { MyEvaluator } from '../../src/evaluators/my-evaluator.js'; +import { runEvaluatorTest, type BaseTestCase } from '../utils/index.js'; +import { config } from 'dotenv'; + +config(); + +const SKIP_INTEGRATION = !process.env.RUN_INTEGRATION_TESTS && !process.env.MY_API_KEY; +const describeIntegration = SKIP_INTEGRATION ? describe.skip : describe; +const TEST_TIMEOUT_MS = 2 * 60 * 1000; // 2 minutes + +const TEST_CASES: BaseTestCase[] = [ + { + id: 'TEST1', + grade: '3', // Optional, if evaluator needs it + text: 'Sample text...', + expected: 'expected result', + acceptable: ['acceptable alternative'], + }, +]; + +describeIntegration.concurrent('My Evaluator - Test Suite', () => { + let evaluator: MyEvaluator; + + beforeAll(() => { + if (SKIP_INTEGRATION) { + console.log('⏭️ Skipping integration tests'); + return; + } + + evaluator = new MyEvaluator({ + apiKey: process.env.MY_API_KEY!, + retry: false, // We handle retries in test logic + }); + }); + + TEST_CASES.forEach((testCase) => { + it.concurrent(`${testCase.id}: ${testCase.expected}`, async () => { + const logBuffer: string[] = []; + + logBuffer.push('\n' + '='.repeat(80)); + logBuffer.push(`Test Case ${testCase.id}`); + logBuffer.push('='.repeat(80)); + + const maxAttempts = 3; + const result = await runEvaluatorTest(testCase, { + evaluator, + extractResult: (r) => r.score, // Extract the field to compare + maxAttempts, + }); + + logBuffer.push(...result.logs); + console.log(logBuffer.join('\n')); + + expect(result.matched).toBe(true); + expect(result.matchedOnAttempt).toBeLessThanOrEqual(maxAttempts); + }, TEST_TIMEOUT_MS); + }); +}); +``` + +### Test Configuration + +```typescript +// Test timeout (2 minutes per test) +const TEST_TIMEOUT_MS = 2 * 60 * 1000; + +// Max retry attempts +const maxAttempts = 3; + +// Skip integration tests if no API keys +const SKIP_INTEGRATION = !process.env.RUN_INTEGRATION_TESTS && !process.env.API_KEY; +``` + +## Test Utilities + +### `runEvaluatorTest(testCase, config)` + +Generic test runner for all evaluators: + +```typescript +const result = await runEvaluatorTest(testCase, { + evaluator: myEvaluator, + extractResult: (r) => r.score, // How to extract result from evaluation + maxAttempts: 3, // Default: 3 +}); + +// Result structure +interface TestResult { + matched: boolean; // Did test pass? + matchedOnAttempt?: number; // Which attempt matched? + matchType?: 'expected' | 'acceptable'; // How did it match? + totalAttempts: number; + allResults: string[]; // All attempt results + logs: string[]; // Buffered log messages +} +``` + +## Test Strategy + +### Local Development +Tests run against `src/` with prompts copied from `../../evals/prompts/`: +```bash +npm run test:unit +npm run test:integration +``` + +### CI/CD +Tests run against built `dist/` package to validate published code: +```bash +npm run test:ci +``` + +## Troubleshooting + +**Tests skipped?** +- Check API keys: `echo $OPENAI_API_KEY` +- Set: `RUN_INTEGRATION_TESTS=true npm run test:integration` + +**Tests timeout?** +- Increase `TEST_TIMEOUT_MS = 3 * 60 * 1000` (3 minutes) + +**Tests flaky?** +- Add more acceptable values based on actual LLM output +- Increase `maxAttempts` from 3 to 5 +- Check if test case is ambiguous diff --git a/sdks/typescript/tests/integration/vocabulary.integration.test.ts b/sdks/typescript/tests/integration/vocabulary.integration.test.ts new file mode 100644 index 0000000..3bfd478 --- /dev/null +++ b/sdks/typescript/tests/integration/vocabulary.integration.test.ts @@ -0,0 +1,146 @@ +import { describe, it, expect, beforeAll } from 'vitest'; +import { VocabularyEvaluator } from '../../src/evaluators/vocabulary.js'; +import { config } from 'dotenv'; +import { + runEvaluatorTest, + type BaseTestCase, +} from '../utils/index.js'; + +// Load .env file for testing convenience +config(); + +/** + * Vocabulary Evaluator Integration Tests + * + * Test cases cover grades 3-9 with varying complexity levels. + * + * Each test uses a retry mechanism (up to 3 attempts) to account for LLM non-determinism, + * with short-circuiting on first expected match. If no expected match is found after all + * attempts, the test checks if any result falls within the acceptable value range. + * + * To run these tests: + * ```bash + * RUN_INTEGRATION_TESTS=true npm run test:integration + * ``` + */ + +const SKIP_INTEGRATION = !process.env.RUN_INTEGRATION_TESTS && + (!process.env.OPENAI_API_KEY || !process.env.GOOGLE_API_KEY); + +const describeIntegration = SKIP_INTEGRATION ? describe.skip : describe; + +// Test timeout: 2 minutes per test case (allows for 3 attempts with API latency) +const TEST_TIMEOUT_MS = 2 * 60 * 1000; + +// Test cases from PR #6 +const TEST_CASES: BaseTestCase[] = [ + { + id: 'V3', + grade: '3', + text: 'Civil rights are rights that all people in a country have. The civil rights of a country apply to all the citizens within its borders. These rights are given by the laws of the country. Civil rights are sometimes thought to be the same as natural rights. In many countries civil rights include freedom of speech, freedom of the press, freedom of religion, and freedom of assembly. Civil rights also include the right to own property and the right to get fair and equal treatment from the government, from other citizens, and from private groups.', + expected: 'very complex', + acceptable: ['moderately complex', 'exceedingly complex'], + }, + { + id: 'V4', + grade: '4', + text: 'Bluetooth is a protocol for wireless communication over short distances. It was developed in the 1990s, to reduce the number of cables. Devices such as mobile phones, laptops, PCs, printers, digital cameras and video game consoles can connect to each other, and exchange information. This is done using radio waves. It can be done securely. Bluetooth is only used for relatively short distances, like a few metres. There are different standards. Data rates vary. Currently, they are at 1-3 MBit per second.', + expected: 'exceedingly complex', + acceptable: ['very complex'], + }, + { + id: 'V5', + grade: '5', + text: `The scientific method is a way to learn about the world around us. It helps us figure out how things work. Scientists use the scientific method to test their ideas. They start by making observations and asking questions. Then, they make a guess, or a hypothesis, about what might be the answer. They use their hypothesis to make predictions about what will happen in an experiment. Scientists then test their predictions by doing experiments. If the results of the experiment match their predictions, then their hypothesis is supported. If the results don't match, then they need to change their hypothesis. Scientists repeat this process many times to make sure their hypothesis is correct. The scientific method is important because it helps us learn new things. It helps us understand the world around us. Scientists use the scientific method to make new discoveries and solve problems.`, + expected: 'slightly complex', + acceptable: ['moderately complex'], + }, + { + id: 'V6', + grade: '6', + text: `Chicago in 1871 was a city ready to burn. The city boasted having 59,500 buildings, many of them—such as the Courthouse and the Tribune Building—large and ornately decorated. The trouble was that about two-thirds of all these structures were made entirely of wood. Many of the remaining buildings (even the ones proclaimed to be 'fireproof') looked solid, but were actually jerrybuilt affairs; the stone or brick exteriors hid wooden frames and floors, all topped with highly flammable tar or shingle roofs. It was also a common practice to disguise wood as another kind of building material. The fancy exterior decorations on just about every building were carved from wood, then painted to look like stone or marble.`, + expected: 'very complex', + acceptable: ['moderately complex', 'exceedingly complex'], + }, + { + id: 'V7', + grade: '7', + text: `The scientific method is a way of learning about the world around us. It's a process that helps us understand how things work and why they happen. It's not just for scientists; we all use the scientific method in our everyday lives, even if we don't realize it. The scientific method starts with an observation. We notice something interesting and want to know more about it. For example, you might notice that your plant is wilting. You might wonder why this is happening. Next, we form a hypothesis, which is a possible explanation for our observation. In our plant example, you might hypothesize that the plant is wilting because it needs more water. Then, we test our hypothesis by doing an experiment. We change something in our experiment to see if it affects the outcome. In our plant example, you could water the plant and see if it recovers. Based on the results of our experiment, we can either support or reject our hypothesis. If the plant recovers after being watered, then your hypothesis is supported. If the plant doesn't recover, then you need to come up with a new hypothesis. The scientific method is a powerful tool for learning and understanding the world around us. It's a process of asking questions, testing ideas, and drawing conclusions based on evidence. It's a way of thinking that helps us to be curious, to be critical, and to be open to new ideas.`, + expected: 'slightly complex', + acceptable: ['moderately complex'], + }, + { + id: 'V8', + grade: '8', + text: 'The American Revolution was a war for independence between the thirteen American colonies and Great Britain. The war started in 1775 and ended in 1783. The colonists wanted to be free from British rule. They wanted to make their own laws and govern themselves. The colonists were angry about new taxes that the British Parliament imposed on them. They felt that they were being taxed without having a say in how the money was spent. The colonists also felt that the British government was not treating them fairly. The war began with the Battles of Lexington and Concord in April 1775. The colonists, led by General George Washington, fought against the British army. The war was long and difficult, but the colonists eventually won. The colonists won the war because they had the support of the French. The French helped the colonists by providing them with soldiers, ships, and money. The colonists also had a strong leader in George Washington. He was a skilled military leader and he inspired the colonists to fight for their freedom. The American Revolution was a turning point in history. It showed that colonies could break free from their mother countries and become independent nations. The American Revolution also inspired other revolutions around the world.', + expected: 'slightly complex', + acceptable: ['moderately complex'], + }, + { + id: 'V9', + grade: '9', + text: `Mr. President: I would like to speak briefly and simply about a serious national condition. It is a national feeling of fear and frustration that could result in national suicide and the end of everything that we Americans hold dear. It is a condition that comes from the lack of effective leadership in either the Legislative Branch or the Executive Branch of our Government. That leadership is so lacking that serious and responsible proposals are being made that national advisory commissions be appointed to provide such critically needed leadership. I speak as briefly as possible because too much harm has already been done with irresponsible words of bitterness and selfish political opportunism. I speak as briefly as possible because the issue is too great to be obscured by eloquence. I speak simply and briefly in the hope that my words will be taken to heart. I speak as a Republican. I speak as a woman. I speak as a United States Senator. I speak as an American. The United States Senate has long enjoyed worldwide respect as the greatest deliberative body in the world. But recently that deliberative character has too often been debased to the level of a forum of hate and character assassination sheltered by the shield of congressional immunity. It is ironical that we Senators can in debate in the Senate directly or indirectly, by any form of words, impute to any American who is not a Senator any conduct or motive unworthy or unbecoming an American—and without that non-Senator American having any legal redress against us—yet if we say the same thing in the Senate about our colleagues we can be stopped on the grounds of being out of order. It is strange that we can verbally attack anyone else without restraint and with full protection and yet we hold ourselves above the same type of criticism here on the Senate Floor. Surely the United States Senate is big enough to take self-criticism and self-appraisal. Surely we should be able to take the same kind of character attacks that we "dish out" to outsiders. I think that it is high time for the United States Senate and its members to do some soul-searching—for us to weigh our consciences—on the manner in which we are performing our duty to the people of America—on the manner in which we are using or abusing our individual powers and privileges. I think that it is high time that we remembered that we have sworn to uphold and defend the Constitution. I think that it is high time that we remembered that the Constitution, as amended, speaks not only of the freedom of speech but also of trial by jury instead of trial by accusation. Whether it be a criminal prosecution in court or a character prosecution in the Senate, there is little practical distinction when the life of a person has been ruined. Those of us who shout the loudest about Americanism in making character assassinations are all too frequently those who, by our own words and acts, ignore some of the basic principles of Americanism: The right to criticize; The right to hold unpopular beliefs; The right to protest; The right of independent thought. The exercise of these rights should not cost one single American citizen his reputation or his right to a livelihood nor should he be in danger of losing his reputation or livelihood merely because he happens to know someone who holds unpopular beliefs. Who of us doesn't? Otherwise none of us could call our souls our own. Otherwise thought control would have set in. The American people are sick and tired of being afraid to speak their minds lest they be politically smeared as "Communists" or "Fascists" by their opponents. Freedom of speech is not what it used to be in America. It has been so abused by some that it is not exercised by others. The American people are sick and tired of seeing innocent people smeared and guilty people whitewashed. But there have been enough proved cases, such as the Amerasia case, the Hiss case, the Coplon case, the Gold case, to cause the nationwide distrust and strong suspicion that there may be something to the unproved, sensational accusations. I doubt if the Republican Party could—simply because I don't believe the American people will uphold any political party that puts political exploitation above national interest. Surely we Republicans aren't that desperate for victory. I don't want to see the Republican Party win that way. While it might be a fleeting victory for the Republican Party, it would be a more lasting defeat for the American people. Surely it would ultimately be suicide for the Republican Party and the two-party system that has protected our American liberties from the dictatorship of a one-party system. As members of the Minority Party, we do not have the primary authority to formulate the policy of our Government. But we do have the responsibility of rendering constructive criticism, of clarifying issues, of allaying fears by acting as responsible citizens. As a woman, I wonder how the mothers, wives, sisters, and daughters feel about the way in which members of their families have been politically mangled in the Senate debate—and I use the word "debate" advisedly. As a United States Senator, I am not proud of the way in which the Senate has been made a publicity platform for irresponsible sensationalism. I am not proud of the reckless abandon in which unproved charges have been hurled from the side of the aisle. I am not proud of the obviously staged, undignified countercharges that have been attempted in retaliation from the other side of the aisle. I don't like the way the Senate has been made a rendezvous for vilification, for selfish political gain at the sacrifice of individual reputations and national unity. I am not proud of the way we smear outsiders from the Floor of the Senate and hide behind the cloak of congressional immunity and still place ourselves beyond criticism on the Floor of the Senate. As an American, I am shocked at the way Republicans and Democrats alike are playing directly into the Communist design of "confuse, divide, and conquer." As an American, I don't want a Democratic Administration "whitewash" or "cover-up" any more than I want a Republican smear or witch hunt. As an American, I condemn a Republican "Fascist" just as much I condemn a Democratic "Communist." I condemn a Democrat "Fascist" just as much as I condemn a Republican "Communist." They are equally dangerous to you and me and to our country. As an American, I want to see our nation recapture the strength and unity it once had when we fought the enemy instead of ourselves. It is with these thoughts that I have drafted what I call a "Declaration of Conscience." I am gratified that Senator Tobey, Senator Aiken, Senator Morse, Senator Ives, Senator Thye, and Senator Hendrickson have concurred in that declaration and have authorized me to announce their concurrence.`, + expected: 'very complex', + acceptable: ['moderately complex', 'exceedingly complex'], + }, +]; + +describeIntegration.concurrent('Vocabulary Evaluator - Comprehensive Test Suite', () => { + let evaluator: VocabularyEvaluator; + + beforeAll(() => { + if (SKIP_INTEGRATION) { + console.log('⏭️ Skipping integration tests (no API keys or RUN_INTEGRATION_TESTS not set)'); + return; + } + + evaluator = new VocabularyEvaluator({ + googleApiKey: process.env.GOOGLE_API_KEY!, + openaiApiKey: process.env.OPENAI_API_KEY!, + retry: false, // We handle retries in the test logic + }); + + console.log('\n' + '='.repeat(80)); + console.log('VOCABULARY EVALUATOR - TEST SUITE (PARALLEL)'); + console.log('='.repeat(80)); + console.log(`Running ${TEST_CASES.length} test cases with up to 3 attempts each`); + console.log('Short-circuiting on first expected match'); + console.log('Checking acceptable values if no expected match'); + console.log('='.repeat(80)); + }); + + // Generate individual test for each case + TEST_CASES.forEach((testCase) => { + it.concurrent(`${testCase.id}: Grade ${testCase.grade} - ${testCase.expected}`, async () => { + // Buffer all logs to print atomically at the end (prevents interleaving in parallel tests) + const logBuffer: string[] = []; + + // Test header + logBuffer.push('\n' + '='.repeat(80)); + logBuffer.push(`Test Case ${testCase.id} | Grade: ${testCase.grade}`); + logBuffer.push('='.repeat(80)); + logBuffer.push(`Expected Complexity: ${testCase.expected}`); + logBuffer.push(`Text Preview: ${testCase.text.substring(0, 100)}...`); + logBuffer.push(''); + + // Run the evaluation (returns logs instead of printing) + const maxAttempts = 3; + const result = await runEvaluatorTest(testCase, { + evaluator, + extractResult: (r) => r.score, + maxAttempts, + }); + + // Add evaluation logs to buffer (includes detailed summary) + logBuffer.push(...result.logs); + + // Print all logs atomically at the end - single console.log to prevent interleaving + console.log(logBuffer.join('\n')); + + // Assert that we got a match within maxAttempts (expected or acceptable) + expect(result.matched).toBe(true); + expect(result.matchedOnAttempt).toBeDefined(); + expect(result.matchedOnAttempt).toBeLessThanOrEqual(maxAttempts); + }, TEST_TIMEOUT_MS); + }); +}); diff --git a/sdks/typescript/tests/unit/evaluators/validation.test.ts b/sdks/typescript/tests/unit/evaluators/validation.test.ts new file mode 100644 index 0000000..b273b8a --- /dev/null +++ b/sdks/typescript/tests/unit/evaluators/validation.test.ts @@ -0,0 +1,125 @@ +import { describe, it, expect, vi, beforeEach } from 'vitest'; +import { VocabularyEvaluator } from '../../../src/evaluators/vocabulary.js'; +import { VALIDATION_LIMITS } from '../../../src/evaluators/base.js'; +import type { LLMProvider } from '../../../src/providers/base.js'; + +/** + * Comprehensive validation tests for input validation + * + * Tests the base evaluator validation logic that all evaluators inherit. + * Uses VocabularyEvaluator as the test subject since it extends BaseEvaluator. + * + * All tests use mocked providers to avoid real API calls. + */ + +// Mock providers +const createMockProvider = (): LLMProvider => ({ + generateStructured: vi.fn(), + generateText: vi.fn(), +}); + +// Mock the createProvider factory +vi.mock('../../../src/providers/index.js', () => ({ + createProvider: vi.fn(() => createMockProvider()), +})); + +// Mock telemetry to avoid real HTTP calls +vi.mock('../../../src/telemetry/client.js', () => { + return { + TelemetryClient: class MockTelemetryClient { + send = vi.fn().mockResolvedValue(undefined); + }, + }; +}); + +describe('Input Validation - Text Validation', () => { + let evaluator: VocabularyEvaluator; + + beforeEach(() => { + vi.clearAllMocks(); + + evaluator = new VocabularyEvaluator({ + googleApiKey: 'test-google-key', + openaiApiKey: 'test-openai-key', + telemetry: false, + }); + }); + + describe('Empty text validation', () => { + it.each([ + ['empty string', ''], + ['spaces only', ' '], + ['tabs only', '\t\t\t'], + ['newlines only', '\n\n\n'], + ['mixed whitespace', ' \t\n '], + ])('should reject %s', async (_label, text) => { + await expect(evaluator.evaluate(text, '5')) + .rejects.toThrow('Text cannot be empty or contain only whitespace'); + }); + }); + + describe('Minimum length validation', () => { + it(`should reject text shorter than ${VALIDATION_LIMITS.MIN_TEXT_LENGTH} characters`, async () => { + const shortText = 'Hello wo'; // 8 chars after trim + await expect(evaluator.evaluate(shortText, '5')) + .rejects.toThrow(`Text is too short. Minimum length is ${VALIDATION_LIMITS.MIN_TEXT_LENGTH} characters, received 8 characters`); + }); + }); + + describe('Maximum length validation', () => { + it(`should reject text longer than ${VALIDATION_LIMITS.MAX_TEXT_LENGTH.toLocaleString()} characters`, async () => { + const longText = 'a'.repeat(VALIDATION_LIMITS.MAX_TEXT_LENGTH + 1); + + await expect(evaluator.evaluate(longText, '5')) + .rejects.toThrow(new RegExp(`Text is too long\\. Maximum length is ${VALIDATION_LIMITS.MAX_TEXT_LENGTH.toLocaleString()} characters, received ${(VALIDATION_LIMITS.MAX_TEXT_LENGTH + 1).toLocaleString()} characters`)); + }); + }); +}); + +describe('Input Validation - Grade Validation', () => { + let evaluator: VocabularyEvaluator; + + beforeEach(() => { + vi.clearAllMocks(); + + evaluator = new VocabularyEvaluator({ + googleApiKey: 'test-google-key', + openaiApiKey: 'test-openai-key', + telemetry: false, + }); + }); + + describe('Valid grade range', () => { + it.each([ + ['K', 'K'], + ['1', '1'], + ['2', '2'], + ])('should reject grade %s (below minimum)', async (_label, grade) => { + const validText = 'This is a sample text for testing.'; + + await expect(evaluator.evaluate(validText, grade)) + .rejects.toThrow(`Invalid grade "${grade}". Supported grades for this evaluator: 3, 4, 5, 6, 7, 8, 9, 10, 11, 12`); + }); + + it.each([ + ['13', '13'], + ['99', '99'], + ])('should reject grade %s (above maximum)', async (_label, grade) => { + const validText = 'This is a sample text for testing.'; + + await expect(evaluator.evaluate(validText, grade)) + .rejects.toThrow(`Invalid grade "${grade}". Supported grades for this evaluator: 3, 4, 5, 6, 7, 8, 9, 10, 11, 12`); + }); + + it.each([ + ['invalid', 'invalid'], + ['grade5', 'grade5'], + ['empty string', ''], + ])('should reject grade %s (invalid format)', async (_label, grade) => { + const validText = 'This is a sample text for testing.'; + + await expect(evaluator.evaluate(validText, grade)) + .rejects.toThrow(`Invalid grade "${grade}". Supported grades for this evaluator: 3, 4, 5, 6, 7, 8, 9, 10, 11, 12`); + }); + }); +}); diff --git a/sdks/typescript/tests/unit/evaluators/vocabulary.test.ts b/sdks/typescript/tests/unit/evaluators/vocabulary.test.ts new file mode 100644 index 0000000..6b75336 --- /dev/null +++ b/sdks/typescript/tests/unit/evaluators/vocabulary.test.ts @@ -0,0 +1,249 @@ +import { describe, it, expect, vi, beforeEach, afterEach } from 'vitest'; +import { VocabularyEvaluator } from '../../../src/evaluators/vocabulary.js'; +import type { LLMProvider } from '../../../src/providers/base.js'; + +/** + * Comprehensive unit tests for VocabularyEvaluator + * + * These tests verify: + * - Constructor validation + * - Successful evaluation flow (both stages) + * - Error handling (LLM failures, validation errors) + * - Telemetry behavior (success/error cases) + * - Token usage aggregation + * - Edge cases + */ + +// Mock providers +const createMockProvider = (): LLMProvider => ({ + generateStructured: vi.fn(), + generateText: vi.fn(), +}); + +// Mock the createProvider factory +vi.mock('../../../src/providers/index.js', () => ({ + createProvider: vi.fn(() => createMockProvider()), +})); + +// Mock telemetry to avoid real HTTP calls +vi.mock('../../../src/telemetry/client.js', () => { + return { + TelemetryClient: class MockTelemetryClient { + send = vi.fn().mockResolvedValue(undefined); + }, + }; +}); + +describe('VocabularyEvaluator - Constructor Validation', () => { + it('should throw error when Google API key is missing', () => { + expect(() => new VocabularyEvaluator({ + googleApiKey: '', + openaiApiKey: 'test-openai-key', + })).toThrow('Google API key is required. Pass googleApiKey in config.'); + }); + + it('should throw error when OpenAI API key is missing', () => { + expect(() => new VocabularyEvaluator({ + googleApiKey: 'test-google-key', + openaiApiKey: '', + })).toThrow('OpenAI API key is required. Pass openaiApiKey in config.'); + }); + +}); + +describe('VocabularyEvaluator - Evaluation Flow', () => { + let evaluator: VocabularyEvaluator; + let mockBackgroundProvider: LLMProvider; + let mockComplexityProvider: LLMProvider; + + beforeEach(() => { + vi.clearAllMocks(); + + // Create evaluator (providers will be mocked) + evaluator = new VocabularyEvaluator({ + googleApiKey: 'test-google-key', + openaiApiKey: 'test-openai-key', + telemetry: false, // Disable telemetry for most tests + }); + + // Get references to the mocked providers + // @ts-expect-error Accessing private property for testing + mockBackgroundProvider = evaluator.backgroundKnowledgeProvider; + // @ts-expect-error Accessing private property for testing + mockComplexityProvider = evaluator.complexityProvider; + }); + + afterEach(() => { + vi.restoreAllMocks(); + }); + + describe('Successful Evaluation Flow', () => { + it('should successfully evaluate text through both stages', async () => { + const testText = 'The mitochondria is the powerhouse of the cell.'; + const testGrade = '5'; + + // Mock background knowledge response + vi.mocked(mockBackgroundProvider.generateText).mockResolvedValue({ + text: 'Students at grade 5 typically understand basic cell biology concepts.', + usage: { + inputTokens: 100, + outputTokens: 50, + }, + latencyMs: 500, + }); + + // Mock complexity evaluation response + vi.mocked(mockComplexityProvider.generateStructured).mockResolvedValue({ + data: { + complexity_score: 'moderately complex', + reasoning: 'The text uses grade-appropriate vocabulary.', + factors: ['Academic terminology', 'Clear structure'], + }, + model: 'gemini-2.5-pro', + usage: { + inputTokens: 200, + outputTokens: 100, + }, + latencyMs: 800, + }); + + // Execute evaluation + const result = await evaluator.evaluate(testText, testGrade); + + // Verify result structure + expect(result.score).toBe('moderately complex'); + expect(result.reasoning).toContain('grade-appropriate vocabulary'); + expect(result.metadata).toBeDefined(); + expect(result.metadata.model).toBe('gemini-2.5-pro + gpt-4o-2024-11-20'); + expect(result.metadata.processingTimeMs).toBeGreaterThan(0); + + // Verify both providers were called + expect(mockBackgroundProvider.generateText).toHaveBeenCalledTimes(1); + expect(mockComplexityProvider.generateStructured).toHaveBeenCalledTimes(1); + + // Verify background knowledge call + const bgCall = vi.mocked(mockBackgroundProvider.generateText).mock.calls[0]; + expect(bgCall[0][0].content).toContain(testText); + expect(bgCall[1]).toBe(0); // temperature = 0 + + // Verify complexity call includes background knowledge + const complexityCall = vi.mocked(mockComplexityProvider.generateStructured).mock.calls[0]; + expect(complexityCall[0].messages[1].content).toContain(testText); + expect(complexityCall[0].schema).toBeDefined(); + expect(complexityCall[0].temperature).toBe(0); + }); + +}); + + describe('Error Handling', () => { + it('should handle background knowledge API failure', async () => { + const testText = 'Test text here for API failure'; + const testGrade = '5'; + + // Mock background knowledge failure + vi.mocked(mockBackgroundProvider.generateText).mockRejectedValue( + new Error('API timeout') + ); + + // Should propagate the error + await expect(evaluator.evaluate(testText, testGrade)) + .rejects.toThrow('API timeout'); + + // Verify complexity provider was never called + expect(mockComplexityProvider.generateStructured).not.toHaveBeenCalled(); + }); + + it('should handle complexity evaluation API failure', async () => { + const testText = 'Test text here for complexity failure'; + const testGrade = '6'; + + // Mock successful background knowledge + vi.mocked(mockBackgroundProvider.generateText).mockResolvedValue({ + text: 'Background knowledge', + usage: { inputTokens: 100, outputTokens: 50 }, + latencyMs: 500, + }); + + // Mock complexity evaluation failure + vi.mocked(mockComplexityProvider.generateStructured).mockRejectedValue( + new Error('Schema validation failed') + ); + + // Should propagate the error + await expect(evaluator.evaluate(testText, testGrade)) + .rejects.toThrow('Schema validation failed'); + + // Verify background provider was called (stage 1 completed) + expect(mockBackgroundProvider.generateText).toHaveBeenCalledTimes(1); + }); + + }); + + describe('Response Structure', () => { + it('should return correct result structure', async () => { + vi.mocked(mockBackgroundProvider.generateText).mockResolvedValue({ + text: 'Background knowledge', + usage: { inputTokens: 100, outputTokens: 50 }, + latencyMs: 500, + }); + + vi.mocked(mockComplexityProvider.generateStructured).mockResolvedValue({ + data: { + complexity_score: 'moderately complex', + reasoning: 'Detailed reasoning here', + factors: ['Factor 1', 'Factor 2'], + }, + model: 'gemini-2.5-pro', + usage: { inputTokens: 200, outputTokens: 100 }, + latencyMs: 800, + }); + + const result = await evaluator.evaluate('Test text here', '5'); + + // Verify result structure + expect(result).toHaveProperty('score'); + expect(result).toHaveProperty('reasoning'); + expect(result).toHaveProperty('metadata'); + expect(result).toHaveProperty('_internal'); + + // Verify metadata structure + expect(result.metadata).toHaveProperty('promptVersion'); + expect(result.metadata).toHaveProperty('model'); + expect(result.metadata).toHaveProperty('timestamp'); + expect(result.metadata).toHaveProperty('processingTimeMs'); + + // Verify metadata values + expect(result.metadata.promptVersion).toBe('1.0'); + expect(result.metadata.model).toBe('gemini-2.5-pro + gpt-4o-2024-11-20'); + expect(result.metadata.timestamp).toBeInstanceOf(Date); + expect(result.metadata.processingTimeMs).toBeGreaterThanOrEqual(0); // Mocked calls can be instant (0ms) + }); + + it('should include internal data', async () => { + vi.mocked(mockBackgroundProvider.generateText).mockResolvedValue({ + text: 'Background knowledge', + usage: { inputTokens: 100, outputTokens: 50 }, + latencyMs: 500, + }); + + const mockComplexityData = { + complexity_score: 'moderately complex', + reasoning: 'Detailed reasoning', + factors: ['Factor 1', 'Factor 2'], + analysis: 'Deep analysis', + }; + + vi.mocked(mockComplexityProvider.generateStructured).mockResolvedValue({ + data: mockComplexityData, + model: 'gemini-2.5-pro', + usage: { inputTokens: 200, outputTokens: 100 }, + latencyMs: 800, + }); + + const result = await evaluator.evaluate('Test text here', '5'); + + // Verify internal data is included + expect(result._internal).toEqual(mockComplexityData); + }); + }); +}); diff --git a/sdks/typescript/tests/unit/features/readability.test.ts b/sdks/typescript/tests/unit/features/readability.test.ts new file mode 100644 index 0000000..2c1eda6 --- /dev/null +++ b/sdks/typescript/tests/unit/features/readability.test.ts @@ -0,0 +1,27 @@ +import { describe, it, expect } from 'vitest'; +import { calculateFleschKincaidGrade } from '../../../src/features/readability.js'; + +describe('calculateFleschKincaidGrade', () => { + it('should calculate FK grade for simple text', () => { + const text = 'The cat sat on the mat. The dog ran away.'; + const grade = calculateFleschKincaidGrade(text); + + expect(grade).toBeLessThan(5); + expect(typeof grade).toBe('number'); + }); + + it('should handle empty text', () => { + const grade = calculateFleschKincaidGrade(''); + expect(grade).toBe(0); + }); + + it('should calculate higher grade for complex text', () => { + const simpleText = 'The cat sat.'; + const complexText = 'The mitochondria, known as the powerhouse of cellular respiration, facilitates biochemical processes.'; + + const simpleGrade = calculateFleschKincaidGrade(simpleText); + const complexGrade = calculateFleschKincaidGrade(complexText); + + expect(complexGrade).toBeGreaterThan(simpleGrade); + }); +}); diff --git a/sdks/typescript/tests/unit/telemetry/utils.test.ts b/sdks/typescript/tests/unit/telemetry/utils.test.ts new file mode 100644 index 0000000..24dd0e7 --- /dev/null +++ b/sdks/typescript/tests/unit/telemetry/utils.test.ts @@ -0,0 +1,88 @@ +import { describe, it, expect } from 'vitest'; +import { generateClientId, getSDKVersion } from '../../../src/telemetry/utils.js'; + +describe('Telemetry Utils', () => { + describe('generateClientId', () => { + it('should generate consistent hash for same keys', () => { + const id1 = generateClientId('key1', 'key2'); + const id2 = generateClientId('key1', 'key2'); + + expect(id1).toBe(id2); + }); + + it('should generate same hash regardless of key order', () => { + const id1 = generateClientId('key1', 'key2'); + const id2 = generateClientId('key2', 'key1'); + + expect(id1).toBe(id2); + }); + + it('should filter out undefined keys', () => { + const id1 = generateClientId('key1', undefined, 'key2'); + const id2 = generateClientId('key1', 'key2'); + + expect(id1).toBe(id2); + }); + + it('should generate different hashes for different keys', () => { + const id1 = generateClientId('key1', 'key2'); + const id2 = generateClientId('key1', 'key3'); + + expect(id1).not.toBe(id2); + }); + + it('should return 64-character hex string', () => { + const id = generateClientId('key1', 'key2'); + + expect(id).toMatch(/^[a-f0-9]{64}$/); + }); + + it('should handle single key', () => { + const id = generateClientId('single-key'); + + expect(id).toMatch(/^[a-f0-9]{64}$/); + }); + + it('should generate random ID when no keys provided', () => { + const id1 = generateClientId(); + const id2 = generateClientId(); + + // Random IDs should be different + expect(id1).not.toBe(id2); + expect(id1).toMatch(/^[a-f0-9]{64}$/); + expect(id2).toMatch(/^[a-f0-9]{64}$/); + }); + + it('should generate random ID when all keys are undefined', () => { + const id1 = generateClientId(undefined, undefined); + const id2 = generateClientId(undefined, undefined); + + // Random IDs should be different + expect(id1).not.toBe(id2); + }); + + it('should prevent collision with delimiter (theoretical)', () => { + // Without delimiter: ["ab", "c"] and ["a", "bc"] would both hash "abc" + // With delimiter: ["ab", "c"] → "ab|c" and ["a", "bc"] → "a|bc" + const id1 = generateClientId('ab', 'c'); + const id2 = generateClientId('a', 'bc'); + + expect(id1).not.toBe(id2); + }); + }); + + describe('getSDKVersion', () => { + it('should return a valid version string', () => { + const version = getSDKVersion(); + + expect(version).toMatch(/^\d+\.\d+\.\d+$/); + }); + + it('should return same version on repeated calls (cached)', () => { + const version1 = getSDKVersion(); + const version2 = getSDKVersion(); + + expect(version1).toBe(version2); + }); + }); +}); diff --git a/sdks/typescript/tests/utils/index.ts b/sdks/typescript/tests/utils/index.ts new file mode 100644 index 0000000..e01a630 --- /dev/null +++ b/sdks/typescript/tests/utils/index.ts @@ -0,0 +1,18 @@ +/** + * Test utilities for evaluator testing + * + * @example + * ```typescript + * import { runTestWithRetry, runEvaluatorTest } from '../utils'; + * ``` + */ + +export { + runTestWithRetry, + runEvaluatorTest, + type TestAttempt, + type TestResult, + type RetryTestOptions, + type BaseTestCase, + type EvaluatorTestConfig, +} from './test-helpers.js'; diff --git a/sdks/typescript/tests/utils/test-helpers.ts b/sdks/typescript/tests/utils/test-helpers.ts new file mode 100644 index 0000000..91a6be7 --- /dev/null +++ b/sdks/typescript/tests/utils/test-helpers.ts @@ -0,0 +1,254 @@ +/** + * Streamlined test utilities for evaluator testing + */ + +export interface TestAttempt { + attempt: number; + result: T; + matched: boolean; +} + +export interface TestResult { + matched: boolean; + matchedOnAttempt?: number; + matchType?: 'expected' | 'acceptable'; // How the match occurred + totalAttempts: number; + attempts: TestAttempt[]; + allResults: T[]; + logs: string[]; // Buffered log messages for atomic printing +} + +export interface RetryTestOptions { + /** Function that executes the test and returns the actual output */ + testFn: (input: TInput) => Promise; + + /** Input to pass to the test function */ + input: TInput; + + /** Expected output value */ + expected: TOutput; + + /** Maximum number of attempts (default: 3) */ + maxAttempts?: number; + + /** Custom comparison function (default: strict equality) */ + compareFn?: (actual: TOutput, expected: TOutput) => boolean; + + /** Optional callback after each attempt */ + onAttempt?: (attempt: number, result: TOutput, matched: boolean) => void; +} + +/** + * Default comparison function (case-insensitive string comparison) + */ +function defaultCompareFn(actual: T, expected: T): boolean { + if (typeof actual === 'string' && typeof expected === 'string') { + return actual.toLowerCase() === expected.toLowerCase(); + } + return actual === expected; +} + +/** + * Runs a test function multiple times with retry logic and short-circuiting. + */ +export async function runTestWithRetry( + options: RetryTestOptions +): Promise> { + const { + testFn, + input, + expected, + maxAttempts = 3, + compareFn = defaultCompareFn, + onAttempt, + } = options; + + const attempts: TestAttempt[] = []; + let matched = false; + let matchedOnAttempt: number | undefined; + + for (let attemptNum = 1; attemptNum <= maxAttempts; attemptNum++) { + const result = await testFn(input); + const isMatch = compareFn(result, expected); + + attempts.push({ + attempt: attemptNum, + result, + matched: isMatch, + }); + + if (onAttempt) { + onAttempt(attemptNum, result, isMatch); + } + + // Short-circuit on match + if (isMatch) { + matched = true; + matchedOnAttempt = attemptNum; + break; + } + } + + return { + matched, + matchedOnAttempt, + totalAttempts: attempts.length, + attempts, + allResults: attempts.map(a => a.result), + logs: [], // No logs for this simple retry function + }; +} + +/** + * Generic test case structure + * All evaluator-specific test cases extend this + */ +export interface BaseTestCase { + id: string; + text: string; + grade?: string; // Optional: some evaluators need it, some don't + expected: string; // Expected output value (checked on each attempt) + acceptable?: string[]; // Acceptable adjacent values (checked if no expected match after all retries) +} + +/** + * Configuration for running evaluator tests + */ +export interface EvaluatorTestConfig { + /** The evaluator instance to test */ + evaluator: TEvaluator; + + /** Function to extract the result to compare from evaluation output */ + extractResult: (evalResult: any) => string; + + /** Maximum retry attempts (default: 3) */ + maxAttempts?: number; +} + +/** + * Generic evaluator test runner + * Works for any evaluator with retry logic + * + * @example + * ```typescript + * // Vocabulary evaluator + * const result = await runEvaluatorTest( + * { + * id: 'V1', + * text: 'Sample text...', + * grade: '3', + * expected: 'very complex' + * }, + * { + * evaluator: vocabularyEvaluator, + * extractResult: (r) => r.score + * } + * ); + * + * // Grade level evaluator + * const result = await runEvaluatorTest( + * { + * id: 'GLA1', + * text: 'Sample text...', + * expected: '6-8' + * }, + * { + * evaluator: gradeLevelEvaluator, + * extractResult: (r) => r.score.grade + * } + * ); + * ``` + */ +export async function runEvaluatorTest( + testCase: BaseTestCase, + config: EvaluatorTestConfig +): Promise> { + const { evaluator, extractResult, maxAttempts = 3 } = config; + const compareFn = defaultCompareFn; + + // Buffer logs to print atomically at the end (prevents interleaving in parallel tests) + const logBuffer: string[] = []; + + // Log test criteria upfront + logBuffer.push(`\n Expected: "${testCase.expected}"`); + if (testCase.acceptable && testCase.acceptable.length > 0) { + logBuffer.push(` Acceptable: [${testCase.acceptable.map(v => `"${v}"`).join(', ')}]`); + } + logBuffer.push(''); + + const attempts: TestAttempt[] = []; + let matched = false; + let matchedOnAttempt: number | undefined; + let matchType: 'expected' | 'acceptable' | undefined; + + // Phase 1: Try to match expected value (short-circuit on match) + for (let attemptNum = 1; attemptNum <= maxAttempts; attemptNum++) { + const result = testCase.grade + ? await evaluator.evaluate(testCase.text, testCase.grade) + : await evaluator.evaluate(testCase.text); + + const actualValue = extractResult(result); + const isExpectedMatch = compareFn(actualValue, testCase.expected); + + attempts.push({ + attempt: attemptNum, + result: actualValue, + matched: isExpectedMatch, + }); + + logBuffer.push(` Attempt ${attemptNum}: "${actualValue}" ${isExpectedMatch ? '✓ EXPECTED MATCH' : '✗'}`); + + // Short-circuit on expected match + if (isExpectedMatch) { + matched = true; + matchedOnAttempt = attemptNum; + matchType = 'expected'; + break; + } + } + + // Phase 2: If no expected match, check if any result is in acceptable range + // Only check acceptable values if they are defined and non-empty + if (!matched && testCase.acceptable?.length) { + logBuffer.push('\n No expected match. Checking acceptable values...'); + + for (let i = 0; i < attempts.length; i++) { + const attemptResult = attempts[i].result; + const isAcceptable = testCase.acceptable.some(acceptable => + compareFn(attemptResult, acceptable) + ); + + if (isAcceptable) { + matched = true; + matchedOnAttempt = i + 1; + matchType = 'acceptable'; + logBuffer.push(` ✓ ACCEPTABLE MATCH: Attempt ${matchedOnAttempt} result "${attemptResult}" is in acceptable range`); + break; + } + } + + if (!matched) { + logBuffer.push(` ✗ NO MATCH: None of the attempts matched expected or acceptable values`); + } + } + + // Summary logging + logBuffer.push('\n Summary:'); + logBuffer.push(` All Results: [${attempts.map(a => `"${a.result}"`).join(', ')}]`); + if (matched) { + logBuffer.push(` Status: ✓ PASS (matched ${matchType} on attempt ${matchedOnAttempt})`); + } else { + logBuffer.push(` Status: ✗ FAIL (no match after ${attempts.length} attempts)`); + } + + // Return logs for atomic printing by the caller + return { + matched, + matchedOnAttempt, + matchType, + totalAttempts: attempts.length, + attempts, + allResults: attempts.map(a => a.result), + logs: logBuffer, + }; +} \ No newline at end of file From 3d01f047e7fd9b9602309098c04413afd87d3f53 Mon Sep 17 00:00:00 2001 From: Adnan Rashid Hussain Date: Thu, 19 Feb 2026 19:22:39 -0800 Subject: [PATCH 02/10] rebase and update --- sdks/typescript/src/evaluators/vocabulary.ts | 2 -- .../vocabulary/background-knowledge.ts | 8 +------ .../src/prompts/vocabulary/system.ts | 15 ++----------- .../typescript/src/prompts/vocabulary/user.ts | 15 ++----------- sdks/typescript/src/utils/prompts.ts | 21 ------------------- .../vocabulary.integration.test.ts | 1 - 6 files changed, 5 insertions(+), 57 deletions(-) delete mode 100644 sdks/typescript/src/utils/prompts.ts diff --git a/sdks/typescript/src/evaluators/vocabulary.ts b/sdks/typescript/src/evaluators/vocabulary.ts index b9270bd..3fd0f5a 100644 --- a/sdks/typescript/src/evaluators/vocabulary.ts +++ b/sdks/typescript/src/evaluators/vocabulary.ts @@ -61,7 +61,6 @@ export interface VocabularyEvaluatorConfig extends BaseEvaluatorConfig { export class VocabularyEvaluator extends BaseEvaluator { private complexityProvider: LLMProvider; private backgroundKnowledgeProvider: LLMProvider; - private evaluatorConfig: VocabularyEvaluatorConfig; constructor(config: VocabularyEvaluatorConfig) { // Call base constructor for common setup (telemetry, etc.) @@ -76,7 +75,6 @@ export class VocabularyEvaluator extends BaseEvaluator { throw new ValidationError('OpenAI API key is required. Pass openaiApiKey in config.'); } - this.evaluatorConfig = config; // Create Google Gemini provider for complexity evaluation this.complexityProvider = createProvider({ diff --git a/sdks/typescript/src/prompts/vocabulary/background-knowledge.ts b/sdks/typescript/src/prompts/vocabulary/background-knowledge.ts index 9f108cc..555f949 100644 --- a/sdks/typescript/src/prompts/vocabulary/background-knowledge.ts +++ b/sdks/typescript/src/prompts/vocabulary/background-knowledge.ts @@ -1,10 +1,4 @@ -import { loadPrompt } from '../../utils/prompts'; - -/** - * Background knowledge prompt template - * Loaded from: prompts/vocabulary/background-knowledge.txt - */ -const BACKGROUND_KNOWLEDGE_TEMPLATE = loadPrompt('vocabulary/background-knowledge.txt'); +import BACKGROUND_KNOWLEDGE_TEMPLATE from '../../../../../evals/prompts/vocabulary/background-knowledge.txt'; /** * Generate the background knowledge prompt for a given text and grade level diff --git a/sdks/typescript/src/prompts/vocabulary/system.ts b/sdks/typescript/src/prompts/vocabulary/system.ts index 634dbad..f1a0970 100644 --- a/sdks/typescript/src/prompts/vocabulary/system.ts +++ b/sdks/typescript/src/prompts/vocabulary/system.ts @@ -1,16 +1,5 @@ -import { loadPrompt } from '../../utils/prompts'; - -/** - * System prompt for vocabulary complexity evaluation (Grades 3-4) - * Loaded from: prompts/vocabulary/grades-3-4-system.txt - */ -const SYSTEM_PROMPT_GRADES_3_4 = loadPrompt('vocabulary/grades-3-4-system.txt'); - -/** - * System prompt for vocabulary complexity evaluation (Other grades: K-2, 5-12) - * Loaded from: prompts/vocabulary/other-grades-system.txt - */ -const SYSTEM_PROMPT_OTHER_GRADES = loadPrompt('vocabulary/other-grades-system.txt'); +import SYSTEM_PROMPT_GRADES_3_4 from '../../../../../evals/prompts/vocabulary/grades-3-4-system.txt'; +import SYSTEM_PROMPT_OTHER_GRADES from '../../../../../evals/prompts/vocabulary/other-grades-system.txt'; /** * Get the appropriate system prompt based on grade level diff --git a/sdks/typescript/src/prompts/vocabulary/user.ts b/sdks/typescript/src/prompts/vocabulary/user.ts index 6c06e87..55dcfee 100644 --- a/sdks/typescript/src/prompts/vocabulary/user.ts +++ b/sdks/typescript/src/prompts/vocabulary/user.ts @@ -1,16 +1,5 @@ -import { loadPrompt } from '../../utils/prompts'; - -/** - * User prompt template for vocabulary complexity evaluation (Grades 3-4) - * Loaded from: prompts/vocabulary/grades-3-4-user.txt - */ -const USER_PROMPT_TEMPLATE_GRADES_3_4 = loadPrompt('vocabulary/grades-3-4-user.txt'); - -/** - * User prompt template for vocabulary complexity evaluation (Other grades: K-2, 5-12) - * Loaded from: prompts/vocabulary/other-grades-user.txt - */ -const USER_PROMPT_TEMPLATE_OTHER_GRADES = loadPrompt('vocabulary/other-grades-user.txt'); +import USER_PROMPT_TEMPLATE_GRADES_3_4 from '../../../../../evals/prompts/vocabulary/grades-3-4-user.txt'; +import USER_PROMPT_TEMPLATE_OTHER_GRADES from '../../../../../evals/prompts/vocabulary/other-grades-user.txt'; /** * Generate the user prompt for vocabulary complexity evaluation diff --git a/sdks/typescript/src/utils/prompts.ts b/sdks/typescript/src/utils/prompts.ts deleted file mode 100644 index 757f2e9..0000000 --- a/sdks/typescript/src/utils/prompts.ts +++ /dev/null @@ -1,21 +0,0 @@ -import { readFileSync } from 'fs'; -import { fileURLToPath } from 'url'; -import { dirname, join } from 'path'; - -const __filename = fileURLToPath(import.meta.url); -const __dirname = dirname(__filename); - -// Path to prompts directory -// When in src/utils/, go up one level. When bundled in dist/, stay in current dir. -const PROMPTS_DIR = __dirname.endsWith('utils') - ? join(dirname(__dirname), 'prompts') - : join(__dirname, 'prompts'); - -/** - * Load a prompt file from the prompts directory - * @param relativePath - Path relative to prompts directory (e.g., 'vocabulary/grades-3-4-system.txt') - * @returns The prompt file contents as a string - */ -export function loadPrompt(relativePath: string): string { - return readFileSync(join(PROMPTS_DIR, relativePath), 'utf-8'); -} diff --git a/sdks/typescript/tests/integration/vocabulary.integration.test.ts b/sdks/typescript/tests/integration/vocabulary.integration.test.ts index 3bfd478..889165e 100644 --- a/sdks/typescript/tests/integration/vocabulary.integration.test.ts +++ b/sdks/typescript/tests/integration/vocabulary.integration.test.ts @@ -97,7 +97,6 @@ describeIntegration.concurrent('Vocabulary Evaluator - Comprehensive Test Suite' evaluator = new VocabularyEvaluator({ googleApiKey: process.env.GOOGLE_API_KEY!, openaiApiKey: process.env.OPENAI_API_KEY!, - retry: false, // We handle retries in the test logic }); console.log('\n' + '='.repeat(80)); From fb820ab331b6b7e8ed9ff16abfb7d27d0d0cb569 Mon Sep 17 00:00:00 2001 From: Adnan Rashid Hussain Date: Thu, 19 Feb 2026 20:04:59 -0800 Subject: [PATCH 03/10] feedback --- sdks/typescript/src/evaluators/base.ts | 10 ++-- sdks/typescript/src/evaluators/vocabulary.ts | 5 +- sdks/typescript/src/features/index.ts | 1 + sdks/typescript/src/features/readability.ts | 50 ++++++++----------- sdks/typescript/src/index.ts | 3 +- .../vocabulary/background-knowledge.ts | 4 +- .../src/prompts/vocabulary/system.ts | 4 +- .../typescript/src/prompts/vocabulary/user.ts | 8 +-- sdks/typescript/src/telemetry/utils.ts | 4 +- 9 files changed, 42 insertions(+), 47 deletions(-) diff --git a/sdks/typescript/src/evaluators/base.ts b/sdks/typescript/src/evaluators/base.ts index 65ea428..cbca438 100644 --- a/sdks/typescript/src/evaluators/base.ts +++ b/sdks/typescript/src/evaluators/base.ts @@ -162,7 +162,7 @@ export abstract class BaseEvaluator { * Validate text meets requirements * Default implementation - can be overridden by concrete evaluators * - * @throws {Error} If text is invalid + * @throws {ValidationError} If text is invalid */ protected validateText(text: string): void { this.logger.debug('Validating text input', { @@ -199,15 +199,15 @@ export abstract class BaseEvaluator { } // Check maximum length - if (text.length > VALIDATION_LIMITS.MAX_TEXT_LENGTH) { + if (trimmedText.length > VALIDATION_LIMITS.MAX_TEXT_LENGTH) { const error = new ValidationError( - `Text is too long. Maximum length is ${VALIDATION_LIMITS.MAX_TEXT_LENGTH.toLocaleString()} characters, received ${text.length.toLocaleString()} characters` + `Text is too long. Maximum length is ${VALIDATION_LIMITS.MAX_TEXT_LENGTH.toLocaleString()} characters, received ${trimmedText.length.toLocaleString()} characters` ); this.logger.error('Text validation failed: too long', { evaluator: this.getEvaluatorType(), error, maxLength: VALIDATION_LIMITS.MAX_TEXT_LENGTH, - actualLength: text.length, + actualLength: trimmedText.length, }); throw error; } @@ -219,7 +219,7 @@ export abstract class BaseEvaluator { * * @param grade - Grade level to validate * @param validGrades - Set of valid grades for this evaluator - * @throws {Error} If grade is invalid + * @throws {ValidationError} If grade is invalid */ protected validateGrade(grade: string, validGrades: Set): void { this.logger.debug('Validating grade input', { diff --git a/sdks/typescript/src/evaluators/vocabulary.ts b/sdks/typescript/src/evaluators/vocabulary.ts index 3fd0f5a..4e44801 100644 --- a/sdks/typescript/src/evaluators/vocabulary.ts +++ b/sdks/typescript/src/evaluators/vocabulary.ts @@ -75,7 +75,6 @@ export class VocabularyEvaluator extends BaseEvaluator { throw new ValidationError('OpenAI API key is required. Pass openaiApiKey in config.'); } - // Create Google Gemini provider for complexity evaluation this.complexityProvider = createProvider({ type: 'google', @@ -102,9 +101,9 @@ export class VocabularyEvaluator extends BaseEvaluator { * Evaluate vocabulary complexity for a given text and grade level * * @param text - The text to evaluate - * @param grade - The target grade level (K-12) + * @param grade - The target grade level (3-12) * @returns Evaluation result with complexity score and detailed analysis - * @throws {Error} If text is empty or grade is invalid + * @throws {ValidationError} If text is empty or grade is invalid */ async evaluate( text: string, diff --git a/sdks/typescript/src/features/index.ts b/sdks/typescript/src/features/index.ts index e99c290..354830e 100644 --- a/sdks/typescript/src/features/index.ts +++ b/sdks/typescript/src/features/index.ts @@ -1,4 +1,5 @@ export { calculateFleschKincaidGrade, calculateReadabilityMetrics, + type ReadabilityMetrics, } from './readability.js'; diff --git a/sdks/typescript/src/features/readability.ts b/sdks/typescript/src/features/readability.ts index f6d8350..a744cb5 100644 --- a/sdks/typescript/src/features/readability.ts +++ b/sdks/typescript/src/features/readability.ts @@ -6,50 +6,44 @@ import { syllable } from 'syllable'; * Equivalent to Python's textstat.flesch_kincaid_grade() */ export function calculateFleschKincaidGrade(text: string): number { - const doc = nlp(text); - - const sentences = doc.sentences().length; - const words = doc.terms().length; - - if (sentences === 0 || words === 0) { - return 0; - } - - // Count syllables for all words - const allWords = doc.terms().out('array'); - const totalSyllables = allWords.reduce((sum: number, word: string) => { - return sum + syllable(word); - }, 0); - - // Flesch-Kincaid formula - const avgWordsPerSentence = words / sentences; - const avgSyllablesPerWord = totalSyllables / words; - - const fkGrade = 0.39 * avgWordsPerSentence + 11.8 * avgSyllablesPerWord - 15.59; - - return Math.round(fkGrade * 100) / 100; // Round to 2 decimal places + return calculateReadabilityMetrics(text).fleschKincaidGrade; } /** * Additional readability metrics */ -export function calculateReadabilityMetrics(text: string) { +export interface ReadabilityMetrics { + sentenceCount: number; + wordCount: number; + characterCount: number; + syllableCount: number; + avgWordsPerSentence: number; + avgSyllablesPerWord: number; + fleschKincaidGrade: number; +} + +export function calculateReadabilityMetrics(text: string): ReadabilityMetrics { const doc = nlp(text); const sentences = doc.sentences().length; - const words = doc.terms().length; + const terms = doc.terms(); + const words = terms.length; const characters = text.replace(/\s/g, '').length; - const allWords = doc.terms().out('array'); + const allWords = terms.out('array'); const totalSyllables = allWords.reduce((sum: number, word: string) => sum + syllable(word), 0); + const avgWordsPerSentence = sentences > 0 ? words / sentences : 0; + const avgSyllablesPerWord = words > 0 ? totalSyllables / words : 0; + const fkGrade = 0.39 * avgWordsPerSentence + 11.8 * avgSyllablesPerWord - 15.59; + return { sentenceCount: sentences, wordCount: words, characterCount: characters, syllableCount: totalSyllables, - avgWordsPerSentence: sentences > 0 ? words / sentences : 0, - avgSyllablesPerWord: words > 0 ? totalSyllables / words : 0, - fleschKincaidGrade: calculateFleschKincaidGrade(text), + avgWordsPerSentence, + avgSyllablesPerWord, + fleschKincaidGrade: Math.round(Math.max(0, fkGrade) * 100) / 100, }; } diff --git a/sdks/typescript/src/index.ts b/sdks/typescript/src/index.ts index 22d069b..5c72f20 100644 --- a/sdks/typescript/src/index.ts +++ b/sdks/typescript/src/index.ts @@ -7,7 +7,7 @@ export type { EvaluationError, } from './schemas/index.js'; -export { ComplexityLevel, GradeLevel, } from './schemas/index.js'; +export { ComplexityLevel, GradeLevel } from './schemas/index.js'; // Error types export { @@ -58,4 +58,5 @@ export { export { calculateFleschKincaidGrade, calculateReadabilityMetrics, + type ReadabilityMetrics, } from './features/index.js'; diff --git a/sdks/typescript/src/prompts/vocabulary/background-knowledge.ts b/sdks/typescript/src/prompts/vocabulary/background-knowledge.ts index 555f949..52309f1 100644 --- a/sdks/typescript/src/prompts/vocabulary/background-knowledge.ts +++ b/sdks/typescript/src/prompts/vocabulary/background-knowledge.ts @@ -5,6 +5,6 @@ import BACKGROUND_KNOWLEDGE_TEMPLATE from '../../../../../evals/prompts/vocabula */ export function getBackgroundKnowledgePrompt(text: string, grade: string): string { return BACKGROUND_KNOWLEDGE_TEMPLATE - .replace('{grade}', grade) - .replace('{text}', text); + .replaceAll('{grade}', grade) + .replaceAll('{text}', text); } diff --git a/sdks/typescript/src/prompts/vocabulary/system.ts b/sdks/typescript/src/prompts/vocabulary/system.ts index f1a0970..81dde16 100644 --- a/sdks/typescript/src/prompts/vocabulary/system.ts +++ b/sdks/typescript/src/prompts/vocabulary/system.ts @@ -3,7 +3,7 @@ import SYSTEM_PROMPT_OTHER_GRADES from '../../../../../evals/prompts/vocabulary/ /** * Get the appropriate system prompt based on grade level - * @param grade - The target grade level (K-12) + * @param grade - The target grade level (3-12) * @returns The system prompt for the grade level */ export function getSystemPrompt(grade: string): string { @@ -12,6 +12,6 @@ export function getSystemPrompt(grade: string): string { return SYSTEM_PROMPT_GRADES_3_4; } - // All other grades (K, 1, 2, 5-12) use OTHER_GRADES prompt + // All other grades (5-12) use OTHER_GRADES prompt return SYSTEM_PROMPT_OTHER_GRADES; } diff --git a/sdks/typescript/src/prompts/vocabulary/user.ts b/sdks/typescript/src/prompts/vocabulary/user.ts index 55dcfee..75e56b0 100644 --- a/sdks/typescript/src/prompts/vocabulary/user.ts +++ b/sdks/typescript/src/prompts/vocabulary/user.ts @@ -21,8 +21,8 @@ export function getUserPrompt( : USER_PROMPT_TEMPLATE_OTHER_GRADES; return template - .replace('{student_grade_level}', studentGradeLevel) - .replace('{student_background_knowledge}', studentBackgroundKnowledge) - .replace('{fk_level}', fkLevel.toString()) - .replace('{text}', text); + .replaceAll('{student_grade_level}', studentGradeLevel) + .replaceAll('{student_background_knowledge}', studentBackgroundKnowledge) + .replaceAll('{fk_level}', fkLevel.toString()) + .replaceAll('{text}', text); } diff --git a/sdks/typescript/src/telemetry/utils.ts b/sdks/typescript/src/telemetry/utils.ts index 4a912b1..81fc945 100644 --- a/sdks/typescript/src/telemetry/utils.ts +++ b/sdks/typescript/src/telemetry/utils.ts @@ -1,4 +1,4 @@ -import { createHash } from 'crypto'; +import { createHash, randomBytes } from 'crypto'; import { readFileSync } from 'fs'; import { join, dirname } from 'path'; import { fileURLToPath } from 'url'; @@ -22,7 +22,7 @@ export function generateClientId(...apiKeys: (string | undefined)[]): string { // If no keys provided, generate random ID for this session if (keys.length === 0) { return createHash('sha256') - .update(Math.random().toString()) + .update(randomBytes(16)) .digest('hex'); } From e88a60e4800202b8ceb4eab6191bf325e48026ab Mon Sep 17 00:00:00 2001 From: Adnan Rashid Hussain Date: Thu, 19 Feb 2026 20:45:53 -0800 Subject: [PATCH 04/10] fix complexity model for grades 5-12 --- sdks/typescript/src/evaluators/vocabulary.ts | 35 ++++++++++++++----- .../tests/unit/evaluators/vocabulary.test.ts | 7 ++-- 2 files changed, 30 insertions(+), 12 deletions(-) diff --git a/sdks/typescript/src/evaluators/vocabulary.ts b/sdks/typescript/src/evaluators/vocabulary.ts index 4e44801..28b2dc5 100644 --- a/sdks/typescript/src/evaluators/vocabulary.ts +++ b/sdks/typescript/src/evaluators/vocabulary.ts @@ -59,7 +59,8 @@ export interface VocabularyEvaluatorConfig extends BaseEvaluatorConfig { * ``` */ export class VocabularyEvaluator extends BaseEvaluator { - private complexityProvider: LLMProvider; + private grades34ComplexityProvider: LLMProvider; + private otherGradesComplexityProvider: LLMProvider; private backgroundKnowledgeProvider: LLMProvider; constructor(config: VocabularyEvaluatorConfig) { @@ -75,14 +76,22 @@ export class VocabularyEvaluator extends BaseEvaluator { throw new ValidationError('OpenAI API key is required. Pass openaiApiKey in config.'); } - // Create Google Gemini provider for complexity evaluation - this.complexityProvider = createProvider({ + // Create Google Gemini provider for complexity evaluation (grades 3-4) + this.grades34ComplexityProvider = createProvider({ type: 'google', model: 'gemini-2.5-pro', apiKey: config.googleApiKey, maxRetries: this.config.maxRetries, }); + // Create OpenAI GPT-4.1 provider for complexity evaluation (grades 5-12) + this.otherGradesComplexityProvider = createProvider({ + type: 'openai', + model: 'gpt-4.1-2025-04-14', + apiKey: config.openaiApiKey, + maxRetries: this.config.maxRetries, + }); + // Create OpenAI GPT-4o provider for background knowledge generation this.backgroundKnowledgeProvider = createProvider({ type: 'openai', @@ -122,6 +131,9 @@ export class VocabularyEvaluator extends BaseEvaluator { const startTime = Date.now(); const stageDetails: StageDetail[] = []; + const complexityProviderName = (grade === '3' || grade === '4') + ? 'google:gemini-2.5-pro' + : 'openai:gpt-4.1-2025-04-14'; try { this.logger.debug('Stage 1: Generating background knowledge', { @@ -158,7 +170,7 @@ export class VocabularyEvaluator extends BaseEvaluator { stageDetails.push({ stage: 'complexity_evaluation', - provider: 'google:gemini-2.5-pro', + provider: complexityProviderName, latency_ms: complexityResponse.latencyMs, // TODO: Retry tracking - Vercel AI SDK doesn't expose actual retry attempts // We set -1 to indicate "unknown" (we may have retried, but can't track it) @@ -188,7 +200,7 @@ export class VocabularyEvaluator extends BaseEvaluator { reasoning: complexityResponse.data.reasoning, metadata: { promptVersion: '1.0', - model: 'gemini-2.5-pro + gpt-4o-2024-11-20', + model: `openai:gpt-4o-2024-11-20 + ${complexityProviderName}`, timestamp: new Date(), processingTimeMs: latencyMs, }, @@ -201,7 +213,7 @@ export class VocabularyEvaluator extends BaseEvaluator { latencyMs, textLength: text.length, grade, - provider: 'google:gemini-2.5-pro+openai:gpt-4o', + provider: `openai:gpt-4o-2024-11-20 + ${complexityProviderName}`, retryAttempts: totalRetries, tokenUsage: totalTokenUsage, metadata: { @@ -251,7 +263,7 @@ export class VocabularyEvaluator extends BaseEvaluator { latencyMs, textLength: text.length, grade, - provider: 'google:gemini-2.5-pro+openai:gpt-4o', + provider: `openai:gpt-4o-2024-11-20 + ${complexityProviderName}`, retryAttempts: totalRetries, tokenUsage: totalTokenUsage, errorCode: error instanceof Error ? error.name : 'UnknownError', @@ -301,7 +313,8 @@ export class VocabularyEvaluator extends BaseEvaluator { /** * Stage 2: Evaluate vocabulary complexity * - * Uses the Qual Text Complexity rubric (SAP) and background knowledge to evaluate vocabulary complexity + * Uses the Qual Text Complexity rubric (SAP) and background knowledge to evaluate vocabulary complexity. + * Grades 3-4 use Gemini 2.5 Pro; grades 5-12 use GPT-4.1. */ private async evaluateComplexity( text: string, @@ -312,7 +325,11 @@ export class VocabularyEvaluator extends BaseEvaluator { const systemPrompt = getSystemPrompt(grade); const userPrompt = getUserPrompt(text, grade, backgroundKnowledge, fkLevel); - const response = await this.complexityProvider.generateStructured({ + const provider = (grade === '3' || grade === '4') + ? this.grades34ComplexityProvider + : this.otherGradesComplexityProvider; + + const response = await provider.generateStructured({ messages: [ { role: 'system', content: systemPrompt }, { role: 'user', content: userPrompt }, diff --git a/sdks/typescript/tests/unit/evaluators/vocabulary.test.ts b/sdks/typescript/tests/unit/evaluators/vocabulary.test.ts index 6b75336..9cf2a1c 100644 --- a/sdks/typescript/tests/unit/evaluators/vocabulary.test.ts +++ b/sdks/typescript/tests/unit/evaluators/vocabulary.test.ts @@ -70,7 +70,8 @@ describe('VocabularyEvaluator - Evaluation Flow', () => { // @ts-expect-error Accessing private property for testing mockBackgroundProvider = evaluator.backgroundKnowledgeProvider; // @ts-expect-error Accessing private property for testing - mockComplexityProvider = evaluator.complexityProvider; + // Tests use grade 5+, which routes to otherGradesComplexityProvider (GPT-4.1) + mockComplexityProvider = evaluator.otherGradesComplexityProvider; }); afterEach(() => { @@ -114,7 +115,7 @@ describe('VocabularyEvaluator - Evaluation Flow', () => { expect(result.score).toBe('moderately complex'); expect(result.reasoning).toContain('grade-appropriate vocabulary'); expect(result.metadata).toBeDefined(); - expect(result.metadata.model).toBe('gemini-2.5-pro + gpt-4o-2024-11-20'); + expect(result.metadata.model).toBe('openai:gpt-4o-2024-11-20 + openai:gpt-4.1-2025-04-14'); expect(result.metadata.processingTimeMs).toBeGreaterThan(0); // Verify both providers were called @@ -214,7 +215,7 @@ describe('VocabularyEvaluator - Evaluation Flow', () => { // Verify metadata values expect(result.metadata.promptVersion).toBe('1.0'); - expect(result.metadata.model).toBe('gemini-2.5-pro + gpt-4o-2024-11-20'); + expect(result.metadata.model).toBe('openai:gpt-4o-2024-11-20 + openai:gpt-4.1-2025-04-14'); expect(result.metadata.timestamp).toBeInstanceOf(Date); expect(result.metadata.processingTimeMs).toBeGreaterThanOrEqual(0); // Mocked calls can be instant (0ms) }); From b60aaded4dea4cd6c895785df49130073fb5d89f Mon Sep 17 00:00:00 2001 From: Adnan Rashid Hussain Date: Mon, 2 Mar 2026 15:32:31 -0800 Subject: [PATCH 05/10] fix env loading --- .../tests/integration/vocabulary.integration.test.ts | 4 ---- sdks/typescript/vitest.config.ts | 6 ++++-- 2 files changed, 4 insertions(+), 6 deletions(-) diff --git a/sdks/typescript/tests/integration/vocabulary.integration.test.ts b/sdks/typescript/tests/integration/vocabulary.integration.test.ts index 889165e..49b4662 100644 --- a/sdks/typescript/tests/integration/vocabulary.integration.test.ts +++ b/sdks/typescript/tests/integration/vocabulary.integration.test.ts @@ -1,14 +1,10 @@ import { describe, it, expect, beforeAll } from 'vitest'; import { VocabularyEvaluator } from '../../src/evaluators/vocabulary.js'; -import { config } from 'dotenv'; import { runEvaluatorTest, type BaseTestCase, } from '../utils/index.js'; -// Load .env file for testing convenience -config(); - /** * Vocabulary Evaluator Integration Tests * diff --git a/sdks/typescript/vitest.config.ts b/sdks/typescript/vitest.config.ts index 9eb9a49..06f43e3 100644 --- a/sdks/typescript/vitest.config.ts +++ b/sdks/typescript/vitest.config.ts @@ -1,4 +1,5 @@ import { defineConfig } from 'vitest/config'; +import { loadEnv } from 'vite'; import { readFileSync } from 'fs'; import { resolve, dirname } from 'path'; import type { Plugin } from 'vite'; @@ -18,12 +19,13 @@ function txtPlugin(): Plugin { }; } -export default defineConfig({ +export default defineConfig(({ mode }) => ({ plugins: [txtPlugin()], test: { globals: true, environment: 'node', passWithNoTests: true, + env: loadEnv(mode, process.cwd(), ''), coverage: { provider: 'v8', reporter: ['text', 'json', 'html'], @@ -36,4 +38,4 @@ export default defineConfig({ ], }, }, -}); +})); From 4951f093f4aeeed7de4ba8c475d903c7c5cb8321 Mon Sep 17 00:00:00 2001 From: Adnan Rashid Hussain Date: Mon, 2 Mar 2026 15:58:55 -0800 Subject: [PATCH 06/10] disabled text collection by default --- sdks/typescript/docs/telemetry.md | 6 +++--- sdks/typescript/src/evaluators/base.ts | 8 ++++---- 2 files changed, 7 insertions(+), 7 deletions(-) diff --git a/sdks/typescript/docs/telemetry.md b/sdks/typescript/docs/telemetry.md index f9f810b..0f6e6e3 100644 --- a/sdks/typescript/docs/telemetry.md +++ b/sdks/typescript/docs/telemetry.md @@ -75,7 +75,7 @@ We **never** collect your API keys (only a hashed identifier). | `provider` | LLM provider(s) used (e.g., "openai:gpt-4o", "google:gemini-2.5-pro+openai:gpt-4o") | | `retry_attempts` | Number of retries (-1 means unknown, see note below) | | `token_usage` | Total tokens consumed (input, output, total) | -| `input_text` | The text being evaluated (omitted if `recordInputs: false`) | +| `input_text` | The text being evaluated (only included if `recordInputs: true`) | | `metadata.stage_details` | Per-stage breakdown for multi-stage evaluators (optional) | **Note on `retry_attempts`:** Currently set to `-1` (unknown) as actual retry counts are not yet tracked. This field is included for backward compatibility as we plan to add this as a future enhancement. @@ -114,7 +114,7 @@ const evaluator = new VocabularyEvaluator({ }); ``` -### Disable Input Text Collection +### Enable Input Text Collection ```typescript const evaluator = new VocabularyEvaluator({ @@ -122,7 +122,7 @@ const evaluator = new VocabularyEvaluator({ openaiApiKey: process.env.OPENAI_API_KEY!, telemetry: { enabled: true, - recordInputs: false, // Only metrics, no text content + recordInputs: true, // Also send input text with telemetry }, }); ``` diff --git a/sdks/typescript/src/evaluators/base.ts b/sdks/typescript/src/evaluators/base.ts index cbca438..bf5fd53 100644 --- a/sdks/typescript/src/evaluators/base.ts +++ b/sdks/typescript/src/evaluators/base.ts @@ -25,7 +25,7 @@ export interface TelemetryOptions { /** Enable telemetry (default: true) */ enabled?: boolean; - /** Record input text in telemetry (default: true) */ + /** Record input text in telemetry (default: false) */ recordInputs?: boolean; } @@ -55,7 +55,7 @@ export interface BaseEvaluatorConfig { * Telemetry configuration (default: all enabled) * * Can be: - * - `true`: Enable with defaults (recordInputs: true) + * - `true`: Enable with defaults (recordInputs: false) * - `false`: Disable completely * - `TelemetryOptions`: Granular control */ @@ -141,14 +141,14 @@ export abstract class BaseEvaluator { if (telemetry === true || telemetry === undefined) { return { enabled: true, - recordInputs: true, + recordInputs: false, }; } // Handle granular config object return { enabled: telemetry.enabled ?? true, - recordInputs: telemetry.recordInputs ?? true, + recordInputs: telemetry.recordInputs ?? false, }; } From d933db9493c372bb871d0504f48a4d4e9bce6293 Mon Sep 17 00:00:00 2001 From: Adnan Rashid Hussain Date: Mon, 2 Mar 2026 21:40:11 -0800 Subject: [PATCH 07/10] re: feedback --- evals/prompts/vocabulary/grades-3-4-user.txt | 2 - .../prompts/vocabulary/other-grades-user.txt | 2 - sdks/typescript/README.md | 12 +- sdks/typescript/docs/telemetry.md | 8 +- sdks/typescript/package.json | 10 +- sdks/typescript/src/evaluators/base.ts | 49 +------ sdks/typescript/src/evaluators/vocabulary.ts | 13 +- sdks/typescript/src/index.ts | 12 +- .../src/providers/ai-sdk-provider.ts | 123 ++++++++-------- sdks/typescript/src/telemetry/client.ts | 6 +- sdks/typescript/src/telemetry/types.ts | 6 +- sdks/typescript/src/telemetry/utils.ts | 74 +++++++--- sdks/typescript/tests/README.md | 2 +- .../tests/unit/evaluators/vocabulary.test.ts | 2 +- .../tests/unit/telemetry/utils.test.ts | 137 +++++++++++------- 15 files changed, 243 insertions(+), 215 deletions(-) diff --git a/evals/prompts/vocabulary/grades-3-4-user.txt b/evals/prompts/vocabulary/grades-3-4-user.txt index 1759511..7da9831 100644 --- a/evals/prompts/vocabulary/grades-3-4-user.txt +++ b/evals/prompts/vocabulary/grades-3-4-user.txt @@ -10,5 +10,3 @@ Below is the text you need to evaluate. Let's think step by step in order to pre - Text to evaluate: [BEGIN TEXT] {text} [END TEXT] - -{format_instructions} diff --git a/evals/prompts/vocabulary/other-grades-user.txt b/evals/prompts/vocabulary/other-grades-user.txt index 0d4b534..95cc176 100644 --- a/evals/prompts/vocabulary/other-grades-user.txt +++ b/evals/prompts/vocabulary/other-grades-user.txt @@ -135,5 +135,3 @@ As you read the text, you can assume the student has the following background kn [END TEXT] In your response, when specifying the level of complexity, be sure to use only a single integer (e.g. 2) and don't include any other text (e.g. don't say "level 2"). - -{format_instructions} diff --git a/sdks/typescript/README.md b/sdks/typescript/README.md index 9be34c3..d1de0cd 100644 --- a/sdks/typescript/README.md +++ b/sdks/typescript/README.md @@ -5,7 +5,15 @@ TypeScript SDK for Learning Commons educational text complexity evaluators. ## Installation ```bash -npm install ai @learning-commons/evaluators +npm install @learning-commons/evaluators ai +``` + +The SDK uses the [Vercel AI SDK](https://sdk.vercel.ai) (`ai`) as its LLM interface. You also need to install the provider adapter(s) for the LLM(s) you use: + +```bash +npm install @ai-sdk/openai # for OpenAI +npm install @ai-sdk/google # for Google Gemini +npm install @ai-sdk/anthropic # for Anthropic ``` ## Quick Start @@ -155,7 +163,7 @@ interface BaseEvaluatorConfig { telemetry?: boolean | TelemetryOptions; // Telemetry config (default: true) logger?: Logger; // Custom logger (optional) logLevel?: LogLevel; // Console log level (default: WARN) - apiKey?: string; // Learning Commons API key for authenticated telemetry (optional) + partnerKey?: string; // Learning Commons partner key for authenticated telemetry (optional) } ``` diff --git a/sdks/typescript/docs/telemetry.md b/sdks/typescript/docs/telemetry.md index 0f6e6e3..f9caea1 100644 --- a/sdks/typescript/docs/telemetry.md +++ b/sdks/typescript/docs/telemetry.md @@ -11,10 +11,13 @@ Telemetry is **anonymous by default**. If you'd like to partner with us to impro **By default, telemetry is enabled** and sends: - Performance metrics (latency, token usage) - Metadata (evaluator type, grade, SDK version) -- **Input text** (the text you're evaluating) + +**Input text is NOT collected by default.** You can opt in via `recordInputs: true` — see [Enable Input Text Collection](#enable-input-text-collection) below. We **never** collect your API keys (only a hashed identifier). +If you prefer not to send any telemetry, you can disable it entirely — see [Disable Telemetry Completely](#disable-telemetry-completely) below. + ## Example Telemetry Event ```json @@ -32,7 +35,6 @@ We **never** collect your API keys (only a hashed identifier). "input_tokens": 650, "output_tokens": 350 }, - "input_text": "The mitochondria is the powerhouse of the cell...", "metadata": { "stage_details": [ { @@ -100,7 +102,7 @@ To help us support your specific use case, provide an API key: const evaluator = new VocabularyEvaluator({ googleApiKey: process.env.GOOGLE_API_KEY!, openaiApiKey: process.env.OPENAI_API_KEY!, - apiKey: process.env.LEARNING_COMMONS_API_KEY!, // Contact us for a key + partnerKey: process.env.LEARNING_COMMONS_PARTNER_KEY!, // Contact us for a key }); ``` diff --git a/sdks/typescript/package.json b/sdks/typescript/package.json index f3e4bf6..b269046 100644 --- a/sdks/typescript/package.json +++ b/sdks/typescript/package.json @@ -48,7 +48,15 @@ }, "homepage": "https://github.com/learning-commons-org/evaluators#readme", "peerDependencies": { - "ai": ">=4.0.0" + "ai": ">=6.0.0", + "@ai-sdk/openai": ">=3.0.0", + "@ai-sdk/google": ">=3.0.0", + "@ai-sdk/anthropic": ">=3.0.0" + }, + "peerDependenciesMeta": { + "@ai-sdk/openai": { "optional": true }, + "@ai-sdk/google": { "optional": true }, + "@ai-sdk/anthropic": { "optional": true } }, "dependencies": { "compromise": "^14.13.0", diff --git a/sdks/typescript/src/evaluators/base.ts b/sdks/typescript/src/evaluators/base.ts index bf5fd53..6cfde46 100644 --- a/sdks/typescript/src/evaluators/base.ts +++ b/sdks/typescript/src/evaluators/base.ts @@ -39,8 +39,8 @@ export interface BaseEvaluatorConfig { /** OpenAI API key (for evaluators using GPT) */ openaiApiKey?: string; - /** Learning Commons API key for authenticated telemetry (optional) */ - apiKey?: string; + /** Learning Commons partner key for authenticated telemetry (optional) */ + partnerKey?: string; /** * Maximum number of retries for failed API calls (default: 2) @@ -110,15 +110,10 @@ export abstract class BaseEvaluator { // Initialize telemetry if enabled if (this.config.telemetry.enabled) { - // Use all provider keys for client ID generation - const providerKeys = [config.googleApiKey, config.openaiApiKey].filter( - (key): key is string => key !== undefined - ); - this.telemetryClient = new TelemetryClient({ endpoint: 'https://api.learningcommons.org/v1/telemetry', - apiKey: config.apiKey, - clientId: generateClientId(...providerKeys), + partnerKey: config.partnerKey, + clientId: generateClientId(), enabled: true, }); } @@ -174,42 +169,21 @@ export abstract class BaseEvaluator { // Check if text is empty or only whitespace const trimmedText = text.trim(); if (!trimmedText) { - const error = new ValidationError( - 'Text cannot be empty or contain only whitespace' - ); - this.logger.error('Text validation failed: empty or whitespace only', { - evaluator: this.getEvaluatorType(), - error, - }); - throw error; + throw new ValidationError('Text cannot be empty or contain only whitespace'); } // Check minimum length if (trimmedText.length < VALIDATION_LIMITS.MIN_TEXT_LENGTH) { - const error = new ValidationError( + throw new ValidationError( `Text is too short. Minimum length is ${VALIDATION_LIMITS.MIN_TEXT_LENGTH} characters, received ${trimmedText.length} characters` ); - this.logger.error('Text validation failed: too short', { - evaluator: this.getEvaluatorType(), - error, - minLength: VALIDATION_LIMITS.MIN_TEXT_LENGTH, - actualLength: trimmedText.length, - }); - throw error; } // Check maximum length if (trimmedText.length > VALIDATION_LIMITS.MAX_TEXT_LENGTH) { - const error = new ValidationError( + throw new ValidationError( `Text is too long. Maximum length is ${VALIDATION_LIMITS.MAX_TEXT_LENGTH.toLocaleString()} characters, received ${trimmedText.length.toLocaleString()} characters` ); - this.logger.error('Text validation failed: too long', { - evaluator: this.getEvaluatorType(), - error, - maxLength: VALIDATION_LIMITS.MAX_TEXT_LENGTH, - actualLength: trimmedText.length, - }); - throw error; } } @@ -237,16 +211,9 @@ export abstract class BaseEvaluator { return parseInt(a) - parseInt(b); }).join(', '); - const error = new ValidationError( + throw new ValidationError( `Invalid grade "${grade}". Supported grades for this evaluator: ${validList}` ); - this.logger.error('Grade validation failed: invalid grade', { - evaluator: this.getEvaluatorType(), - error, - providedGrade: grade, - validGrades: validList, - }); - throw error; } } diff --git a/sdks/typescript/src/evaluators/vocabulary.ts b/sdks/typescript/src/evaluators/vocabulary.ts index 28b2dc5..68d3467 100644 --- a/sdks/typescript/src/evaluators/vocabulary.ts +++ b/sdks/typescript/src/evaluators/vocabulary.ts @@ -112,7 +112,8 @@ export class VocabularyEvaluator extends BaseEvaluator { * @param text - The text to evaluate * @param grade - The target grade level (3-12) * @returns Evaluation result with complexity score and detailed analysis - * @throws {ValidationError} If text is empty or grade is invalid + * @throws {ValidationError} If text is empty, too short/long, or grade is invalid + * @throws {APIError} If LLM API calls fail (includes AuthenticationError, RateLimitError, NetworkError, TimeoutError) */ async evaluate( text: string, @@ -125,10 +126,6 @@ export class VocabularyEvaluator extends BaseEvaluator { textLength: text.length, }); - // Use inherited validation methods - this.validateText(text); - this.validateGrade(grade, VALID_GRADES); - const startTime = Date.now(); const stageDetails: StageDetail[] = []; const complexityProviderName = (grade === '3' || grade === '4') @@ -136,6 +133,10 @@ export class VocabularyEvaluator extends BaseEvaluator { : 'openai:gpt-4.1-2025-04-14'; try { + // Validate inputs — inside try so validation errors are telemetered. + // If partners consistently pass invalid grades/text, telemetry will surface documentation gaps. + this.validateText(text); + this.validateGrade(grade, VALID_GRADES); this.logger.debug('Stage 1: Generating background knowledge', { evaluator: 'vocabulary', operation: 'background_knowledge', @@ -199,7 +200,7 @@ export class VocabularyEvaluator extends BaseEvaluator { score: complexityResponse.data.complexity_score, reasoning: complexityResponse.data.reasoning, metadata: { - promptVersion: '1.0', + promptVersion: '1.2.0', model: `openai:gpt-4o-2024-11-20 + ${complexityProviderName}`, timestamp: new Date(), processingTimeMs: latencyMs, diff --git a/sdks/typescript/src/index.ts b/sdks/typescript/src/index.ts index 5c72f20..e5bc977 100644 --- a/sdks/typescript/src/index.ts +++ b/sdks/typescript/src/index.ts @@ -7,8 +7,6 @@ export type { EvaluationError, } from './schemas/index.js'; -export { ComplexityLevel, GradeLevel } from './schemas/index.js'; - // Error types export { EvaluatorError, @@ -18,14 +16,13 @@ export { RateLimitError, NetworkError, TimeoutError, - wrapProviderError, } from './errors.js'; // Logger export type { Logger, LogContext } from './logger.js'; -export { LogLevel, createLogger, formatError } from './logger.js'; +export { LogLevel } from './logger.js'; -// Provider exports +// Provider types (for implementing custom providers) export type { LLMProvider, LLMRequest, @@ -35,17 +32,12 @@ export type { ProviderConfig, } from './providers/index.js'; -export { VercelAIProvider, createProvider } from './providers/index.js'; - // Vocabulary exports export type { VocabularyComplexity, VocabularyComplexityLevel, - BackgroundKnowledge, } from './schemas/vocabulary.js'; -export { VocabularyComplexitySchema } from './schemas/vocabulary.js'; - export { VocabularyEvaluator, evaluateVocabulary, diff --git a/sdks/typescript/src/providers/ai-sdk-provider.ts b/sdks/typescript/src/providers/ai-sdk-provider.ts index 984a572..e482f35 100644 --- a/sdks/typescript/src/providers/ai-sdk-provider.ts +++ b/sdks/typescript/src/providers/ai-sdk-provider.ts @@ -1,7 +1,4 @@ -import { generateText, Output } from 'ai'; -import { createOpenAI } from '@ai-sdk/openai'; -import { createAnthropic } from '@ai-sdk/anthropic'; -import { createGoogleGenerativeAI } from '@ai-sdk/google'; +import { generateText as aiGenerateText, Output } from 'ai'; import type { LLMProvider, LLMRequest, @@ -36,91 +33,85 @@ export class VercelAIProvider implements LLMProvider { * Generate structured output using Vercel AI SDK's generateText with output */ async generateStructured(request: LLMRequest): Promise> { - const model = this.getModel(request.model); + const model = await this.getModel(request.model); const startTime = Date.now(); - try { - const baseParams = { - model, - messages: request.messages, - output: Output.object({ schema: request.schema }), - temperature: request.temperature ?? 0, - maxRetries: this.config.maxRetries ?? 0, - }; - - const params = request.maxTokens !== undefined - ? { ...baseParams, maxTokens: request.maxTokens } - : baseParams; - - const { output, usage } = await generateText(params as Parameters[0]); - - return { - data: output, - model: request.model || this.getDefaultModel(), - usage: { - inputTokens: usage.inputTokens || 0, - outputTokens: usage.outputTokens || 0, - }, - latencyMs: Date.now() - startTime, - }; - } catch (error) { - throw new Error( - `Failed to generate structured output: ${error instanceof Error ? error.message : String(error)}` - ); - } + const { output, usage } = await aiGenerateText({ + model, + messages: request.messages, + output: Output.object({ schema: request.schema }), + temperature: request.temperature ?? 0, + maxRetries: this.config.maxRetries ?? 0, + ...(request.maxTokens !== undefined ? { maxTokens: request.maxTokens } : {}), + }); + + return { + data: output as T, + model: request.model || this.getDefaultModel(), + usage: { + inputTokens: usage.inputTokens || 0, + outputTokens: usage.outputTokens || 0, + }, + latencyMs: Date.now() - startTime, + }; } /** * Generate plain text using Vercel AI SDK's generateText */ async generateText(messages: Message[], temperature?: number): Promise { - const model = this.getModel(); + const model = await this.getModel(); const startTime = Date.now(); - try { - const params = { - model, - messages, - temperature: temperature ?? this.config.temperature ?? 0, - maxRetries: this.config.maxRetries ?? 0, - }; - - const { text, usage } = await generateText(params as Parameters[0]); - - return { - text, - usage: { - inputTokens: usage.inputTokens || 0, - outputTokens: usage.outputTokens || 0, - }, - latencyMs: Date.now() - startTime, - }; - } catch (error) { - throw new Error( - `Failed to generate text: ${error instanceof Error ? error.message : String(error)}` - ); - } + const { text, usage } = await aiGenerateText({ + model, + messages, + temperature: temperature ?? this.config.temperature ?? 0, + maxRetries: this.config.maxRetries ?? 0, + }); + + return { + text, + usage: { + inputTokens: usage.inputTokens || 0, + outputTokens: usage.outputTokens || 0, + }, + latencyMs: Date.now() - startTime, + }; } /** - * Get the configured language model + * Get the configured language model. + * Uses dynamic imports so consumers only need to install the provider packages they use. */ - private getModel(requestModel?: string) { + private async getModel(requestModel?: string) { const modelId = requestModel || this.config.model || this.getDefaultModel(); const apiKey = this.config.apiKey; switch (this.config.type) { case 'openai': { - const provider = createOpenAI(apiKey ? { apiKey } : {}); - return provider(modelId); + const { createOpenAI } = await import('@ai-sdk/openai').catch(() => { + throw new Error( + 'To use the OpenAI provider, install its adapter: npm install @ai-sdk/openai' + ); + }); + return createOpenAI(apiKey ? { apiKey } : {})(modelId); } case 'anthropic': { - const provider = createAnthropic(apiKey ? { apiKey } : {}); - return provider(modelId); + const { createAnthropic } = await import('@ai-sdk/anthropic').catch(() => { + throw new Error( + 'To use the Anthropic provider, install its adapter: npm install @ai-sdk/anthropic' + ); + }); + return createAnthropic(apiKey ? { apiKey } : {})(modelId); } case 'google': { - const provider = createGoogleGenerativeAI(apiKey ? { apiKey } : {}); - return provider(modelId); + const { createGoogleGenerativeAI } = await import('@ai-sdk/google').catch(() => { + throw new Error( + 'To use the Google provider, install its adapter: npm install @ai-sdk/google' + ); + }); + return createGoogleGenerativeAI(apiKey ? { apiKey } : {})(modelId); } default: throw new Error(`Unsupported provider type: ${this.config.type}`); diff --git a/sdks/typescript/src/telemetry/client.ts b/sdks/typescript/src/telemetry/client.ts index 7dc65f2..4a1743b 100644 --- a/sdks/typescript/src/telemetry/client.ts +++ b/sdks/typescript/src/telemetry/client.ts @@ -30,9 +30,9 @@ export class TelemetryClient { 'X-Client-ID': this.config.clientId, }; - // Add API key if provided - if (this.config.apiKey) { - headers['X-API-Key'] = this.config.apiKey; + // Add partner key if provided + if (this.config.partnerKey) { + headers['X-API-Key'] = this.config.partnerKey; } const response = await fetch(this.config.endpoint, { diff --git a/sdks/typescript/src/telemetry/types.ts b/sdks/typescript/src/telemetry/types.ts index 6f3abf8..2b78164 100644 --- a/sdks/typescript/src/telemetry/types.ts +++ b/sdks/typescript/src/telemetry/types.ts @@ -94,10 +94,10 @@ export interface TelemetryConfig { /** Analytics service endpoint URL */ endpoint: string; - /** Learning Commons API key (optional, sent as X-API-Key header) */ - apiKey?: string; + /** Learning Commons partner key (optional, sent as X-API-Key header) */ + partnerKey?: string; - /** Client ID for anonymous tracking (sha256 of LLM API keys) */ + /** Client ID for anonymous tracking (persistent UUID from ~/.config/learning-commons/config.json) */ clientId: string; /** Enable telemetry (default: true) */ diff --git a/sdks/typescript/src/telemetry/utils.ts b/sdks/typescript/src/telemetry/utils.ts index 81fc945..eaef3d9 100644 --- a/sdks/typescript/src/telemetry/utils.ts +++ b/sdks/typescript/src/telemetry/utils.ts @@ -1,35 +1,65 @@ -import { createHash, randomBytes } from 'crypto'; -import { readFileSync } from 'fs'; -import { join, dirname } from 'path'; -import { fileURLToPath } from 'url'; +import { randomUUID } from 'node:crypto'; +import { readFileSync, writeFileSync, mkdirSync } from 'node:fs'; +import { join, dirname } from 'node:path'; +import { homedir } from 'node:os'; +import { fileURLToPath } from 'node:url'; const __filename = fileURLToPath(import.meta.url); const __dirname = dirname(__filename); +/** Cached client ID — populated on first call, reused for process lifetime */ +let cachedClientId: string | undefined; + /** - * Generate client ID for anonymous tracking + * Get or create a persistent client ID for anonymous tracking. * - * Creates SHA256 hash of API keys to create consistent identifier - * across requests while maintaining anonymity. + * On first run, generates a UUID and tries to save it to: + * - Windows: %APPDATA%\learning-commons\config.json + * - macOS/Linux: ~/.config/learning-commons/config.json * - * @param apiKeys - Array of API keys to hash - * @returns SHA256 hex string + * On subsequent runs, reads the saved UUID from disk. + * Falls back to an in-memory UUID (per-process) if the filesystem + * is unavailable (e.g., serverless, read-only containers). */ -export function generateClientId(...apiKeys: (string | undefined)[]): string { - // Filter out undefined keys and sort for consistency - const keys = apiKeys.filter((k): k is string => k !== undefined).sort(); - - // If no keys provided, generate random ID for this session - if (keys.length === 0) { - return createHash('sha256') - .update(randomBytes(16)) - .digest('hex'); +export function generateClientId(): string { + if (cachedClientId) { + return cachedClientId; + } + + const configFile = getConfigFilePath(); + + // Try to read existing client ID from disk + try { + const data = JSON.parse(readFileSync(configFile, 'utf-8')) as { + telemetry?: { clientId?: string }; + }; + if (data?.telemetry?.clientId) { + cachedClientId = data.telemetry.clientId; + return cachedClientId; + } + } catch { + // File doesn't exist yet — fall through to generate } - // Hash the concatenated keys with delimiter to prevent collisions - return createHash('sha256') - .update(keys.join('|')) - .digest('hex'); + // Generate new UUID and try to persist it + const clientId = randomUUID(); + try { + mkdirSync(dirname(configFile), { recursive: true }); + writeFileSync(configFile, JSON.stringify({ telemetry: { clientId } }, null, 2)); + } catch { + // Filesystem unavailable — use in-memory UUID for this process + } + + cachedClientId = clientId; + return cachedClientId; +} + +function getConfigFilePath(): string { + const configDir = + process.platform === 'win32' + ? join(process.env.APPDATA ?? homedir(), 'learning-commons') + : join(homedir(), '.config', 'learning-commons'); + return join(configDir, 'config.json'); } let cachedVersion: string | undefined; diff --git a/sdks/typescript/tests/README.md b/sdks/typescript/tests/README.md index e5499ba..0f06c3e 100644 --- a/sdks/typescript/tests/README.md +++ b/sdks/typescript/tests/README.md @@ -124,7 +124,7 @@ describeIntegration.concurrent('My Evaluator - Test Suite', () => { } evaluator = new MyEvaluator({ - apiKey: process.env.MY_API_KEY!, + partnerKey: process.env.MY_PARTNER_KEY!, retry: false, // We handle retries in test logic }); }); diff --git a/sdks/typescript/tests/unit/evaluators/vocabulary.test.ts b/sdks/typescript/tests/unit/evaluators/vocabulary.test.ts index 9cf2a1c..2ce906a 100644 --- a/sdks/typescript/tests/unit/evaluators/vocabulary.test.ts +++ b/sdks/typescript/tests/unit/evaluators/vocabulary.test.ts @@ -214,7 +214,7 @@ describe('VocabularyEvaluator - Evaluation Flow', () => { expect(result.metadata).toHaveProperty('processingTimeMs'); // Verify metadata values - expect(result.metadata.promptVersion).toBe('1.0'); + expect(result.metadata.promptVersion).toBe('1.2.0'); expect(result.metadata.model).toBe('openai:gpt-4o-2024-11-20 + openai:gpt-4.1-2025-04-14'); expect(result.metadata.timestamp).toBeInstanceOf(Date); expect(result.metadata.processingTimeMs).toBeGreaterThanOrEqual(0); // Mocked calls can be instant (0ms) diff --git a/sdks/typescript/tests/unit/telemetry/utils.test.ts b/sdks/typescript/tests/unit/telemetry/utils.test.ts index 24dd0e7..77e50d2 100644 --- a/sdks/typescript/tests/unit/telemetry/utils.test.ts +++ b/sdks/typescript/tests/unit/telemetry/utils.test.ts @@ -1,73 +1,106 @@ -import { describe, it, expect } from 'vitest'; -import { generateClientId, getSDKVersion } from '../../../src/telemetry/utils.js'; +import { describe, it, expect, vi, beforeEach } from 'vitest'; +import { getSDKVersion } from '../../../src/telemetry/utils.js'; + +// UUID v4 pattern +const UUID_REGEX = /^[0-9a-f]{8}-[0-9a-f]{4}-4[0-9a-f]{3}-[89ab][0-9a-f]{3}-[0-9a-f]{12}$/i; describe('Telemetry Utils', () => { describe('generateClientId', () => { - it('should generate consistent hash for same keys', () => { - const id1 = generateClientId('key1', 'key2'); - const id2 = generateClientId('key1', 'key2'); - - expect(id1).toBe(id2); + // Reset module cache between tests so cachedClientId doesn't leak across tests + beforeEach(() => { + vi.resetModules(); }); - it('should generate same hash regardless of key order', () => { - const id1 = generateClientId('key1', 'key2'); - const id2 = generateClientId('key2', 'key1'); - - expect(id1).toBe(id2); + it('should generate a new UUID, create the config directory, and persist it when no config file exists', async () => { + const writeFileSync = vi.fn(); + const mkdirSync = vi.fn(); + vi.doMock('node:fs', () => ({ + readFileSync: vi.fn(() => { throw new Error('ENOENT'); }), + writeFileSync, + mkdirSync, + })); + vi.doMock('node:os', () => ({ homedir: vi.fn(() => '/home/user') })); + + const { generateClientId } = await import('../../../src/telemetry/utils.js'); + const id = generateClientId(); + + expect(id).toMatch(UUID_REGEX); + expect(mkdirSync).toHaveBeenCalledWith(expect.any(String), { recursive: true }); + expect(writeFileSync).toHaveBeenCalledOnce(); + const written = JSON.parse(writeFileSync.mock.calls[0][1] as string) as { + telemetry: { clientId: string }; + }; + expect(written.telemetry.clientId).toBe(id); }); - it('should filter out undefined keys', () => { - const id1 = generateClientId('key1', undefined, 'key2'); - const id2 = generateClientId('key1', 'key2'); + it('should not re-read from disk on repeated calls', async () => { + const readFileSync = vi.fn(() => { throw new Error('ENOENT'); }); + vi.doMock('node:fs', () => ({ + readFileSync, + writeFileSync: vi.fn(), + mkdirSync: vi.fn(), + })); + vi.doMock('node:os', () => ({ homedir: vi.fn(() => '/home/user') })); - expect(id1).toBe(id2); - }); + const { generateClientId } = await import('../../../src/telemetry/utils.js'); - it('should generate different hashes for different keys', () => { - const id1 = generateClientId('key1', 'key2'); - const id2 = generateClientId('key1', 'key3'); + generateClientId(); + generateClientId(); - expect(id1).not.toBe(id2); + expect(readFileSync).toHaveBeenCalledOnce(); }); - it('should return 64-character hex string', () => { - const id = generateClientId('key1', 'key2'); - - expect(id).toMatch(/^[a-f0-9]{64}$/); + it('should read and return an existing client ID from config file without writing to disk', async () => { + const existingId = 'a1b2c3d4-e5f6-4789-ab01-cd23ef456789'; + const writeFileSync = vi.fn(); + const mkdirSync = vi.fn(); + vi.doMock('node:fs', () => ({ + readFileSync: vi.fn(() => JSON.stringify({ telemetry: { clientId: existingId } })), + writeFileSync, + mkdirSync, + })); + vi.doMock('node:os', () => ({ homedir: vi.fn(() => '/home/user') })); + + const { generateClientId } = await import('../../../src/telemetry/utils.js'); + + expect(generateClientId()).toBe(existingId); + expect(mkdirSync).not.toHaveBeenCalled(); + expect(writeFileSync).not.toHaveBeenCalled(); }); - it('should handle single key', () => { - const id = generateClientId('single-key'); - - expect(id).toMatch(/^[a-f0-9]{64}$/); + it('should generate and persist a new UUID if config file exists but clientId is missing', async () => { + const writeFileSync = vi.fn(); + vi.doMock('node:fs', () => ({ + readFileSync: vi.fn(() => JSON.stringify({ telemetry: {} })), + writeFileSync, + mkdirSync: vi.fn(), + })); + vi.doMock('node:os', () => ({ homedir: vi.fn(() => '/home/user') })); + + const { generateClientId } = await import('../../../src/telemetry/utils.js'); + const id = generateClientId(); + + expect(id).toMatch(UUID_REGEX); + expect(writeFileSync).toHaveBeenCalledOnce(); + const written = JSON.parse(writeFileSync.mock.calls[0][1] as string) as { + telemetry: { clientId: string }; + }; + expect(written.telemetry.clientId).toBe(id); }); - it('should generate random ID when no keys provided', () => { - const id1 = generateClientId(); - const id2 = generateClientId(); - - // Random IDs should be different - expect(id1).not.toBe(id2); - expect(id1).toMatch(/^[a-f0-9]{64}$/); - expect(id2).toMatch(/^[a-f0-9]{64}$/); - }); - - it('should generate random ID when all keys are undefined', () => { - const id1 = generateClientId(undefined, undefined); - const id2 = generateClientId(undefined, undefined); - - // Random IDs should be different - expect(id1).not.toBe(id2); - }); + it('should return a valid UUID without throwing when filesystem is read-only', async () => { + vi.doMock('node:fs', () => ({ + readFileSync: vi.fn(() => { throw new Error('ENOENT'); }), + writeFileSync: vi.fn(() => { throw new Error('EROFS'); }), + mkdirSync: vi.fn(() => { throw new Error('EROFS'); }), + })); + vi.doMock('node:os', () => ({ homedir: vi.fn(() => '/home/user') })); - it('should prevent collision with delimiter (theoretical)', () => { - // Without delimiter: ["ab", "c"] and ["a", "bc"] would both hash "abc" - // With delimiter: ["ab", "c"] → "ab|c" and ["a", "bc"] → "a|bc" - const id1 = generateClientId('ab', 'c'); - const id2 = generateClientId('a', 'bc'); + const { generateClientId } = await import('../../../src/telemetry/utils.js'); - expect(id1).not.toBe(id2); + let id: string | undefined; + expect(() => { id = generateClientId(); }).not.toThrow(); + expect(id).toMatch(UUID_REGEX); }); }); From c48e673ea6f9234b0e3e32aadaa9480d3a6c2e67 Mon Sep 17 00:00:00 2001 From: Adnan Rashid Hussain Date: Mon, 2 Mar 2026 21:51:44 -0800 Subject: [PATCH 08/10] logger for telemetry client --- sdks/typescript/src/evaluators/base.ts | 1 + sdks/typescript/src/telemetry/client.ts | 7 +++++-- sdks/typescript/src/telemetry/types.ts | 3 +++ 3 files changed, 9 insertions(+), 2 deletions(-) diff --git a/sdks/typescript/src/evaluators/base.ts b/sdks/typescript/src/evaluators/base.ts index 6cfde46..c2a12da 100644 --- a/sdks/typescript/src/evaluators/base.ts +++ b/sdks/typescript/src/evaluators/base.ts @@ -115,6 +115,7 @@ export abstract class BaseEvaluator { partnerKey: config.partnerKey, clientId: generateClientId(), enabled: true, + logger: this.logger, }); } } diff --git a/sdks/typescript/src/telemetry/client.ts b/sdks/typescript/src/telemetry/client.ts index 4a1743b..d4db550 100644 --- a/sdks/typescript/src/telemetry/client.ts +++ b/sdks/typescript/src/telemetry/client.ts @@ -1,4 +1,5 @@ import type { TelemetryConfig, TelemetryEvent } from './types.js'; +import type { Logger } from '../logger.js'; /** * Telemetry client for sending analytics events @@ -8,9 +9,11 @@ import type { TelemetryConfig, TelemetryEvent } from './types.js'; */ export class TelemetryClient { private config: TelemetryConfig; + private logger: Logger; constructor(config: TelemetryConfig) { this.config = config; + this.logger = config.logger; } /** @@ -44,7 +47,7 @@ export class TelemetryClient { }); if (!response.ok) { - console.error( + this.logger.warn( `[Telemetry] Failed to send event: ${response.status} ${response.statusText}` ); } @@ -53,7 +56,7 @@ export class TelemetryClient { if (error instanceof Error) { // Don't log timeout errors (expected on slow networks) if (error.name !== 'TimeoutError' && error.name !== 'AbortError') { - console.error('[Telemetry] Error sending event:', error.message); + this.logger.warn(`[Telemetry] Error sending event: ${error.message}`); } } } diff --git a/sdks/typescript/src/telemetry/types.ts b/sdks/typescript/src/telemetry/types.ts index 2b78164..dc0c186 100644 --- a/sdks/typescript/src/telemetry/types.ts +++ b/sdks/typescript/src/telemetry/types.ts @@ -102,4 +102,7 @@ export interface TelemetryConfig { /** Enable telemetry (default: true) */ enabled: boolean; + + /** Logger instance (respects the SDK's configured log level and custom logger) */ + logger: import('../logger.js').Logger; } From 7750105af90da484253fdfb8800cb7b196e1082b Mon Sep 17 00:00:00 2001 From: Adnan Rashid Hussain Date: Mon, 2 Mar 2026 21:57:57 -0800 Subject: [PATCH 09/10] remove retry_attempts --- sdks/typescript/docs/telemetry.md | 8 +------- sdks/typescript/src/evaluators/base.ts | 2 -- sdks/typescript/src/evaluators/vocabulary.ts | 20 -------------------- sdks/typescript/src/telemetry/types.ts | 16 ---------------- 4 files changed, 1 insertion(+), 45 deletions(-) diff --git a/sdks/typescript/docs/telemetry.md b/sdks/typescript/docs/telemetry.md index f9caea1..991479f 100644 --- a/sdks/typescript/docs/telemetry.md +++ b/sdks/typescript/docs/telemetry.md @@ -14,7 +14,7 @@ Telemetry is **anonymous by default**. If you'd like to partner with us to impro **Input text is NOT collected by default.** You can opt in via `recordInputs: true` — see [Enable Input Text Collection](#enable-input-text-collection) below. -We **never** collect your API keys (only a hashed identifier). +We **never** collect your API keys (only an anonymous identifier). If you prefer not to send any telemetry, you can disable it entirely — see [Disable Telemetry Completely](#disable-telemetry-completely) below. @@ -30,7 +30,6 @@ If you prefer not to send any telemetry, you can disable it entirely — see [Di "latency_ms": 3500, "text_length_chars": 456, "provider": "google:gemini-2.5-pro+openai:gpt-4o", - "retry_attempts": -1, "token_usage": { "input_tokens": 650, "output_tokens": 350 @@ -41,7 +40,6 @@ If you prefer not to send any telemetry, you can disable it entirely — see [Di "stage": "background_knowledge", "provider": "openai:gpt-4o-2024-11-20", "latency_ms": 1200, - "retry_attempts": -1, "token_usage": { "input_tokens": 250, "output_tokens": 150 @@ -51,7 +49,6 @@ If you prefer not to send any telemetry, you can disable it entirely — see [Di "stage": "complexity_evaluation", "provider": "google:gemini-2.5-pro", "latency_ms": 2300, - "retry_attempts": -1, "token_usage": { "input_tokens": 400, "output_tokens": 200 @@ -75,13 +72,10 @@ If you prefer not to send any telemetry, you can disable it entirely — see [Di | `latency_ms` | Total evaluation time in milliseconds | | `text_length_chars` | Length of input text in characters | | `provider` | LLM provider(s) used (e.g., "openai:gpt-4o", "google:gemini-2.5-pro+openai:gpt-4o") | -| `retry_attempts` | Number of retries (-1 means unknown, see note below) | | `token_usage` | Total tokens consumed (input, output, total) | | `input_text` | The text being evaluated (only included if `recordInputs: true`) | | `metadata.stage_details` | Per-stage breakdown for multi-stage evaluators (optional) | -**Note on `retry_attempts`:** Currently set to `-1` (unknown) as actual retry counts are not yet tracked. This field is included for backward compatibility as we plan to add this as a future enhancement. - ## Configuration ### Default (Anonymous) diff --git a/sdks/typescript/src/evaluators/base.ts b/sdks/typescript/src/evaluators/base.ts index c2a12da..bef2ec4 100644 --- a/sdks/typescript/src/evaluators/base.ts +++ b/sdks/typescript/src/evaluators/base.ts @@ -228,7 +228,6 @@ export abstract class BaseEvaluator { textLength: number; grade?: string; provider: string; - retryAttempts: number; errorCode?: string; tokenUsage?: TokenUsage; metadata?: TelemetryMetadata; @@ -248,7 +247,6 @@ export abstract class BaseEvaluator { latency_ms: params.latencyMs, text_length_chars: params.textLength, provider: params.provider, - retry_attempts: params.retryAttempts, token_usage: params.tokenUsage, metadata: params.metadata, // Include input text only if recording is enabled diff --git a/sdks/typescript/src/evaluators/vocabulary.ts b/sdks/typescript/src/evaluators/vocabulary.ts index 68d3467..ea281e9 100644 --- a/sdks/typescript/src/evaluators/vocabulary.ts +++ b/sdks/typescript/src/evaluators/vocabulary.ts @@ -148,10 +148,6 @@ export class VocabularyEvaluator extends BaseEvaluator { stage: 'background_knowledge', provider: 'openai:gpt-4o-2024-11-20', latency_ms: bgResponse.latencyMs, - // TODO: Retry tracking - Vercel AI SDK doesn't expose actual retry attempts - // We set -1 to indicate "unknown" (we may have retried, but can't track it) - // To fix: Implement custom retry wrapper that tracks each attempt - retry_attempts: -1, token_usage: { input_tokens: bgResponse.usage.inputTokens, output_tokens: bgResponse.usage.outputTokens, @@ -173,10 +169,6 @@ export class VocabularyEvaluator extends BaseEvaluator { stage: 'complexity_evaluation', provider: complexityProviderName, latency_ms: complexityResponse.latencyMs, - // TODO: Retry tracking - Vercel AI SDK doesn't expose actual retry attempts - // We set -1 to indicate "unknown" (we may have retried, but can't track it) - // To fix: Implement custom retry wrapper that tracks each attempt - retry_attempts: -1, token_usage: { input_tokens: complexityResponse.usage.inputTokens, output_tokens: complexityResponse.usage.outputTokens, @@ -191,11 +183,6 @@ export class VocabularyEvaluator extends BaseEvaluator { output_tokens: stageDetails.reduce((sum, s) => sum + (s.token_usage?.output_tokens || 0), 0), }; - // If any stage has unknown retries (-1), total is unknown - const totalRetries = stageDetails.some(s => s.retry_attempts === -1) - ? -1 - : stageDetails.reduce((sum, s) => sum + s.retry_attempts, 0); - const result = { score: complexityResponse.data.complexity_score, reasoning: complexityResponse.data.reasoning, @@ -215,7 +202,6 @@ export class VocabularyEvaluator extends BaseEvaluator { textLength: text.length, grade, provider: `openai:gpt-4o-2024-11-20 + ${complexityProviderName}`, - retryAttempts: totalRetries, tokenUsage: totalTokenUsage, metadata: { stage_details: stageDetails, @@ -253,11 +239,6 @@ export class VocabularyEvaluator extends BaseEvaluator { output_tokens: stageDetails.reduce((sum, s) => sum + (s.token_usage?.output_tokens || 0), 0), } : undefined; - // If any stage has unknown retries (-1), total is unknown - const totalRetries = stageDetails.length > 0 && stageDetails.some(s => s.retry_attempts === -1) - ? -1 - : stageDetails.reduce((sum, s) => sum + s.retry_attempts, 0); - // Send failure telemetry (fire-and-forget) this.sendTelemetry({ status: 'error', @@ -265,7 +246,6 @@ export class VocabularyEvaluator extends BaseEvaluator { textLength: text.length, grade, provider: `openai:gpt-4o-2024-11-20 + ${complexityProviderName}`, - retryAttempts: totalRetries, tokenUsage: totalTokenUsage, errorCode: error instanceof Error ? error.name : 'UnknownError', metadata: stageDetails.length > 0 ? { stage_details: stageDetails } : undefined, diff --git a/sdks/typescript/src/telemetry/types.ts b/sdks/typescript/src/telemetry/types.ts index dc0c186..31b2920 100644 --- a/sdks/typescript/src/telemetry/types.ts +++ b/sdks/typescript/src/telemetry/types.ts @@ -28,21 +28,6 @@ export interface StageDetail { /** Total latency including all retries (ms) */ latency_ms: number; - /** - * Number of retries for this stage - * - * IMPORTANT: Currently set to -1 (unknown) because Vercel AI SDK doesn't expose - * actual retry attempts. We may have retried, but can't track it. - * - * Values: - * - -1 = unknown (current implementation) - * - 0+ = known retry count (requires custom retry wrapper) - * - * Note: Token usage and costs only reflect the final successful attempt. - * Failed retry attempts are not included due to SDK limitations. - */ - retry_attempts: number; - /** Token usage aggregated across all attempts */ token_usage?: TokenUsage; @@ -81,7 +66,6 @@ export interface TelemetryEvent { latency_ms: number; text_length_chars: number; provider: string; // Format: "provider:model" or "provider1+provider2" for multi-provider - retry_attempts: number; // Total retries across all stages (-1 = unknown, see StageDetail docs) token_usage?: TokenUsage; // Aggregated across all stages and attempts metadata?: TelemetryMetadata; // Optional per-stage breakdown input_text?: string; // Input text (only if recordInputs enabled) From 7f5b5da6c5f6dcfcd4cbcae64e4d0857371acd47 Mon Sep 17 00:00:00 2001 From: Adnan Rashid Hussain Date: Mon, 2 Mar 2026 22:09:56 -0800 Subject: [PATCH 10/10] change to configurationerror --- sdks/typescript/README.md | 7 +++++- sdks/typescript/src/errors.ts | 22 +++++++++++++++++++ sdks/typescript/src/evaluators/vocabulary.ts | 6 ++--- sdks/typescript/src/index.ts | 1 + .../tests/unit/evaluators/validation.test.ts | 17 ++++++++++++++ 5 files changed, 49 insertions(+), 4 deletions(-) diff --git a/sdks/typescript/README.md b/sdks/typescript/README.md index d1de0cd..f4209df 100644 --- a/sdks/typescript/README.md +++ b/sdks/typescript/README.md @@ -80,6 +80,7 @@ The SDK provides specific error types to help you handle different scenarios: ```typescript import { + ConfigurationError, ValidationError, APIError, AuthenticationError, @@ -89,9 +90,13 @@ import { } from '@learning-commons/evaluators'; try { + const evaluator = new VocabularyEvaluator({ googleApiKey, openaiApiKey }); const result = await evaluator.evaluate(text, grade); } catch (error) { - if (error instanceof ValidationError) { + if (error instanceof ConfigurationError) { + // Missing or invalid API keys — fix your config + console.error('Configuration error:', error.message); + } else if (error instanceof ValidationError) { // Invalid input (text too short, invalid grade, etc.) console.error('Invalid input:', error.message); } else if (error instanceof AuthenticationError) { diff --git a/sdks/typescript/src/errors.ts b/sdks/typescript/src/errors.ts index 2be798c..f31828a 100644 --- a/sdks/typescript/src/errors.ts +++ b/sdks/typescript/src/errors.ts @@ -23,6 +23,28 @@ export class EvaluatorError extends Error { } } +/** + * Configuration error - thrown when the evaluator is misconfigured + * These are developer errors (e.g. missing API keys) that should NOT be retried + * + * @example + * ```typescript + * try { + * const evaluator = new VocabularyEvaluator({ googleApiKey: '' }); + * } catch (error) { + * if (error instanceof ConfigurationError) { + * console.error('Check your evaluator config:', error.message); + * } + * } + * ``` + */ +export class ConfigurationError extends EvaluatorError { + constructor(message: string) { + super(message, 'CONFIGURATION_ERROR'); + this.name = 'ConfigurationError'; + } +} + /** * Validation error - thrown when input validation fails * These are client-side errors that should NOT be retried diff --git a/sdks/typescript/src/evaluators/vocabulary.ts b/sdks/typescript/src/evaluators/vocabulary.ts index ea281e9..912e8a2 100644 --- a/sdks/typescript/src/evaluators/vocabulary.ts +++ b/sdks/typescript/src/evaluators/vocabulary.ts @@ -14,7 +14,7 @@ import { import type { EvaluationResult } from '../schemas/index.js'; import { BaseEvaluator, type BaseEvaluatorConfig } from './base.js'; import type { StageDetail } from '../telemetry/index.js'; -import { ValidationError, wrapProviderError } from '../errors.js'; +import { ConfigurationError, ValidationError, wrapProviderError } from '../errors.js'; /** * Valid grade levels (3-12) @@ -69,11 +69,11 @@ export class VocabularyEvaluator extends BaseEvaluator { // Validate required API keys if (!config.googleApiKey) { - throw new ValidationError('Google API key is required. Pass googleApiKey in config.'); + throw new ConfigurationError('Google API key is required. Pass googleApiKey in config.'); } if (!config.openaiApiKey) { - throw new ValidationError('OpenAI API key is required. Pass openaiApiKey in config.'); + throw new ConfigurationError('OpenAI API key is required. Pass openaiApiKey in config.'); } // Create Google Gemini provider for complexity evaluation (grades 3-4) diff --git a/sdks/typescript/src/index.ts b/sdks/typescript/src/index.ts index e5bc977..16ac0e8 100644 --- a/sdks/typescript/src/index.ts +++ b/sdks/typescript/src/index.ts @@ -10,6 +10,7 @@ export type { // Error types export { EvaluatorError, + ConfigurationError, ValidationError, APIError, AuthenticationError, diff --git a/sdks/typescript/tests/unit/evaluators/validation.test.ts b/sdks/typescript/tests/unit/evaluators/validation.test.ts index b273b8a..74c095a 100644 --- a/sdks/typescript/tests/unit/evaluators/validation.test.ts +++ b/sdks/typescript/tests/unit/evaluators/validation.test.ts @@ -1,6 +1,7 @@ import { describe, it, expect, vi, beforeEach } from 'vitest'; import { VocabularyEvaluator } from '../../../src/evaluators/vocabulary.js'; import { VALIDATION_LIMITS } from '../../../src/evaluators/base.js'; +import { ConfigurationError } from '../../../src/errors.js'; import type { LLMProvider } from '../../../src/providers/base.js'; /** @@ -32,6 +33,22 @@ vi.mock('../../../src/telemetry/client.js', () => { }; }); +describe('Configuration Validation', () => { + it('should throw ConfigurationError when googleApiKey is missing', () => { + expect(() => new VocabularyEvaluator({ + googleApiKey: '', + openaiApiKey: 'test-openai-key', + })).toThrow(ConfigurationError); + }); + + it('should throw ConfigurationError when openaiApiKey is missing', () => { + expect(() => new VocabularyEvaluator({ + googleApiKey: 'test-google-key', + openaiApiKey: '', + })).toThrow(ConfigurationError); + }); +}); + describe('Input Validation - Text Validation', () => { let evaluator: VocabularyEvaluator;