diff --git a/evals/prompts/vocabulary/grades-3-4-user.txt b/evals/prompts/vocabulary/grades-3-4-user.txt index 1759511..7da9831 100644 --- a/evals/prompts/vocabulary/grades-3-4-user.txt +++ b/evals/prompts/vocabulary/grades-3-4-user.txt @@ -10,5 +10,3 @@ Below is the text you need to evaluate. Let's think step by step in order to pre - Text to evaluate: [BEGIN TEXT] {text} [END TEXT] - -{format_instructions} diff --git a/evals/prompts/vocabulary/other-grades-user.txt b/evals/prompts/vocabulary/other-grades-user.txt index 0d4b534..95cc176 100644 --- a/evals/prompts/vocabulary/other-grades-user.txt +++ b/evals/prompts/vocabulary/other-grades-user.txt @@ -135,5 +135,3 @@ As you read the text, you can assume the student has the following background kn [END TEXT] In your response, when specifying the level of complexity, be sure to use only a single integer (e.g. 2) and don't include any other text (e.g. don't say "level 2"). - -{format_instructions} diff --git a/sdks/typescript/README.md b/sdks/typescript/README.md index 3ea9d1c..f4209df 100644 --- a/sdks/typescript/README.md +++ b/sdks/typescript/README.md @@ -1 +1,179 @@ # @learning-commons/evaluators + +TypeScript SDK for Learning Commons educational text complexity evaluators. + +## Installation + +```bash +npm install @learning-commons/evaluators ai +``` + +The SDK uses the [Vercel AI SDK](https://sdk.vercel.ai) (`ai`) as its LLM interface. You also need to install the provider adapter(s) for the LLM(s) you use: + +```bash +npm install @ai-sdk/openai # for OpenAI +npm install @ai-sdk/google # for Google Gemini +npm install @ai-sdk/anthropic # for Anthropic +``` + +## Quick Start + +```typescript +import { VocabularyEvaluator } from '@learning-commons/evaluators'; + +const evaluator = new VocabularyEvaluator({ + googleApiKey: process.env.GOOGLE_API_KEY, + openaiApiKey: process.env.OPENAI_API_KEY +}); + +const result = await evaluator.evaluate("Your text here", "5"); +console.log(result.score); // "moderately complex" +``` + +--- + +## Evaluators + +### 1. Vocabulary Evaluator + +Evaluates vocabulary complexity using the Qual Text Complexity rubric (SAP). + +**Supported Grades:** 3-12 + +**Uses:** Google Gemini 2.5 Pro + OpenAI GPT-4o + +**Constructor:** +```typescript +const evaluator = new VocabularyEvaluator({ + googleApiKey: string; // Required - Google API key + openaiApiKey: string; // Required - OpenAI API key + maxRetries?: number; // Optional - Max retry attempts (default: 2) + telemetry?: boolean | TelemetryOptions; // Optional (default: true) + logger?: Logger; // Optional - Custom logger + logLevel?: LogLevel; // Optional - SILENT | ERROR | WARN | INFO | DEBUG (default: WARN) +}); +``` + +**API:** +```typescript +await evaluator.evaluate(text: string, grade: string) +``` + +**Returns:** +```typescript +{ + score: 'slightly complex' | 'moderately complex' | 'very complex' | 'exceedingly complex'; + reasoning: string; + metadata: { + promptVersion: string; + model: string; + timestamp: Date; + processingTimeMs: number; + }; + _internal: VocabularyComplexity; // Detailed analysis +} +``` + +## Error Handling + +The SDK provides specific error types to help you handle different scenarios: + +```typescript +import { + ConfigurationError, + ValidationError, + APIError, + AuthenticationError, + RateLimitError, + NetworkError, + TimeoutError, +} from '@learning-commons/evaluators'; + +try { + const evaluator = new VocabularyEvaluator({ googleApiKey, openaiApiKey }); + const result = await evaluator.evaluate(text, grade); +} catch (error) { + if (error instanceof ConfigurationError) { + // Missing or invalid API keys — fix your config + console.error('Configuration error:', error.message); + } else if (error instanceof ValidationError) { + // Invalid input (text too short, invalid grade, etc.) + console.error('Invalid input:', error.message); + } else if (error instanceof AuthenticationError) { + // Invalid API keys + console.error('Check your API keys:', error.message); + } else if (error instanceof RateLimitError) { + // Rate limit exceeded - wait and retry + console.error('Rate limited. Retry after:', error.retryAfter); + } else if (error instanceof NetworkError) { + // Network connectivity issues + console.error('Network error:', error.message); + } else if (error instanceof APIError) { + // Other API errors + console.error('API error:', error.message, 'Status:', error.statusCode); + } +} +``` + +--- + +## Logging + +Control logging verbosity with `logLevel`: + +```typescript +import { VocabularyEvaluator, LogLevel } from '@learning-commons/evaluators'; + +const evaluator = new VocabularyEvaluator({ + googleApiKey: '...', + openaiApiKey: '...', + logLevel: LogLevel.INFO, // SILENT | ERROR | WARN | INFO | DEBUG +}); +``` + +Or provide a custom logger: + +```typescript +import type { Logger } from '@learning-commons/evaluators'; + +const customLogger: Logger = { + debug: (msg, ctx) => myLogger.debug(msg, ctx), + info: (msg, ctx) => myLogger.info(msg, ctx), + warn: (msg, ctx) => myLogger.warn(msg, ctx), + error: (msg, ctx) => myLogger.error(msg, ctx), +}; + +const evaluator = new VocabularyEvaluator({ + googleApiKey: '...', + openaiApiKey: '...', + logger: customLogger, +}); +``` + +--- + +## Telemetry & Privacy + +See [docs/telemetry.md](./docs/telemetry.md) for telemetry configuration and privacy information. + +--- + +## Configuration Options + +All evaluators support these common options: + +```typescript +interface BaseEvaluatorConfig { + maxRetries?: number; // Max API retry attempts (default: 2) + telemetry?: boolean | TelemetryOptions; // Telemetry config (default: true) + logger?: Logger; // Custom logger (optional) + logLevel?: LogLevel; // Console log level (default: WARN) + partnerKey?: string; // Learning Commons partner key for authenticated telemetry (optional) +} +``` + +--- + +## License + +MIT diff --git a/sdks/typescript/docs/telemetry.md b/sdks/typescript/docs/telemetry.md new file mode 100644 index 0000000..991479f --- /dev/null +++ b/sdks/typescript/docs/telemetry.md @@ -0,0 +1,124 @@ +# Telemetry + +## Why We Collect Telemetry + +We use telemetry data to improve evaluator quality, identify edge cases, and optimize performance. This helps us build better tools for our developer partners. + +Telemetry is **anonymous by default**. If you'd like to partner with us to improve your specific use case, you can optionally provide an API key (see Configuration section below). This allows us to connect with you and collaborate more deeply. + +## What We Collect + +**By default, telemetry is enabled** and sends: +- Performance metrics (latency, token usage) +- Metadata (evaluator type, grade, SDK version) + +**Input text is NOT collected by default.** You can opt in via `recordInputs: true` — see [Enable Input Text Collection](#enable-input-text-collection) below. + +We **never** collect your API keys (only an anonymous identifier). + +If you prefer not to send any telemetry, you can disable it entirely — see [Disable Telemetry Completely](#disable-telemetry-completely) below. + +## Example Telemetry Event + +```json +{ + "timestamp": "2026-02-05T19:30:00.000Z", + "sdk_version": "0.1.0", + "evaluator_type": "vocabulary", + "grade": "5", + "status": "success", + "latency_ms": 3500, + "text_length_chars": 456, + "provider": "google:gemini-2.5-pro+openai:gpt-4o", + "token_usage": { + "input_tokens": 650, + "output_tokens": 350 + }, + "metadata": { + "stage_details": [ + { + "stage": "background_knowledge", + "provider": "openai:gpt-4o-2024-11-20", + "latency_ms": 1200, + "token_usage": { + "input_tokens": 250, + "output_tokens": 150 + } + }, + { + "stage": "complexity_evaluation", + "provider": "google:gemini-2.5-pro", + "latency_ms": 2300, + "token_usage": { + "input_tokens": 400, + "output_tokens": 200 + } + } + ] + } +} +``` + +## Field Reference + +| Field | Description | +|-------|-------------| +| `timestamp` | ISO 8601 timestamp when evaluation started | +| `sdk_version` | Version of the SDK (e.g., "0.1.0") | +| `evaluator_type` | Which evaluator ran (e.g., "vocabulary", "sentence-structure") | +| `grade` | Grade level evaluated (e.g., "5", "K") | +| `status` | Evaluation outcome: "success" or "error" | +| `error_code` | Error type if status is "error" (e.g., "Error", "TypeError") | +| `latency_ms` | Total evaluation time in milliseconds | +| `text_length_chars` | Length of input text in characters | +| `provider` | LLM provider(s) used (e.g., "openai:gpt-4o", "google:gemini-2.5-pro+openai:gpt-4o") | +| `token_usage` | Total tokens consumed (input, output, total) | +| `input_text` | The text being evaluated (only included if `recordInputs: true`) | +| `metadata.stage_details` | Per-stage breakdown for multi-stage evaluators (optional) | + +## Configuration + +### Default (Anonymous) + +```typescript +const evaluator = new VocabularyEvaluator({ + googleApiKey: process.env.GOOGLE_API_KEY!, + openaiApiKey: process.env.OPENAI_API_KEY!, + // telemetry: true (default - anonymous) +}); +``` + +### Partner with Us (Authenticated) + +To help us support your specific use case, provide an API key: + +```typescript +const evaluator = new VocabularyEvaluator({ + googleApiKey: process.env.GOOGLE_API_KEY!, + openaiApiKey: process.env.OPENAI_API_KEY!, + partnerKey: process.env.LEARNING_COMMONS_PARTNER_KEY!, // Contact us for a key +}); +``` + +### Disable Telemetry Completely + +```typescript +const evaluator = new VocabularyEvaluator({ + googleApiKey: process.env.GOOGLE_API_KEY!, + openaiApiKey: process.env.OPENAI_API_KEY!, + telemetry: false, // No data sent +}); +``` + +### Enable Input Text Collection + +```typescript +const evaluator = new VocabularyEvaluator({ + googleApiKey: process.env.GOOGLE_API_KEY!, + openaiApiKey: process.env.OPENAI_API_KEY!, + telemetry: { + enabled: true, + recordInputs: true, // Also send input text with telemetry + }, +}); +``` diff --git a/sdks/typescript/package.json b/sdks/typescript/package.json index f3e4bf6..b269046 100644 --- a/sdks/typescript/package.json +++ b/sdks/typescript/package.json @@ -48,7 +48,15 @@ }, "homepage": "https://github.com/learning-commons-org/evaluators#readme", "peerDependencies": { - "ai": ">=4.0.0" + "ai": ">=6.0.0", + "@ai-sdk/openai": ">=3.0.0", + "@ai-sdk/google": ">=3.0.0", + "@ai-sdk/anthropic": ">=3.0.0" + }, + "peerDependenciesMeta": { + "@ai-sdk/openai": { "optional": true }, + "@ai-sdk/google": { "optional": true }, + "@ai-sdk/anthropic": { "optional": true } }, "dependencies": { "compromise": "^14.13.0", diff --git a/sdks/typescript/src/errors.ts b/sdks/typescript/src/errors.ts new file mode 100644 index 0000000..f31828a --- /dev/null +++ b/sdks/typescript/src/errors.ts @@ -0,0 +1,263 @@ +/** + * Custom error types for the Evaluators SDK + * + * This module provides a hierarchy of error types to help users + * distinguish between different error scenarios and implement + * appropriate error handling strategies. + */ + +/** + * Base error class for all evaluator errors + */ +export class EvaluatorError extends Error { + constructor( + message: string, + public readonly code?: string + ) { + super(message); + this.name = 'EvaluatorError'; + // Maintains proper stack trace for where error was thrown (only available on V8) + if (Error.captureStackTrace) { + Error.captureStackTrace(this, this.constructor); + } + } +} + +/** + * Configuration error - thrown when the evaluator is misconfigured + * These are developer errors (e.g. missing API keys) that should NOT be retried + * + * @example + * ```typescript + * try { + * const evaluator = new VocabularyEvaluator({ googleApiKey: '' }); + * } catch (error) { + * if (error instanceof ConfigurationError) { + * console.error('Check your evaluator config:', error.message); + * } + * } + * ``` + */ +export class ConfigurationError extends EvaluatorError { + constructor(message: string) { + super(message, 'CONFIGURATION_ERROR'); + this.name = 'ConfigurationError'; + } +} + +/** + * Validation error - thrown when input validation fails + * These are client-side errors that should NOT be retried + * + * @example + * ```typescript + * try { + * await evaluator.evaluate('', '5'); + * } catch (error) { + * if (error instanceof ValidationError) { + * // Show user-friendly error message + * console.error('Invalid input:', error.message); + * } + * } + * ``` + */ +export class ValidationError extends EvaluatorError { + constructor(message: string) { + super(message, 'VALIDATION_ERROR'); + this.name = 'ValidationError'; + } +} + +/** + * Base API error - thrown when LLM API calls fail + * Contains additional context about the API error + */ +export class APIError extends EvaluatorError { + constructor( + message: string, + public readonly statusCode?: number, + public readonly retryable: boolean = false, + code?: string + ) { + super(message, code); + this.name = 'APIError'; + } +} + +/** + * Authentication error - thrown when API keys are invalid or missing + * HTTP 401 or 403 responses + * Should NOT be retried + * + * @example + * ```typescript + * try { + * await evaluator.evaluate(text, grade); + * } catch (error) { + * if (error instanceof AuthenticationError) { + * // Prompt user to check API keys + * console.error('Invalid API keys. Please check your credentials.'); + * } + * } + * ``` + */ +export class AuthenticationError extends APIError { + constructor(message: string, statusCode?: number) { + super(message, statusCode, false, 'AUTHENTICATION_ERROR'); + this.name = 'AuthenticationError'; + } +} + +/** + * Rate limit error - thrown when API rate limits are exceeded + * HTTP 429 responses + * Should be retried with exponential backoff + * + * @example + * ```typescript + * try { + * await evaluator.evaluate(text, grade); + * } catch (error) { + * if (error instanceof RateLimitError) { + * // Wait and retry + * await sleep(error.retryAfter || 5000); + * // retry... + * } + * } + * ``` + */ +export class RateLimitError extends APIError { + constructor( + message: string, + public readonly retryAfter?: number // milliseconds + ) { + super(message, 429, true, 'RATE_LIMIT_ERROR'); + this.name = 'RateLimitError'; + } +} + +/** + * Network error - thrown when network requests fail + * Connection timeouts, DNS failures, etc. + * May be retryable depending on the scenario + * + * @example + * ```typescript + * try { + * await evaluator.evaluate(text, grade); + * } catch (error) { + * if (error instanceof NetworkError) { + * // Check network connection and retry + * console.error('Network error:', error.message); + * } + * } + * ``` + */ +export class NetworkError extends APIError { + constructor(message: string, retryable: boolean = true) { + super(message, undefined, retryable, 'NETWORK_ERROR'); + this.name = 'NetworkError'; + } +} + +/** + * Timeout error - thrown when requests exceed timeout limits + * Should be retried with caution + * + * @example + * ```typescript + * try { + * await evaluator.evaluate(text, grade); + * } catch (error) { + * if (error instanceof TimeoutError) { + * // Retry with longer timeout or smaller text + * console.error('Request timed out'); + * } + * } + * ``` + */ +export class TimeoutError extends APIError { + constructor(message: string = 'Request timed out') { + super(message, 408, true, 'TIMEOUT_ERROR'); + this.name = 'TimeoutError'; + } +} + +/** + * Parse structured output from LLM provider error + */ +function parseProviderError(error: unknown): { message: string; statusCode?: number; code?: string } { + // Handle Error objects + if (error instanceof Error) { + const message = error.message; + + // Try to extract status code from error message + // Common patterns: "429", "401", "Error 429:", "Status: 429" + const statusMatch = message.match(/\b(4\d{2}|5\d{2})\b/); + const statusCode = statusMatch ? parseInt(statusMatch[1]) : undefined; + + return { + message, + statusCode, + code: error.name !== 'Error' ? error.name : undefined, + }; + } + + // Handle unknown error types + return { + message: String(error), + }; +} + +/** + * Wrap a provider error into the appropriate error type + * + * @internal + */ +export function wrapProviderError(error: unknown, defaultMessage: string = 'API request failed'): APIError { + const { message, statusCode, code } = parseProviderError(error); + + // Detect authentication errors (401, 403) + if (statusCode === 401 || statusCode === 403) { + return new AuthenticationError( + message.includes('API key') ? message : 'Invalid API key', + statusCode + ); + } + + // Detect rate limit errors (429) + if (statusCode === 429) { + // Try to extract retry-after if present + const retryAfterMatch = message.match(/retry[- ]after[:\s]+(\d+)/i); + const retryAfter = retryAfterMatch ? parseInt(retryAfterMatch[1]) * 1000 : undefined; + + return new RateLimitError( + message.includes('rate limit') ? message : 'Rate limit exceeded', + retryAfter + ); + } + + // Detect network errors + if ( + message.includes('ECONNREFUSED') || + message.includes('ENOTFOUND') || + message.includes('ETIMEDOUT') || + message.includes('network') || + message.includes('Network') + ) { + return new NetworkError(message); + } + + // Detect timeout errors + if (message.includes('timeout') || message.includes('timed out')) { + return new TimeoutError(message); + } + + // Generic API error for everything else + return new APIError( + message || defaultMessage, + statusCode, + statusCode ? statusCode >= 500 : false, // 5xx errors are retryable + code + ); +} diff --git a/sdks/typescript/src/evaluators/base.ts b/sdks/typescript/src/evaluators/base.ts new file mode 100644 index 0000000..bef2ec4 --- /dev/null +++ b/sdks/typescript/src/evaluators/base.ts @@ -0,0 +1,256 @@ +import { + TelemetryClient, + generateClientId, + getSDKVersion, + type TelemetryMetadata, + type TokenUsage, +} from '../telemetry/index.js'; +import { ValidationError } from '../errors.js'; +import { createLogger, LogLevel, type Logger } from '../logger.js'; + +/** + * Validation constants for input text + */ +export const VALIDATION_LIMITS = { + /** Minimum text length in characters */ + MIN_TEXT_LENGTH: 10, + /** Maximum text length in characters (100K chars ≈ 25K tokens) */ + MAX_TEXT_LENGTH: 100_000, +} as const; + +/** + * Granular telemetry configuration options + */ +export interface TelemetryOptions { + /** Enable telemetry (default: true) */ + enabled?: boolean; + + /** Record input text in telemetry (default: false) */ + recordInputs?: boolean; +} + +/** + * Base configuration for all evaluators + */ +export interface BaseEvaluatorConfig { + /** Google API key (for evaluators using Gemini) */ + googleApiKey?: string; + + /** OpenAI API key (for evaluators using GPT) */ + openaiApiKey?: string; + + /** Learning Commons partner key for authenticated telemetry (optional) */ + partnerKey?: string; + + /** + * Maximum number of retries for failed API calls (default: 2) + * Set to 0 to disable retries. + * + * Note: With maxRetries=2, a failed call will be attempted up to 3 times total + * (1 initial attempt + 2 retries) + */ + maxRetries?: number; + + /** + * Telemetry configuration (default: all enabled) + * + * Can be: + * - `true`: Enable with defaults (recordInputs: false) + * - `false`: Disable completely + * - `TelemetryOptions`: Granular control + */ + telemetry?: boolean | TelemetryOptions; + + /** + * Custom logger implementation (optional) + * If not provided, uses console logger with specified logLevel + */ + logger?: Logger; + + /** + * Log level for default console logger (default: WARN) + * Only used if custom logger is not provided + * + * - DEBUG: Very verbose, shows all operations + * - INFO: Normal operations + * - WARN: Warnings only (default) + * - ERROR: Errors only + * - SILENT: No logging + */ + logLevel?: LogLevel; +} + +/** + * Abstract base class for all evaluators + * + * Provides common functionality: + * - Telemetry setup and event sending + * - Text validation + * - Grade validation (with overridable default) + * - Metadata creation + */ +export abstract class BaseEvaluator { + protected telemetryClient?: TelemetryClient; + protected logger: Logger; + protected config: Required> & { + telemetry: Required; + }; + + constructor(config: BaseEvaluatorConfig) { + // Initialize logger + this.logger = createLogger(config.logger, config.logLevel ?? LogLevel.WARN); + // Normalize telemetry config + const telemetryConfig = this.normalizeTelemetryConfig(config.telemetry); + + // Set defaults for common config + this.config = { + maxRetries: config.maxRetries ?? 2, + telemetry: telemetryConfig, + }; + + // Initialize telemetry if enabled + if (this.config.telemetry.enabled) { + this.telemetryClient = new TelemetryClient({ + endpoint: 'https://api.learningcommons.org/v1/telemetry', + partnerKey: config.partnerKey, + clientId: generateClientId(), + enabled: true, + logger: this.logger, + }); + } + } + + /** + * Normalize telemetry config to standard format + */ + private normalizeTelemetryConfig( + telemetry: boolean | TelemetryOptions | undefined + ): Required { + // Handle boolean shortcuts + if (telemetry === false) { + return { + enabled: false, + recordInputs: false, + }; + } + + if (telemetry === true || telemetry === undefined) { + return { + enabled: true, + recordInputs: false, + }; + } + + // Handle granular config object + return { + enabled: telemetry.enabled ?? true, + recordInputs: telemetry.recordInputs ?? false, + }; + } + + /** + * Get the evaluator type identifier (e.g., "vocabulary", "sentence-structure") + * Must be implemented by concrete evaluators + */ + protected abstract getEvaluatorType(): string; + + /** + * Validate text meets requirements + * Default implementation - can be overridden by concrete evaluators + * + * @throws {ValidationError} If text is invalid + */ + protected validateText(text: string): void { + this.logger.debug('Validating text input', { + evaluator: this.getEvaluatorType(), + operation: 'validateText', + textLength: text.length, + }); + + // Check if text is empty or only whitespace + const trimmedText = text.trim(); + if (!trimmedText) { + throw new ValidationError('Text cannot be empty or contain only whitespace'); + } + + // Check minimum length + if (trimmedText.length < VALIDATION_LIMITS.MIN_TEXT_LENGTH) { + throw new ValidationError( + `Text is too short. Minimum length is ${VALIDATION_LIMITS.MIN_TEXT_LENGTH} characters, received ${trimmedText.length} characters` + ); + } + + // Check maximum length + if (trimmedText.length > VALIDATION_LIMITS.MAX_TEXT_LENGTH) { + throw new ValidationError( + `Text is too long. Maximum length is ${VALIDATION_LIMITS.MAX_TEXT_LENGTH.toLocaleString()} characters, received ${trimmedText.length.toLocaleString()} characters` + ); + } + } + + /** + * Validate grade is in supported range + * Default implementation - can be overridden by concrete evaluators + * + * @param grade - Grade level to validate + * @param validGrades - Set of valid grades for this evaluator + * @throws {ValidationError} If grade is invalid + */ + protected validateGrade(grade: string, validGrades: Set): void { + this.logger.debug('Validating grade input', { + evaluator: this.getEvaluatorType(), + operation: 'validateGrade', + grade, + }); + + // Check if grade is in valid set + if (!validGrades.has(grade)) { + const validList = Array.from(validGrades).sort((a, b) => { + // Sort K first, then numerically + if (a === 'K') return -1; + if (b === 'K') return 1; + return parseInt(a) - parseInt(b); + }).join(', '); + + throw new ValidationError( + `Invalid grade "${grade}". Supported grades for this evaluator: ${validList}` + ); + } + } + + /** + * Send telemetry event to analytics service + * Common helper for all evaluators + */ + protected async sendTelemetry(params: { + status: 'success' | 'error'; + latencyMs: number; + textLength: number; + grade?: string; + provider: string; + errorCode?: string; + tokenUsage?: TokenUsage; + metadata?: TelemetryMetadata; + inputText?: string; + }): Promise { + if (!this.telemetryClient) { + return; + } + + await this.telemetryClient.send({ + timestamp: new Date().toISOString(), + sdk_version: getSDKVersion(), + evaluator_type: this.getEvaluatorType(), + grade: params.grade, + status: params.status, + error_code: params.errorCode, + latency_ms: params.latencyMs, + text_length_chars: params.textLength, + provider: params.provider, + token_usage: params.tokenUsage, + metadata: params.metadata, + // Include input text only if recording is enabled + input_text: this.config.telemetry.recordInputs ? params.inputText : undefined, + }); + } +} diff --git a/sdks/typescript/src/evaluators/index.ts b/sdks/typescript/src/evaluators/index.ts new file mode 100644 index 0000000..2f6ff9d --- /dev/null +++ b/sdks/typescript/src/evaluators/index.ts @@ -0,0 +1,7 @@ +export { BaseEvaluator, type BaseEvaluatorConfig, type TelemetryOptions } from './base.js'; + +export { + VocabularyEvaluator, + evaluateVocabulary, + type VocabularyEvaluatorConfig, +} from './vocabulary.js'; diff --git a/sdks/typescript/src/evaluators/vocabulary.ts b/sdks/typescript/src/evaluators/vocabulary.ts new file mode 100644 index 0000000..912e8a2 --- /dev/null +++ b/sdks/typescript/src/evaluators/vocabulary.ts @@ -0,0 +1,353 @@ +import type { LLMProvider } from '../providers/index.js'; +import { createProvider } from '../providers/index.js'; +import { + VocabularyComplexitySchema, + type VocabularyComplexity, + type BackgroundKnowledge, +} from '../schemas/vocabulary.js'; +import { calculateFleschKincaidGrade } from '../features/index.js'; +import { + getBackgroundKnowledgePrompt, + getSystemPrompt, + getUserPrompt, +} from '../prompts/vocabulary/index.js'; +import type { EvaluationResult } from '../schemas/index.js'; +import { BaseEvaluator, type BaseEvaluatorConfig } from './base.js'; +import type { StageDetail } from '../telemetry/index.js'; +import { ConfigurationError, ValidationError, wrapProviderError } from '../errors.js'; + +/** + * Valid grade levels (3-12) + */ +const VALID_GRADES = new Set(['3', '4', '5', '6', '7', '8', '9', '10', '11', '12']); + +/** + * Configuration for VocabularyEvaluator + */ +export interface VocabularyEvaluatorConfig extends BaseEvaluatorConfig { + /** Google API key for complexity evaluation (uses Gemini 2.5 Pro) */ + googleApiKey: string; + + /** OpenAI API key for background knowledge generation (uses GPT-4o) */ + openaiApiKey: string; +} + +/** + * Vocabulary Evaluator + * + * Evaluates vocabulary complexity of educational texts relative to grade level. + * Uses a 2-stage process: + * 1. Generate background knowledge assumption for the student's grade level + * 2. Evaluate vocabulary complexity using that background knowledge + * + * Based on Qual Text Complexity rubric (SAP) with 4 levels: + * - Slightly complex + * - Moderately complex + * - Very complex + * - Exceedingly complex + * + * @example + * ```typescript + * const evaluator = new VocabularyEvaluator({ + * googleApiKey: process.env.GOOGLE_API_KEY, + * openaiApiKey: process.env.OPENAI_API_KEY + * }); + * + * const result = await evaluator.evaluate(text, "3"); + * console.log(result.score); // "moderately complex" + * console.log(result.reasoning); + * ``` + */ +export class VocabularyEvaluator extends BaseEvaluator { + private grades34ComplexityProvider: LLMProvider; + private otherGradesComplexityProvider: LLMProvider; + private backgroundKnowledgeProvider: LLMProvider; + + constructor(config: VocabularyEvaluatorConfig) { + // Call base constructor for common setup (telemetry, etc.) + super(config); + + // Validate required API keys + if (!config.googleApiKey) { + throw new ConfigurationError('Google API key is required. Pass googleApiKey in config.'); + } + + if (!config.openaiApiKey) { + throw new ConfigurationError('OpenAI API key is required. Pass openaiApiKey in config.'); + } + + // Create Google Gemini provider for complexity evaluation (grades 3-4) + this.grades34ComplexityProvider = createProvider({ + type: 'google', + model: 'gemini-2.5-pro', + apiKey: config.googleApiKey, + maxRetries: this.config.maxRetries, + }); + + // Create OpenAI GPT-4.1 provider for complexity evaluation (grades 5-12) + this.otherGradesComplexityProvider = createProvider({ + type: 'openai', + model: 'gpt-4.1-2025-04-14', + apiKey: config.openaiApiKey, + maxRetries: this.config.maxRetries, + }); + + // Create OpenAI GPT-4o provider for background knowledge generation + this.backgroundKnowledgeProvider = createProvider({ + type: 'openai', + model: 'gpt-4o-2024-11-20', + apiKey: config.openaiApiKey, + maxRetries: this.config.maxRetries, + }); + } + + // Implement abstract methods from BaseEvaluator + protected getEvaluatorType(): string { + return 'vocabulary'; + } + + /** + * Evaluate vocabulary complexity for a given text and grade level + * + * @param text - The text to evaluate + * @param grade - The target grade level (3-12) + * @returns Evaluation result with complexity score and detailed analysis + * @throws {ValidationError} If text is empty, too short/long, or grade is invalid + * @throws {APIError} If LLM API calls fail (includes AuthenticationError, RateLimitError, NetworkError, TimeoutError) + */ + async evaluate( + text: string, + grade: string + ): Promise> { + this.logger.info('Starting vocabulary evaluation', { + evaluator: 'vocabulary', + operation: 'evaluate', + grade, + textLength: text.length, + }); + + const startTime = Date.now(); + const stageDetails: StageDetail[] = []; + const complexityProviderName = (grade === '3' || grade === '4') + ? 'google:gemini-2.5-pro' + : 'openai:gpt-4.1-2025-04-14'; + + try { + // Validate inputs — inside try so validation errors are telemetered. + // If partners consistently pass invalid grades/text, telemetry will surface documentation gaps. + this.validateText(text); + this.validateGrade(grade, VALID_GRADES); + this.logger.debug('Stage 1: Generating background knowledge', { + evaluator: 'vocabulary', + operation: 'background_knowledge', + }); + // Stage 1: Generate background knowledge assumption + const bgResponse = await this.getBackgroundKnowledgeAssumption(text, grade); + + stageDetails.push({ + stage: 'background_knowledge', + provider: 'openai:gpt-4o-2024-11-20', + latency_ms: bgResponse.latencyMs, + token_usage: { + input_tokens: bgResponse.usage.inputTokens, + output_tokens: bgResponse.usage.outputTokens, + }, + }); + + // Calculate Flesch-Kincaid grade level + const fkLevel = calculateFleschKincaidGrade(text); + + // Stage 2: Evaluate vocabulary complexity + const complexityResponse = await this.evaluateComplexity( + text, + grade, + bgResponse.knowledge.assumption, + fkLevel + ); + + stageDetails.push({ + stage: 'complexity_evaluation', + provider: complexityProviderName, + latency_ms: complexityResponse.latencyMs, + token_usage: { + input_tokens: complexityResponse.usage.inputTokens, + output_tokens: complexityResponse.usage.outputTokens, + }, + }); + + const latencyMs = Date.now() - startTime; + + // Aggregate token usage + const totalTokenUsage = { + input_tokens: stageDetails.reduce((sum, s) => sum + (s.token_usage?.input_tokens || 0), 0), + output_tokens: stageDetails.reduce((sum, s) => sum + (s.token_usage?.output_tokens || 0), 0), + }; + + const result = { + score: complexityResponse.data.complexity_score, + reasoning: complexityResponse.data.reasoning, + metadata: { + promptVersion: '1.2.0', + model: `openai:gpt-4o-2024-11-20 + ${complexityProviderName}`, + timestamp: new Date(), + processingTimeMs: latencyMs, + }, + _internal: complexityResponse.data, + }; + + // Send success telemetry (fire-and-forget) + this.sendTelemetry({ + status: 'success', + latencyMs, + textLength: text.length, + grade, + provider: `openai:gpt-4o-2024-11-20 + ${complexityProviderName}`, + tokenUsage: totalTokenUsage, + metadata: { + stage_details: stageDetails, + }, + inputText: text, + }).catch(() => { + // Ignore telemetry errors + }); + + this.logger.info('Vocabulary evaluation completed successfully', { + evaluator: 'vocabulary', + operation: 'evaluate', + grade, + score: result.score, + processingTimeMs: latencyMs, + }); + + return result; + } catch (error) { + const latencyMs = Date.now() - startTime; + + // Log the error + this.logger.error('Vocabulary evaluation failed', { + evaluator: 'vocabulary', + operation: 'evaluate', + grade, + error: error instanceof Error ? error : undefined, + processingTimeMs: latencyMs, + completedStages: stageDetails.length, + }); + + // Aggregate metrics from completed stages + const totalTokenUsage = stageDetails.length > 0 ? { + input_tokens: stageDetails.reduce((sum, s) => sum + (s.token_usage?.input_tokens || 0), 0), + output_tokens: stageDetails.reduce((sum, s) => sum + (s.token_usage?.output_tokens || 0), 0), + } : undefined; + + // Send failure telemetry (fire-and-forget) + this.sendTelemetry({ + status: 'error', + latencyMs, + textLength: text.length, + grade, + provider: `openai:gpt-4o-2024-11-20 + ${complexityProviderName}`, + tokenUsage: totalTokenUsage, + errorCode: error instanceof Error ? error.name : 'UnknownError', + metadata: stageDetails.length > 0 ? { stage_details: stageDetails } : undefined, + inputText: text, + }).catch(() => { + // Ignore telemetry errors + }); + + // Re-throw validation errors as-is + if (error instanceof ValidationError) { + throw error; + } + + // Wrap provider errors into appropriate error types + throw wrapProviderError(error, 'Vocabulary evaluation failed'); + } + } + + /** + * Stage 1: Generate background knowledge assumption + * + * Estimates what topics the student at the given grade level would be familiar with + * based on Common Core curriculum progression. + */ + private async getBackgroundKnowledgeAssumption( + text: string, + grade: string + ): Promise<{ knowledge: BackgroundKnowledge; usage: { inputTokens: number; outputTokens: number }; latencyMs: number }> { + const prompt = getBackgroundKnowledgePrompt(text, grade); + + const response = await this.backgroundKnowledgeProvider.generateText( + [{ role: 'user', content: prompt }], + 0 // temperature = 0 for consistency + ); + + return { + knowledge: { + assumption: response.text.trim(), + grade, + }, + usage: response.usage, + latencyMs: response.latencyMs, + }; + } + + /** + * Stage 2: Evaluate vocabulary complexity + * + * Uses the Qual Text Complexity rubric (SAP) and background knowledge to evaluate vocabulary complexity. + * Grades 3-4 use Gemini 2.5 Pro; grades 5-12 use GPT-4.1. + */ + private async evaluateComplexity( + text: string, + grade: string, + backgroundKnowledge: string, + fkLevel: number + ): Promise<{ data: VocabularyComplexity; usage: { inputTokens: number; outputTokens: number }; latencyMs: number }> { + const systemPrompt = getSystemPrompt(grade); + const userPrompt = getUserPrompt(text, grade, backgroundKnowledge, fkLevel); + + const provider = (grade === '3' || grade === '4') + ? this.grades34ComplexityProvider + : this.otherGradesComplexityProvider; + + const response = await provider.generateStructured({ + messages: [ + { role: 'system', content: systemPrompt }, + { role: 'user', content: userPrompt }, + ], + schema: VocabularyComplexitySchema, + temperature: 0, + }); + + return { + data: response.data, + usage: response.usage, + latencyMs: response.latencyMs, + }; + } + +} + +/** + * Functional API for vocabulary evaluation + * + * @example + * ```typescript + * const result = await evaluateVocabulary( + * "The mitochondria is the powerhouse of the cell.", + * "3", + * { + * googleApiKey: process.env.GOOGLE_API_KEY, + * openaiApiKey: process.env.OPENAI_API_KEY + * } + * ); + * ``` + */ +export async function evaluateVocabulary( + text: string, + grade: string, + config: VocabularyEvaluatorConfig +): Promise> { + const evaluator = new VocabularyEvaluator(config); + return evaluator.evaluate(text, grade); +} diff --git a/sdks/typescript/src/features/index.ts b/sdks/typescript/src/features/index.ts new file mode 100644 index 0000000..354830e --- /dev/null +++ b/sdks/typescript/src/features/index.ts @@ -0,0 +1,5 @@ +export { + calculateFleschKincaidGrade, + calculateReadabilityMetrics, + type ReadabilityMetrics, +} from './readability.js'; diff --git a/sdks/typescript/src/features/readability.ts b/sdks/typescript/src/features/readability.ts new file mode 100644 index 0000000..a744cb5 --- /dev/null +++ b/sdks/typescript/src/features/readability.ts @@ -0,0 +1,49 @@ +import nlp from 'compromise'; +import { syllable } from 'syllable'; + +/** + * Calculate Flesch-Kincaid Grade Level + * Equivalent to Python's textstat.flesch_kincaid_grade() + */ +export function calculateFleschKincaidGrade(text: string): number { + return calculateReadabilityMetrics(text).fleschKincaidGrade; +} + +/** + * Additional readability metrics + */ +export interface ReadabilityMetrics { + sentenceCount: number; + wordCount: number; + characterCount: number; + syllableCount: number; + avgWordsPerSentence: number; + avgSyllablesPerWord: number; + fleschKincaidGrade: number; +} + +export function calculateReadabilityMetrics(text: string): ReadabilityMetrics { + const doc = nlp(text); + + const sentences = doc.sentences().length; + const terms = doc.terms(); + const words = terms.length; + const characters = text.replace(/\s/g, '').length; + + const allWords = terms.out('array'); + const totalSyllables = allWords.reduce((sum: number, word: string) => sum + syllable(word), 0); + + const avgWordsPerSentence = sentences > 0 ? words / sentences : 0; + const avgSyllablesPerWord = words > 0 ? totalSyllables / words : 0; + const fkGrade = 0.39 * avgWordsPerSentence + 11.8 * avgSyllablesPerWord - 15.59; + + return { + sentenceCount: sentences, + wordCount: words, + characterCount: characters, + syllableCount: totalSyllables, + avgWordsPerSentence, + avgSyllablesPerWord, + fleschKincaidGrade: Math.round(Math.max(0, fkGrade) * 100) / 100, + }; +} diff --git a/sdks/typescript/src/index.ts b/sdks/typescript/src/index.ts index cb0ff5c..16ac0e8 100644 --- a/sdks/typescript/src/index.ts +++ b/sdks/typescript/src/index.ts @@ -1 +1,55 @@ -export {}; +// Core types and schemas +export type { + EvaluationResult, + EvaluationMetadata, + BatchEvaluationResult, + BatchSummary, + EvaluationError, +} from './schemas/index.js'; + +// Error types +export { + EvaluatorError, + ConfigurationError, + ValidationError, + APIError, + AuthenticationError, + RateLimitError, + NetworkError, + TimeoutError, +} from './errors.js'; + +// Logger +export type { Logger, LogContext } from './logger.js'; +export { LogLevel } from './logger.js'; + +// Provider types (for implementing custom providers) +export type { + LLMProvider, + LLMRequest, + LLMResponse, + TextGenerationResponse, + Message, + ProviderConfig, +} from './providers/index.js'; + +// Vocabulary exports +export type { + VocabularyComplexity, + VocabularyComplexityLevel, +} from './schemas/vocabulary.js'; + +export { + VocabularyEvaluator, + evaluateVocabulary, + type VocabularyEvaluatorConfig, + type BaseEvaluatorConfig, + type TelemetryOptions, +} from './evaluators/index.js'; + +// Features +export { + calculateFleschKincaidGrade, + calculateReadabilityMetrics, + type ReadabilityMetrics, +} from './features/index.js'; diff --git a/sdks/typescript/src/logger.ts b/sdks/typescript/src/logger.ts new file mode 100644 index 0000000..bdc5a1b --- /dev/null +++ b/sdks/typescript/src/logger.ts @@ -0,0 +1,159 @@ +/** + * Logging interface for the Evaluators SDK + * + * Provides structured logging with verbosity levels. + * Users can inject custom loggers or use the default console logger. + */ + +/** + * Log levels in order of verbosity + */ +export enum LogLevel { + /** Debug messages - very verbose, for development */ + DEBUG = 0, + /** Informational messages - normal operations */ + INFO = 1, + /** Warning messages - potentially problematic situations */ + WARN = 2, + /** Error messages - errors that need attention */ + ERROR = 3, + /** Silent - no logging */ + SILENT = 4, +} + +/** + * Context object for structured logging + */ +export interface LogContext { + /** Evaluator type (vocabulary, sentence-structure, etc.) */ + evaluator?: string; + /** Current operation or stage */ + operation?: string; + /** Error object if applicable */ + error?: Error; + /** Additional metadata */ + [key: string]: unknown; +} + +/** + * Logger interface + * + * Implement this interface to provide custom logging behavior. + * + * @example + * ```typescript + * const customLogger: Logger = { + * debug: (msg, ctx) => myLogger.debug(msg, ctx), + * info: (msg, ctx) => myLogger.info(msg, ctx), + * warn: (msg, ctx) => myLogger.warn(msg, ctx), + * error: (msg, ctx) => myLogger.error(msg, ctx), + * }; + * + * const evaluator = new VocabularyEvaluator({ + * googleApiKey: '...', + * openaiApiKey: '...', + * logger: customLogger, + * logLevel: LogLevel.INFO, + * }); + * ``` + */ +export interface Logger { + /** + * Log debug message + * Used for detailed debugging information + */ + debug(message: string, context?: LogContext): void; + + /** + * Log informational message + * Used for normal operations + */ + info(message: string, context?: LogContext): void; + + /** + * Log warning message + * Used for potentially problematic situations + */ + warn(message: string, context?: LogContext): void; + + /** + * Log error message + * Used for errors that need attention + */ + error(message: string, context?: LogContext): void; +} + +/** + * Default console logger implementation + */ +class ConsoleLogger implements Logger { + constructor(private level: LogLevel = LogLevel.WARN) {} + + debug(message: string, context?: LogContext): void { + if (this.level <= LogLevel.DEBUG) { + console.debug(`[DEBUG] ${message}`, context || ''); + } + } + + info(message: string, context?: LogContext): void { + if (this.level <= LogLevel.INFO) { + console.info(`[INFO] ${message}`, context || ''); + } + } + + warn(message: string, context?: LogContext): void { + if (this.level <= LogLevel.WARN) { + console.warn(`[WARN] ${message}`, context || ''); + } + } + + error(message: string, context?: LogContext): void { + if (this.level <= LogLevel.ERROR) { + console.error(`[ERROR] ${message}`, context || ''); + } + } +} + +/** + * Silent logger - logs nothing + */ +class SilentLogger implements Logger { + debug(): void {} + info(): void {} + warn(): void {} + error(): void {} +} + +/** + * Create a logger instance + * + * @param customLogger - Optional custom logger implementation + * @param level - Log level (default: WARN) + * @returns Logger instance + */ +export function createLogger(customLogger?: Logger, level: LogLevel = LogLevel.WARN): Logger { + // Use custom logger if provided + if (customLogger) { + return customLogger; + } + + // Use silent logger if level is SILENT + if (level === LogLevel.SILENT) { + return new SilentLogger(); + } + + // Use console logger with specified level + return new ConsoleLogger(level); +} + +/** + * Format error for logging + * + * @internal + */ +export function formatError(error: unknown): string { + if (error instanceof Error) { + return `${error.name}: ${error.message}`; + } + return String(error); +} diff --git a/sdks/typescript/src/prompts/vocabulary/background-knowledge.ts b/sdks/typescript/src/prompts/vocabulary/background-knowledge.ts new file mode 100644 index 0000000..52309f1 --- /dev/null +++ b/sdks/typescript/src/prompts/vocabulary/background-knowledge.ts @@ -0,0 +1,10 @@ +import BACKGROUND_KNOWLEDGE_TEMPLATE from '../../../../../evals/prompts/vocabulary/background-knowledge.txt'; + +/** + * Generate the background knowledge prompt for a given text and grade level + */ +export function getBackgroundKnowledgePrompt(text: string, grade: string): string { + return BACKGROUND_KNOWLEDGE_TEMPLATE + .replaceAll('{grade}', grade) + .replaceAll('{text}', text); +} diff --git a/sdks/typescript/src/prompts/vocabulary/index.ts b/sdks/typescript/src/prompts/vocabulary/index.ts new file mode 100644 index 0000000..47ed85a --- /dev/null +++ b/sdks/typescript/src/prompts/vocabulary/index.ts @@ -0,0 +1,3 @@ +export { getBackgroundKnowledgePrompt } from './background-knowledge.js'; +export { getSystemPrompt } from './system.js'; +export { getUserPrompt } from './user.js'; diff --git a/sdks/typescript/src/prompts/vocabulary/system.ts b/sdks/typescript/src/prompts/vocabulary/system.ts new file mode 100644 index 0000000..81dde16 --- /dev/null +++ b/sdks/typescript/src/prompts/vocabulary/system.ts @@ -0,0 +1,17 @@ +import SYSTEM_PROMPT_GRADES_3_4 from '../../../../../evals/prompts/vocabulary/grades-3-4-system.txt'; +import SYSTEM_PROMPT_OTHER_GRADES from '../../../../../evals/prompts/vocabulary/other-grades-system.txt'; + +/** + * Get the appropriate system prompt based on grade level + * @param grade - The target grade level (3-12) + * @returns The system prompt for the grade level + */ +export function getSystemPrompt(grade: string): string { + // Grades 3-4 use the GRADES_3_4 prompt + if (grade === '3' || grade === '4') { + return SYSTEM_PROMPT_GRADES_3_4; + } + + // All other grades (5-12) use OTHER_GRADES prompt + return SYSTEM_PROMPT_OTHER_GRADES; +} diff --git a/sdks/typescript/src/prompts/vocabulary/user.ts b/sdks/typescript/src/prompts/vocabulary/user.ts new file mode 100644 index 0000000..75e56b0 --- /dev/null +++ b/sdks/typescript/src/prompts/vocabulary/user.ts @@ -0,0 +1,28 @@ +import USER_PROMPT_TEMPLATE_GRADES_3_4 from '../../../../../evals/prompts/vocabulary/grades-3-4-user.txt'; +import USER_PROMPT_TEMPLATE_OTHER_GRADES from '../../../../../evals/prompts/vocabulary/other-grades-user.txt'; + +/** + * Generate the user prompt for vocabulary complexity evaluation + * @param text - The text to evaluate + * @param studentGradeLevel - The student's grade level + * @param studentBackgroundKnowledge - Background knowledge assumption + * @param fkLevel - Flesch-Kincaid grade level + * @returns The formatted user prompt + */ +export function getUserPrompt( + text: string, + studentGradeLevel: string, + studentBackgroundKnowledge: string, + fkLevel: number +): string { + // Select the appropriate template based on grade + const template = studentGradeLevel === '3' || studentGradeLevel === '4' + ? USER_PROMPT_TEMPLATE_GRADES_3_4 + : USER_PROMPT_TEMPLATE_OTHER_GRADES; + + return template + .replaceAll('{student_grade_level}', studentGradeLevel) + .replaceAll('{student_background_knowledge}', studentBackgroundKnowledge) + .replaceAll('{fk_level}', fkLevel.toString()) + .replaceAll('{text}', text); +} diff --git a/sdks/typescript/src/providers/ai-sdk-provider.ts b/sdks/typescript/src/providers/ai-sdk-provider.ts new file mode 100644 index 0000000..e482f35 --- /dev/null +++ b/sdks/typescript/src/providers/ai-sdk-provider.ts @@ -0,0 +1,144 @@ +import { generateText as aiGenerateText, Output } from 'ai'; +import type { + LLMProvider, + LLMRequest, + LLMResponse, + Message, + ProviderConfig, +} from './base.js'; + +/** + * Default models for each provider based on Python implementation + */ +const DEFAULT_MODELS = { + openai: 'gpt-4o', + anthropic: 'claude-sonnet-4-5-20250929', + google: 'gemini-2.5-pro', +} as const; + +/** + * Vercel AI SDK provider implementation + * Supports OpenAI, Anthropic, and Google Gemini + */ +export class VercelAIProvider implements LLMProvider { + constructor(private config: ProviderConfig) { + if (config.type === 'custom') { + throw new Error( + 'VercelAIProvider does not support custom type. Use config.customProvider directly.' + ); + } + } + + /** + * Generate structured output using Vercel AI SDK's generateText with output + */ + async generateStructured(request: LLMRequest): Promise> { + const model = await this.getModel(request.model); + const startTime = Date.now(); + + const { output, usage } = await aiGenerateText({ + model, + messages: request.messages, + output: Output.object({ schema: request.schema }), + temperature: request.temperature ?? 0, + maxRetries: this.config.maxRetries ?? 0, + ...(request.maxTokens !== undefined ? { maxTokens: request.maxTokens } : {}), + }); + + return { + data: output as T, + model: request.model || this.getDefaultModel(), + usage: { + inputTokens: usage.inputTokens || 0, + outputTokens: usage.outputTokens || 0, + }, + latencyMs: Date.now() - startTime, + }; + } + + /** + * Generate plain text using Vercel AI SDK's generateText + */ + async generateText(messages: Message[], temperature?: number): Promise { + const model = await this.getModel(); + const startTime = Date.now(); + + const { text, usage } = await aiGenerateText({ + model, + messages, + temperature: temperature ?? this.config.temperature ?? 0, + maxRetries: this.config.maxRetries ?? 0, + }); + + return { + text, + usage: { + inputTokens: usage.inputTokens || 0, + outputTokens: usage.outputTokens || 0, + }, + latencyMs: Date.now() - startTime, + }; + } + + /** + * Get the configured language model. + * Uses dynamic imports so consumers only need to install the provider packages they use. + */ + private async getModel(requestModel?: string) { + const modelId = requestModel || this.config.model || this.getDefaultModel(); + const apiKey = this.config.apiKey; + + switch (this.config.type) { + case 'openai': { + const { createOpenAI } = await import('@ai-sdk/openai').catch(() => { + throw new Error( + 'To use the OpenAI provider, install its adapter: npm install @ai-sdk/openai' + ); + }); + return createOpenAI(apiKey ? { apiKey } : {})(modelId); + } + case 'anthropic': { + const { createAnthropic } = await import('@ai-sdk/anthropic').catch(() => { + throw new Error( + 'To use the Anthropic provider, install its adapter: npm install @ai-sdk/anthropic' + ); + }); + return createAnthropic(apiKey ? { apiKey } : {})(modelId); + } + case 'google': { + const { createGoogleGenerativeAI } = await import('@ai-sdk/google').catch(() => { + throw new Error( + 'To use the Google provider, install its adapter: npm install @ai-sdk/google' + ); + }); + return createGoogleGenerativeAI(apiKey ? { apiKey } : {})(modelId); + } + default: + throw new Error(`Unsupported provider type: ${this.config.type}`); + } + } + + /** + * Get default model for the configured provider + */ + private getDefaultModel(): string { + const providerType = this.config.type; + + if (providerType === 'custom') { + throw new Error('Cannot get default model for custom provider type'); + } + + return DEFAULT_MODELS[providerType]; + } +} + +/** + * Factory function to create a provider instance + */ +export function createProvider(config: ProviderConfig): LLMProvider { + if (config.type === 'custom' && config.customProvider) { + return config.customProvider; + } + + return new VercelAIProvider(config); +} diff --git a/sdks/typescript/src/providers/base.ts b/sdks/typescript/src/providers/base.ts new file mode 100644 index 0000000..5b6dee9 --- /dev/null +++ b/sdks/typescript/src/providers/base.ts @@ -0,0 +1,73 @@ +import type { z } from 'zod'; + +/** + * Message format for LLM conversations + */ +export interface Message { + role: 'system' | 'user' | 'assistant'; + content: string; +} + +/** + * Request configuration for structured LLM generation + */ +export interface LLMRequest { + messages: Message[]; + schema: z.ZodSchema; + temperature?: number; + maxTokens?: number; + model?: string; +} + +/** + * Response from LLM with usage metadata + */ +export interface LLMResponse { + data: T; + model: string; + usage: { + inputTokens: number; + outputTokens: number; + }; + latencyMs: number; +} + +/** + * Response from plain text generation + */ +export interface TextGenerationResponse { + text: string; + usage: { + inputTokens: number; + outputTokens: number; + }; + latencyMs: number; +} + +/** + * Base interface for LLM provider implementations + */ +export interface LLMProvider { + /** + * Generate structured output from LLM using Zod schema + */ + generateStructured(request: LLMRequest): Promise>; + + /** + * Generate plain text from LLM + */ + generateText(messages: Message[], temperature?: number): Promise; +} + +/** + * Configuration for LLM provider + */ +export interface ProviderConfig { + type: 'openai' | 'anthropic' | 'google' | 'custom'; + apiKey?: string; + model?: string; + temperature?: number; + baseURL?: string; + customProvider?: LLMProvider; + maxRetries?: number; +} diff --git a/sdks/typescript/src/providers/index.ts b/sdks/typescript/src/providers/index.ts new file mode 100644 index 0000000..f32e5e3 --- /dev/null +++ b/sdks/typescript/src/providers/index.ts @@ -0,0 +1,10 @@ +export type { + LLMProvider, + LLMRequest, + LLMResponse, + TextGenerationResponse, + Message, + ProviderConfig, +} from './base.js'; + +export { VercelAIProvider, createProvider } from './ai-sdk-provider.js'; diff --git a/sdks/typescript/src/schemas/index.ts b/sdks/typescript/src/schemas/index.ts new file mode 100644 index 0000000..ded6c72 --- /dev/null +++ b/sdks/typescript/src/schemas/index.ts @@ -0,0 +1,10 @@ +export { + ComplexityLevel, + GradeLevel, + type EvaluationResult, + type EvaluationMetadata, + type BatchEvaluationResult, + type BatchSummary, + type EvaluationError, +} from './outputs.js'; + diff --git a/sdks/typescript/src/schemas/outputs.ts b/sdks/typescript/src/schemas/outputs.ts new file mode 100644 index 0000000..9ab807e --- /dev/null +++ b/sdks/typescript/src/schemas/outputs.ts @@ -0,0 +1,75 @@ +import { z } from 'zod'; + +/** + * Complexity levels for sentence structure evaluation + */ +export const ComplexityLevel = z.enum([ + 'Slightly Complex', + 'Moderately Complex', + 'Very Complex', + 'Exceedingly Complex', +]); + +export type ComplexityLevel = z.infer; + +/** + * Grade levels for vocabulary evaluation + */ +export const GradeLevel = z.enum([ + 'Below Grade Level', + 'At Grade Level', + 'Above Grade Level', +]); + +export type GradeLevel = z.infer; + +/** + * Metadata attached to all evaluation results + */ +export interface EvaluationMetadata { + evaluatorVersion?: string; + promptVersion: string; + model: string; + timestamp: Date; + processingTimeMs: number; +} + +/** + * Base evaluation result structure + */ +export interface EvaluationResult { + score: TScore; + reasoning: string; + metadata: EvaluationMetadata; + _internal?: TInternal; +} + +/** + * Batch evaluation summary statistics + */ +export interface BatchSummary { + total: number; + successful: number; + failed: number; + averageProcessingTimeMs: number; +} + +/** + * Error type for failed evaluations + */ +export interface EvaluationError { + error: string; + input: { + text: string; + grade?: string; + }; + timestamp: Date; +} + +/** + * Batch evaluation result + */ +export interface BatchEvaluationResult { + results: Array; + summary: BatchSummary; +} diff --git a/sdks/typescript/src/schemas/vocabulary.ts b/sdks/typescript/src/schemas/vocabulary.ts new file mode 100644 index 0000000..f5b80d0 --- /dev/null +++ b/sdks/typescript/src/schemas/vocabulary.ts @@ -0,0 +1,39 @@ +import { z } from 'zod'; + +/** + * Vocabulary complexity levels matching Qual Text Complexity rubric (SAP) + */ +export const VocabularyComplexityLevel = z.enum([ + 'slightly complex', + 'moderately complex', + 'very complex', + 'exceedingly complex', +]); + +export type VocabularyComplexityLevel = z.infer; + +/** + * Vocabulary complexity evaluation output + * Ported from Python Output BaseModel + */ +export const VocabularyComplexitySchema = z.object({ + tier_2_words: z.string().describe('List of Tier 2 words (academic words)'), + tier_3_words: z.string().describe('List of Tier 3 words (domain-specific)'), + archaic_words: z.string().describe('List of Archaic words'), + other_complex_words: z.string().describe('List of Other Complex words'), + complexity_score: VocabularyComplexityLevel.describe( + 'The complexity of the text vocabulary' + ), + reasoning: z.string().describe('Detailed reasoning for the complexity rating'), +}); + +export type VocabularyComplexity = z.infer; + +/** + * Background knowledge assumption for a student at a given grade level + * This is generated in Stage 1 and used as input for Stage 2 + */ +export interface BackgroundKnowledge { + assumption: string; + grade: string; +} diff --git a/sdks/typescript/src/telemetry/client.ts b/sdks/typescript/src/telemetry/client.ts new file mode 100644 index 0000000..d4db550 --- /dev/null +++ b/sdks/typescript/src/telemetry/client.ts @@ -0,0 +1,64 @@ +import type { TelemetryConfig, TelemetryEvent } from './types.js'; +import type { Logger } from '../logger.js'; + +/** + * Telemetry client for sending analytics events + * + * Fire-and-forget implementation that never blocks SDK operations. + * Errors are logged but don't fail evaluations. + */ +export class TelemetryClient { + private config: TelemetryConfig; + private logger: Logger; + + constructor(config: TelemetryConfig) { + this.config = config; + this.logger = config.logger; + } + + /** + * Send telemetry event to analytics service + * + * Fire-and-forget: Errors are logged but don't throw. + */ + async send(event: TelemetryEvent): Promise { + // Skip if telemetry disabled + if (!this.config.enabled) { + return; + } + + try { + const headers: Record = { + 'Content-Type': 'application/json', + 'X-Client-ID': this.config.clientId, + }; + + // Add partner key if provided + if (this.config.partnerKey) { + headers['X-API-Key'] = this.config.partnerKey; + } + + const response = await fetch(this.config.endpoint, { + method: 'POST', + headers, + body: JSON.stringify(event), + // Don't block SDK operations on slow networks + signal: AbortSignal.timeout(5000), // 5 second timeout + }); + + if (!response.ok) { + this.logger.warn( + `[Telemetry] Failed to send event: ${response.status} ${response.statusText}` + ); + } + } catch (error) { + // Log error but never throw (fire-and-forget) + if (error instanceof Error) { + // Don't log timeout errors (expected on slow networks) + if (error.name !== 'TimeoutError' && error.name !== 'AbortError') { + this.logger.warn(`[Telemetry] Error sending event: ${error.message}`); + } + } + } + } +} diff --git a/sdks/typescript/src/telemetry/index.ts b/sdks/typescript/src/telemetry/index.ts new file mode 100644 index 0000000..ae1cbb7 --- /dev/null +++ b/sdks/typescript/src/telemetry/index.ts @@ -0,0 +1,10 @@ +export { TelemetryClient } from './client.js'; +export { generateClientId, getSDKVersion } from './utils.js'; +export type { + TelemetryConfig, + TelemetryEvent, + EvaluationStatus, + TokenUsage, + StageDetail, + TelemetryMetadata, +} from './types.js'; diff --git a/sdks/typescript/src/telemetry/types.ts b/sdks/typescript/src/telemetry/types.ts new file mode 100644 index 0000000..31b2920 --- /dev/null +++ b/sdks/typescript/src/telemetry/types.ts @@ -0,0 +1,92 @@ +// TODO: Generate these types from the telemetry service OpenAPI/JSON Schema +// instead of maintaining them manually. This will prevent drift between +// client and server schemas. + +/** + * Evaluation status + */ +export type EvaluationStatus = 'success' | 'error'; + +/** + * Token usage metrics from LLM providers + */ +export interface TokenUsage { + input_tokens: number; + output_tokens: number; +} + +/** + * Per-stage details for multi-stage evaluations + */ +export interface StageDetail { + /** Stage name (e.g., "background_knowledge", "complexity_evaluation") */ + stage: string; + + /** Provider used for this stage (e.g., "openai:gpt-4o") */ + provider: string; + + /** Total latency including all retries (ms) */ + latency_ms: number; + + /** Token usage aggregated across all attempts */ + token_usage?: TokenUsage; + + /** + * Whether schema validation failed (indicates prompt needs clearer instructions) + * + * TODO: Not currently tracked. Vercel AI SDK abstracts validation away. + * To implement: Add custom retry wrapper that catches validation errors. + */ + schema_validation_failed?: boolean; +} + +/** + * Extensible metadata for telemetry events + */ +export interface TelemetryMetadata { + /** Detailed breakdown by stage (for multi-stage evaluations) */ + stage_details?: StageDetail[]; + + // Future fields can be added here: + // cache_hit?: boolean; + // prompt_tokens_breakdown?: {...}; + // etc. +} + +/** + * Telemetry event payload + */ +export interface TelemetryEvent { + timestamp: string; + sdk_version: string; + evaluator_type: string; + grade?: string; + status: EvaluationStatus; + error_code?: string; + latency_ms: number; + text_length_chars: number; + provider: string; // Format: "provider:model" or "provider1+provider2" for multi-provider + token_usage?: TokenUsage; // Aggregated across all stages and attempts + metadata?: TelemetryMetadata; // Optional per-stage breakdown + input_text?: string; // Input text (only if recordInputs enabled) +} + +/** + * Configuration for telemetry client + */ +export interface TelemetryConfig { + /** Analytics service endpoint URL */ + endpoint: string; + + /** Learning Commons partner key (optional, sent as X-API-Key header) */ + partnerKey?: string; + + /** Client ID for anonymous tracking (persistent UUID from ~/.config/learning-commons/config.json) */ + clientId: string; + + /** Enable telemetry (default: true) */ + enabled: boolean; + + /** Logger instance (respects the SDK's configured log level and custom logger) */ + logger: import('../logger.js').Logger; +} diff --git a/sdks/typescript/src/telemetry/utils.ts b/sdks/typescript/src/telemetry/utils.ts new file mode 100644 index 0000000..eaef3d9 --- /dev/null +++ b/sdks/typescript/src/telemetry/utils.ts @@ -0,0 +1,93 @@ +import { randomUUID } from 'node:crypto'; +import { readFileSync, writeFileSync, mkdirSync } from 'node:fs'; +import { join, dirname } from 'node:path'; +import { homedir } from 'node:os'; +import { fileURLToPath } from 'node:url'; + +const __filename = fileURLToPath(import.meta.url); +const __dirname = dirname(__filename); + +/** Cached client ID — populated on first call, reused for process lifetime */ +let cachedClientId: string | undefined; + +/** + * Get or create a persistent client ID for anonymous tracking. + * + * On first run, generates a UUID and tries to save it to: + * - Windows: %APPDATA%\learning-commons\config.json + * - macOS/Linux: ~/.config/learning-commons/config.json + * + * On subsequent runs, reads the saved UUID from disk. + * Falls back to an in-memory UUID (per-process) if the filesystem + * is unavailable (e.g., serverless, read-only containers). + */ +export function generateClientId(): string { + if (cachedClientId) { + return cachedClientId; + } + + const configFile = getConfigFilePath(); + + // Try to read existing client ID from disk + try { + const data = JSON.parse(readFileSync(configFile, 'utf-8')) as { + telemetry?: { clientId?: string }; + }; + if (data?.telemetry?.clientId) { + cachedClientId = data.telemetry.clientId; + return cachedClientId; + } + } catch { + // File doesn't exist yet — fall through to generate + } + + // Generate new UUID and try to persist it + const clientId = randomUUID(); + try { + mkdirSync(dirname(configFile), { recursive: true }); + writeFileSync(configFile, JSON.stringify({ telemetry: { clientId } }, null, 2)); + } catch { + // Filesystem unavailable — use in-memory UUID for this process + } + + cachedClientId = clientId; + return cachedClientId; +} + +function getConfigFilePath(): string { + const configDir = + process.platform === 'win32' + ? join(process.env.APPDATA ?? homedir(), 'learning-commons') + : join(homedir(), '.config', 'learning-commons'); + return join(configDir, 'config.json'); +} + +let cachedVersion: string | undefined; + +/** + * Get SDK version from package.json + */ +export function getSDKVersion(): string { + if (cachedVersion) { + return cachedVersion; + } + + const possiblePaths = [ + join(__dirname, '../../package.json'), // From src/ + join(__dirname, '../package.json'), // From dist/ + ]; + + for (const path of possiblePaths) { + try { + const pkg = JSON.parse(readFileSync(path, 'utf-8')) as { version?: string }; + cachedVersion = pkg.version || '0.0.0'; + return cachedVersion; + } catch { + continue; + } + } + + // Fallback if no package.json found + cachedVersion = '0.0.0'; + return cachedVersion; +} diff --git a/sdks/typescript/tests/README.md b/sdks/typescript/tests/README.md new file mode 100644 index 0000000..0f06c3e --- /dev/null +++ b/sdks/typescript/tests/README.md @@ -0,0 +1,221 @@ +# Test Suite + +This directory contains unit and integration tests for the Evaluators SDK. + +## Structure + +``` +tests/ +├── unit/ # Fast tests, no API calls +├── integration/ # Real API calls +└── utils/ # Shared test utilities +``` + +## Running Tests + +### Unit Tests +```bash +npm run test:unit # Fast, no API keys needed +``` + +### Integration Tests +Requires API keys in `.env`: +```bash +OPENAI_API_KEY=sk-... +GOOGLE_API_KEY=... +``` + +Run tests: +```bash +RUN_INTEGRATION_TESTS=true npm run test:integration +``` + +### All Tests +```bash +RUN_INTEGRATION_TESTS=true npm run test:all +``` + +### CI Tests +```bash +npm run test:ci # Tests built dist/ package +``` + +## Key Patterns + +### 1. Acceptable Values for LLM Non-Determinism + +LLMs are non-deterministic. Tests use **expected** values with **acceptable** adjacent values: + +```typescript +{ + id: 'V3', + grade: '3', + text: 'Sample text...', + expected: 'very complex', // Try to match this first + acceptable: ['moderately complex'], // Accept if no expected match +} +``` + +**Strategy:** +- Try up to 3 attempts to match expected value (short-circuit on match) +- If no expected match, check if any result is in acceptable range +- Pass test if either expected or acceptable match found + +### 2. Parallel Test Execution + +All tests run concurrently using `it.concurrent()`: + +```typescript +describeIntegration.concurrent('Test Suite', () => { + TEST_CASES.forEach((testCase) => { + it.concurrent(`${testCase.id}`, async () => { + // Test runs in parallel + }, TEST_TIMEOUT_MS); + }); +}); +``` + +**Benefits**: 3-4x faster test execution + +### 3. Buffered Logging + +Logs are buffered and printed atomically to prevent interleaving: + +```typescript +const logBuffer: string[] = []; +logBuffer.push('Test output...'); +// ... collect all logs +console.log(logBuffer.join('\n')); // Print once at end +``` + +## Writing New Integration Tests + +### Basic Template + +```typescript +import { describe, it, expect, beforeAll } from 'vitest'; +import { MyEvaluator } from '../../src/evaluators/my-evaluator.js'; +import { runEvaluatorTest, type BaseTestCase } from '../utils/index.js'; +import { config } from 'dotenv'; + +config(); + +const SKIP_INTEGRATION = !process.env.RUN_INTEGRATION_TESTS && !process.env.MY_API_KEY; +const describeIntegration = SKIP_INTEGRATION ? describe.skip : describe; +const TEST_TIMEOUT_MS = 2 * 60 * 1000; // 2 minutes + +const TEST_CASES: BaseTestCase[] = [ + { + id: 'TEST1', + grade: '3', // Optional, if evaluator needs it + text: 'Sample text...', + expected: 'expected result', + acceptable: ['acceptable alternative'], + }, +]; + +describeIntegration.concurrent('My Evaluator - Test Suite', () => { + let evaluator: MyEvaluator; + + beforeAll(() => { + if (SKIP_INTEGRATION) { + console.log('⏭️ Skipping integration tests'); + return; + } + + evaluator = new MyEvaluator({ + partnerKey: process.env.MY_PARTNER_KEY!, + retry: false, // We handle retries in test logic + }); + }); + + TEST_CASES.forEach((testCase) => { + it.concurrent(`${testCase.id}: ${testCase.expected}`, async () => { + const logBuffer: string[] = []; + + logBuffer.push('\n' + '='.repeat(80)); + logBuffer.push(`Test Case ${testCase.id}`); + logBuffer.push('='.repeat(80)); + + const maxAttempts = 3; + const result = await runEvaluatorTest(testCase, { + evaluator, + extractResult: (r) => r.score, // Extract the field to compare + maxAttempts, + }); + + logBuffer.push(...result.logs); + console.log(logBuffer.join('\n')); + + expect(result.matched).toBe(true); + expect(result.matchedOnAttempt).toBeLessThanOrEqual(maxAttempts); + }, TEST_TIMEOUT_MS); + }); +}); +``` + +### Test Configuration + +```typescript +// Test timeout (2 minutes per test) +const TEST_TIMEOUT_MS = 2 * 60 * 1000; + +// Max retry attempts +const maxAttempts = 3; + +// Skip integration tests if no API keys +const SKIP_INTEGRATION = !process.env.RUN_INTEGRATION_TESTS && !process.env.API_KEY; +``` + +## Test Utilities + +### `runEvaluatorTest(testCase, config)` + +Generic test runner for all evaluators: + +```typescript +const result = await runEvaluatorTest(testCase, { + evaluator: myEvaluator, + extractResult: (r) => r.score, // How to extract result from evaluation + maxAttempts: 3, // Default: 3 +}); + +// Result structure +interface TestResult { + matched: boolean; // Did test pass? + matchedOnAttempt?: number; // Which attempt matched? + matchType?: 'expected' | 'acceptable'; // How did it match? + totalAttempts: number; + allResults: string[]; // All attempt results + logs: string[]; // Buffered log messages +} +``` + +## Test Strategy + +### Local Development +Tests run against `src/` with prompts copied from `../../evals/prompts/`: +```bash +npm run test:unit +npm run test:integration +``` + +### CI/CD +Tests run against built `dist/` package to validate published code: +```bash +npm run test:ci +``` + +## Troubleshooting + +**Tests skipped?** +- Check API keys: `echo $OPENAI_API_KEY` +- Set: `RUN_INTEGRATION_TESTS=true npm run test:integration` + +**Tests timeout?** +- Increase `TEST_TIMEOUT_MS = 3 * 60 * 1000` (3 minutes) + +**Tests flaky?** +- Add more acceptable values based on actual LLM output +- Increase `maxAttempts` from 3 to 5 +- Check if test case is ambiguous diff --git a/sdks/typescript/tests/integration/vocabulary.integration.test.ts b/sdks/typescript/tests/integration/vocabulary.integration.test.ts new file mode 100644 index 0000000..49b4662 --- /dev/null +++ b/sdks/typescript/tests/integration/vocabulary.integration.test.ts @@ -0,0 +1,141 @@ +import { describe, it, expect, beforeAll } from 'vitest'; +import { VocabularyEvaluator } from '../../src/evaluators/vocabulary.js'; +import { + runEvaluatorTest, + type BaseTestCase, +} from '../utils/index.js'; + +/** + * Vocabulary Evaluator Integration Tests + * + * Test cases cover grades 3-9 with varying complexity levels. + * + * Each test uses a retry mechanism (up to 3 attempts) to account for LLM non-determinism, + * with short-circuiting on first expected match. If no expected match is found after all + * attempts, the test checks if any result falls within the acceptable value range. + * + * To run these tests: + * ```bash + * RUN_INTEGRATION_TESTS=true npm run test:integration + * ``` + */ + +const SKIP_INTEGRATION = !process.env.RUN_INTEGRATION_TESTS && + (!process.env.OPENAI_API_KEY || !process.env.GOOGLE_API_KEY); + +const describeIntegration = SKIP_INTEGRATION ? describe.skip : describe; + +// Test timeout: 2 minutes per test case (allows for 3 attempts with API latency) +const TEST_TIMEOUT_MS = 2 * 60 * 1000; + +// Test cases from PR #6 +const TEST_CASES: BaseTestCase[] = [ + { + id: 'V3', + grade: '3', + text: 'Civil rights are rights that all people in a country have. The civil rights of a country apply to all the citizens within its borders. These rights are given by the laws of the country. Civil rights are sometimes thought to be the same as natural rights. In many countries civil rights include freedom of speech, freedom of the press, freedom of religion, and freedom of assembly. Civil rights also include the right to own property and the right to get fair and equal treatment from the government, from other citizens, and from private groups.', + expected: 'very complex', + acceptable: ['moderately complex', 'exceedingly complex'], + }, + { + id: 'V4', + grade: '4', + text: 'Bluetooth is a protocol for wireless communication over short distances. It was developed in the 1990s, to reduce the number of cables. Devices such as mobile phones, laptops, PCs, printers, digital cameras and video game consoles can connect to each other, and exchange information. This is done using radio waves. It can be done securely. Bluetooth is only used for relatively short distances, like a few metres. There are different standards. Data rates vary. Currently, they are at 1-3 MBit per second.', + expected: 'exceedingly complex', + acceptable: ['very complex'], + }, + { + id: 'V5', + grade: '5', + text: `The scientific method is a way to learn about the world around us. It helps us figure out how things work. Scientists use the scientific method to test their ideas. They start by making observations and asking questions. Then, they make a guess, or a hypothesis, about what might be the answer. They use their hypothesis to make predictions about what will happen in an experiment. Scientists then test their predictions by doing experiments. If the results of the experiment match their predictions, then their hypothesis is supported. If the results don't match, then they need to change their hypothesis. Scientists repeat this process many times to make sure their hypothesis is correct. The scientific method is important because it helps us learn new things. It helps us understand the world around us. Scientists use the scientific method to make new discoveries and solve problems.`, + expected: 'slightly complex', + acceptable: ['moderately complex'], + }, + { + id: 'V6', + grade: '6', + text: `Chicago in 1871 was a city ready to burn. The city boasted having 59,500 buildings, many of them—such as the Courthouse and the Tribune Building—large and ornately decorated. The trouble was that about two-thirds of all these structures were made entirely of wood. Many of the remaining buildings (even the ones proclaimed to be 'fireproof') looked solid, but were actually jerrybuilt affairs; the stone or brick exteriors hid wooden frames and floors, all topped with highly flammable tar or shingle roofs. It was also a common practice to disguise wood as another kind of building material. The fancy exterior decorations on just about every building were carved from wood, then painted to look like stone or marble.`, + expected: 'very complex', + acceptable: ['moderately complex', 'exceedingly complex'], + }, + { + id: 'V7', + grade: '7', + text: `The scientific method is a way of learning about the world around us. It's a process that helps us understand how things work and why they happen. It's not just for scientists; we all use the scientific method in our everyday lives, even if we don't realize it. The scientific method starts with an observation. We notice something interesting and want to know more about it. For example, you might notice that your plant is wilting. You might wonder why this is happening. Next, we form a hypothesis, which is a possible explanation for our observation. In our plant example, you might hypothesize that the plant is wilting because it needs more water. Then, we test our hypothesis by doing an experiment. We change something in our experiment to see if it affects the outcome. In our plant example, you could water the plant and see if it recovers. Based on the results of our experiment, we can either support or reject our hypothesis. If the plant recovers after being watered, then your hypothesis is supported. If the plant doesn't recover, then you need to come up with a new hypothesis. The scientific method is a powerful tool for learning and understanding the world around us. It's a process of asking questions, testing ideas, and drawing conclusions based on evidence. It's a way of thinking that helps us to be curious, to be critical, and to be open to new ideas.`, + expected: 'slightly complex', + acceptable: ['moderately complex'], + }, + { + id: 'V8', + grade: '8', + text: 'The American Revolution was a war for independence between the thirteen American colonies and Great Britain. The war started in 1775 and ended in 1783. The colonists wanted to be free from British rule. They wanted to make their own laws and govern themselves. The colonists were angry about new taxes that the British Parliament imposed on them. They felt that they were being taxed without having a say in how the money was spent. The colonists also felt that the British government was not treating them fairly. The war began with the Battles of Lexington and Concord in April 1775. The colonists, led by General George Washington, fought against the British army. The war was long and difficult, but the colonists eventually won. The colonists won the war because they had the support of the French. The French helped the colonists by providing them with soldiers, ships, and money. The colonists also had a strong leader in George Washington. He was a skilled military leader and he inspired the colonists to fight for their freedom. The American Revolution was a turning point in history. It showed that colonies could break free from their mother countries and become independent nations. The American Revolution also inspired other revolutions around the world.', + expected: 'slightly complex', + acceptable: ['moderately complex'], + }, + { + id: 'V9', + grade: '9', + text: `Mr. President: I would like to speak briefly and simply about a serious national condition. It is a national feeling of fear and frustration that could result in national suicide and the end of everything that we Americans hold dear. It is a condition that comes from the lack of effective leadership in either the Legislative Branch or the Executive Branch of our Government. That leadership is so lacking that serious and responsible proposals are being made that national advisory commissions be appointed to provide such critically needed leadership. I speak as briefly as possible because too much harm has already been done with irresponsible words of bitterness and selfish political opportunism. I speak as briefly as possible because the issue is too great to be obscured by eloquence. I speak simply and briefly in the hope that my words will be taken to heart. I speak as a Republican. I speak as a woman. I speak as a United States Senator. I speak as an American. The United States Senate has long enjoyed worldwide respect as the greatest deliberative body in the world. But recently that deliberative character has too often been debased to the level of a forum of hate and character assassination sheltered by the shield of congressional immunity. It is ironical that we Senators can in debate in the Senate directly or indirectly, by any form of words, impute to any American who is not a Senator any conduct or motive unworthy or unbecoming an American—and without that non-Senator American having any legal redress against us—yet if we say the same thing in the Senate about our colleagues we can be stopped on the grounds of being out of order. It is strange that we can verbally attack anyone else without restraint and with full protection and yet we hold ourselves above the same type of criticism here on the Senate Floor. Surely the United States Senate is big enough to take self-criticism and self-appraisal. Surely we should be able to take the same kind of character attacks that we "dish out" to outsiders. I think that it is high time for the United States Senate and its members to do some soul-searching—for us to weigh our consciences—on the manner in which we are performing our duty to the people of America—on the manner in which we are using or abusing our individual powers and privileges. I think that it is high time that we remembered that we have sworn to uphold and defend the Constitution. I think that it is high time that we remembered that the Constitution, as amended, speaks not only of the freedom of speech but also of trial by jury instead of trial by accusation. Whether it be a criminal prosecution in court or a character prosecution in the Senate, there is little practical distinction when the life of a person has been ruined. Those of us who shout the loudest about Americanism in making character assassinations are all too frequently those who, by our own words and acts, ignore some of the basic principles of Americanism: The right to criticize; The right to hold unpopular beliefs; The right to protest; The right of independent thought. The exercise of these rights should not cost one single American citizen his reputation or his right to a livelihood nor should he be in danger of losing his reputation or livelihood merely because he happens to know someone who holds unpopular beliefs. Who of us doesn't? Otherwise none of us could call our souls our own. Otherwise thought control would have set in. The American people are sick and tired of being afraid to speak their minds lest they be politically smeared as "Communists" or "Fascists" by their opponents. Freedom of speech is not what it used to be in America. It has been so abused by some that it is not exercised by others. The American people are sick and tired of seeing innocent people smeared and guilty people whitewashed. But there have been enough proved cases, such as the Amerasia case, the Hiss case, the Coplon case, the Gold case, to cause the nationwide distrust and strong suspicion that there may be something to the unproved, sensational accusations. I doubt if the Republican Party could—simply because I don't believe the American people will uphold any political party that puts political exploitation above national interest. Surely we Republicans aren't that desperate for victory. I don't want to see the Republican Party win that way. While it might be a fleeting victory for the Republican Party, it would be a more lasting defeat for the American people. Surely it would ultimately be suicide for the Republican Party and the two-party system that has protected our American liberties from the dictatorship of a one-party system. As members of the Minority Party, we do not have the primary authority to formulate the policy of our Government. But we do have the responsibility of rendering constructive criticism, of clarifying issues, of allaying fears by acting as responsible citizens. As a woman, I wonder how the mothers, wives, sisters, and daughters feel about the way in which members of their families have been politically mangled in the Senate debate—and I use the word "debate" advisedly. As a United States Senator, I am not proud of the way in which the Senate has been made a publicity platform for irresponsible sensationalism. I am not proud of the reckless abandon in which unproved charges have been hurled from the side of the aisle. I am not proud of the obviously staged, undignified countercharges that have been attempted in retaliation from the other side of the aisle. I don't like the way the Senate has been made a rendezvous for vilification, for selfish political gain at the sacrifice of individual reputations and national unity. I am not proud of the way we smear outsiders from the Floor of the Senate and hide behind the cloak of congressional immunity and still place ourselves beyond criticism on the Floor of the Senate. As an American, I am shocked at the way Republicans and Democrats alike are playing directly into the Communist design of "confuse, divide, and conquer." As an American, I don't want a Democratic Administration "whitewash" or "cover-up" any more than I want a Republican smear or witch hunt. As an American, I condemn a Republican "Fascist" just as much I condemn a Democratic "Communist." I condemn a Democrat "Fascist" just as much as I condemn a Republican "Communist." They are equally dangerous to you and me and to our country. As an American, I want to see our nation recapture the strength and unity it once had when we fought the enemy instead of ourselves. It is with these thoughts that I have drafted what I call a "Declaration of Conscience." I am gratified that Senator Tobey, Senator Aiken, Senator Morse, Senator Ives, Senator Thye, and Senator Hendrickson have concurred in that declaration and have authorized me to announce their concurrence.`, + expected: 'very complex', + acceptable: ['moderately complex', 'exceedingly complex'], + }, +]; + +describeIntegration.concurrent('Vocabulary Evaluator - Comprehensive Test Suite', () => { + let evaluator: VocabularyEvaluator; + + beforeAll(() => { + if (SKIP_INTEGRATION) { + console.log('⏭️ Skipping integration tests (no API keys or RUN_INTEGRATION_TESTS not set)'); + return; + } + + evaluator = new VocabularyEvaluator({ + googleApiKey: process.env.GOOGLE_API_KEY!, + openaiApiKey: process.env.OPENAI_API_KEY!, + }); + + console.log('\n' + '='.repeat(80)); + console.log('VOCABULARY EVALUATOR - TEST SUITE (PARALLEL)'); + console.log('='.repeat(80)); + console.log(`Running ${TEST_CASES.length} test cases with up to 3 attempts each`); + console.log('Short-circuiting on first expected match'); + console.log('Checking acceptable values if no expected match'); + console.log('='.repeat(80)); + }); + + // Generate individual test for each case + TEST_CASES.forEach((testCase) => { + it.concurrent(`${testCase.id}: Grade ${testCase.grade} - ${testCase.expected}`, async () => { + // Buffer all logs to print atomically at the end (prevents interleaving in parallel tests) + const logBuffer: string[] = []; + + // Test header + logBuffer.push('\n' + '='.repeat(80)); + logBuffer.push(`Test Case ${testCase.id} | Grade: ${testCase.grade}`); + logBuffer.push('='.repeat(80)); + logBuffer.push(`Expected Complexity: ${testCase.expected}`); + logBuffer.push(`Text Preview: ${testCase.text.substring(0, 100)}...`); + logBuffer.push(''); + + // Run the evaluation (returns logs instead of printing) + const maxAttempts = 3; + const result = await runEvaluatorTest(testCase, { + evaluator, + extractResult: (r) => r.score, + maxAttempts, + }); + + // Add evaluation logs to buffer (includes detailed summary) + logBuffer.push(...result.logs); + + // Print all logs atomically at the end - single console.log to prevent interleaving + console.log(logBuffer.join('\n')); + + // Assert that we got a match within maxAttempts (expected or acceptable) + expect(result.matched).toBe(true); + expect(result.matchedOnAttempt).toBeDefined(); + expect(result.matchedOnAttempt).toBeLessThanOrEqual(maxAttempts); + }, TEST_TIMEOUT_MS); + }); +}); diff --git a/sdks/typescript/tests/unit/evaluators/validation.test.ts b/sdks/typescript/tests/unit/evaluators/validation.test.ts new file mode 100644 index 0000000..74c095a --- /dev/null +++ b/sdks/typescript/tests/unit/evaluators/validation.test.ts @@ -0,0 +1,142 @@ +import { describe, it, expect, vi, beforeEach } from 'vitest'; +import { VocabularyEvaluator } from '../../../src/evaluators/vocabulary.js'; +import { VALIDATION_LIMITS } from '../../../src/evaluators/base.js'; +import { ConfigurationError } from '../../../src/errors.js'; +import type { LLMProvider } from '../../../src/providers/base.js'; + +/** + * Comprehensive validation tests for input validation + * + * Tests the base evaluator validation logic that all evaluators inherit. + * Uses VocabularyEvaluator as the test subject since it extends BaseEvaluator. + * + * All tests use mocked providers to avoid real API calls. + */ + +// Mock providers +const createMockProvider = (): LLMProvider => ({ + generateStructured: vi.fn(), + generateText: vi.fn(), +}); + +// Mock the createProvider factory +vi.mock('../../../src/providers/index.js', () => ({ + createProvider: vi.fn(() => createMockProvider()), +})); + +// Mock telemetry to avoid real HTTP calls +vi.mock('../../../src/telemetry/client.js', () => { + return { + TelemetryClient: class MockTelemetryClient { + send = vi.fn().mockResolvedValue(undefined); + }, + }; +}); + +describe('Configuration Validation', () => { + it('should throw ConfigurationError when googleApiKey is missing', () => { + expect(() => new VocabularyEvaluator({ + googleApiKey: '', + openaiApiKey: 'test-openai-key', + })).toThrow(ConfigurationError); + }); + + it('should throw ConfigurationError when openaiApiKey is missing', () => { + expect(() => new VocabularyEvaluator({ + googleApiKey: 'test-google-key', + openaiApiKey: '', + })).toThrow(ConfigurationError); + }); +}); + +describe('Input Validation - Text Validation', () => { + let evaluator: VocabularyEvaluator; + + beforeEach(() => { + vi.clearAllMocks(); + + evaluator = new VocabularyEvaluator({ + googleApiKey: 'test-google-key', + openaiApiKey: 'test-openai-key', + telemetry: false, + }); + }); + + describe('Empty text validation', () => { + it.each([ + ['empty string', ''], + ['spaces only', ' '], + ['tabs only', '\t\t\t'], + ['newlines only', '\n\n\n'], + ['mixed whitespace', ' \t\n '], + ])('should reject %s', async (_label, text) => { + await expect(evaluator.evaluate(text, '5')) + .rejects.toThrow('Text cannot be empty or contain only whitespace'); + }); + }); + + describe('Minimum length validation', () => { + it(`should reject text shorter than ${VALIDATION_LIMITS.MIN_TEXT_LENGTH} characters`, async () => { + const shortText = 'Hello wo'; // 8 chars after trim + await expect(evaluator.evaluate(shortText, '5')) + .rejects.toThrow(`Text is too short. Minimum length is ${VALIDATION_LIMITS.MIN_TEXT_LENGTH} characters, received 8 characters`); + }); + }); + + describe('Maximum length validation', () => { + it(`should reject text longer than ${VALIDATION_LIMITS.MAX_TEXT_LENGTH.toLocaleString()} characters`, async () => { + const longText = 'a'.repeat(VALIDATION_LIMITS.MAX_TEXT_LENGTH + 1); + + await expect(evaluator.evaluate(longText, '5')) + .rejects.toThrow(new RegExp(`Text is too long\\. Maximum length is ${VALIDATION_LIMITS.MAX_TEXT_LENGTH.toLocaleString()} characters, received ${(VALIDATION_LIMITS.MAX_TEXT_LENGTH + 1).toLocaleString()} characters`)); + }); + }); +}); + +describe('Input Validation - Grade Validation', () => { + let evaluator: VocabularyEvaluator; + + beforeEach(() => { + vi.clearAllMocks(); + + evaluator = new VocabularyEvaluator({ + googleApiKey: 'test-google-key', + openaiApiKey: 'test-openai-key', + telemetry: false, + }); + }); + + describe('Valid grade range', () => { + it.each([ + ['K', 'K'], + ['1', '1'], + ['2', '2'], + ])('should reject grade %s (below minimum)', async (_label, grade) => { + const validText = 'This is a sample text for testing.'; + + await expect(evaluator.evaluate(validText, grade)) + .rejects.toThrow(`Invalid grade "${grade}". Supported grades for this evaluator: 3, 4, 5, 6, 7, 8, 9, 10, 11, 12`); + }); + + it.each([ + ['13', '13'], + ['99', '99'], + ])('should reject grade %s (above maximum)', async (_label, grade) => { + const validText = 'This is a sample text for testing.'; + + await expect(evaluator.evaluate(validText, grade)) + .rejects.toThrow(`Invalid grade "${grade}". Supported grades for this evaluator: 3, 4, 5, 6, 7, 8, 9, 10, 11, 12`); + }); + + it.each([ + ['invalid', 'invalid'], + ['grade5', 'grade5'], + ['empty string', ''], + ])('should reject grade %s (invalid format)', async (_label, grade) => { + const validText = 'This is a sample text for testing.'; + + await expect(evaluator.evaluate(validText, grade)) + .rejects.toThrow(`Invalid grade "${grade}". Supported grades for this evaluator: 3, 4, 5, 6, 7, 8, 9, 10, 11, 12`); + }); + }); +}); diff --git a/sdks/typescript/tests/unit/evaluators/vocabulary.test.ts b/sdks/typescript/tests/unit/evaluators/vocabulary.test.ts new file mode 100644 index 0000000..2ce906a --- /dev/null +++ b/sdks/typescript/tests/unit/evaluators/vocabulary.test.ts @@ -0,0 +1,250 @@ +import { describe, it, expect, vi, beforeEach, afterEach } from 'vitest'; +import { VocabularyEvaluator } from '../../../src/evaluators/vocabulary.js'; +import type { LLMProvider } from '../../../src/providers/base.js'; + +/** + * Comprehensive unit tests for VocabularyEvaluator + * + * These tests verify: + * - Constructor validation + * - Successful evaluation flow (both stages) + * - Error handling (LLM failures, validation errors) + * - Telemetry behavior (success/error cases) + * - Token usage aggregation + * - Edge cases + */ + +// Mock providers +const createMockProvider = (): LLMProvider => ({ + generateStructured: vi.fn(), + generateText: vi.fn(), +}); + +// Mock the createProvider factory +vi.mock('../../../src/providers/index.js', () => ({ + createProvider: vi.fn(() => createMockProvider()), +})); + +// Mock telemetry to avoid real HTTP calls +vi.mock('../../../src/telemetry/client.js', () => { + return { + TelemetryClient: class MockTelemetryClient { + send = vi.fn().mockResolvedValue(undefined); + }, + }; +}); + +describe('VocabularyEvaluator - Constructor Validation', () => { + it('should throw error when Google API key is missing', () => { + expect(() => new VocabularyEvaluator({ + googleApiKey: '', + openaiApiKey: 'test-openai-key', + })).toThrow('Google API key is required. Pass googleApiKey in config.'); + }); + + it('should throw error when OpenAI API key is missing', () => { + expect(() => new VocabularyEvaluator({ + googleApiKey: 'test-google-key', + openaiApiKey: '', + })).toThrow('OpenAI API key is required. Pass openaiApiKey in config.'); + }); + +}); + +describe('VocabularyEvaluator - Evaluation Flow', () => { + let evaluator: VocabularyEvaluator; + let mockBackgroundProvider: LLMProvider; + let mockComplexityProvider: LLMProvider; + + beforeEach(() => { + vi.clearAllMocks(); + + // Create evaluator (providers will be mocked) + evaluator = new VocabularyEvaluator({ + googleApiKey: 'test-google-key', + openaiApiKey: 'test-openai-key', + telemetry: false, // Disable telemetry for most tests + }); + + // Get references to the mocked providers + // @ts-expect-error Accessing private property for testing + mockBackgroundProvider = evaluator.backgroundKnowledgeProvider; + // @ts-expect-error Accessing private property for testing + // Tests use grade 5+, which routes to otherGradesComplexityProvider (GPT-4.1) + mockComplexityProvider = evaluator.otherGradesComplexityProvider; + }); + + afterEach(() => { + vi.restoreAllMocks(); + }); + + describe('Successful Evaluation Flow', () => { + it('should successfully evaluate text through both stages', async () => { + const testText = 'The mitochondria is the powerhouse of the cell.'; + const testGrade = '5'; + + // Mock background knowledge response + vi.mocked(mockBackgroundProvider.generateText).mockResolvedValue({ + text: 'Students at grade 5 typically understand basic cell biology concepts.', + usage: { + inputTokens: 100, + outputTokens: 50, + }, + latencyMs: 500, + }); + + // Mock complexity evaluation response + vi.mocked(mockComplexityProvider.generateStructured).mockResolvedValue({ + data: { + complexity_score: 'moderately complex', + reasoning: 'The text uses grade-appropriate vocabulary.', + factors: ['Academic terminology', 'Clear structure'], + }, + model: 'gemini-2.5-pro', + usage: { + inputTokens: 200, + outputTokens: 100, + }, + latencyMs: 800, + }); + + // Execute evaluation + const result = await evaluator.evaluate(testText, testGrade); + + // Verify result structure + expect(result.score).toBe('moderately complex'); + expect(result.reasoning).toContain('grade-appropriate vocabulary'); + expect(result.metadata).toBeDefined(); + expect(result.metadata.model).toBe('openai:gpt-4o-2024-11-20 + openai:gpt-4.1-2025-04-14'); + expect(result.metadata.processingTimeMs).toBeGreaterThan(0); + + // Verify both providers were called + expect(mockBackgroundProvider.generateText).toHaveBeenCalledTimes(1); + expect(mockComplexityProvider.generateStructured).toHaveBeenCalledTimes(1); + + // Verify background knowledge call + const bgCall = vi.mocked(mockBackgroundProvider.generateText).mock.calls[0]; + expect(bgCall[0][0].content).toContain(testText); + expect(bgCall[1]).toBe(0); // temperature = 0 + + // Verify complexity call includes background knowledge + const complexityCall = vi.mocked(mockComplexityProvider.generateStructured).mock.calls[0]; + expect(complexityCall[0].messages[1].content).toContain(testText); + expect(complexityCall[0].schema).toBeDefined(); + expect(complexityCall[0].temperature).toBe(0); + }); + +}); + + describe('Error Handling', () => { + it('should handle background knowledge API failure', async () => { + const testText = 'Test text here for API failure'; + const testGrade = '5'; + + // Mock background knowledge failure + vi.mocked(mockBackgroundProvider.generateText).mockRejectedValue( + new Error('API timeout') + ); + + // Should propagate the error + await expect(evaluator.evaluate(testText, testGrade)) + .rejects.toThrow('API timeout'); + + // Verify complexity provider was never called + expect(mockComplexityProvider.generateStructured).not.toHaveBeenCalled(); + }); + + it('should handle complexity evaluation API failure', async () => { + const testText = 'Test text here for complexity failure'; + const testGrade = '6'; + + // Mock successful background knowledge + vi.mocked(mockBackgroundProvider.generateText).mockResolvedValue({ + text: 'Background knowledge', + usage: { inputTokens: 100, outputTokens: 50 }, + latencyMs: 500, + }); + + // Mock complexity evaluation failure + vi.mocked(mockComplexityProvider.generateStructured).mockRejectedValue( + new Error('Schema validation failed') + ); + + // Should propagate the error + await expect(evaluator.evaluate(testText, testGrade)) + .rejects.toThrow('Schema validation failed'); + + // Verify background provider was called (stage 1 completed) + expect(mockBackgroundProvider.generateText).toHaveBeenCalledTimes(1); + }); + + }); + + describe('Response Structure', () => { + it('should return correct result structure', async () => { + vi.mocked(mockBackgroundProvider.generateText).mockResolvedValue({ + text: 'Background knowledge', + usage: { inputTokens: 100, outputTokens: 50 }, + latencyMs: 500, + }); + + vi.mocked(mockComplexityProvider.generateStructured).mockResolvedValue({ + data: { + complexity_score: 'moderately complex', + reasoning: 'Detailed reasoning here', + factors: ['Factor 1', 'Factor 2'], + }, + model: 'gemini-2.5-pro', + usage: { inputTokens: 200, outputTokens: 100 }, + latencyMs: 800, + }); + + const result = await evaluator.evaluate('Test text here', '5'); + + // Verify result structure + expect(result).toHaveProperty('score'); + expect(result).toHaveProperty('reasoning'); + expect(result).toHaveProperty('metadata'); + expect(result).toHaveProperty('_internal'); + + // Verify metadata structure + expect(result.metadata).toHaveProperty('promptVersion'); + expect(result.metadata).toHaveProperty('model'); + expect(result.metadata).toHaveProperty('timestamp'); + expect(result.metadata).toHaveProperty('processingTimeMs'); + + // Verify metadata values + expect(result.metadata.promptVersion).toBe('1.2.0'); + expect(result.metadata.model).toBe('openai:gpt-4o-2024-11-20 + openai:gpt-4.1-2025-04-14'); + expect(result.metadata.timestamp).toBeInstanceOf(Date); + expect(result.metadata.processingTimeMs).toBeGreaterThanOrEqual(0); // Mocked calls can be instant (0ms) + }); + + it('should include internal data', async () => { + vi.mocked(mockBackgroundProvider.generateText).mockResolvedValue({ + text: 'Background knowledge', + usage: { inputTokens: 100, outputTokens: 50 }, + latencyMs: 500, + }); + + const mockComplexityData = { + complexity_score: 'moderately complex', + reasoning: 'Detailed reasoning', + factors: ['Factor 1', 'Factor 2'], + analysis: 'Deep analysis', + }; + + vi.mocked(mockComplexityProvider.generateStructured).mockResolvedValue({ + data: mockComplexityData, + model: 'gemini-2.5-pro', + usage: { inputTokens: 200, outputTokens: 100 }, + latencyMs: 800, + }); + + const result = await evaluator.evaluate('Test text here', '5'); + + // Verify internal data is included + expect(result._internal).toEqual(mockComplexityData); + }); + }); +}); diff --git a/sdks/typescript/tests/unit/features/readability.test.ts b/sdks/typescript/tests/unit/features/readability.test.ts new file mode 100644 index 0000000..2c1eda6 --- /dev/null +++ b/sdks/typescript/tests/unit/features/readability.test.ts @@ -0,0 +1,27 @@ +import { describe, it, expect } from 'vitest'; +import { calculateFleschKincaidGrade } from '../../../src/features/readability.js'; + +describe('calculateFleschKincaidGrade', () => { + it('should calculate FK grade for simple text', () => { + const text = 'The cat sat on the mat. The dog ran away.'; + const grade = calculateFleschKincaidGrade(text); + + expect(grade).toBeLessThan(5); + expect(typeof grade).toBe('number'); + }); + + it('should handle empty text', () => { + const grade = calculateFleschKincaidGrade(''); + expect(grade).toBe(0); + }); + + it('should calculate higher grade for complex text', () => { + const simpleText = 'The cat sat.'; + const complexText = 'The mitochondria, known as the powerhouse of cellular respiration, facilitates biochemical processes.'; + + const simpleGrade = calculateFleschKincaidGrade(simpleText); + const complexGrade = calculateFleschKincaidGrade(complexText); + + expect(complexGrade).toBeGreaterThan(simpleGrade); + }); +}); diff --git a/sdks/typescript/tests/unit/telemetry/utils.test.ts b/sdks/typescript/tests/unit/telemetry/utils.test.ts new file mode 100644 index 0000000..77e50d2 --- /dev/null +++ b/sdks/typescript/tests/unit/telemetry/utils.test.ts @@ -0,0 +1,121 @@ +import { describe, it, expect, vi, beforeEach } from 'vitest'; +import { getSDKVersion } from '../../../src/telemetry/utils.js'; + +// UUID v4 pattern +const UUID_REGEX = /^[0-9a-f]{8}-[0-9a-f]{4}-4[0-9a-f]{3}-[89ab][0-9a-f]{3}-[0-9a-f]{12}$/i; + +describe('Telemetry Utils', () => { + describe('generateClientId', () => { + // Reset module cache between tests so cachedClientId doesn't leak across tests + beforeEach(() => { + vi.resetModules(); + }); + + it('should generate a new UUID, create the config directory, and persist it when no config file exists', async () => { + const writeFileSync = vi.fn(); + const mkdirSync = vi.fn(); + vi.doMock('node:fs', () => ({ + readFileSync: vi.fn(() => { throw new Error('ENOENT'); }), + writeFileSync, + mkdirSync, + })); + vi.doMock('node:os', () => ({ homedir: vi.fn(() => '/home/user') })); + + const { generateClientId } = await import('../../../src/telemetry/utils.js'); + const id = generateClientId(); + + expect(id).toMatch(UUID_REGEX); + expect(mkdirSync).toHaveBeenCalledWith(expect.any(String), { recursive: true }); + expect(writeFileSync).toHaveBeenCalledOnce(); + const written = JSON.parse(writeFileSync.mock.calls[0][1] as string) as { + telemetry: { clientId: string }; + }; + expect(written.telemetry.clientId).toBe(id); + }); + + it('should not re-read from disk on repeated calls', async () => { + const readFileSync = vi.fn(() => { throw new Error('ENOENT'); }); + vi.doMock('node:fs', () => ({ + readFileSync, + writeFileSync: vi.fn(), + mkdirSync: vi.fn(), + })); + vi.doMock('node:os', () => ({ homedir: vi.fn(() => '/home/user') })); + + const { generateClientId } = await import('../../../src/telemetry/utils.js'); + + generateClientId(); + generateClientId(); + + expect(readFileSync).toHaveBeenCalledOnce(); + }); + + it('should read and return an existing client ID from config file without writing to disk', async () => { + const existingId = 'a1b2c3d4-e5f6-4789-ab01-cd23ef456789'; + const writeFileSync = vi.fn(); + const mkdirSync = vi.fn(); + vi.doMock('node:fs', () => ({ + readFileSync: vi.fn(() => JSON.stringify({ telemetry: { clientId: existingId } })), + writeFileSync, + mkdirSync, + })); + vi.doMock('node:os', () => ({ homedir: vi.fn(() => '/home/user') })); + + const { generateClientId } = await import('../../../src/telemetry/utils.js'); + + expect(generateClientId()).toBe(existingId); + expect(mkdirSync).not.toHaveBeenCalled(); + expect(writeFileSync).not.toHaveBeenCalled(); + }); + + it('should generate and persist a new UUID if config file exists but clientId is missing', async () => { + const writeFileSync = vi.fn(); + vi.doMock('node:fs', () => ({ + readFileSync: vi.fn(() => JSON.stringify({ telemetry: {} })), + writeFileSync, + mkdirSync: vi.fn(), + })); + vi.doMock('node:os', () => ({ homedir: vi.fn(() => '/home/user') })); + + const { generateClientId } = await import('../../../src/telemetry/utils.js'); + const id = generateClientId(); + + expect(id).toMatch(UUID_REGEX); + expect(writeFileSync).toHaveBeenCalledOnce(); + const written = JSON.parse(writeFileSync.mock.calls[0][1] as string) as { + telemetry: { clientId: string }; + }; + expect(written.telemetry.clientId).toBe(id); + }); + + it('should return a valid UUID without throwing when filesystem is read-only', async () => { + vi.doMock('node:fs', () => ({ + readFileSync: vi.fn(() => { throw new Error('ENOENT'); }), + writeFileSync: vi.fn(() => { throw new Error('EROFS'); }), + mkdirSync: vi.fn(() => { throw new Error('EROFS'); }), + })); + vi.doMock('node:os', () => ({ homedir: vi.fn(() => '/home/user') })); + + const { generateClientId } = await import('../../../src/telemetry/utils.js'); + + let id: string | undefined; + expect(() => { id = generateClientId(); }).not.toThrow(); + expect(id).toMatch(UUID_REGEX); + }); + }); + + describe('getSDKVersion', () => { + it('should return a valid version string', () => { + const version = getSDKVersion(); + + expect(version).toMatch(/^\d+\.\d+\.\d+$/); + }); + + it('should return same version on repeated calls (cached)', () => { + const version1 = getSDKVersion(); + const version2 = getSDKVersion(); + + expect(version1).toBe(version2); + }); + }); +}); diff --git a/sdks/typescript/tests/utils/index.ts b/sdks/typescript/tests/utils/index.ts new file mode 100644 index 0000000..e01a630 --- /dev/null +++ b/sdks/typescript/tests/utils/index.ts @@ -0,0 +1,18 @@ +/** + * Test utilities for evaluator testing + * + * @example + * ```typescript + * import { runTestWithRetry, runEvaluatorTest } from '../utils'; + * ``` + */ + +export { + runTestWithRetry, + runEvaluatorTest, + type TestAttempt, + type TestResult, + type RetryTestOptions, + type BaseTestCase, + type EvaluatorTestConfig, +} from './test-helpers.js'; diff --git a/sdks/typescript/tests/utils/test-helpers.ts b/sdks/typescript/tests/utils/test-helpers.ts new file mode 100644 index 0000000..91a6be7 --- /dev/null +++ b/sdks/typescript/tests/utils/test-helpers.ts @@ -0,0 +1,254 @@ +/** + * Streamlined test utilities for evaluator testing + */ + +export interface TestAttempt { + attempt: number; + result: T; + matched: boolean; +} + +export interface TestResult { + matched: boolean; + matchedOnAttempt?: number; + matchType?: 'expected' | 'acceptable'; // How the match occurred + totalAttempts: number; + attempts: TestAttempt[]; + allResults: T[]; + logs: string[]; // Buffered log messages for atomic printing +} + +export interface RetryTestOptions { + /** Function that executes the test and returns the actual output */ + testFn: (input: TInput) => Promise; + + /** Input to pass to the test function */ + input: TInput; + + /** Expected output value */ + expected: TOutput; + + /** Maximum number of attempts (default: 3) */ + maxAttempts?: number; + + /** Custom comparison function (default: strict equality) */ + compareFn?: (actual: TOutput, expected: TOutput) => boolean; + + /** Optional callback after each attempt */ + onAttempt?: (attempt: number, result: TOutput, matched: boolean) => void; +} + +/** + * Default comparison function (case-insensitive string comparison) + */ +function defaultCompareFn(actual: T, expected: T): boolean { + if (typeof actual === 'string' && typeof expected === 'string') { + return actual.toLowerCase() === expected.toLowerCase(); + } + return actual === expected; +} + +/** + * Runs a test function multiple times with retry logic and short-circuiting. + */ +export async function runTestWithRetry( + options: RetryTestOptions +): Promise> { + const { + testFn, + input, + expected, + maxAttempts = 3, + compareFn = defaultCompareFn, + onAttempt, + } = options; + + const attempts: TestAttempt[] = []; + let matched = false; + let matchedOnAttempt: number | undefined; + + for (let attemptNum = 1; attemptNum <= maxAttempts; attemptNum++) { + const result = await testFn(input); + const isMatch = compareFn(result, expected); + + attempts.push({ + attempt: attemptNum, + result, + matched: isMatch, + }); + + if (onAttempt) { + onAttempt(attemptNum, result, isMatch); + } + + // Short-circuit on match + if (isMatch) { + matched = true; + matchedOnAttempt = attemptNum; + break; + } + } + + return { + matched, + matchedOnAttempt, + totalAttempts: attempts.length, + attempts, + allResults: attempts.map(a => a.result), + logs: [], // No logs for this simple retry function + }; +} + +/** + * Generic test case structure + * All evaluator-specific test cases extend this + */ +export interface BaseTestCase { + id: string; + text: string; + grade?: string; // Optional: some evaluators need it, some don't + expected: string; // Expected output value (checked on each attempt) + acceptable?: string[]; // Acceptable adjacent values (checked if no expected match after all retries) +} + +/** + * Configuration for running evaluator tests + */ +export interface EvaluatorTestConfig { + /** The evaluator instance to test */ + evaluator: TEvaluator; + + /** Function to extract the result to compare from evaluation output */ + extractResult: (evalResult: any) => string; + + /** Maximum retry attempts (default: 3) */ + maxAttempts?: number; +} + +/** + * Generic evaluator test runner + * Works for any evaluator with retry logic + * + * @example + * ```typescript + * // Vocabulary evaluator + * const result = await runEvaluatorTest( + * { + * id: 'V1', + * text: 'Sample text...', + * grade: '3', + * expected: 'very complex' + * }, + * { + * evaluator: vocabularyEvaluator, + * extractResult: (r) => r.score + * } + * ); + * + * // Grade level evaluator + * const result = await runEvaluatorTest( + * { + * id: 'GLA1', + * text: 'Sample text...', + * expected: '6-8' + * }, + * { + * evaluator: gradeLevelEvaluator, + * extractResult: (r) => r.score.grade + * } + * ); + * ``` + */ +export async function runEvaluatorTest( + testCase: BaseTestCase, + config: EvaluatorTestConfig +): Promise> { + const { evaluator, extractResult, maxAttempts = 3 } = config; + const compareFn = defaultCompareFn; + + // Buffer logs to print atomically at the end (prevents interleaving in parallel tests) + const logBuffer: string[] = []; + + // Log test criteria upfront + logBuffer.push(`\n Expected: "${testCase.expected}"`); + if (testCase.acceptable && testCase.acceptable.length > 0) { + logBuffer.push(` Acceptable: [${testCase.acceptable.map(v => `"${v}"`).join(', ')}]`); + } + logBuffer.push(''); + + const attempts: TestAttempt[] = []; + let matched = false; + let matchedOnAttempt: number | undefined; + let matchType: 'expected' | 'acceptable' | undefined; + + // Phase 1: Try to match expected value (short-circuit on match) + for (let attemptNum = 1; attemptNum <= maxAttempts; attemptNum++) { + const result = testCase.grade + ? await evaluator.evaluate(testCase.text, testCase.grade) + : await evaluator.evaluate(testCase.text); + + const actualValue = extractResult(result); + const isExpectedMatch = compareFn(actualValue, testCase.expected); + + attempts.push({ + attempt: attemptNum, + result: actualValue, + matched: isExpectedMatch, + }); + + logBuffer.push(` Attempt ${attemptNum}: "${actualValue}" ${isExpectedMatch ? '✓ EXPECTED MATCH' : '✗'}`); + + // Short-circuit on expected match + if (isExpectedMatch) { + matched = true; + matchedOnAttempt = attemptNum; + matchType = 'expected'; + break; + } + } + + // Phase 2: If no expected match, check if any result is in acceptable range + // Only check acceptable values if they are defined and non-empty + if (!matched && testCase.acceptable?.length) { + logBuffer.push('\n No expected match. Checking acceptable values...'); + + for (let i = 0; i < attempts.length; i++) { + const attemptResult = attempts[i].result; + const isAcceptable = testCase.acceptable.some(acceptable => + compareFn(attemptResult, acceptable) + ); + + if (isAcceptable) { + matched = true; + matchedOnAttempt = i + 1; + matchType = 'acceptable'; + logBuffer.push(` ✓ ACCEPTABLE MATCH: Attempt ${matchedOnAttempt} result "${attemptResult}" is in acceptable range`); + break; + } + } + + if (!matched) { + logBuffer.push(` ✗ NO MATCH: None of the attempts matched expected or acceptable values`); + } + } + + // Summary logging + logBuffer.push('\n Summary:'); + logBuffer.push(` All Results: [${attempts.map(a => `"${a.result}"`).join(', ')}]`); + if (matched) { + logBuffer.push(` Status: ✓ PASS (matched ${matchType} on attempt ${matchedOnAttempt})`); + } else { + logBuffer.push(` Status: ✗ FAIL (no match after ${attempts.length} attempts)`); + } + + // Return logs for atomic printing by the caller + return { + matched, + matchedOnAttempt, + matchType, + totalAttempts: attempts.length, + attempts, + allResults: attempts.map(a => a.result), + logs: logBuffer, + }; +} \ No newline at end of file diff --git a/sdks/typescript/vitest.config.ts b/sdks/typescript/vitest.config.ts index 9eb9a49..06f43e3 100644 --- a/sdks/typescript/vitest.config.ts +++ b/sdks/typescript/vitest.config.ts @@ -1,4 +1,5 @@ import { defineConfig } from 'vitest/config'; +import { loadEnv } from 'vite'; import { readFileSync } from 'fs'; import { resolve, dirname } from 'path'; import type { Plugin } from 'vite'; @@ -18,12 +19,13 @@ function txtPlugin(): Plugin { }; } -export default defineConfig({ +export default defineConfig(({ mode }) => ({ plugins: [txtPlugin()], test: { globals: true, environment: 'node', passWithNoTests: true, + env: loadEnv(mode, process.cwd(), ''), coverage: { provider: 'v8', reporter: ['text', 'json', 'html'], @@ -36,4 +38,4 @@ export default defineConfig({ ], }, }, -}); +}));