From 52628b062235e88a63f042cf45d75b19a62056b4 Mon Sep 17 00:00:00 2001 From: Adnan Rashid Hussain Date: Mon, 9 Feb 2026 09:47:14 -0800 Subject: [PATCH 1/4] feat: Implement Batch Evaluator --- sdks/typescript/package-lock.json | 49 ++ sdks/typescript/package.json | 7 + sdks/typescript/src/batch/README.md | 361 +++++++++++++++ sdks/typescript/src/batch/evaluator.ts | 323 ++++++++++++++ sdks/typescript/src/batch/formatters.ts | 399 +++++++++++++++++ sdks/typescript/src/batch/index.ts | 422 ++++++++++++++++++ sdks/typescript/src/batch/progress.ts | 167 +++++++ sdks/typescript/src/batch/types.ts | 71 +++ sdks/typescript/src/evaluators/base.ts | 11 + sdks/typescript/tests/fixtures/batch-test.csv | 3 + .../integration/batch.integration.test.ts | 192 ++++++++ .../tests/unit/batch/csv-parsing.test.ts | 91 ++++ .../tests/unit/batch/formatters.test.ts | 252 +++++++++++ .../tests/unit/batch/limits.test.ts | 107 +++++ sdks/typescript/tsup.config.ts | 2 +- 15 files changed, 2456 insertions(+), 1 deletion(-) create mode 100644 sdks/typescript/src/batch/README.md create mode 100644 sdks/typescript/src/batch/evaluator.ts create mode 100644 sdks/typescript/src/batch/formatters.ts create mode 100644 sdks/typescript/src/batch/index.ts create mode 100644 sdks/typescript/src/batch/progress.ts create mode 100644 sdks/typescript/src/batch/types.ts create mode 100644 sdks/typescript/tests/fixtures/batch-test.csv create mode 100644 sdks/typescript/tests/integration/batch.integration.test.ts create mode 100644 sdks/typescript/tests/unit/batch/csv-parsing.test.ts create mode 100644 sdks/typescript/tests/unit/batch/formatters.test.ts create mode 100644 sdks/typescript/tests/unit/batch/limits.test.ts diff --git a/sdks/typescript/package-lock.json b/sdks/typescript/package-lock.json index d8815a0..80f178f 100644 --- a/sdks/typescript/package-lock.json +++ b/sdks/typescript/package-lock.json @@ -10,7 +10,10 @@ "license": "MIT", "dependencies": { "compromise": "^14.13.0", + "csv-parse": "^6.1.0", + "csv-stringify": "^6.6.0", "p-limit": "^5.0.0", + "prompts": "^2.4.2", "syllable": "^5.0.1", "zod": "^3.22.4" }, @@ -19,6 +22,7 @@ "@ai-sdk/google": "^3.0.7", "@ai-sdk/openai": "^3.0.9", "@types/node": "^20.11.5", + "@types/prompts": "^2.4.9", "@typescript-eslint/eslint-plugin": "^6.19.0", "@typescript-eslint/parser": "^6.19.0", "@vitest/coverage-v8": "^4.0.17", @@ -1190,6 +1194,16 @@ "resolved": "https://registry.npmjs.org/@types/pluralize/-/pluralize-0.0.29.tgz", "integrity": "sha512-BYOID+l2Aco2nBik+iYS4SZX0Lf20KPILP5RGmM1IgzdwNdTs0eebiFriOPcej1sX9mLnSoiNte5zcFxssgpGA==" }, + "node_modules/@types/prompts": { + "version": "2.4.9", + "resolved": "https://registry.npmjs.org/@types/prompts/-/prompts-2.4.9.tgz", + "integrity": "sha512-qTxFi6Buiu8+50/+3DGIWLHM6QuWsEKugJnnP6iv2Mc4ncxE4A/OJkjuVOA+5X0X1S/nq5VJRa8Lu+nwcvbrKA==", + "dev": true, + "dependencies": { + "@types/node": "*", + "kleur": "^3.0.3" + } + }, "node_modules/@types/semver": { "version": "7.7.1", "resolved": "https://registry.npmjs.org/@types/semver/-/semver-7.7.1.tgz", @@ -1830,6 +1844,16 @@ "node": ">= 8" } }, + "node_modules/csv-parse": { + "version": "6.1.0", + "resolved": "https://registry.npmjs.org/csv-parse/-/csv-parse-6.1.0.tgz", + "integrity": "sha512-CEE+jwpgLn+MmtCpVcPtiCZpVtB6Z2OKPTr34pycYYoL7sxdOkXDdQ4lRiw6ioC0q6BLqhc6cKweCVvral8yhw==" + }, + "node_modules/csv-stringify": { + "version": "6.6.0", + "resolved": "https://registry.npmjs.org/csv-stringify/-/csv-stringify-6.6.0.tgz", + "integrity": "sha512-YW32lKOmIBgbxtu3g5SaiqWNwa/9ISQt2EcgOq0+RAIFufFp9is6tqNnKahqE5kuKvrnYAzs28r+s6pXJR8Vcw==" + }, "node_modules/debug": { "version": "4.4.3", "resolved": "https://registry.npmjs.org/debug/-/debug-4.4.3.tgz", @@ -2593,6 +2617,14 @@ "json-buffer": "3.0.1" } }, + "node_modules/kleur": { + "version": "3.0.3", + "resolved": "https://registry.npmjs.org/kleur/-/kleur-3.0.3.tgz", + "integrity": "sha512-eTIzlVOSUR+JxdDFepEYcBMtZ9Qqdef+rnzWdRZuMbOywu5tO2w2N7rqjoANZ5k9vywhL6Br1VRjUIgTQx4E8w==", + "engines": { + "node": ">=6" + } + }, "node_modules/levn": { "version": "0.4.1", "resolved": "https://registry.npmjs.org/levn/-/levn-0.4.1.tgz", @@ -3064,6 +3096,18 @@ "node": ">= 0.8.0" } }, + "node_modules/prompts": { + "version": "2.4.2", + "resolved": "https://registry.npmjs.org/prompts/-/prompts-2.4.2.tgz", + "integrity": "sha512-NxNv/kLguCA7p3jE8oL2aEBsrJWgAakBpgmgK6lpPWV+WuOmY6r2/zbAVnP+T8bQlA0nzHXSJSJW0Hq7ylaD2Q==", + "dependencies": { + "kleur": "^3.0.3", + "sisteransi": "^1.0.5" + }, + "engines": { + "node": ">= 6" + } + }, "node_modules/punycode": { "version": "2.3.1", "resolved": "https://registry.npmjs.org/punycode/-/punycode-2.3.1.tgz", @@ -3247,6 +3291,11 @@ "integrity": "sha512-ybx0WO1/8bSBLEWXZvEd7gMW3Sn3JFlW3TvX1nREbDLRNQNaeNN8WK0meBwPdAaOI7TtRRRJn/Es1zhrrCHu7g==", "dev": true }, + "node_modules/sisteransi": { + "version": "1.0.5", + "resolved": "https://registry.npmjs.org/sisteransi/-/sisteransi-1.0.5.tgz", + "integrity": "sha512-bLGGlR1QxBcynn2d5YmDX4MGjlZvy2MRBDRNHLJ8VI6l6+9FUiyTFNJ0IveOSP0bcXgVDPRcfGqA0pjaqUpfVg==" + }, "node_modules/slash": { "version": "3.0.0", "resolved": "https://registry.npmjs.org/slash/-/slash-3.0.0.tgz", diff --git a/sdks/typescript/package.json b/sdks/typescript/package.json index d293235..5f1fbc3 100644 --- a/sdks/typescript/package.json +++ b/sdks/typescript/package.json @@ -12,6 +12,9 @@ } }, "sideEffects": false, + "bin": { + "evaluators-batch": "./dist/batch/index.js" + }, "files": [ "dist", "README.md", @@ -60,7 +63,10 @@ }, "dependencies": { "compromise": "^14.13.0", + "csv-parse": "^6.1.0", + "csv-stringify": "^6.6.0", "p-limit": "^5.0.0", + "prompts": "^2.4.2", "syllable": "^5.0.1", "zod": "^3.22.4" }, @@ -69,6 +75,7 @@ "@ai-sdk/google": "^3.0.7", "@ai-sdk/openai": "^3.0.9", "@types/node": "^20.11.5", + "@types/prompts": "^2.4.9", "@typescript-eslint/eslint-plugin": "^6.19.0", "@typescript-eslint/parser": "^6.19.0", "@vitest/coverage-v8": "^4.0.17", diff --git a/sdks/typescript/src/batch/README.md b/sdks/typescript/src/batch/README.md new file mode 100644 index 0000000..8b4fc91 --- /dev/null +++ b/sdks/typescript/src/batch/README.md @@ -0,0 +1,361 @@ +# Batch CSV Evaluator + +Evaluate multiple texts from a CSV file using one or more evaluators, with results output in CSV, JSON, and HTML formats. + +## Usage + +### Installation + +After publishing to npm: + +```bash +# Install globally +npm install -g @learning-commons/evaluators + +# Or run directly with npx +npx @learning-commons/evaluators-batch +``` + +### Interactive Mode + +Run the batch evaluator interactively from any directory: + +```bash +# If installed globally +evaluators-batch + +# Or with npx +npx @learning-commons/evaluators-batch +``` + +**Important:** Run this command from the directory containing your CSV file, or provide an absolute path to your CSV. + +The CLI will guide you through: +1. **CSV File Path**: Location of your input CSV file +2. **Evaluator Selection**: Choose which evaluators to run (multi-select) +3. **API Keys**: Enter required API keys (only prompted for needed keys) +4. **Output Directory**: Where to save results (default: timestamped folder in current directory) +5. **Confirmation**: Review summary before starting + +The output directory is automatically created with a human-readable timestamp: +``` +batch-results-2024-02-07_14-30-22/ +├── results.csv +├── results.json +└── results.html +``` + +### Input CSV Format + +Your CSV must have these columns: +- `text` (or `TEXT`): The text content to evaluate +- `grade` (or `GRADE`): The grade level for evaluation + +Example `input.csv`: +```csv +text,grade +"The cat sat on the mat.",3 +"Photosynthesis is the process by which plants convert sunlight into energy.",5 +"The mitochondria are the powerhouse of the cell.",8 +``` + +See `tests/fixtures/sample-batch-input.csv` for a complete example. + +### Available Evaluators + +- **vocabulary**: Analyzes vocabulary complexity (requires Google + OpenAI keys) +- **sentence-structure**: Analyzes sentence structure complexity (requires OpenAI key) +- **grade-level-appropriateness**: Determines appropriate grade level (requires Google key) + +### Output Files + +Three files are generated: + +1. **CSV** (`batch-results-YYYY-MM-DD.csv`): + - Spreadsheet-compatible format + - Columns: Row, Text, Grade, Evaluator, Status, Score, Reasoning, Error, Processing Time + +2. **JSON** (`batch-results-YYYY-MM-DD.json`): + - Structured data with full results and summary statistics + - Easy to parse programmatically + +3. **HTML** (`batch-results-YYYY-MM-DD.html`): + - Interactive table with sorting and filtering (AG Grid) + - Color-coded status indicators + - Summary statistics dashboard + - Self-contained (works offline) + +### Progress Display + +During evaluation, you'll see real-time progress: + +``` +Processing evaluations... +████████████░░░░ 60% (180/300) + ✓ vocabulary: 95/100 successful + ✓ sentence-structure: 85/100 successful + ⏳ grade-level: 0/100 successful + +⏱ Elapsed: 2m 15s | Estimated remaining: 1m 30s +``` + +### Batch Size Limits + +**Hard Limit: 500 tasks maximum** +- Tasks = Rows × Evaluators +- Example: 166 rows × 3 evaluators = 498 tasks ✓ +- Example: 167 rows × 3 evaluators = 501 tasks ❌ + +**Warnings:** +- Batches > 100 tasks show estimated time and cost +- Large batches default to "No" in confirmation prompt + +If you exceed the limit: +``` +❌ Batch too large! + + Maximum allowed: 500 tasks + Your batch: 600 tasks (200 rows × 3 evaluators) + +Suggestions: + • Reduce number of rows in CSV + • Select fewer evaluators + • Split into multiple smaller batches +``` + +### Parallelization + +The batch evaluator runs tasks in parallel with a concurrency limit of 3: +- **3 evaluators per row**: All 3 run simultaneously for each row +- **1 evaluator per row**: 3 different rows processed simultaneously +- Optimizes throughput while respecting API rate limits + +### API Keys + +You can provide API keys in three ways: +1. **Environment variables**: `GOOGLE_API_KEY`, `OPENAI_API_KEY` +2. **Interactive prompts**: Enter when prompted (keys are masked) +3. **Pre-filled prompts**: If env vars exist, they're used as defaults + +Only required keys are prompted: +- Select only `sentence-structure`: Only OpenAI key needed +- Select only `grade-level-appropriateness`: Only Google key needed +- Select multiple evaluators: All required keys prompted + +### Example Session + +```bash +$ npx evaluators-batch + +📊 Batch CSV Evaluator + +This tool will evaluate multiple texts using one or more evaluators. + +? Where is your CSV file? ./input.csv +✓ Found 10 rows in CSV + +? Which evaluators do you want to run? + ◉ Vocabulary + ◯ Sentence Structure + ◉ Grade Level Appropriateness + +✓ Selected: vocabulary, grade-level-appropriateness + +? Google API Key: •••••••••••• +? Output directory: ./batch-results-2024-02-07_14-30-22 + +📝 Summary: + Input rows: 10 + Evaluators: 2 + Total tasks: 20 + Output: ./batch-results-2024-02-07_14-30-22 + +? Start batch evaluation? Yes + +# For larger batches (>100 tasks): +⚠️ Warning: Large batch detected + + API calls: 150 + Estimated time: ~5 minutes + Estimated cost: ~$2.25 + +? Start batch evaluation? (y/N) + +Processing evaluations... +████████████████████ 100% (20/20) + ✓ vocabulary: 10/10 successful + ✓ grade-level-appropriateness: 10/10 successful + +⏱ Elapsed: 45s | Estimated remaining: 0s + +✅ Batch evaluation completed! + +Total tasks: 20 +Successful: 20 ✓ +Failed: 0 ✗ +Duration: 45s + +📄 Output files generated: + ./batch-results-2024-02-07_14-30-22/ + ├── results.csv + ├── results.json + └── results.html +``` + +### Error Handling + +- **Individual failures**: Continue processing, mark as failed in output +- **Invalid CSV**: Validation before starting +- **Missing API keys**: Prompt for required keys +- **Partial results**: Save all results even if some fail + +### Graceful Shutdown + +Press `Ctrl+C` during evaluation to gracefully shutdown: + +1. **In-flight tasks complete**: Running evaluations finish processing +2. **New tasks cancelled**: Pending tasks are skipped +3. **Partial results saved**: All completed results are saved to `results-partial.*` files +4. **Progress preserved**: No loss of work done so far + +Example: +```bash +# Press Ctrl+C during a long batch evaluation + +⚠️ Shutdown requested. Saving partial results... + (Press Ctrl+C again to force quit) + +✓ Saved 15 results to: + ./batch-results-2024-02-07_14-30-22/ + ├── results-partial.csv + ├── results-partial.json + └── results-partial.html +``` + +Press `Ctrl+C` twice to force quit immediately (not recommended - may lose in-flight results). + +## Programmatic API + +You can also use the batch evaluator programmatically: + +```typescript +import { BatchEvaluator } from '@learning-commons/evaluators/batch'; + +const evaluator = new BatchEvaluator({ + googleApiKey: process.env.GOOGLE_API_KEY, + openaiApiKey: process.env.OPENAI_API_KEY, + concurrency: 3, + maxRetries: 2, + telemetry: false, +}); + +const inputs = [ + { text: 'Sample text 1', grade: '3', rowIndex: 1 }, + { text: 'Sample text 2', grade: '5', rowIndex: 2 }, +]; + +const output = await evaluator.evaluate( + inputs, + ['vocabulary', 'sentence-structure'], + (result) => { + console.log(`Completed: ${result.evaluatorId} for row ${result.rowIndex}`); + } +); + +console.log(output.summary); + +// Programmatic cancellation example +const evaluator2 = new BatchEvaluator({ openaiApiKey: 'key' }); + +const evaluationPromise = evaluator2.evaluate(inputs, ['vocabulary']); + +// Cancel after 5 seconds +setTimeout(() => { + const partialResults = evaluator2.cancel(); + console.log(`Cancelled with ${partialResults.length} completed results`); +}, 5000); + +await evaluationPromise; +``` + +## Cross-Platform Compatibility + +The batch evaluator works on: +- **macOS** ✓ +- **Windows** ✓ +- **Linux** ✓ + +All file paths are handled with Node.js `path` module for cross-platform compatibility. + +--- + +## Development & Testing + +### Running Locally (Before Publishing) + +When developing or testing the batch evaluator locally: + +```bash +# From the SDK root directory (sdks/typescript/) +cd sdks/typescript + +# Build the project +npm run build + +# Run the batch CLI directly +node dist/batch/index.js + +# Or test with a sample CSV +node dist/batch/index.js +# When prompted, enter: test-input.csv +``` + +**Important:** Always run from the SDK root directory (`sdks/typescript/`) so the CLI can find the prompt files at `dist/prompts/`. + +### Creating Test CSVs + +Create a test CSV in the SDK root: + +```bash +# From sdks/typescript/ +cat > test-input.csv << 'EOF' +text,grade +"The cat sat on the mat.",3 +"Photosynthesis converts light energy into chemical energy.",5 +EOF + +# Then run the CLI +node dist/batch/index.js +# Enter: test-input.csv +``` + +### Testing the Package + +Test the package locally before publishing: + +```bash +# Build the package +npm run build + +# Create a tarball +npm pack +# Creates: learning-commons-evaluators-0.1.0.tgz + +# Test installation in another directory +cd /tmp +npm install /path/to/learning-commons-evaluators-0.1.0.tgz + +# Test the CLI +npx @learning-commons/evaluators-batch +``` + +### After Publishing to npm + +Once published, users can run from any directory: + +```bash +# Run from anywhere +cd ~/Documents/my-data +npx @learning-commons/evaluators-batch +# Works! Prompts are bundled with the package +``` diff --git a/sdks/typescript/src/batch/evaluator.ts b/sdks/typescript/src/batch/evaluator.ts new file mode 100644 index 0000000..09942cd --- /dev/null +++ b/sdks/typescript/src/batch/evaluator.ts @@ -0,0 +1,323 @@ +import pLimit from 'p-limit'; +import { + VocabularyEvaluator, + SentenceStructureEvaluator, + GradeLevelAppropriatenessEvaluator, +} from '../evaluators/index.js'; +import type { BaseEvaluator } from '../evaluators/base.js'; +import type { + BatchInput, + BatchTask, + BatchResult, + BatchOutput, + BatchConfig, + BatchSummary, +} from './types.js'; + +/** + * Available evaluators for batch processing + */ +const EVALUATORS = [ + VocabularyEvaluator, + SentenceStructureEvaluator, + GradeLevelAppropriatenessEvaluator, +] as const; + +/** + * Map of evaluator IDs to their constructors + */ +const EVALUATOR_MAP = new Map(EVALUATORS.map((E) => [E.metadata.id, E])); + +/** + * Get all available evaluator IDs + */ +export function getAvailableEvaluators(): Array<{ + id: string; + name: string; + requiresGoogleKey: boolean; + requiresOpenAIKey: boolean; +}> { + return EVALUATORS.map((E) => ({ + id: E.metadata.id, + name: E.metadata.name, + requiresGoogleKey: E.metadata.requiresGoogleKey, + requiresOpenAIKey: E.metadata.requiresOpenAIKey, + })); +} + +/** + * Validate that selected evaluators exist + */ +export function validateEvaluators(evaluatorIds: string[]): void { + const invalid = evaluatorIds.filter((id) => !EVALUATOR_MAP.has(id)); + if (invalid.length > 0) { + throw new Error( + `Invalid evaluator IDs: ${invalid.join(', ')}. Available: ${Array.from(EVALUATOR_MAP.keys()).join(', ')}` + ); + } +} + +/** + * Determine required API keys for selected evaluators + */ +export function getRequiredApiKeys(evaluatorIds: string[]): { + requiresGoogle: boolean; + requiresOpenAI: boolean; +} { + const requiresGoogle = evaluatorIds.some( + (id) => EVALUATOR_MAP.get(id)!.metadata.requiresGoogleKey + ); + const requiresOpenAI = evaluatorIds.some( + (id) => EVALUATOR_MAP.get(id)!.metadata.requiresOpenAIKey + ); + + return { requiresGoogle, requiresOpenAI }; +} + +/** + * Batch evaluator class + * + * Processes multiple texts with multiple evaluators in parallel + */ +export class BatchEvaluator { + private config: BatchConfig; + private limit: ReturnType; + private evaluatorInstances = new Map(); + private isCancelled = false; + private completedResults: BatchResult[] = []; + + constructor(config: BatchConfig) { + this.config = { + concurrency: 3, + maxRetries: 2, + telemetry: false, + ...config, + }; + + this.limit = pLimit(this.config.concurrency!); + } + + /** + * Cancel ongoing evaluation + * Returns partial results collected so far + */ + cancel(): BatchResult[] { + this.isCancelled = true; + return [...this.completedResults]; + } + + /** + * Initialize evaluator instances + */ + private initializeEvaluators(evaluatorIds: string[]): void { + for (const id of evaluatorIds) { + if (this.evaluatorInstances.has(id)) continue; + + const EvaluatorClass = EVALUATOR_MAP.get(id); + if (!EvaluatorClass) { + throw new Error(`Unknown evaluator: ${id}`); + } + + const evaluator = new EvaluatorClass({ + googleApiKey: this.config.googleApiKey, + openaiApiKey: this.config.openaiApiKey, + maxRetries: this.config.maxRetries, + telemetry: this.config.telemetry, + }); + + this.evaluatorInstances.set(id, evaluator); + } + } + + /** + * Create tasks from inputs and evaluator IDs + */ + // eslint-disable-next-line @typescript-eslint/no-explicit-any + private createTasks(inputs: BatchInput[], evaluatorIds: string[]): Array }> { + // eslint-disable-next-line @typescript-eslint/no-explicit-any + const tasks: Array }> = []; + + for (const input of inputs) { + for (const evaluatorId of evaluatorIds) { + tasks.push({ + text: input.text, + grade: input.grade, + evaluatorId, + rowIndex: input.rowIndex, + originalRow: input.originalRow, + }); + } + } + + return tasks; + } + + /** + * Execute a single evaluation task + */ + private async executeTask( + // eslint-disable-next-line @typescript-eslint/no-explicit-any + task: BatchTask & { originalRow: Record }, + onProgress?: (result: BatchResult) => void + ): Promise { + // Check if cancelled before starting + if (this.isCancelled) { + const batchResult: BatchResult = { + rowIndex: task.rowIndex, + text: task.text, + grade: task.grade, + evaluatorId: task.evaluatorId, + status: 'error', + error: 'Cancelled by user', + processingTimeMs: 0, + originalRow: task.originalRow, + }; + return batchResult; + } + + const startTime = Date.now(); + const evaluator = this.evaluatorInstances.get(task.evaluatorId); + + if (!evaluator) { + throw new Error(`Evaluator not initialized: ${task.evaluatorId}`); + } + + try { + const result = await evaluator.evaluate(task.text, task.grade); + + // Handle different score types + let scoreString: string; + if (typeof result.score === 'string') { + scoreString = result.score; + } else if (typeof result.score === 'object' && result.score !== null) { + // For grade-level-appropriateness, extract the grade field + if ('grade' in result.score) { + // eslint-disable-next-line @typescript-eslint/no-explicit-any + scoreString = (result.score as any).grade; + } else { + // Fallback: JSON stringify the object + scoreString = JSON.stringify(result.score); + } + } else { + scoreString = String(result.score); + } + + const batchResult: BatchResult = { + rowIndex: task.rowIndex, + text: task.text, + grade: task.grade, + evaluatorId: task.evaluatorId, + status: 'success', + score: scoreString, + reasoning: result.reasoning, + processingTimeMs: Date.now() - startTime, + originalRow: task.originalRow, + }; + + // Store completed result + this.completedResults.push(batchResult); + + // Report progress + if (onProgress) { + onProgress(batchResult); + } + + return batchResult; + } catch (error) { + const batchResult: BatchResult = { + rowIndex: task.rowIndex, + text: task.text, + grade: task.grade, + evaluatorId: task.evaluatorId, + status: 'error', + error: error instanceof Error ? error.message : String(error), + processingTimeMs: Date.now() - startTime, + originalRow: task.originalRow, + }; + + // Store completed result (even errors) + this.completedResults.push(batchResult); + + // Report progress + if (onProgress) { + onProgress(batchResult); + } + + return batchResult; + } + } + + /** + * Calculate summary statistics + */ + private calculateSummary(results: BatchResult[], durationMs: number): BatchSummary { + const summary: BatchSummary = { + totalTasks: results.length, + successful: results.filter((r) => r.status === 'success').length, + failed: results.filter((r) => r.status === 'error').length, + durationMs, + resultsPerEvaluator: {}, + }; + + // Calculate per-evaluator stats + const evaluatorIds = Array.from(new Set(results.map((r) => r.evaluatorId))); + for (const id of evaluatorIds) { + const evalResults = results.filter((r) => r.evaluatorId === id); + summary.resultsPerEvaluator[id] = { + successful: evalResults.filter((r) => r.status === 'success').length, + failed: evalResults.filter((r) => r.status === 'error').length, + }; + } + + return summary; + } + + /** + * Run batch evaluation + * + * @param inputs - Array of input rows + * @param evaluatorIds - Array of evaluator IDs to run + * @param onProgress - Optional callback for progress updates + * @returns Batch evaluation results and summary + */ + async evaluate( + inputs: BatchInput[], + evaluatorIds: string[], + onProgress?: (result: BatchResult) => void + ): Promise { + const startTime = Date.now(); + + // Reset state + this.isCancelled = false; + this.completedResults = []; + + // Validate evaluators + validateEvaluators(evaluatorIds); + + // Initialize evaluator instances + this.initializeEvaluators(evaluatorIds); + + // Create all tasks (flattened) + const tasks = this.createTasks(inputs, evaluatorIds); + + // Execute all tasks with concurrency control + // Use allSettled to get partial results even if cancelled + const settledResults = await Promise.allSettled( + tasks.map((task) => this.limit(() => this.executeTask(task, onProgress))) + ); + + // Extract fulfilled results (skip rejected) + const results = settledResults + .filter((r): r is PromiseFulfilledResult => r.status === 'fulfilled') + .map((r) => r.value); + + // Calculate summary + const durationMs = Date.now() - startTime; + const summary = this.calculateSummary(results, durationMs); + + return { + results, + summary, + }; + } +} diff --git a/sdks/typescript/src/batch/formatters.ts b/sdks/typescript/src/batch/formatters.ts new file mode 100644 index 0000000..f80a63e --- /dev/null +++ b/sdks/typescript/src/batch/formatters.ts @@ -0,0 +1,399 @@ +import type { BatchOutput, BatchResult } from './types.js'; + +/** + * Group results by row index + */ +function groupResultsByRow(results: BatchResult[]): Map { + const grouped = new Map(); + + for (const result of results) { + if (!grouped.has(result.rowIndex)) { + grouped.set(result.rowIndex, []); + } + grouped.get(result.rowIndex)!.push(result); + } + + return grouped; +} + +/** + * Format evaluator ID as column prefix (kebab-case to snake_case) + */ +function formatEvaluatorPrefix(evaluatorId: string): string { + return evaluatorId.replace(/-/g, '_'); +} + +/** + * Format results as CSV with columns per evaluator + */ +export function formatAsCSV(output: BatchOutput): string { + if (output.results.length === 0) { + return ''; + } + + // Group results by row + const groupedByRow = groupResultsByRow(output.results); + + // Get unique evaluator IDs (sorted for consistent column order) + const evaluatorIds = Array.from( + new Set(output.results.map(r => r.evaluatorId)) + ).sort(); + + // Get original column names from first result + const firstResult = output.results[0]; + const originalColumns = Object.keys(firstResult.originalRow); + + // Build headers: original columns + evaluator columns (score, reasoning, status) + const evaluatorColumns: string[] = []; + for (const evalId of evaluatorIds) { + const prefix = formatEvaluatorPrefix(evalId); + evaluatorColumns.push(`${prefix}_score`); + evaluatorColumns.push(`${prefix}_reasoning`); + evaluatorColumns.push(`${prefix}_status`); + } + const headers = [...originalColumns, ...evaluatorColumns]; + + // Build rows (one per input row) + const rows: string[][] = []; + const sortedRowIndices = Array.from(groupedByRow.keys()).sort((a, b) => a - b); + + for (const rowIndex of sortedRowIndices) { + const resultsForRow = groupedByRow.get(rowIndex)!; + const firstResultForRow = resultsForRow[0]; + + // Original column values + const originalValues = originalColumns.map(col => + escapeCSV(String(firstResultForRow.originalRow[col] || '')) + ); + + // Evaluator column values + const evaluatorValues: string[] = []; + for (const evalId of evaluatorIds) { + const result = resultsForRow.find(r => r.evaluatorId === evalId); + + if (result) { + // Score + if (result.status === 'success') { + evaluatorValues.push(escapeCSV(result.score || '')); + } else { + evaluatorValues.push(''); // Empty for errors + } + + // Reasoning + if (result.status === 'success') { + evaluatorValues.push(escapeCSV(result.reasoning || '')); + } else { + evaluatorValues.push(escapeCSV(result.error || '')); + } + + // Status + evaluatorValues.push(result.status); + } else { + // Evaluator not run for this row + evaluatorValues.push('', '', 'not_run'); + } + } + + rows.push([...originalValues, ...evaluatorValues]); + } + + return [headers, ...rows].map(row => row.join(',')).join('\n'); +} + +/** + * Escape CSV field (handle quotes and commas) + */ +function escapeCSV(field: string): string { + if (field.includes(',') || field.includes('"') || field.includes('\n')) { + return `"${field.replace(/"/g, '""')}"`; + } + return field; +} + +/** + * Format results as JSON + */ +export function formatAsJSON(output: BatchOutput): string { + return JSON.stringify(output, null, 2); +} + +/** + * Format results as HTML with AG Grid + */ +export function formatAsHTML(output: BatchOutput): string { + if (output.results.length === 0) { + return '

No results to display

'; + } + + // Group results by row + const groupedByRow = groupResultsByRow(output.results); + + // Get unique evaluator IDs (sorted) + const evaluatorIds = Array.from( + new Set(output.results.map(r => r.evaluatorId)) + ).sort(); + + // Convert grouped results to grid data (one row per input row) + // eslint-disable-next-line @typescript-eslint/no-explicit-any + const gridData: any[] = []; + const sortedRowIndices = Array.from(groupedByRow.keys()).sort((a, b) => a - b); + + for (const rowIndex of sortedRowIndices) { + const resultsForRow = groupedByRow.get(rowIndex)!; + const firstResult = resultsForRow[0]; + + // eslint-disable-next-line @typescript-eslint/no-explicit-any + const rowData: any = { + row: rowIndex, + text: firstResult.text.substring(0, 100) + (firstResult.text.length > 100 ? '...' : ''), + textFull: firstResult.text, + grade: firstResult.grade, + }; + + // Add evaluator-specific fields + for (const evalId of evaluatorIds) { + const result = resultsForRow.find(r => r.evaluatorId === evalId); + const prefix = formatEvaluatorPrefix(evalId); + + if (result) { + rowData[`${prefix}_status`] = result.status; + rowData[`${prefix}_score`] = result.status === 'success' ? (result.score || '') : ''; + rowData[`${prefix}_reasoning`] = result.status === 'success' ? (result.reasoning || '') : (result.error || ''); + } else { + rowData[`${prefix}_status`] = 'not_run'; + rowData[`${prefix}_score`] = ''; + rowData[`${prefix}_reasoning`] = ''; + } + } + + gridData.push(rowData); + } + + return ` + + + + + Batch Evaluation Results + + + + + + + + + + +
+
+

📊 Batch Evaluation Results

+

Generated on ${new Date().toLocaleString()}

+
+ +
+
+
${output.summary.totalTasks}
+
Total Tasks
+
+
+
${output.summary.successful}
+
Successful
+
+
+
${output.summary.failed}
+
Failed
+
+
+
${Math.round(output.summary.durationMs / 1000)}s
+
Duration
+
+
+ +
+ + +
+ + + +`; +} diff --git a/sdks/typescript/src/batch/index.ts b/sdks/typescript/src/batch/index.ts new file mode 100644 index 0000000..69948a3 --- /dev/null +++ b/sdks/typescript/src/batch/index.ts @@ -0,0 +1,422 @@ +#!/usr/bin/env node + +import * as fs from 'fs'; +import * as path from 'path'; +import prompts from 'prompts'; +import { parse } from 'csv-parse/sync'; +import { + BatchEvaluator, + getAvailableEvaluators, + getRequiredApiKeys, +} from './evaluator.js'; +import { formatAsCSV, formatAsJSON, formatAsHTML } from './formatters.js'; +import { ProgressTracker } from './progress.js'; +import type { BatchInput } from './types.js'; + +/** + * Find a column in a CSV row, case-insensitive with whitespace trimming + */ +// eslint-disable-next-line @typescript-eslint/no-explicit-any +function findColumn(row: any, columnName: string): string | undefined { + const normalizedTarget = columnName.toLowerCase().trim(); + + for (const key of Object.keys(row)) { + if (key.toLowerCase().trim() === normalizedTarget) { + return key; + } + } + + return undefined; +} + +/** + * Validate CSV file has required columns + */ +function validateCSV(csvPath: string): void { + if (!fs.existsSync(csvPath)) { + throw new Error(`CSV file not found: ${csvPath}`); + } + + const content = fs.readFileSync(csvPath, 'utf-8'); + const records = parse(content, { + columns: true, + skip_empty_lines: true, + trim: true, + // eslint-disable-next-line @typescript-eslint/no-explicit-any + }) as Record[]; + + if (records.length === 0) { + throw new Error('CSV file is empty'); + } + + const firstRow = records[0]; + const textColumn = findColumn(firstRow, 'text'); + const gradeColumn = findColumn(firstRow, 'grade'); + + if (!textColumn) { + throw new Error('CSV must have a "text" column (case-insensitive)'); + } + if (!gradeColumn) { + throw new Error('CSV must have a "grade" column (case-insensitive)'); + } +} + +/** + * Read and parse CSV file + */ +function readCSV(csvPath: string): BatchInput[] { + const content = fs.readFileSync(csvPath, 'utf-8'); + const records = parse(content, { + columns: true, + skip_empty_lines: true, + trim: true, + // eslint-disable-next-line @typescript-eslint/no-explicit-any + }) as Record[]; + + // Find column names (case-insensitive, whitespace-trimmed) + const firstRow = records[0]; + const textColumn = findColumn(firstRow, 'text'); + const gradeColumn = findColumn(firstRow, 'grade'); + + if (!textColumn || !gradeColumn) { + throw new Error('CSV missing required columns'); + } + + // Filter out empty rows and map to BatchInput + const inputs: BatchInput[] = []; + let rowIndex = 1; + + for (const row of records) { + const text = row[textColumn]; + const grade = row[gradeColumn]; + + // Skip rows with missing text or grade (empty rows) + if (!text || !grade) { + continue; + } + + inputs.push({ + text: String(text).trim(), + grade: String(grade).trim(), + rowIndex: rowIndex++, + originalRow: row, // Preserve all original columns + }); + } + + return inputs; +} + +/** + * Main CLI function + */ +async function main() { + console.log('\n📊 Batch CSV Evaluator\n'); + console.log('This tool will evaluate multiple texts using one or more evaluators.\n'); + + try { + // Step 1: Get CSV file path + const { csvPath } = await prompts({ + type: 'text', + name: 'csvPath', + message: 'Where is your CSV file?', + initial: './input.csv', + validate: (value) => { + try { + validateCSV(value); + return true; + } catch (error) { + return error instanceof Error ? error.message : 'Invalid CSV file'; + } + }, + }); + + if (!csvPath) { + console.log('Cancelled.'); + process.exit(0); + } + + // Read CSV to show info + const inputs = readCSV(csvPath); + console.log(`\n✓ Found ${inputs.length} rows in CSV\n`); + + // Step 2: Select evaluators + const availableEvaluators = getAvailableEvaluators(); + const { evaluatorIds } = await prompts({ + type: 'multiselect', + name: 'evaluatorIds', + message: 'Which evaluators do you want to run?', + choices: availableEvaluators.map((e) => ({ + title: e.name, + value: e.id, + selected: false, + })), + min: 1, + hint: 'Use space to select, enter to confirm', + }); + + if (!evaluatorIds || evaluatorIds.length === 0) { + console.log('No evaluators selected. Cancelled.'); + process.exit(0); + } + + console.log(`\n✓ Selected: ${evaluatorIds.join(', ')}\n`); + + // Step 3: Get API keys (only required ones) + const { requiresGoogle, requiresOpenAI } = getRequiredApiKeys(evaluatorIds); + + let googleApiKey: string | undefined; + let openaiApiKey: string | undefined; + + if (requiresGoogle) { + const result = await prompts({ + type: 'password', + name: 'key', + message: 'Google API Key:', + initial: process.env.GOOGLE_API_KEY || '', + validate: (value) => (value ? true : 'Google API key is required'), + }); + + if (!result.key) { + console.log('Cancelled.'); + process.exit(0); + } + + googleApiKey = result.key; + } + + if (requiresOpenAI) { + const result = await prompts({ + type: 'password', + name: 'key', + message: 'OpenAI API Key:', + initial: process.env.OPENAI_API_KEY || '', + validate: (value) => (value ? true : 'OpenAI API key is required'), + }); + + if (!result.key) { + console.log('Cancelled.'); + process.exit(0); + } + + openaiApiKey = result.key; + } + + // Step 4: Get output directory (with human-readable timestamp in local time) + const now = new Date(); + const timestamp = `${now.getFullYear()}-${String(now.getMonth() + 1).padStart(2, '0')}-${String(now.getDate()).padStart(2, '0')}_${String(now.getHours()).padStart(2, '0')}-${String(now.getMinutes()).padStart(2, '0')}-${String(now.getSeconds()).padStart(2, '0')}`; + const defaultOutputDir = path.join(process.cwd(), `batch-results-${timestamp}`); + + const { outputDir } = await prompts({ + type: 'text', + name: 'outputDir', + message: 'Output directory:', + initial: defaultOutputDir, + validate: (value) => { + // Check if parent directory exists + const parentDir = path.dirname(value); + if (!fs.existsSync(parentDir)) { + return `Parent directory does not exist: ${parentDir}`; + } + + // Check write permissions by attempting to create output directory + try { + if (!fs.existsSync(value)) { + fs.mkdirSync(value, { recursive: true }); + } + + // Test write permission with a temporary file + const testFile = path.join(value, '.write-test'); + fs.writeFileSync(testFile, ''); + fs.unlinkSync(testFile); + + return true; + } catch (error) { + if (error instanceof Error) { + if (error.message.includes('EACCES')) { + return `No write permission for directory: ${value}`; + } + if (error.message.includes('EROFS')) { + return `Directory is read-only: ${value}`; + } + return `Cannot write to directory: ${error.message}`; + } + return `Cannot write to directory`; + } + }, + }); + + if (!outputDir) { + console.log('Cancelled.'); + process.exit(0); + } + + // Ensure output directory exists + if (!fs.existsSync(outputDir)) { + fs.mkdirSync(outputDir, { recursive: true }); + } + + // Step 5: Confirm and run + const totalTasks = inputs.length * evaluatorIds.length; + const MAX_TASKS = 500; + + console.log(`\n📝 Summary:`); + console.log(` Input rows: ${inputs.length}`); + console.log(` Evaluators: ${evaluatorIds.length}`); + console.log(` Total tasks: ${totalTasks}`); + console.log(` Output: ${outputDir}\n`); + + // Hard limit check + if (totalTasks > MAX_TASKS) { + console.log(`❌ Batch too large!\n`); + console.log(` Maximum allowed: ${MAX_TASKS} tasks`); + console.log(` Your batch: ${totalTasks} tasks (${inputs.length} rows × ${evaluatorIds.length} evaluators)\n`); + console.log(`Suggestions:`); + console.log(` • Reduce number of rows in CSV`); + console.log(` • Select fewer evaluators`); + console.log(` • Split into multiple smaller batches\n`); + process.exit(1); + } + + // Warning for large batches (100-500 tasks) + if (totalTasks > 100) { + // Estimate time: ~2 seconds per task with concurrency=3 + const estimatedMinutes = Math.ceil((totalTasks * 2) / 60); + // Estimate cost: ~$0.01-0.02 per task (rough average) + const estimatedCost = (totalTasks * 0.015).toFixed(2); + + console.log(`⚠️ Warning: Large batch detected\n`); + console.log(` API calls: ${totalTasks}`); + console.log(` Estimated time: ~${estimatedMinutes} minute${estimatedMinutes > 1 ? 's' : ''}`); + console.log(` Estimated cost: ~$${estimatedCost}\n`); + } + + const { confirm } = await prompts({ + type: 'confirm', + name: 'confirm', + message: 'Start batch evaluation?', + initial: totalTasks <= 100, // Default to No for large batches + }); + + if (!confirm) { + console.log('Cancelled.'); + process.exit(0); + } + + // Step 6: Run batch evaluation + console.log('\n' + '='.repeat(60)); + const tracker = new ProgressTracker(totalTasks); + const evaluationStartTime = Date.now(); + + const evaluator = new BatchEvaluator({ + googleApiKey, + openaiApiKey, + concurrency: 3, + maxRetries: 2, + telemetry: false, + }); + + // Handle Ctrl+C gracefully + let isShuttingDown = false; + const handleShutdown = () => { + if (isShuttingDown) { + console.log('\n\n⚠️ Force quit detected. Exiting immediately...'); + process.exit(1); + } + + isShuttingDown = true; + console.log('\n\n⚠️ Shutdown requested. Saving partial results...'); + console.log(' (Press Ctrl+C again to force quit)\n'); + + // Get partial results + const partialResults = evaluator.cancel(); + + if (partialResults.length > 0) { + // Calculate summary for partial results + const durationMs = Date.now() - evaluationStartTime; + const partialOutput = { + results: partialResults, + summary: { + totalTasks: totalTasks, + successful: partialResults.filter((r) => r.status === 'success').length, + failed: partialResults.filter((r) => r.status === 'error').length, + durationMs, + resultsPerEvaluator: {}, + }, + }; + + // Save partial results + try { + const csvPath_partial = path.join(outputDir, 'results-partial.csv'); + const jsonPath_partial = path.join(outputDir, 'results-partial.json'); + const htmlPath_partial = path.join(outputDir, 'results-partial.html'); + + fs.writeFileSync(csvPath_partial, formatAsCSV(partialOutput)); + fs.writeFileSync(jsonPath_partial, formatAsJSON(partialOutput)); + fs.writeFileSync(htmlPath_partial, formatAsHTML(partialOutput)); + + console.log(`✓ Saved ${partialResults.length} results to:`); + console.log(` ${outputDir}/`); + console.log(` ├── results-partial.csv`); + console.log(` ├── results-partial.json`); + console.log(` └── results-partial.html`); + console.log(); + } catch (error) { + console.error('❌ Error saving partial results:', error instanceof Error ? error.message : String(error)); + } + } else { + console.log('No results to save yet.\n'); + } + + process.exit(0); + }; + + process.on('SIGINT', handleShutdown); + process.on('SIGTERM', handleShutdown); + + let output; + try { + output = await evaluator.evaluate(inputs, evaluatorIds, (result) => { + tracker.update(result); + tracker.display(); + }); + } finally { + // Remove signal handlers + process.off('SIGINT', handleShutdown); + process.off('SIGTERM', handleShutdown); + } + + // Display final summary + tracker.displaySummary(); + + // Step 7: Write output files + const csvPath_out = path.join(outputDir, 'results.csv'); + const jsonPath = path.join(outputDir, 'results.json'); + const htmlPath = path.join(outputDir, 'results.html'); + + try { + fs.writeFileSync(csvPath_out, formatAsCSV(output)); + fs.writeFileSync(jsonPath, formatAsJSON(output)); + fs.writeFileSync(htmlPath, formatAsHTML(output)); + + console.log('📄 Output files generated:'); + console.log(` ${outputDir}/`); + console.log(` ├── results.csv`); + console.log(` ├── results.json`); + console.log(` └── results.html`); + console.log(); + } catch (error) { + console.error('\n❌ Error writing output files:'); + if (error instanceof Error) { + console.error(` ${error.message}`); + } + console.error('\n⚠️ Evaluation completed but outputs could not be saved.'); + process.exit(1); + } + } catch (error) { + console.error('\n❌ Error:', error instanceof Error ? error.message : String(error)); + process.exit(1); + } +} + +// Run CLI +main(); diff --git a/sdks/typescript/src/batch/progress.ts b/sdks/typescript/src/batch/progress.ts new file mode 100644 index 0000000..5b9eb1b --- /dev/null +++ b/sdks/typescript/src/batch/progress.ts @@ -0,0 +1,167 @@ +import type { BatchResult } from './types.js'; + +/** + * Progress tracker for batch evaluation + */ +export class ProgressTracker { + private totalTasks: number; + private completed = 0; + private successful = 0; + private failed = 0; + private startTime: number; + private perEvaluator = new Map(); + + constructor(totalTasks: number) { + this.totalTasks = totalTasks; + this.startTime = Date.now(); + } + + /** + * Update progress with a new result + */ + update(result: BatchResult): void { + this.completed++; + + if (result.status === 'success') { + this.successful++; + } else { + this.failed++; + } + + // Track per-evaluator stats + if (!this.perEvaluator.has(result.evaluatorId)) { + this.perEvaluator.set(result.evaluatorId, { completed: 0, successful: 0, failed: 0 }); + } + + const stats = this.perEvaluator.get(result.evaluatorId)!; + stats.completed++; + if (result.status === 'success') { + stats.successful++; + } else { + stats.failed++; + } + } + + /** + * Get current progress percentage + */ + getPercentage(): number { + return Math.round((this.completed / this.totalTasks) * 100); + } + + /** + * Get elapsed time in seconds + */ + getElapsedSeconds(): number { + return Math.round((Date.now() - this.startTime) / 1000); + } + + /** + * Estimate remaining time in seconds + */ + getEstimatedRemainingSeconds(): number { + if (this.completed === 0) return 0; + + const elapsed = Date.now() - this.startTime; + const avgTimePerTask = elapsed / this.completed; + const remaining = this.totalTasks - this.completed; + + return Math.round((avgTimePerTask * remaining) / 1000); + } + + /** + * Format elapsed time as human-readable string + */ + formatElapsed(): string { + const seconds = this.getElapsedSeconds(); + if (seconds < 60) return `${seconds}s`; + + const minutes = Math.floor(seconds / 60); + const remainingSeconds = seconds % 60; + return `${minutes}m ${remainingSeconds}s`; + } + + /** + * Format estimated remaining time as human-readable string + */ + formatEstimatedRemaining(): string { + const seconds = this.getEstimatedRemainingSeconds(); + if (seconds < 60) return `${seconds}s`; + + const minutes = Math.floor(seconds / 60); + const remainingSeconds = seconds % 60; + return `${minutes}m ${remainingSeconds}s`; + } + + /** + * Generate progress bar + */ + getProgressBar(width = 20): string { + const percentage = this.getPercentage(); + const filled = Math.round((percentage / 100) * width); + const empty = width - filled; + + return '█'.repeat(filled) + '░'.repeat(empty); + } + + /** + * Display progress in terminal + */ + display(): void { + // Clear previous lines (move cursor up and clear) + if (this.completed > 1) { + const linesToClear = 3 + this.perEvaluator.size; + process.stdout.write(`\x1b[${linesToClear}A`); // Move cursor up + process.stdout.write('\x1b[J'); // Clear from cursor to end of screen + } + + console.log('\nProcessing evaluations...'); + console.log( + `${this.getProgressBar()} ${this.getPercentage()}% (${this.completed}/${this.totalTasks})` + ); + + // Show per-evaluator progress + for (const [evalId, stats] of this.perEvaluator.entries()) { + const status = + stats.completed === stats.successful + ? '✓' + : stats.failed > 0 + ? '✗' + : '⏳'; + console.log( + ` ${status} ${evalId}: ${stats.successful}/${stats.completed} successful` + ); + } + + console.log( + `\n⏱ Elapsed: ${this.formatElapsed()} | Estimated remaining: ${this.formatEstimatedRemaining()}` + ); + } + + /** + * Display final summary + */ + displaySummary(): void { + // Clear progress display + const linesToClear = 3 + this.perEvaluator.size + 1; + process.stdout.write(`\x1b[${linesToClear}A`); + process.stdout.write('\x1b[J'); + + console.log('\n✅ Batch evaluation completed!\n'); + console.log(`Total tasks: ${this.totalTasks}`); + console.log(`Successful: ${this.successful} ✓`); + console.log(`Failed: ${this.failed} ✗`); + console.log(`Duration: ${this.formatElapsed()}`); + + // Show per-evaluator summary + if (this.perEvaluator.size > 1) { + console.log('\nResults per evaluator:'); + for (const [evalId, stats] of this.perEvaluator.entries()) { + console.log( + ` ${evalId}: ${stats.successful} successful, ${stats.failed} failed` + ); + } + } + console.log(); + } +} diff --git a/sdks/typescript/src/batch/types.ts b/sdks/typescript/src/batch/types.ts new file mode 100644 index 0000000..bb279ed --- /dev/null +++ b/sdks/typescript/src/batch/types.ts @@ -0,0 +1,71 @@ +/** + * Batch evaluation types + */ + +/** + * Input row from CSV + */ +export interface BatchInput { + text: string; + grade: string; + rowIndex: number; + // eslint-disable-next-line @typescript-eslint/no-explicit-any + originalRow: Record; // Preserve all original CSV columns +} + +/** + * Individual evaluation task + */ +export interface BatchTask { + text: string; + grade: string; + evaluatorId: string; + rowIndex: number; +} + +/** + * Result from a single evaluation + */ +export interface BatchResult { + rowIndex: number; + text: string; + grade: string; + evaluatorId: string; + status: 'success' | 'error'; + score?: string; + reasoning?: string; + error?: string; + processingTimeMs: number; + // eslint-disable-next-line @typescript-eslint/no-explicit-any + originalRow: Record; // Preserve all original CSV columns +} + +/** + * Summary statistics for batch evaluation + */ +export interface BatchSummary { + totalTasks: number; + successful: number; + failed: number; + durationMs: number; + resultsPerEvaluator: Record; +} + +/** + * Complete batch evaluation output + */ +export interface BatchOutput { + results: BatchResult[]; + summary: BatchSummary; +} + +/** + * Configuration for batch evaluation + */ +export interface BatchConfig { + googleApiKey?: string; + openaiApiKey?: string; + concurrency?: number; + maxRetries?: number; + telemetry?: boolean; +} diff --git a/sdks/typescript/src/evaluators/base.ts b/sdks/typescript/src/evaluators/base.ts index d4e48d6..f9025a3 100644 --- a/sdks/typescript/src/evaluators/base.ts +++ b/sdks/typescript/src/evaluators/base.ts @@ -7,6 +7,7 @@ import { } from '../telemetry/index.js'; import { ConfigurationError, ValidationError } from '../errors.js'; import { createLogger, LogLevel, type Logger } from '../logger.js'; +import type { EvaluationResult } from '../schemas/index.js'; /** * Validation constants for input text @@ -181,6 +182,16 @@ export abstract class BaseEvaluator { return meta; } + /** + * Abstract evaluate method that concrete evaluators must implement + * + * @param text - The text to evaluate + * @param grade - The grade level for context + * @returns Promise resolving to evaluation result with score and reasoning + */ + // eslint-disable-next-line @typescript-eslint/no-explicit-any + abstract evaluate(text: string, grade: string): Promise>; + /** * Validate that required API keys are provided based on metadata * @throws {ConfigurationError} If required API keys are missing diff --git a/sdks/typescript/tests/fixtures/batch-test.csv b/sdks/typescript/tests/fixtures/batch-test.csv new file mode 100644 index 0000000..3a498a5 --- /dev/null +++ b/sdks/typescript/tests/fixtures/batch-test.csv @@ -0,0 +1,3 @@ +row_id,TEXT, Grade ,source,category +1,"The cat sat on the mat. It was a warm, sunny day.",3,textbook,simple +2,"The photosynthesis process converts light energy into chemical energy.",5,science,biology diff --git a/sdks/typescript/tests/integration/batch.integration.test.ts b/sdks/typescript/tests/integration/batch.integration.test.ts new file mode 100644 index 0000000..e4c5111 --- /dev/null +++ b/sdks/typescript/tests/integration/batch.integration.test.ts @@ -0,0 +1,192 @@ +import { describe, it, expect, beforeAll } from 'vitest'; +import { config } from 'dotenv'; +import { BatchEvaluator } from '../../src/batch/evaluator.js'; +import type { BatchInput } from '../../src/batch/types.js'; +import * as fs from 'fs'; +import { parse } from 'csv-parse/sync'; +import * as path from 'path'; + +// Load .env file for testing convenience +config(); + +/** + * Batch Evaluator Integration Tests + * + * Lightweight integration test with 2 rows and 1 evaluator (sentence-structure). + * Verifies the full batch evaluation flow works end-to-end with real API calls. + * + * To run: + * ```bash + * RUN_INTEGRATION_TESTS=true npm run test:integration + * ``` + */ + +const SKIP_INTEGRATION = !process.env.RUN_INTEGRATION_TESTS && + !process.env.OPENAI_API_KEY; + +const describeIntegration = SKIP_INTEGRATION ? describe.skip : describe; + +// Test timeout: 2 minutes (generous for API calls) +const TEST_TIMEOUT_MS = 2 * 60 * 1000; + +describeIntegration('Batch Evaluator - Integration', () => { + let evaluator: BatchEvaluator; + + beforeAll(() => { + if (SKIP_INTEGRATION) { + console.log('⏭️ Skipping batch integration tests (no API keys or RUN_INTEGRATION_TESTS not set)'); + return; + } + + evaluator = new BatchEvaluator({ + openaiApiKey: process.env.OPENAI_API_KEY!, + concurrency: 2, // Process both rows in parallel + maxRetries: 2, + telemetry: false, + }); + + console.log('\n' + '='.repeat(80)); + console.log('BATCH EVALUATOR - INTEGRATION TEST'); + console.log('='.repeat(80)); + console.log('Testing with 2 rows, 1 evaluator (sentence-structure)'); + console.log('='.repeat(80)); + }); + + it( + 'should process sample CSV end-to-end', + async () => { + // Read test CSV + const csvPath = path.join(__dirname, '../fixtures/batch-test.csv'); + const content = fs.readFileSync(csvPath, 'utf-8'); + const records = parse(content, { + columns: true, + skip_empty_lines: true, + trim: true, + }); + + // Helper to find column case-insensitively + function findColumn(row: any, columnName: string): string | undefined { + const normalizedTarget = columnName.toLowerCase().trim(); + for (const key of Object.keys(row)) { + if (key.toLowerCase().trim() === normalizedTarget) { + return key; + } + } + return undefined; + } + + const firstRow = records[0]; + const textColumn = findColumn(firstRow, 'text')!; + const gradeColumn = findColumn(firstRow, 'grade')!; + + const inputs: BatchInput[] = records.map((row: any, index: number) => ({ + text: row[textColumn], + grade: row[gradeColumn], + rowIndex: index + 1, + originalRow: row, + })); + + console.log(`\n📊 Processing ${inputs.length} rows...`); + + // Run batch evaluation + const startTime = Date.now(); + const output = await evaluator.evaluate( + inputs, + ['sentence-structure'], + (result) => { + console.log(` ✓ Row ${result.rowIndex} - ${result.status}: ${result.score || result.error}`); + } + ); + const duration = Date.now() - startTime; + + console.log(`\n⏱ Completed in ${Math.round(duration / 1000)}s\n`); + + // Verify results structure + expect(output).toBeDefined(); + expect(output.results).toBeDefined(); + expect(output.summary).toBeDefined(); + + // Should have 2 results (2 rows × 1 evaluator) + expect(output.results).toHaveLength(2); + + // Verify each result has expected fields + for (const result of output.results) { + expect(result.rowIndex).toBeGreaterThan(0); + expect(result.text).toBeTruthy(); + expect(result.grade).toBeTruthy(); + expect(result.evaluatorId).toBe('sentence-structure'); + expect(result.status).toMatch(/success|error/); + expect(result.processingTimeMs).toBeGreaterThan(0); + + if (result.status === 'success') { + expect(result.score).toBeTruthy(); + expect(result.reasoning).toBeTruthy(); + } else { + expect(result.error).toBeTruthy(); + } + } + + // Verify summary + expect(output.summary.totalTasks).toBe(2); + expect(output.summary.successful + output.summary.failed).toBe(2); + expect(output.summary.durationMs).toBeGreaterThan(0); + expect(output.summary.resultsPerEvaluator).toHaveProperty('sentence-structure'); + + // Log summary + console.log('📊 Summary:'); + console.log(` Total: ${output.summary.totalTasks}`); + console.log(` Successful: ${output.summary.successful} ✓`); + console.log(` Failed: ${output.summary.failed} ✗`); + console.log(` Duration: ${Math.round(output.summary.durationMs / 1000)}s`); + + // At least 1 should succeed (allow for occasional API issues) + expect(output.summary.successful).toBeGreaterThan(0); + }, + TEST_TIMEOUT_MS + ); + + it( + 'should handle multiple evaluators with same inputs', + async () => { + // Skip if Google key not available + if (!process.env.GOOGLE_API_KEY) { + console.log('⏭️ Skipping multi-evaluator test (no GOOGLE_API_KEY)'); + return; + } + + // Single row, two evaluators + const inputs: BatchInput[] = [ + { text: 'The cat sat on the mat.', grade: '3', rowIndex: 1, originalRow: { text: 'The cat sat on the mat.', grade: '3' } }, + ]; + + console.log('\n📊 Processing 1 row with 2 evaluators...'); + + const evaluatorWithBothKeys = new BatchEvaluator({ + openaiApiKey: process.env.OPENAI_API_KEY!, + googleApiKey: process.env.GOOGLE_API_KEY!, + concurrency: 2, + maxRetries: 2, + telemetry: false, + }); + + const output = await evaluatorWithBothKeys.evaluate( + inputs, + ['sentence-structure', 'grade-level-appropriateness'], + (result) => { + console.log(` ✓ ${result.evaluatorId} - ${result.status}: ${result.score || result.error}`); + } + ); + + // Should have 2 results (1 row × 2 evaluators) + expect(output.results).toHaveLength(2); + + // Verify both evaluators ran + const evaluatorIds = output.results.map((r) => r.evaluatorId); + expect(evaluatorIds).toContain('sentence-structure'); + expect(evaluatorIds).toContain('grade-level-appropriateness'); + + console.log('\n✅ Multi-evaluator test passed\n'); + }, + TEST_TIMEOUT_MS + ); +}); diff --git a/sdks/typescript/tests/unit/batch/csv-parsing.test.ts b/sdks/typescript/tests/unit/batch/csv-parsing.test.ts new file mode 100644 index 0000000..414b815 --- /dev/null +++ b/sdks/typescript/tests/unit/batch/csv-parsing.test.ts @@ -0,0 +1,91 @@ +import { describe, it, expect } from 'vitest'; +import * as fs from 'fs'; +import { parse } from 'csv-parse/sync'; +import * as path from 'path'; + +/** + * Helper to find column case-insensitively (same as in batch/index.ts) + */ +function findColumn(row: any, columnName: string): string | undefined { + const normalizedTarget = columnName.toLowerCase().trim(); + + for (const key of Object.keys(row)) { + if (key.toLowerCase().trim() === normalizedTarget) { + return key; + } + } + + return undefined; +} + +describe('CSV Parsing Robustness', () => { + it('should find columns case-insensitively', () => { + const row = { TEXT: 'sample', GRADE: '3' }; + + expect(findColumn(row, 'text')).toBe('TEXT'); + expect(findColumn(row, 'Text')).toBe('TEXT'); + expect(findColumn(row, 'TEXT')).toBe('TEXT'); + expect(findColumn(row, 'grade')).toBe('GRADE'); + }); + + it('should find columns with whitespace in name', () => { + const row = { ' text ': 'sample', ' GRADE ': '3' }; + + expect(findColumn(row, 'text')).toBe(' text '); + expect(findColumn(row, 'grade')).toBe(' GRADE '); + }); + + it('should handle mixed case CSV', () => { + const row = { Text: 'sample', Grade: '3' }; + + expect(findColumn(row, 'text')).toBe('Text'); + expect(findColumn(row, 'TEXT')).toBe('Text'); + expect(findColumn(row, 'grade')).toBe('Grade'); + }); + + it('should return undefined for missing columns', () => { + const row = { foo: 'bar' }; + + expect(findColumn(row, 'text')).toBeUndefined(); + expect(findColumn(row, 'grade')).toBeUndefined(); + }); + + it('should parse CSV with column name variants', () => { + const csvPath = path.join(__dirname, '../../fixtures/batch-test.csv'); + const content = fs.readFileSync(csvPath, 'utf-8'); + const records = parse(content, { + columns: true, + skip_empty_lines: true, + trim: true, + }); + + expect(records.length).toBe(2); + + // Find columns (CSV has "TEXT" and "Grade" - trim option normalizes whitespace) + const firstRow = records[0]; + const textColumn = findColumn(firstRow, 'text'); + const gradeColumn = findColumn(firstRow, 'grade'); + + expect(textColumn).toBe('TEXT'); // Uppercase in CSV + expect(gradeColumn).toBe('Grade'); // Mixed case in CSV (whitespace trimmed) + + // Should be able to read values + expect(firstRow[textColumn!]).toBeTruthy(); + expect(firstRow[gradeColumn!]).toBe('3'); + }); + + it('should filter out empty rows', () => { + const records = [ + { text: 'Row 1', grade: '3' }, + { text: '', grade: '4' }, // Empty text + { text: 'Row 3', grade: '' }, // Empty grade + { text: 'Row 4', grade: '5' }, + ]; + + const filtered = records.filter((row) => row.text && row.grade); + + expect(filtered).toHaveLength(2); + expect(filtered[0].text).toBe('Row 1'); + expect(filtered[1].text).toBe('Row 4'); + }); +}); diff --git a/sdks/typescript/tests/unit/batch/formatters.test.ts b/sdks/typescript/tests/unit/batch/formatters.test.ts new file mode 100644 index 0000000..3ee84db --- /dev/null +++ b/sdks/typescript/tests/unit/batch/formatters.test.ts @@ -0,0 +1,252 @@ +import { describe, it, expect } from 'vitest'; +import { formatAsCSV, formatAsJSON, formatAsHTML } from '../../../src/batch/formatters.js'; +import type { BatchOutput, BatchResult } from '../../../src/batch/types.js'; + +describe('Batch Formatters', () => { + const sampleResults: BatchResult[] = [ + { + rowIndex: 1, + text: 'The cat sat on the mat.', + grade: '3', + evaluatorId: 'vocabulary', + status: 'success', + score: 'slightly complex', + reasoning: 'Simple vocabulary', + processingTimeMs: 1250, + originalRow: { row_id: '1', text: 'The cat sat on the mat.', grade: '3', source: 'test' }, + }, + { + rowIndex: 1, + text: 'The cat sat on the mat.', + grade: '3', + evaluatorId: 'sentence-structure', + status: 'success', + score: 'Moderately Complex', + reasoning: 'Simple sentence structure', + processingTimeMs: 1100, + originalRow: { row_id: '1', text: 'The cat sat on the mat.', grade: '3', source: 'test' }, + }, + { + rowIndex: 2, + text: 'The quick brown fox jumps over the lazy dog.', + grade: '4', + evaluatorId: 'vocabulary', + status: 'error', + error: 'API timeout', + processingTimeMs: 5000, + originalRow: { row_id: '2', text: 'The quick brown fox jumps over the lazy dog.', grade: '4', source: 'test' }, + }, + ]; + + const sampleOutput: BatchOutput = { + results: sampleResults, + summary: { + totalTasks: 3, + successful: 2, + failed: 1, + durationMs: 7500, + resultsPerEvaluator: { + vocabulary: { successful: 1, failed: 1 }, + 'sentence-structure': { successful: 1, failed: 0 }, + }, + }, + }; + + describe('formatAsCSV', () => { + it('should format results as CSV with columns per evaluator', () => { + const csv = formatAsCSV(sampleOutput); + + // Should include original columns + expect(csv).toContain('row_id'); + expect(csv).toContain('source'); + + // Should have evaluator-specific columns (not "evaluator" column) + expect(csv).toContain('vocabulary_score'); + expect(csv).toContain('vocabulary_reasoning'); + expect(csv).toContain('vocabulary_status'); + expect(csv).toContain('sentence_structure_score'); + expect(csv).toContain('sentence_structure_reasoning'); + expect(csv).toContain('sentence_structure_status'); + + // Should have one row per input row (not per evaluator) + expect(csv.split('\n')).toHaveLength(3); // Header + 2 data rows + }); + + it('should escape CSV fields with quotes', () => { + const resultsWithCommas: BatchResult[] = [ + { + rowIndex: 1, + text: 'Text with, comma', + grade: '3', + evaluatorId: 'vocabulary', + status: 'success', + score: 'slightly complex', + reasoning: 'Reasoning with, comma', + processingTimeMs: 1000, + originalRow: { text: 'Text with, comma', grade: '3' }, + }, + ]; + + const output: BatchOutput = { + results: resultsWithCommas, + summary: { + totalTasks: 1, + successful: 1, + failed: 0, + durationMs: 1000, + resultsPerEvaluator: { vocabulary: { successful: 1, failed: 0 } }, + }, + }; + + const csv = formatAsCSV(output); + expect(csv).toContain('"Text with, comma"'); + expect(csv).toContain('"Reasoning with, comma"'); + }); + + it('should handle errors in evaluator columns', () => { + const csv = formatAsCSV(sampleOutput); + + // Row 2 has vocabulary error - should have empty score, error as reasoning, status=error + expect(csv).toContain('API timeout'); // Error message in reasoning column + expect(csv).toContain('error'); // Status column + }); + + it('should preserve original columns in order', () => { + const csv = formatAsCSV(sampleOutput); + const lines = csv.split('\n'); + + // First line should be headers with original columns first + const headers = lines[0]; + expect(headers).toContain('row_id'); + expect(headers).toContain('text'); + expect(headers).toContain('grade'); + expect(headers).toContain('source'); + + // Should have evaluator columns (not single "evaluator" column) + expect(headers).toContain('vocabulary_score'); + + // Data rows should have original data first - now one row per input row + expect(lines[1].startsWith('1,')).toBe(true); // Row 1 + expect(lines[2].startsWith('2,')).toBe(true); // Row 2 + }); + }); + + describe('formatAsJSON', () => { + it('should format results as valid JSON', () => { + const json = formatAsJSON(sampleOutput); + + expect(() => JSON.parse(json)).not.toThrow(); + }); + + it('should include results and summary', () => { + const json = formatAsJSON(sampleOutput); + const parsed = JSON.parse(json); + + expect(parsed).toHaveProperty('results'); + expect(parsed).toHaveProperty('summary'); + expect(parsed.results).toHaveLength(3); + }); + + it('should preserve all result fields', () => { + const json = formatAsJSON(sampleOutput); + const parsed = JSON.parse(json); + + const firstResult = parsed.results[0]; + expect(firstResult).toHaveProperty('rowIndex'); + expect(firstResult).toHaveProperty('text'); + expect(firstResult).toHaveProperty('grade'); + expect(firstResult).toHaveProperty('evaluatorId'); + expect(firstResult).toHaveProperty('status'); + expect(firstResult).toHaveProperty('processingTimeMs'); + }); + + it('should include summary statistics', () => { + const json = formatAsJSON(sampleOutput); + const parsed = JSON.parse(json); + + expect(parsed.summary.totalTasks).toBe(3); + expect(parsed.summary.successful).toBe(2); + expect(parsed.summary.failed).toBe(1); + expect(parsed.summary.durationMs).toBe(7500); + }); + }); + + describe('formatAsHTML', () => { + it('should generate valid HTML', () => { + const html = formatAsHTML(sampleOutput); + + expect(html).toContain(''); + expect(html).toContain(''); + }); + + it('should include AG Grid script', () => { + const html = formatAsHTML(sampleOutput); + + expect(html).toContain('ag-grid-community'); + }); + + it('should include summary statistics', () => { + const html = formatAsHTML(sampleOutput); + + expect(html).toContain('3'); // Total tasks + expect(html).toContain('2'); // Successful + expect(html).toContain('1'); // Failed + }); + + it('should include grid data as JSON', () => { + const html = formatAsHTML(sampleOutput); + + expect(html).toContain('const rowData'); + expect(html).toContain('vocabulary_status'); + expect(html).toContain('sentence_structure_status'); + }); + + it('should include HTML-like content in JSON data', () => { + const resultsWithHTML: BatchResult[] = [ + { + rowIndex: 1, + text: 'Text with ', + grade: '3', + evaluatorId: 'vocabulary', + status: 'success', + score: 'slightly complex', + reasoning: 'Reasoning with bold', + processingTimeMs: 1000, + originalRow: { text: 'Text with ', grade: '3' }, + }, + ]; + + const output: BatchOutput = { + results: resultsWithHTML, + summary: { + totalTasks: 1, + successful: 1, + failed: 0, + durationMs: 1000, + resultsPerEvaluator: { vocabulary: { successful: 1, failed: 0 } }, + }, + }; + + const html = formatAsHTML(output); + + // JSON.stringify automatically escapes HTML, so it's safe + // The content will be in the JSON data but escaped + expect(html).toContain('const rowData'); + expect(html).toContain('vocabulary'); + }); + + it('should include column definitions with evaluator columns', () => { + const html = formatAsHTML(sampleOutput); + + expect(html).toContain('columnDefs'); + expect(html).toContain('field: \'row\''); + expect(html).toContain('field: \'text\''); + + // Should have evaluator-specific columns (not single "status" column) + expect(html).toContain('vocabulary_status'); + expect(html).toContain('vocabulary_score'); + expect(html).toContain('sentence_structure_status'); + }); + }); +}); diff --git a/sdks/typescript/tests/unit/batch/limits.test.ts b/sdks/typescript/tests/unit/batch/limits.test.ts new file mode 100644 index 0000000..7db5f4f --- /dev/null +++ b/sdks/typescript/tests/unit/batch/limits.test.ts @@ -0,0 +1,107 @@ +import { describe, it, expect } from 'vitest'; + +describe('Batch Size Limits', () => { + const MAX_TASKS = 500; + + it('should calculate total tasks correctly', () => { + const inputs = 100; + const evaluators = 3; + const totalTasks = inputs * evaluators; + + expect(totalTasks).toBe(300); + }); + + it('should accept batch under limit', () => { + const totalTasks = 100; + + expect(totalTasks).toBeLessThanOrEqual(MAX_TASKS); + }); + + it('should accept batch at limit', () => { + const totalTasks = 500; + + expect(totalTasks).toBeLessThanOrEqual(MAX_TASKS); + }); + + it('should reject batch over limit', () => { + const totalTasks = 501; + + expect(totalTasks).toBeGreaterThan(MAX_TASKS); + }); + + it('should show warning for batch > 100 tasks', () => { + const totalTasks = 150; + + expect(totalTasks).toBeGreaterThan(100); + expect(totalTasks).toBeLessThanOrEqual(MAX_TASKS); + }); + + it('should estimate time correctly', () => { + // Rough estimate: 2 seconds per task with concurrency=3 + const totalTasks = 300; + const estimatedSeconds = (totalTasks * 2) / 3; // Parallel processing + const estimatedMinutes = Math.ceil(estimatedSeconds / 60); + + expect(estimatedMinutes).toBeGreaterThan(0); + expect(estimatedMinutes).toBe(4); // 300 * 2 / 3 = 200s = 3.33m → 4m + }); + + it('should estimate cost correctly', () => { + const totalTasks = 300; + const estimatedCost = (totalTasks * 0.015).toFixed(2); + + expect(estimatedCost).toBe('4.50'); + }); + + describe('Edge Cases', () => { + it('should handle 1 row × 1 evaluator', () => { + const totalTasks = 1 * 1; + expect(totalTasks).toBe(1); + expect(totalTasks).toBeLessThanOrEqual(MAX_TASKS); + }); + + it('should handle max rows with 1 evaluator', () => { + const totalTasks = 500 * 1; + expect(totalTasks).toBe(500); + expect(totalTasks).toBeLessThanOrEqual(MAX_TASKS); + }); + + it('should reject 167 rows × 3 evaluators', () => { + const totalTasks = 167 * 3; + expect(totalTasks).toBe(501); + expect(totalTasks).toBeGreaterThan(MAX_TASKS); + }); + + it('should accept 166 rows × 3 evaluators', () => { + const totalTasks = 166 * 3; + expect(totalTasks).toBe(498); + expect(totalTasks).toBeLessThanOrEqual(MAX_TASKS); + }); + }); + + describe('Suggestions for Over Limit', () => { + it('should suggest reducing rows', () => { + const currentRows = 200; + const evaluators = 3; + const totalTasks = currentRows * evaluators; // 600 + + expect(totalTasks).toBeGreaterThan(MAX_TASKS); + + // Calculate max rows for current evaluators + const maxRows = Math.floor(MAX_TASKS / evaluators); + expect(maxRows).toBe(166); + }); + + it('should suggest reducing evaluators', () => { + const rows = 300; + const currentEvaluators = 3; + const totalTasks = rows * currentEvaluators; // 900 + + expect(totalTasks).toBeGreaterThan(MAX_TASKS); + + // Calculate max evaluators for current rows + const maxEvaluators = Math.floor(MAX_TASKS / rows); + expect(maxEvaluators).toBe(1); + }); + }); +}); diff --git a/sdks/typescript/tsup.config.ts b/sdks/typescript/tsup.config.ts index 1a81469..de81c72 100644 --- a/sdks/typescript/tsup.config.ts +++ b/sdks/typescript/tsup.config.ts @@ -1,7 +1,7 @@ import { defineConfig } from 'tsup'; export default defineConfig({ - entry: ['src/index.ts'], + entry: ['src/index.ts', 'src/batch/index.ts'], format: ['esm', 'cjs'], dts: true, splitting: false, From b33922613e66f1dc012fb4de1458221ecf9cf4a7 Mon Sep 17 00:00:00 2001 From: Adnan Rashid Hussain Date: Wed, 4 Mar 2026 20:53:15 -0800 Subject: [PATCH 2/4] fix typing --- sdks/typescript/tests/unit/batch/csv-parsing.test.ts | 6 ++++-- 1 file changed, 4 insertions(+), 2 deletions(-) diff --git a/sdks/typescript/tests/unit/batch/csv-parsing.test.ts b/sdks/typescript/tests/unit/batch/csv-parsing.test.ts index 414b815..4b0fcfb 100644 --- a/sdks/typescript/tests/unit/batch/csv-parsing.test.ts +++ b/sdks/typescript/tests/unit/batch/csv-parsing.test.ts @@ -70,8 +70,10 @@ describe('CSV Parsing Robustness', () => { expect(gradeColumn).toBe('Grade'); // Mixed case in CSV (whitespace trimmed) // Should be able to read values - expect(firstRow[textColumn!]).toBeTruthy(); - expect(firstRow[gradeColumn!]).toBe('3'); + // eslint-disable-next-line @typescript-eslint/no-explicit-any + expect((firstRow as any)[textColumn!]).toBeTruthy(); + // eslint-disable-next-line @typescript-eslint/no-explicit-any + expect((firstRow as any)[gradeColumn!]).toBe('3'); }); it('should filter out empty rows', () => { From 177f18057775cf23f7cd6117bde5e46e98346821 Mon Sep 17 00:00:00 2001 From: Adnan Rashid Hussain Date: Wed, 4 Mar 2026 21:01:15 -0800 Subject: [PATCH 3/4] remove json output --- sdks/typescript/src/batch/README.md | 15 ++++----------- sdks/typescript/src/batch/formatters.ts | 7 ------- sdks/typescript/src/batch/index.ts | 8 +------- 3 files changed, 5 insertions(+), 25 deletions(-) diff --git a/sdks/typescript/src/batch/README.md b/sdks/typescript/src/batch/README.md index 8b4fc91..d3636cc 100644 --- a/sdks/typescript/src/batch/README.md +++ b/sdks/typescript/src/batch/README.md @@ -1,6 +1,6 @@ # Batch CSV Evaluator -Evaluate multiple texts from a CSV file using one or more evaluators, with results output in CSV, JSON, and HTML formats. +Evaluate multiple texts from a CSV file using one or more evaluators, with results output in CSV and HTML formats. ## Usage @@ -41,7 +41,6 @@ The output directory is automatically created with a human-readable timestamp: ``` batch-results-2024-02-07_14-30-22/ ├── results.csv -├── results.json └── results.html ``` @@ -69,17 +68,13 @@ See `tests/fixtures/sample-batch-input.csv` for a complete example. ### Output Files -Three files are generated: +Two files are generated: -1. **CSV** (`batch-results-YYYY-MM-DD.csv`): +1. **CSV** (`results.csv`): - Spreadsheet-compatible format - Columns: Row, Text, Grade, Evaluator, Status, Score, Reasoning, Error, Processing Time -2. **JSON** (`batch-results-YYYY-MM-DD.json`): - - Structured data with full results and summary statistics - - Easy to parse programmatically - -3. **HTML** (`batch-results-YYYY-MM-DD.html`): +2. **HTML** (`results.html`): - Interactive table with sorting and filtering (AG Grid) - Color-coded status indicators - Summary statistics dashboard @@ -198,7 +193,6 @@ Duration: 45s 📄 Output files generated: ./batch-results-2024-02-07_14-30-22/ ├── results.csv - ├── results.json └── results.html ``` @@ -228,7 +222,6 @@ Example: ✓ Saved 15 results to: ./batch-results-2024-02-07_14-30-22/ ├── results-partial.csv - ├── results-partial.json └── results-partial.html ``` diff --git a/sdks/typescript/src/batch/formatters.ts b/sdks/typescript/src/batch/formatters.ts index f80a63e..e1ea75f 100644 --- a/sdks/typescript/src/batch/formatters.ts +++ b/sdks/typescript/src/batch/formatters.ts @@ -110,13 +110,6 @@ function escapeCSV(field: string): string { return field; } -/** - * Format results as JSON - */ -export function formatAsJSON(output: BatchOutput): string { - return JSON.stringify(output, null, 2); -} - /** * Format results as HTML with AG Grid */ diff --git a/sdks/typescript/src/batch/index.ts b/sdks/typescript/src/batch/index.ts index 69948a3..e8f5625 100644 --- a/sdks/typescript/src/batch/index.ts +++ b/sdks/typescript/src/batch/index.ts @@ -9,7 +9,7 @@ import { getAvailableEvaluators, getRequiredApiKeys, } from './evaluator.js'; -import { formatAsCSV, formatAsJSON, formatAsHTML } from './formatters.js'; +import { formatAsCSV, formatAsHTML } from './formatters.js'; import { ProgressTracker } from './progress.js'; import type { BatchInput } from './types.js'; @@ -347,17 +347,14 @@ async function main() { // Save partial results try { const csvPath_partial = path.join(outputDir, 'results-partial.csv'); - const jsonPath_partial = path.join(outputDir, 'results-partial.json'); const htmlPath_partial = path.join(outputDir, 'results-partial.html'); fs.writeFileSync(csvPath_partial, formatAsCSV(partialOutput)); - fs.writeFileSync(jsonPath_partial, formatAsJSON(partialOutput)); fs.writeFileSync(htmlPath_partial, formatAsHTML(partialOutput)); console.log(`✓ Saved ${partialResults.length} results to:`); console.log(` ${outputDir}/`); console.log(` ├── results-partial.csv`); - console.log(` ├── results-partial.json`); console.log(` └── results-partial.html`); console.log(); } catch (error) { @@ -390,18 +387,15 @@ async function main() { // Step 7: Write output files const csvPath_out = path.join(outputDir, 'results.csv'); - const jsonPath = path.join(outputDir, 'results.json'); const htmlPath = path.join(outputDir, 'results.html'); try { fs.writeFileSync(csvPath_out, formatAsCSV(output)); - fs.writeFileSync(jsonPath, formatAsJSON(output)); fs.writeFileSync(htmlPath, formatAsHTML(output)); console.log('📄 Output files generated:'); console.log(` ${outputDir}/`); console.log(` ├── results.csv`); - console.log(` ├── results.json`); console.log(` └── results.html`); console.log(); } catch (error) { From e6621b55bc10445d3ae47340e639651ee04f214e Mon Sep 17 00:00:00 2001 From: Adnan Rashid Hussain Date: Wed, 4 Mar 2026 23:04:26 -0800 Subject: [PATCH 4/4] implement a html template and formatter --- sdks/typescript/src/batch/formatters.ts | 588 ++++++----- sdks/typescript/src/batch/index.ts | 17 +- .../typescript/src/batch/report-template.html | 914 ++++++++++++++++++ sdks/typescript/src/types/html.d.ts | 4 + .../tests/unit/batch/formatters.test.ts | 510 ++++++---- sdks/typescript/tsup.config.ts | 3 +- sdks/typescript/vitest.config.ts | 17 +- 7 files changed, 1537 insertions(+), 516 deletions(-) create mode 100644 sdks/typescript/src/batch/report-template.html create mode 100644 sdks/typescript/src/types/html.d.ts diff --git a/sdks/typescript/src/batch/formatters.ts b/sdks/typescript/src/batch/formatters.ts index e1ea75f..2a08735 100644 --- a/sdks/typescript/src/batch/formatters.ts +++ b/sdks/typescript/src/batch/formatters.ts @@ -1,49 +1,111 @@ import type { BatchOutput, BatchResult } from './types.js'; +import reportTemplate from './report-template.html'; + +// ---- Constants ---- + +const GLA_EVALUATOR_ID = 'grade-level-appropriateness'; + +const GRADE_BANDS = ['K-1', '2-3', '4-5', '6-8', '9-10', '11-CCR'] as const; +type GradeBand = typeof GRADE_BANDS[number]; + +// Complexity string scores → numeric (supports both Title Case and lowercase from evaluators) +const COMPLEXITY_SCORE_MAP: Record = { + 'slightly complex': 1, + 'moderately complex': 2, + 'very complex': 3, + 'exceedingly complex': 4, +}; + +// ---- Helpers ---- + +function evaluatorDisplayName(id: string): string { + return id.split('-').map(w => w.charAt(0).toUpperCase() + w.slice(1)).join(' '); +} + +/** Maps a raw grade string (K, 1, 2 … 12, CCR) to a GRADE_BANDS index (0–5). */ +function gradeToBandIndex(grade: string): number { + const g = String(grade).trim().toUpperCase().replace(/^0+/, ''); + if (g === 'K' || g === 'KINDERGARTEN') return 0; + if (g === '1') return 0; + if (g === '2' || g === '3') return 1; + if (g === '4' || g === '5') return 2; + if (g === '6' || g === '7' || g === '8') return 3; + if (g === '9' || g === '10') return 4; + if (g === '11' || g === '12' || g === 'CCR') return 5; + return -1; +} + +/** Maps a GLA score string (e.g. "4-5") to a GRADE_BANDS index. */ +function glaBandToIndex(band: string): number { + return GRADE_BANDS.indexOf(band as GradeBand); +} + +function getGLAStatus(inputGrade: string, glaBand: string): 'on-band' | 'adjacent' | 'off-target' { + const inputIdx = gradeToBandIndex(inputGrade); + const glaIdx = glaBandToIndex(glaBand); + if (inputIdx === -1 || glaIdx === -1) return 'off-target'; + const diff = Math.abs(inputIdx - glaIdx); + if (diff === 0) return 'on-band'; + if (diff === 1) return 'adjacent'; + return 'off-target'; +} + +function complexityToNumeric(score: string): number | undefined { + return COMPLEXITY_SCORE_MAP[score.toLowerCase().trim()]; +} + +function complexityScoreLabel(avg: number): string { + if (avg < 1.5) return 'Slightly Complex'; + if (avg < 2.5) return 'Moderately Complex'; + if (avg < 3.5) return 'Very Complex'; + return 'Exceedingly Complex'; +} + +/** Stub — returns hard-coded insights. Replace with real logic later. */ +function generateInsights(): string[] { + return [ + 'Review texts marked as Off Target — they may need content revision or grade-level adjustment before distribution.', + 'Texts evaluated as Adjacent may benefit from light scaffolding strategies such as vocabulary pre-teaching.', + 'Higher grade bands tend to show greater text complexity. Consider whether complexity aligns with instructional goals.', + ]; +} + +// ---- Shared grouping utility ---- -/** - * Group results by row index - */ function groupResultsByRow(results: BatchResult[]): Map { const grouped = new Map(); - for (const result of results) { if (!grouped.has(result.rowIndex)) { grouped.set(result.rowIndex, []); } grouped.get(result.rowIndex)!.push(result); } - return grouped; } -/** - * Format evaluator ID as column prefix (kebab-case to snake_case) - */ +// ---- CSV Formatter ---- + function formatEvaluatorPrefix(evaluatorId: string): string { return evaluatorId.replace(/-/g, '_'); } -/** - * Format results as CSV with columns per evaluator - */ +function escapeCSV(field: string): string { + if (field.includes(',') || field.includes('"') || field.includes('\n')) { + return `"${field.replace(/"/g, '""')}"`; + } + return field; +} + export function formatAsCSV(output: BatchOutput): string { if (output.results.length === 0) { return ''; } - // Group results by row const groupedByRow = groupResultsByRow(output.results); - - // Get unique evaluator IDs (sorted for consistent column order) - const evaluatorIds = Array.from( - new Set(output.results.map(r => r.evaluatorId)) - ).sort(); - - // Get original column names from first result + const evaluatorIds = Array.from(new Set(output.results.map(r => r.evaluatorId))).sort(); const firstResult = output.results[0]; const originalColumns = Object.keys(firstResult.originalRow); - // Build headers: original columns + evaluator columns (score, reasoning, status) const evaluatorColumns: string[] = []; for (const evalId of evaluatorIds) { const prefix = formatEvaluatorPrefix(evalId); @@ -53,7 +115,6 @@ export function formatAsCSV(output: BatchOutput): string { } const headers = [...originalColumns, ...evaluatorColumns]; - // Build rows (one per input row) const rows: string[][] = []; const sortedRowIndices = Array.from(groupedByRow.keys()).sort((a, b) => a - b); @@ -61,35 +122,20 @@ export function formatAsCSV(output: BatchOutput): string { const resultsForRow = groupedByRow.get(rowIndex)!; const firstResultForRow = resultsForRow[0]; - // Original column values const originalValues = originalColumns.map(col => escapeCSV(String(firstResultForRow.originalRow[col] || '')) ); - // Evaluator column values const evaluatorValues: string[] = []; for (const evalId of evaluatorIds) { const result = resultsForRow.find(r => r.evaluatorId === evalId); - if (result) { - // Score - if (result.status === 'success') { - evaluatorValues.push(escapeCSV(result.score || '')); - } else { - evaluatorValues.push(''); // Empty for errors - } - - // Reasoning - if (result.status === 'success') { - evaluatorValues.push(escapeCSV(result.reasoning || '')); - } else { - evaluatorValues.push(escapeCSV(result.error || '')); - } - - // Status + evaluatorValues.push(result.status === 'success' ? escapeCSV(result.score || '') : ''); + evaluatorValues.push(result.status === 'success' + ? escapeCSV(result.reasoning || '') + : escapeCSV(result.error || '')); evaluatorValues.push(result.status); } else { - // Evaluator not run for this row evaluatorValues.push('', '', 'not_run'); } } @@ -100,293 +146,217 @@ export function formatAsCSV(output: BatchOutput): string { return [headers, ...rows].map(row => row.join(',')).join('\n'); } -/** - * Escape CSV field (handle quotes and commas) - */ -function escapeCSV(field: string): string { - if (field.includes(',') || field.includes('"') || field.includes('\n')) { - return `"${field.replace(/"/g, '""')}"`; - } - return field; +// ---- HTML Formatter ---- + +export interface ReportMeta { + csvPath: string; + evaluatorIds: string[]; + reportId: string; + generatedAt: Date; + totalInputRows: number; } -/** - * Format results as HTML with AG Grid - */ -export function formatAsHTML(output: BatchOutput): string { - if (output.results.length === 0) { - return '

No results to display

'; +export function formatAsHTML(output: BatchOutput, meta: ReportMeta): string { + const { results } = output; + const byRow = groupResultsByRow(results); + const allRowIndices = Array.from(byRow.keys()).sort((a, b) => a - b); + + const allEvaluatorIds = Array.from(new Set(results.map(r => r.evaluatorId))).sort(); + const hasGLA = allEvaluatorIds.includes(GLA_EVALUATOR_ID); + const complexityIds = allEvaluatorIds.filter(id => id !== GLA_EVALUATOR_ID); + + // ---- Snapshot ---- + let processedRows = 0; + let erroredRows = 0; + for (const rowResults of byRow.values()) { + if (rowResults.some(r => r.status === 'error')) erroredRows++; + else processedRows++; } - // Group results by row - const groupedByRow = groupResultsByRow(output.results); - - // Get unique evaluator IDs (sorted) - const evaluatorIds = Array.from( - new Set(output.results.map(r => r.evaluatorId)) - ).sort(); - - // Convert grouped results to grid data (one row per input row) - // eslint-disable-next-line @typescript-eslint/no-explicit-any - const gridData: any[] = []; - const sortedRowIndices = Array.from(groupedByRow.keys()).sort((a, b) => a - b); - - for (const rowIndex of sortedRowIndices) { - const resultsForRow = groupedByRow.get(rowIndex)!; - const firstResult = resultsForRow[0]; - - // eslint-disable-next-line @typescript-eslint/no-explicit-any - const rowData: any = { - row: rowIndex, - text: firstResult.text.substring(0, 100) + (firstResult.text.length > 100 ? '...' : ''), - textFull: firstResult.text, - grade: firstResult.grade, - }; - - // Add evaluator-specific fields - for (const evalId of evaluatorIds) { - const result = resultsForRow.find(r => r.evaluatorId === evalId); - const prefix = formatEvaluatorPrefix(evalId); - - if (result) { - rowData[`${prefix}_status`] = result.status; - rowData[`${prefix}_score`] = result.status === 'success' ? (result.score || '') : ''; - rowData[`${prefix}_reasoning`] = result.status === 'success' ? (result.reasoning || '') : (result.error || ''); - } else { - rowData[`${prefix}_status`] = 'not_run'; - rowData[`${prefix}_score`] = ''; - rowData[`${prefix}_reasoning`] = ''; + // ---- GLA stats ---- + const glaCounts = { onBand: 0, adjacent: 0, offTarget: 0 }; + const rowGLAStatus = new Map(); + + if (hasGLA) { + for (const [rowIndex, rowResults] of byRow) { + const glaResult = rowResults.find(r => r.evaluatorId === GLA_EVALUATOR_ID); + if (glaResult && glaResult.status === 'success' && glaResult.score) { + const status = getGLAStatus(glaResult.grade, glaResult.score); + rowGLAStatus.set(rowIndex, { status, band: glaResult.score, reasoning: glaResult.reasoning || '' }); + if (status === 'on-band') glaCounts.onBand++; + else if (status === 'adjacent') glaCounts.adjacent++; + else glaCounts.offTarget++; } } - - gridData.push(rowData); } - return ` - - - - - Batch Evaluation Results - - - - - - - - - - -
-
-

📊 Batch Evaluation Results

-

Generated on ${new Date().toLocaleString()}

-
- -
-
-
${output.summary.totalTasks}
-
Total Tasks
-
-
-
${output.summary.successful}
-
Successful
-
-
-
${output.summary.failed}
-
Failed
-
-
-
${Math.round(output.summary.durationMs / 1000)}s
-
Duration
-
-
- -
- - -
- - - -`; + return row; + }); + + // ---- Assemble report data ---- + const reportData = { + meta: { + reportId: meta.reportId, + generatedAt: meta.generatedAt.toLocaleString('en-US', { + month: 'short', day: 'numeric', year: 'numeric', + hour: 'numeric', minute: '2-digit', hour12: true, + }), + csvPath: meta.csvPath, + evaluatorIds: meta.evaluatorIds, + evaluatorNames: meta.evaluatorIds.map(evaluatorDisplayName), + totalRows: meta.totalInputRows, + processedRows, + erroredRows, + }, + gradeLevelStats: { + onBand: glaCounts.onBand, + adjacent: glaCounts.adjacent, + offTarget: glaCounts.offTarget, + onBandPct: pct(glaCounts.onBand), + adjacentPct: pct(glaCounts.adjacent), + offTargetPct: pct(glaCounts.offTarget), + hasData: glaTotal > 0, + }, + complexityStats, + gradeBandDistribution: { + bands: [...GRADE_BANDS], + data: bandDist, + }, + complexityHeatmap: { + bands: [...GRADE_BANDS], + evaluators: complexityIds.map(evaluatorDisplayName), + evaluatorIds: complexityIds, + values: heatmapValues, + }, + insights: generateInsights(), + fullResults: { + originalColumns, + hasGLA, + complexityEvaluators: complexityIds.map(id => ({ + evaluatorId: id, + name: evaluatorDisplayName(id), + prefix: id.replace(/-/g, '_'), + })), + rows: fullResultsRows, + }, + }; + + // Inject serialized data into the template. + // Unicode-escape < > & so the JSON is safe inside a injection). + const safeJson = JSON.stringify(reportData) + .replace(//g, '\\u003e') + .replace(/&/g, '\\u0026'); + + return reportTemplate.replace( + 'var REPORT_DATA = null; // __REPLACED_BY_FORMATTER__', + `var REPORT_DATA = ${safeJson};`, + ); } diff --git a/sdks/typescript/src/batch/index.ts b/sdks/typescript/src/batch/index.ts index e8f5625..3f0ec7b 100644 --- a/sdks/typescript/src/batch/index.ts +++ b/sdks/typescript/src/batch/index.ts @@ -9,7 +9,7 @@ import { getAvailableEvaluators, getRequiredApiKeys, } from './evaluator.js'; -import { formatAsCSV, formatAsHTML } from './formatters.js'; +import { formatAsCSV, formatAsHTML, type ReportMeta } from './formatters.js'; import { ProgressTracker } from './progress.js'; import type { BatchInput } from './types.js'; @@ -255,6 +255,17 @@ async function main() { fs.mkdirSync(outputDir, { recursive: true }); } + // Build report metadata used by the HTML formatter + const csvBasename = path.basename(csvPath, path.extname(csvPath)); + const reportTimestamp = `${now.getFullYear()}${String(now.getMonth() + 1).padStart(2, '0')}${String(now.getDate()).padStart(2, '0')}T${String(now.getHours()).padStart(2, '0')}${String(now.getMinutes()).padStart(2, '0')}`; + const reportMeta: ReportMeta = { + csvPath: path.resolve(csvPath), + evaluatorIds, + reportId: `${csvBasename.replace(/[^a-zA-Z0-9]/g, '_')}_${reportTimestamp}`, + generatedAt: now, + totalInputRows: inputs.length, + }; + // Step 5: Confirm and run const totalTasks = inputs.length * evaluatorIds.length; const MAX_TASKS = 500; @@ -350,7 +361,7 @@ async function main() { const htmlPath_partial = path.join(outputDir, 'results-partial.html'); fs.writeFileSync(csvPath_partial, formatAsCSV(partialOutput)); - fs.writeFileSync(htmlPath_partial, formatAsHTML(partialOutput)); + fs.writeFileSync(htmlPath_partial, formatAsHTML(partialOutput, reportMeta)); console.log(`✓ Saved ${partialResults.length} results to:`); console.log(` ${outputDir}/`); @@ -391,7 +402,7 @@ async function main() { try { fs.writeFileSync(csvPath_out, formatAsCSV(output)); - fs.writeFileSync(htmlPath, formatAsHTML(output)); + fs.writeFileSync(htmlPath, formatAsHTML(output, reportMeta)); console.log('📄 Output files generated:'); console.log(` ${outputDir}/`); diff --git a/sdks/typescript/src/batch/report-template.html b/sdks/typescript/src/batch/report-template.html new file mode 100644 index 0000000..885a0a1 --- /dev/null +++ b/sdks/typescript/src/batch/report-template.html @@ -0,0 +1,914 @@ + + + + + + Evaluation Report + + + + + +
+ + + +
+
+ + + + diff --git a/sdks/typescript/src/types/html.d.ts b/sdks/typescript/src/types/html.d.ts new file mode 100644 index 0000000..448f7d1 --- /dev/null +++ b/sdks/typescript/src/types/html.d.ts @@ -0,0 +1,4 @@ +declare module '*.html' { + const content: string; + export default content; +} diff --git a/sdks/typescript/tests/unit/batch/formatters.test.ts b/sdks/typescript/tests/unit/batch/formatters.test.ts index 3ee84db..b3fcaf6 100644 --- a/sdks/typescript/tests/unit/batch/formatters.test.ts +++ b/sdks/typescript/tests/unit/batch/formatters.test.ts @@ -1,252 +1,358 @@ import { describe, it, expect } from 'vitest'; -import { formatAsCSV, formatAsJSON, formatAsHTML } from '../../../src/batch/formatters.js'; +import { formatAsCSV, formatAsHTML, type ReportMeta } from '../../../src/batch/formatters.js'; import type { BatchOutput, BatchResult } from '../../../src/batch/types.js'; -describe('Batch Formatters', () => { - const sampleResults: BatchResult[] = [ - { - rowIndex: 1, - text: 'The cat sat on the mat.', - grade: '3', - evaluatorId: 'vocabulary', - status: 'success', - score: 'slightly complex', - reasoning: 'Simple vocabulary', - processingTimeMs: 1250, - originalRow: { row_id: '1', text: 'The cat sat on the mat.', grade: '3', source: 'test' }, - }, - { - rowIndex: 1, - text: 'The cat sat on the mat.', - grade: '3', - evaluatorId: 'sentence-structure', - status: 'success', - score: 'Moderately Complex', - reasoning: 'Simple sentence structure', - processingTimeMs: 1100, - originalRow: { row_id: '1', text: 'The cat sat on the mat.', grade: '3', source: 'test' }, - }, - { - rowIndex: 2, - text: 'The quick brown fox jumps over the lazy dog.', - grade: '4', - evaluatorId: 'vocabulary', - status: 'error', - error: 'API timeout', - processingTimeMs: 5000, - originalRow: { row_id: '2', text: 'The quick brown fox jumps over the lazy dog.', grade: '4', source: 'test' }, - }, - ]; +// ---- Test fixtures ---- + +function makeResult(overrides: Partial): BatchResult { + return { + rowIndex: 1, + text: 'Sample text.', + grade: '5', + evaluatorId: 'vocabulary', + status: 'success', + score: 'slightly complex', + reasoning: 'ok', + processingTimeMs: 100, + originalRow: { text: 'Sample text.', grade: '5' }, + ...overrides, + }; +} - const sampleOutput: BatchOutput = { - results: sampleResults, +function makeOutput(results: BatchResult[]): BatchOutput { + return { + results, summary: { - totalTasks: 3, - successful: 2, - failed: 1, - durationMs: 7500, - resultsPerEvaluator: { - vocabulary: { successful: 1, failed: 1 }, - 'sentence-structure': { successful: 1, failed: 0 }, - }, + totalTasks: results.length, + successful: results.filter(r => r.status === 'success').length, + failed: results.filter(r => r.status === 'error').length, + durationMs: 1000, + resultsPerEvaluator: {}, }, }; +} + +function makeMeta(overrides?: Partial): ReportMeta { + return { + csvPath: '/data/input.csv', + evaluatorIds: ['vocabulary'], + reportId: 'test_20260301T0000', + generatedAt: new Date('2026-03-01T00:00:00Z'), + totalInputRows: 1, + ...overrides, + }; +} + +/** + * Extracts and parses the REPORT_DATA JSON injected into the HTML by formatAsHTML. + * This lets us make assertions on actual computed values rather than raw string presence. + */ +// eslint-disable-next-line @typescript-eslint/no-explicit-any +function extractReportData(html: string): any { + const marker = 'var REPORT_DATA = '; + const start = html.indexOf(marker) + marker.length; + const line = html.slice(start, html.indexOf('\n', start)); + const json = line.endsWith(';') ? line.slice(0, -1) : line; + return JSON.parse(json); +} + +// ============================================================ +// formatAsCSV +// ============================================================ + +describe('formatAsCSV', () => { + it('returns empty string for empty results', () => { + expect(formatAsCSV(makeOutput([]))).toBe(''); + }); - describe('formatAsCSV', () => { - it('should format results as CSV with columns per evaluator', () => { - const csv = formatAsCSV(sampleOutput); + it('produces one data row per input row, not per evaluator task', () => { + // Row 1 has two evaluators → should collapse into a single CSV row + const output = makeOutput([ + makeResult({ rowIndex: 1, evaluatorId: 'vocabulary', score: 'slightly complex' }), + makeResult({ rowIndex: 1, evaluatorId: 'sentence-structure', score: 'Moderately Complex' }), + ]); - // Should include original columns - expect(csv).toContain('row_id'); - expect(csv).toContain('source'); + const lines = formatAsCSV(output).split('\n'); + expect(lines).toHaveLength(2); // 1 header + 1 data row + }); - // Should have evaluator-specific columns (not "evaluator" column) - expect(csv).toContain('vocabulary_score'); - expect(csv).toContain('vocabulary_reasoning'); - expect(csv).toContain('vocabulary_status'); - expect(csv).toContain('sentence_structure_score'); - expect(csv).toContain('sentence_structure_reasoning'); - expect(csv).toContain('sentence_structure_status'); + it('places evaluator columns in alphabetical order after original columns', () => { + const output = makeOutput([ + makeResult({ evaluatorId: 'vocabulary', originalRow: { id: '1', text: 'txt', grade: '5' } }), + makeResult({ evaluatorId: 'sentence-structure', originalRow: { id: '1', text: 'txt', grade: '5' } }), + ]); - // Should have one row per input row (not per evaluator) - expect(csv.split('\n')).toHaveLength(3); // Header + 2 data rows - }); + const header = formatAsCSV(output).split('\n')[0]; + const cols = header.split(','); - it('should escape CSV fields with quotes', () => { - const resultsWithCommas: BatchResult[] = [ - { - rowIndex: 1, - text: 'Text with, comma', - grade: '3', - evaluatorId: 'vocabulary', - status: 'success', - score: 'slightly complex', - reasoning: 'Reasoning with, comma', - processingTimeMs: 1000, - originalRow: { text: 'Text with, comma', grade: '3' }, - }, - ]; - - const output: BatchOutput = { - results: resultsWithCommas, - summary: { - totalTasks: 1, - successful: 1, - failed: 0, - durationMs: 1000, - resultsPerEvaluator: { vocabulary: { successful: 1, failed: 0 } }, - }, - }; - - const csv = formatAsCSV(output); - expect(csv).toContain('"Text with, comma"'); - expect(csv).toContain('"Reasoning with, comma"'); - }); + // Original columns come first + expect(cols[0]).toBe('id'); + // sentence-structure sorts before vocabulary alphabetically + expect(cols.indexOf('sentence_structure_score')).toBeLessThan(cols.indexOf('vocabulary_score')); + }); - it('should handle errors in evaluator columns', () => { - const csv = formatAsCSV(sampleOutput); + it('leaves score empty and puts error message in reasoning for failed evaluations', () => { + const output = makeOutput([ + makeResult({ status: 'error', error: 'API timeout', score: undefined }), + ]); - // Row 2 has vocabulary error - should have empty score, error as reasoning, status=error - expect(csv).toContain('API timeout'); // Error message in reasoning column - expect(csv).toContain('error'); // Status column - }); + const csv = formatAsCSV(output); + const dataRow = csv.split('\n')[1]; + const cols = dataRow.split(','); + const header = csv.split('\n')[0].split(','); - it('should preserve original columns in order', () => { - const csv = formatAsCSV(sampleOutput); - const lines = csv.split('\n'); + const scoreIdx = header.indexOf('vocabulary_score'); + const reasoningIdx = header.indexOf('vocabulary_reasoning'); + const statusIdx = header.indexOf('vocabulary_status'); - // First line should be headers with original columns first - const headers = lines[0]; - expect(headers).toContain('row_id'); - expect(headers).toContain('text'); - expect(headers).toContain('grade'); - expect(headers).toContain('source'); + expect(cols[scoreIdx]).toBe(''); // score is blank for errors + expect(cols[reasoningIdx]).toBe('API timeout'); + expect(cols[statusIdx]).toBe('error'); + }); - // Should have evaluator columns (not single "evaluator" column) - expect(headers).toContain('vocabulary_score'); + it('outputs not_run when an evaluator produced no result for a row', () => { + // Row 1: vocabulary ran; sentence-structure did not + const output = makeOutput([ + makeResult({ rowIndex: 1, evaluatorId: 'vocabulary', originalRow: { text: 'x', grade: '5' } }), + ]); + // Manually add sentence-structure to the results so the column exists but not for row 1 + output.results.push(makeResult({ + rowIndex: 2, evaluatorId: 'sentence-structure', + originalRow: { text: 'y', grade: '5' }, + })); + + const csv = formatAsCSV(output); + const [header, row1] = csv.split('\n'); + const cols = header.split(','); + const ssStatusIdx = cols.indexOf('sentence_structure_status'); + + expect(row1.split(',')[ssStatusIdx]).toBe('not_run'); + }); - // Data rows should have original data first - now one row per input row - expect(lines[1].startsWith('1,')).toBe(true); // Row 1 - expect(lines[2].startsWith('2,')).toBe(true); // Row 2 - }); + it('wraps fields containing commas, quotes, or newlines in double-quotes', () => { + const output = makeOutput([ + makeResult({ + score: 'slightly complex', + reasoning: 'Has "quotes" and, comma', + originalRow: { text: 'Line1\nLine2', grade: '5' }, + }), + ]); + + const csv = formatAsCSV(output); + expect(csv).toContain('"Line1\nLine2"'); + expect(csv).toContain('"Has ""quotes"" and, comma"'); }); +}); - describe('formatAsJSON', () => { - it('should format results as valid JSON', () => { - const json = formatAsJSON(sampleOutput); +// ============================================================ +// formatAsHTML — computed report data +// ============================================================ + +describe('formatAsHTML', () => { + describe('snapshot counts', () => { + it('counts a row as errored if any of its evaluator results failed', () => { + // Row 1: vocabulary ok, sentence-structure errored → should be "errored" + // Row 2: both ok → should be "processed" + const output = makeOutput([ + makeResult({ rowIndex: 1, evaluatorId: 'vocabulary', status: 'success' }), + makeResult({ rowIndex: 1, evaluatorId: 'sentence-structure', status: 'error', error: 'timeout' }), + makeResult({ rowIndex: 2, evaluatorId: 'vocabulary', status: 'success' }), + makeResult({ rowIndex: 2, evaluatorId: 'sentence-structure', status: 'success' }), + ]); + + const { meta } = extractReportData(formatAsHTML(output, makeMeta({ totalInputRows: 2 }))); + expect(meta.processedRows).toBe(1); + expect(meta.erroredRows).toBe(1); + }); + }); - expect(() => JSON.parse(json)).not.toThrow(); + describe('GLA status classification', () => { + function glaOutput(inputGrade: string, glaBand: string) { + return makeOutput([makeResult({ + grade: inputGrade, + evaluatorId: 'grade-level-appropriateness', + score: glaBand, + })]); + } + + it('classifies on-band when input grade falls within the GLA band', () => { + const { gradeLevelStats } = extractReportData( + formatAsHTML(glaOutput('3', '2-3'), makeMeta({ evaluatorIds: ['grade-level-appropriateness'] })) + ); + expect(gradeLevelStats.onBand).toBe(1); + expect(gradeLevelStats.adjacent).toBe(0); + expect(gradeLevelStats.offTarget).toBe(0); }); - it('should include results and summary', () => { - const json = formatAsJSON(sampleOutput); - const parsed = JSON.parse(json); + it('classifies adjacent when input grade is one band away from the GLA result', () => { + // Grade 4 → band index 2 (4-5); GLA "2-3" → band index 1; diff = 1 + const { gradeLevelStats } = extractReportData( + formatAsHTML(glaOutput('4', '2-3'), makeMeta({ evaluatorIds: ['grade-level-appropriateness'] })) + ); + expect(gradeLevelStats.onBand).toBe(0); + expect(gradeLevelStats.adjacent).toBe(1); + expect(gradeLevelStats.offTarget).toBe(0); + }); - expect(parsed).toHaveProperty('results'); - expect(parsed).toHaveProperty('summary'); - expect(parsed.results).toHaveLength(3); + it('classifies off-target when input grade is two or more bands away', () => { + // Grade 8 → band index 3 (6-8); GLA "2-3" → band index 1; diff = 2 + const { gradeLevelStats } = extractReportData( + formatAsHTML(glaOutput('8', '2-3'), makeMeta({ evaluatorIds: ['grade-level-appropriateness'] })) + ); + expect(gradeLevelStats.onBand).toBe(0); + expect(gradeLevelStats.adjacent).toBe(0); + expect(gradeLevelStats.offTarget).toBe(1); }); - it('should preserve all result fields', () => { - const json = formatAsJSON(sampleOutput); - const parsed = JSON.parse(json); - - const firstResult = parsed.results[0]; - expect(firstResult).toHaveProperty('rowIndex'); - expect(firstResult).toHaveProperty('text'); - expect(firstResult).toHaveProperty('grade'); - expect(firstResult).toHaveProperty('evaluatorId'); - expect(firstResult).toHaveProperty('status'); - expect(firstResult).toHaveProperty('processingTimeMs'); + it('maps grade K and grade 1 to the same K-1 band (both on-band with K-1 GLA result)', () => { + for (const grade of ['K', '1']) { + const { gradeLevelStats } = extractReportData( + formatAsHTML(glaOutput(grade, 'K-1'), makeMeta({ evaluatorIds: ['grade-level-appropriateness'] })) + ); + expect(gradeLevelStats.onBand).toBe(1); + } }); - it('should include summary statistics', () => { - const json = formatAsJSON(sampleOutput); - const parsed = JSON.parse(json); + it('maps grade 11, 12, and CCR to the same 11-CCR band', () => { + for (const grade of ['11', '12', 'CCR']) { + const { gradeLevelStats } = extractReportData( + formatAsHTML(glaOutput(grade, '11-CCR'), makeMeta({ evaluatorIds: ['grade-level-appropriateness'] })) + ); + expect(gradeLevelStats.onBand).toBe(1); + } + }); - expect(parsed.summary.totalTasks).toBe(3); - expect(parsed.summary.successful).toBe(2); - expect(parsed.summary.failed).toBe(1); - expect(parsed.summary.durationMs).toBe(7500); + it('treats an unrecognised grade as off-target (tests the -1 guard, not coincidental diff arithmetic)', () => { + // Grade '99' → gradeToBandIndex returns -1. GLA 'K-1' is index 0, so without + // the "inputIdx === -1" guard the diff would be |(-1) - 0| = 1 → 'adjacent'. + // The guard must fire for this to be 'off-target'. + const { gradeLevelStats } = extractReportData( + formatAsHTML(glaOutput('99', 'K-1'), makeMeta({ evaluatorIds: ['grade-level-appropriateness'] })) + ); + expect(gradeLevelStats.offTarget).toBe(1); }); }); - describe('formatAsHTML', () => { - it('should generate valid HTML', () => { - const html = formatAsHTML(sampleOutput); - - expect(html).toContain(''); - expect(html).toContain(''); + describe('complexity stats', () => { + it('normalises score strings case-insensitively (Title Case and lowercase both map to the same numeric value)', () => { + // vocabulary returns lowercase; sentence-structure returns Title Case + const output = makeOutput([ + makeResult({ rowIndex: 1, evaluatorId: 'vocabulary', score: 'slightly complex' }), + makeResult({ rowIndex: 1, evaluatorId: 'sentence-structure', score: 'Slightly Complex' }), + ]); + + const { complexityStats } = extractReportData( + formatAsHTML(output, makeMeta({ evaluatorIds: ['vocabulary', 'sentence-structure'] })) + ); + + // Both evaluators must appear — verifies GLA is excluded and neither evaluator was silently dropped + expect(complexityStats).toHaveLength(2); + for (const stat of complexityStats) { + expect(stat.average).toBe(1.0); + expect(stat.label).toBe('Slightly Complex'); + expect(stat.distribution[0]).toBe(1); // one score of 1 + } }); - it('should include AG Grid script', () => { - const html = formatAsHTML(sampleOutput); + it('excludes GLA from complexity stats even when it runs alongside complexity evaluators', () => { + const output = makeOutput([ + makeResult({ rowIndex: 1, evaluatorId: 'grade-level-appropriateness', score: '4-5' }), + makeResult({ rowIndex: 1, evaluatorId: 'vocabulary', score: 'slightly complex' }), + ]); + + const { complexityStats } = extractReportData( + formatAsHTML(output, makeMeta({ evaluatorIds: ['grade-level-appropriateness', 'vocabulary'] })) + ); - expect(html).toContain('ag-grid-community'); + expect(complexityStats).toHaveLength(1); + expect(complexityStats[0].evaluatorId).toBe('vocabulary'); }); - it('should include summary statistics', () => { - const html = formatAsHTML(sampleOutput); + it('computes average and distribution correctly across multiple rows', () => { + // scores: 1, 2, 3 → avg 2.0 + const output = makeOutput([ + makeResult({ rowIndex: 1, evaluatorId: 'vocabulary', score: 'slightly complex' }), + makeResult({ rowIndex: 2, evaluatorId: 'vocabulary', score: 'moderately complex' }), + makeResult({ rowIndex: 3, evaluatorId: 'vocabulary', score: 'very complex' }), + ]); - expect(html).toContain('3'); // Total tasks - expect(html).toContain('2'); // Successful - expect(html).toContain('1'); // Failed + const { complexityStats } = extractReportData(formatAsHTML(output, makeMeta({ totalInputRows: 3 }))); + const vocab = complexityStats[0]; + + expect(vocab.average).toBe(2.0); + expect(vocab.label).toBe('Moderately Complex'); + expect(vocab.distribution).toEqual([1, 1, 1, 0]); // one each of scores 1, 2, 3 }); - it('should include grid data as JSON', () => { - const html = formatAsHTML(sampleOutput); + it('excludes error results from complexity averages', () => { + const output = makeOutput([ + makeResult({ rowIndex: 1, evaluatorId: 'vocabulary', status: 'success', score: 'very complex' }), + makeResult({ rowIndex: 2, evaluatorId: 'vocabulary', status: 'error', error: 'timeout' }), + ]); - expect(html).toContain('const rowData'); - expect(html).toContain('vocabulary_status'); - expect(html).toContain('sentence_structure_status'); + const { complexityStats } = extractReportData(formatAsHTML(output, makeMeta({ totalInputRows: 2 }))); + expect(complexityStats[0].average).toBe(3.0); // only the successful score counts + expect(complexityStats[0].distribution).toEqual([0, 0, 1, 0]); }); + }); - it('should include HTML-like content in JSON data', () => { - const resultsWithHTML: BatchResult[] = [ - { - rowIndex: 1, - text: 'Text with ', - grade: '3', - evaluatorId: 'vocabulary', - status: 'success', - score: 'slightly complex', - reasoning: 'Reasoning with bold', - processingTimeMs: 1000, - originalRow: { text: 'Text with ', grade: '3' }, - }, - ]; - - const output: BatchOutput = { - results: resultsWithHTML, - summary: { - totalTasks: 1, - successful: 1, - failed: 0, - durationMs: 1000, - resultsPerEvaluator: { vocabulary: { successful: 1, failed: 0 } }, - }, - }; - - const html = formatAsHTML(output); - - // JSON.stringify automatically escapes HTML, so it's safe - // The content will be in the JSON data but escaped - expect(html).toContain('const rowData'); - expect(html).toContain('vocabulary'); + describe('grade band distribution', () => { + it('groups by the INPUT grade band, not the GLA result band', () => { + // Grade 3 → "2-3" bucket (index 1). GLA says "9-10" (off-target, diff=3). + const output = makeOutput([makeResult({ + grade: '3', + evaluatorId: 'grade-level-appropriateness', + score: '9-10', + })]); + + const { gradeBandDistribution } = extractReportData( + formatAsHTML(output, makeMeta({ evaluatorIds: ['grade-level-appropriateness'] })) + ); + + const band23 = gradeBandDistribution.data[1]; // index 1 = "2-3" (input grade) + const band910 = gradeBandDistribution.data[4]; // index 4 = "9-10" (GLA result) + + expect(band23.total).toBe(1); // row belongs to the "2-3" input bucket + expect(band23.offTarget).toBe(1); + expect(band910.total).toBe(0); // NOT in the GLA result's bucket + }); + }); + + describe('complexity heatmap', () => { + it('produces null for grade bands that have no data', () => { + // Only grade 5 rows → only "4-5" band (index 2) has data; others are null + const output = makeOutput([ + makeResult({ grade: '5', evaluatorId: 'vocabulary', score: 'moderately complex' }), + ]); + + const { complexityHeatmap } = extractReportData(formatAsHTML(output, makeMeta())); + const k1Values = complexityHeatmap.values[0]; // K-1 band + expect(k1Values[0]).toBeNull(); }); - it('should include column definitions with evaluator columns', () => { - const html = formatAsHTML(sampleOutput); + it('computes the correct per-cell average', () => { + // Two grade-5 rows: scores 1 and 3 → average 2.0 + const output = makeOutput([ + makeResult({ rowIndex: 1, grade: '5', evaluatorId: 'vocabulary', score: 'slightly complex' }), + makeResult({ rowIndex: 2, grade: '5', evaluatorId: 'vocabulary', score: 'very complex' }), + ]); + + const { complexityHeatmap } = extractReportData(formatAsHTML(output, makeMeta({ totalInputRows: 2 }))); + const band45Values = complexityHeatmap.values[2]; // "4-5" is index 2 + expect(band45Values[0]).toBe(2.0); + }); + }); - expect(html).toContain('columnDefs'); - expect(html).toContain('field: \'row\''); - expect(html).toContain('field: \'text\''); + describe('XSS safety', () => { + it('Unicode-escapes < > & so injected data cannot break out of the script tag', () => { + const output = makeOutput([makeResult({ + text: '', + originalRow: { text: '', grade: '5' }, + })]); - // Should have evaluator-specific columns (not single "status" column) - expect(html).toContain('vocabulary_status'); - expect(html).toContain('vocabulary_score'); - expect(html).toContain('sentence_structure_status'); + const html = formatAsHTML(output, makeMeta()); + expect(html).not.toContain('