From 3cf28a2aea8e8b8bba8d7dc90f4968618ac2ddb8 Mon Sep 17 00:00:00 2001 From: Adnan Rashid Hussain Date: Thu, 5 Feb 2026 17:12:17 -0800 Subject: [PATCH 1/4] feat: Implement GLA evaluator --- sdks/typescript/README.md | 40 ++++ .../evaluators/grade-level-appropriateness.ts | 212 ++++++++++++++++++ sdks/typescript/src/evaluators/index.ts | 6 + sdks/typescript/src/index.ts | 11 + .../grade-level-appropriateness/index.ts | 30 +++ .../schemas/grade-level-appropriateness.ts | 27 +++ sdks/typescript/src/schemas/index.ts | 5 + .../grade-level-appropriateness.test.ts | 173 ++++++++++++++ 8 files changed, 504 insertions(+) create mode 100644 sdks/typescript/src/evaluators/grade-level-appropriateness.ts create mode 100644 sdks/typescript/src/prompts/grade-level-appropriateness/index.ts create mode 100644 sdks/typescript/src/schemas/grade-level-appropriateness.ts create mode 100644 sdks/typescript/tests/unit/evaluators/grade-level-appropriateness.test.ts diff --git a/sdks/typescript/README.md b/sdks/typescript/README.md index 1fc51cb..86d022b 100644 --- a/sdks/typescript/README.md +++ b/sdks/typescript/README.md @@ -121,6 +121,46 @@ await evaluator.evaluate(text: string, grade: string) --- +### 3. Grade Level Appropriateness Evaluator + +Determines appropriate grade level for text. + +**No grade parameter required** - evaluates what grade the text is appropriate for. + +**Uses:** Google Gemini 2.5 Pro + +**Constructor:** +```typescript +const evaluator = new GradeLevelAppropriatenessEvaluator({ + googleApiKey: string; // Required - Google API key + maxRetries?: number; // Optional - Max retry attempts (default: 2) + telemetry?: boolean | TelemetryOptions; // Optional (default: true) + logger?: Logger; // Optional - Custom logger + logLevel?: LogLevel; // Optional - Logging verbosity (default: WARN) +}); +``` + +**API:** +```typescript +await evaluator.evaluate(text: string) +``` + +**Returns:** +```typescript +{ + score: { + grade: 'K-1' | '2-3' | '4-5' | '6-8' | '9-10' | '11-CCR'; + alternative_grade: string; + scaffolding_needed: string[]; + reasoning: string; + }; + reasoning: string; + metadata: EvaluationMetadata; +} +``` + +--- + ## Error Handling The SDK provides specific error types to help you handle different scenarios: diff --git a/sdks/typescript/src/evaluators/grade-level-appropriateness.ts b/sdks/typescript/src/evaluators/grade-level-appropriateness.ts new file mode 100644 index 0000000..fa125c4 --- /dev/null +++ b/sdks/typescript/src/evaluators/grade-level-appropriateness.ts @@ -0,0 +1,212 @@ +import type { LLMProvider } from '../providers/index.js'; +import { createProvider } from '../providers/index.js'; +import { + GradeLevelAppropriatenessSchema, + type GradeLevelAppropriateness, +} from '../schemas/grade-level-appropriateness.js'; +import { getSystemPrompt, getUserPrompt } from '../prompts/grade-level-appropriateness/index.js'; +import type { EvaluationResult } from '../schemas/index.js'; +import { BaseEvaluator, type BaseEvaluatorConfig } from './base.js'; +import { ValidationError, wrapProviderError } from '../errors.js'; + +/** + * Configuration for GradeLevelAppropriatenessEvaluator + */ +export interface GradeLevelAppropriatenessEvaluatorConfig extends BaseEvaluatorConfig { + /** Google API key for grade level evaluation (uses Gemini 2.5 Pro) */ + googleApiKey: string; +} + +/** + * Grade Level Appropriateness Evaluator + * + * Evaluates whether AI-generated text is suitable for a given grade band. + * Uses a structured 4-step analysis process: + * 1. Quantitative analysis (word count, Flesch-Kincaid) + * 2. Qualitative complexity (text structure, language, purpose, knowledge demands) + * 3. Background knowledge assessment + * 4. Synthesis and final recommendation + * + * Returns: + * - Target grade band (K-1, 2-3, 4-5, 6-8, 9-10, 11-CCR) + * - Alternative grade band (with scaffolding) + * - Specific scaffolding recommendations + * + * @example + * ```typescript + * const evaluator = new GradeLevelAppropriatenessEvaluator({ + * googleApiKey: process.env.GOOGLE_API_KEY + * }); + * + * const result = await evaluator.evaluate(text); + * console.log(result.score.grade); // "9-10" + * console.log(result.score.alternative_grade); // "6-8" + * console.log(result.score.scaffolding_needed); + * ``` + */ +export class GradeLevelAppropriatenessEvaluator extends BaseEvaluator { + private provider: LLMProvider; + private evaluatorConfig: GradeLevelAppropriatenessEvaluatorConfig; + + constructor(config: GradeLevelAppropriatenessEvaluatorConfig) { + // Call base constructor for common setup (telemetry, etc.) + super(config); + + // Validate required API keys + if (!config.googleApiKey) { + throw new ValidationError('Google API key is required. Pass googleApiKey in config.'); + } + + this.evaluatorConfig = config; + + // Create Google Gemini provider + this.provider = createProvider({ + type: 'google', + model: 'gemini-2.5-pro', + apiKey: config.googleApiKey, + temperature: 0.25, + maxRetries: this.config.maxRetries, + }); + } + + // Implement abstract methods from BaseEvaluator + protected getEvaluatorType(): string { + return 'grade-level-appropriateness'; + } + + /** + * Evaluate grade level appropriateness for a given text + * + * @param text - The text to evaluate + * @returns Evaluation result with grade recommendations and scaffolding suggestions + * @throws {Error} If text is empty + */ + async evaluate(text: string): Promise> { + this.logger.info('Starting grade level appropriateness evaluation', { + evaluator: 'grade-level-appropriateness', + operation: 'evaluate', + textLength: text.length, + }); + + // Use inherited validation method + this.validateText(text); + + const startTime = Date.now(); + + try { + this.logger.debug('Evaluating grade level appropriateness', { + evaluator: 'grade-level-appropriateness', + operation: 'grade_evaluation', + }); + const userPrompt = getUserPrompt(text); + + const response = await this.provider.generateStructured({ + messages: [ + { role: 'system', content: getSystemPrompt() }, + { role: 'user', content: userPrompt }, + ], + schema: GradeLevelAppropriatenessSchema, + temperature: 0.25, + }); + + const latencyMs = Date.now() - startTime; + + const tokenUsage = { + input_tokens: response.usage.inputTokens, + output_tokens: response.usage.outputTokens, + }; + + const result = { + score: response.data, + reasoning: response.data.reasoning, + metadata: { + promptVersion: '1.0', + model: 'gemini-2.5-pro', + timestamp: new Date(), + processingTimeMs: latencyMs, + }, + }; + + // Send success telemetry (fire-and-forget) + this.sendTelemetry({ + status: 'success', + latencyMs, + textLength: text.length, + provider: 'google:gemini-2.5-pro', + // TODO: Retry tracking - Vercel AI SDK doesn't expose actual retry attempts + // We set -1 to indicate "unknown" (we may have retried, but can't track it) + // To fix: Implement custom retry wrapper that tracks each attempt + retryAttempts: -1, + tokenUsage, + // No metadata.stage_details for single-stage evaluator + inputText: text, + }).catch(() => { + // Ignore telemetry errors + }); + + this.logger.info('Grade level appropriateness evaluation completed successfully', { + evaluator: 'grade-level-appropriateness', + operation: 'evaluate', + grade: result.score.grade, + processingTimeMs: latencyMs, + }); + + return result; + } catch (error) { + const latencyMs = Date.now() - startTime; + + // Log the error + this.logger.error('Grade level appropriateness evaluation failed', { + evaluator: 'grade-level-appropriateness', + operation: 'evaluate', + error: error instanceof Error ? error : undefined, + processingTimeMs: latencyMs, + }); + + // Send failure telemetry (fire-and-forget) + this.sendTelemetry({ + status: 'error', + latencyMs, + textLength: text.length, + provider: 'google:gemini-2.5-pro', + // TODO: Retry tracking - Vercel AI SDK doesn't expose actual retry attempts + // We set -1 to indicate "unknown" (we may have retried, but can't track it) + // To fix: Implement custom retry wrapper that tracks each attempt + retryAttempts: -1, + errorCode: error instanceof Error ? error.name : 'UnknownError', + inputText: text, + }).catch(() => { + // Ignore telemetry errors + }); + + // Re-throw validation errors as-is + if (error instanceof ValidationError) { + throw error; + } + + // Wrap provider errors into appropriate error types + throw wrapProviderError(error, 'Grade level appropriateness evaluation failed'); + } + } +} + +/** + * Functional API for grade level appropriateness evaluation + * + * @example + * ```typescript + * const result = await evaluateGradeLevelAppropriateness( + * "Tides are the rise and fall of sea levels...", + * { + * googleApiKey: process.env.GOOGLE_API_KEY + * } + * ); + * ``` + */ +export async function evaluateGradeLevelAppropriateness( + text: string, + config: GradeLevelAppropriatenessEvaluatorConfig +): Promise> { + const evaluator = new GradeLevelAppropriatenessEvaluator(config); + return evaluator.evaluate(text); +} diff --git a/sdks/typescript/src/evaluators/index.ts b/sdks/typescript/src/evaluators/index.ts index e96898e..12b55d6 100644 --- a/sdks/typescript/src/evaluators/index.ts +++ b/sdks/typescript/src/evaluators/index.ts @@ -11,3 +11,9 @@ export { evaluateSentenceStructure, type SentenceStructureEvaluatorConfig, } from './sentence-structure.js'; + +export { + GradeLevelAppropriatenessEvaluator, + evaluateGradeLevelAppropriateness, + type GradeLevelAppropriatenessEvaluatorConfig, +} from './grade-level-appropriateness.js'; diff --git a/sdks/typescript/src/index.ts b/sdks/typescript/src/index.ts index d8d0a08..a84fcc0 100644 --- a/sdks/typescript/src/index.ts +++ b/sdks/typescript/src/index.ts @@ -7,6 +7,8 @@ export type { EvaluationError, } from './schemas/index.js'; +export { ComplexityLevel, GradeLevel, GradeBand } from './schemas/index.js'; + // Error types export { EvaluatorError, @@ -51,6 +53,12 @@ export type { VocabularyComplexityLevel, } from './schemas/vocabulary.js'; +// Grade Level Appropriateness exports +export type { GradeLevelAppropriateness } from './schemas/grade-level-appropriateness.js'; + +export { GradeLevelAppropriatenessSchema } from './schemas/grade-level-appropriateness.js'; + + export { VocabularyEvaluator, evaluateVocabulary, @@ -58,6 +66,9 @@ export { SentenceStructureEvaluator, evaluateSentenceStructure, type SentenceStructureEvaluatorConfig, + GradeLevelAppropriatenessEvaluator, + evaluateGradeLevelAppropriateness, + type GradeLevelAppropriatenessEvaluatorConfig, type BaseEvaluatorConfig, type TelemetryOptions, } from './evaluators/index.js'; diff --git a/sdks/typescript/src/prompts/grade-level-appropriateness/index.ts b/sdks/typescript/src/prompts/grade-level-appropriateness/index.ts new file mode 100644 index 0000000..03831e0 --- /dev/null +++ b/sdks/typescript/src/prompts/grade-level-appropriateness/index.ts @@ -0,0 +1,30 @@ +import { loadPrompt } from '../../utils/prompts.js'; + +/** + * System prompt for grade level appropriateness evaluation + * Loaded from: prompts/grade-level-appropriateness/system.txt + */ +const SYSTEM_PROMPT_TEMPLATE = loadPrompt('grade-level-appropriateness/system.txt'); + +/** + * Get the system prompt for grade level appropriateness evaluation + * @returns The system prompt + */ +export function getSystemPrompt(): string { + return SYSTEM_PROMPT_TEMPLATE; +} + +/** + * User prompt template for grade level appropriateness evaluation + * Loaded from: prompts/grade-level-appropriateness/user.txt + */ +const USER_PROMPT_TEMPLATE = loadPrompt('grade-level-appropriateness/user.txt'); + +/** + * Get the user prompt with the text to evaluate + * @param text - The text to evaluate for grade level appropriateness + * @returns The formatted user prompt + */ +export function getUserPrompt(text: string): string { + return USER_PROMPT_TEMPLATE.replace('{text}', text); +} diff --git a/sdks/typescript/src/schemas/grade-level-appropriateness.ts b/sdks/typescript/src/schemas/grade-level-appropriateness.ts new file mode 100644 index 0000000..d18d207 --- /dev/null +++ b/sdks/typescript/src/schemas/grade-level-appropriateness.ts @@ -0,0 +1,27 @@ +import { z } from 'zod'; + +/** + * Valid grade bands for grade level appropriateness evaluation + */ +export const GradeBand = z.enum(['K-1', '2-3', '4-5', '6-8', '9-10', '11-CCR']); + +export type GradeBand = z.infer; + +/** + * Output schema for Grade Level Appropriateness evaluation + * Matches Python OutputRanges model + */ +export const GradeLevelAppropriatenessSchema = z.object({ + reasoning: z + .string() + .describe( + 'Your reasoning for your answer in numbered bullet points for 4 steps with a 4th bullet point for synthesis.' + ), + grade: z.string().describe('The appropriate grade level for the text'), + alternative_grade: z.string().describe('An alternative grade level for the text'), + scaffolding_needed: z + .string() + .describe('Scaffolding needed for the text to be appropriate for the alternative grade'), +}); + +export type GradeLevelAppropriateness = z.infer; diff --git a/sdks/typescript/src/schemas/index.ts b/sdks/typescript/src/schemas/index.ts index ded6c72..f1b73d3 100644 --- a/sdks/typescript/src/schemas/index.ts +++ b/sdks/typescript/src/schemas/index.ts @@ -8,3 +8,8 @@ export { type EvaluationError, } from './outputs.js'; +export { + GradeBand, + GradeLevelAppropriatenessSchema, + type GradeLevelAppropriateness, +} from './grade-level-appropriateness.js'; diff --git a/sdks/typescript/tests/unit/evaluators/grade-level-appropriateness.test.ts b/sdks/typescript/tests/unit/evaluators/grade-level-appropriateness.test.ts new file mode 100644 index 0000000..0d88be0 --- /dev/null +++ b/sdks/typescript/tests/unit/evaluators/grade-level-appropriateness.test.ts @@ -0,0 +1,173 @@ +import { describe, it, expect, vi, beforeEach, afterEach } from 'vitest'; +import { GradeLevelAppropriatenessEvaluator } from '../../../src/evaluators/grade-level-appropriateness.js'; +import type { LLMProvider } from '../../../src/providers/base.js'; + +/** + * Comprehensive unit tests for GradeLevelAppropriatenessEvaluator + * + * These tests verify: + * - Constructor validation + * - Successful evaluation flow (single stage) + * - Error handling (LLM failures) + * - Telemetry behavior + * - Response structure + */ + +// Mock providers +const createMockProvider = (): LLMProvider => ({ + generateStructured: vi.fn(), + generateText: vi.fn(), +}); + +// Mock the createProvider factory +vi.mock('../../../src/providers/index.js', () => ({ + createProvider: vi.fn(() => createMockProvider()), +})); + +// Mock telemetry to avoid real HTTP calls +vi.mock('../../../src/telemetry/client.js', () => { + return { + TelemetryClient: class MockTelemetryClient { + send = vi.fn().mockResolvedValue(undefined); + }, + }; +}); + +describe('GradeLevelAppropriatenessEvaluator - Constructor Validation', () => { + it('should throw error when Google API key is missing', () => { + expect(() => new GradeLevelAppropriatenessEvaluator({ + googleApiKey: '', + })).toThrow('Google API key is required. Pass googleApiKey in config.'); + }); + +}); + +describe('GradeLevelAppropriatenessEvaluator - Evaluation Flow', () => { + let evaluator: GradeLevelAppropriatenessEvaluator; + let mockProvider: LLMProvider; + + beforeEach(() => { + vi.clearAllMocks(); + + // Create evaluator (provider will be mocked) + evaluator = new GradeLevelAppropriatenessEvaluator({ + googleApiKey: 'test-google-key', + telemetry: false, + }); + + // Get reference to the mocked provider + // @ts-expect-error Accessing private property for testing + mockProvider = evaluator.provider; + }); + + afterEach(() => { + vi.restoreAllMocks(); + }); + + describe('Successful Evaluation Flow', () => { + it('should successfully evaluate text', async () => { + const testText = 'Tides are the rise and fall of sea levels caused by the combined effects of the gravitational forces exerted by the Moon and the Sun.'; + + // Mock grade level response + vi.mocked(mockProvider.generateStructured).mockResolvedValue({ + data: { + grade: '6-8', + alternative_grade: '4-5', + scaffolding_needed: [ + 'Pre-teach gravitational forces', + 'Use visual diagrams of moon-sun-earth system', + ], + reasoning: 'The text discusses gravitational forces and celestial mechanics, which are appropriate for middle school science curriculum.', + }, + model: 'gemini-2.5-pro', + usage: { + inputTokens: 200, + outputTokens: 150, + }, + latencyMs: 800, + }); + + // Execute evaluation (no grade parameter needed) + const result = await evaluator.evaluate(testText); + + // Verify result structure + expect(result.score).toBeDefined(); + expect(result.score.grade).toBe('6-8'); + expect(result.score.alternative_grade).toBe('4-5'); + expect(result.score.scaffolding_needed).toHaveLength(2); + expect(result.reasoning).toContain('gravitational forces'); + expect(result.metadata).toBeDefined(); + expect(result.metadata.model).toBe('gemini-2.5-pro'); + expect(result.metadata.processingTimeMs).toBeGreaterThanOrEqual(0); + + // Verify provider was called + expect(mockProvider.generateStructured).toHaveBeenCalledTimes(1); + }); + + }); + + describe('Error Handling', () => { + it('should handle API failure', async () => { + const testText = 'Test text here for API failure'; + + // Mock API failure + vi.mocked(mockProvider.generateStructured).mockRejectedValue( + new Error('API timeout') + ); + + // Should propagate the error + await expect(evaluator.evaluate(testText)) + .rejects.toThrow('API timeout'); + }); + + }); + + describe('Response Structure', () => { + it('should return correct result structure', async () => { + vi.mocked(mockProvider.generateStructured).mockResolvedValue({ + data: { + grade: '9-10', + alternative_grade: '6-8', + scaffolding_needed: [ + 'Pre-teach advanced vocabulary', + 'Provide background context', + ], + reasoning: 'Detailed reasoning about grade appropriateness', + }, + model: 'gemini-2.5-pro', + usage: { inputTokens: 200, outputTokens: 150 }, + latencyMs: 800, + }); + + const result = await evaluator.evaluate('Test text here'); + + // Verify result structure + expect(result).toHaveProperty('score'); + expect(result).toHaveProperty('reasoning'); + expect(result).toHaveProperty('metadata'); + + // Verify score structure (GradeLevelAppropriateness) + expect(result.score).toHaveProperty('grade'); + expect(result.score).toHaveProperty('alternative_grade'); + expect(result.score).toHaveProperty('scaffolding_needed'); + expect(result.score).toHaveProperty('reasoning'); + + // Verify metadata structure + expect(result.metadata).toHaveProperty('promptVersion'); + expect(result.metadata).toHaveProperty('model'); + expect(result.metadata).toHaveProperty('timestamp'); + expect(result.metadata).toHaveProperty('processingTimeMs'); + + // Verify metadata values + expect(result.metadata.promptVersion).toBe('1.0'); + expect(result.metadata.model).toBe('gemini-2.5-pro'); + expect(result.metadata.timestamp).toBeInstanceOf(Date); + expect(result.metadata.processingTimeMs).toBeGreaterThanOrEqual(0); + + // Verify score values + expect(result.score.grade).toBe('9-10'); + expect(result.score.alternative_grade).toBe('6-8'); + expect(result.score.scaffolding_needed).toHaveLength(2); + }); + }); +}); From bd68f3da4bc666d57ce68f5061caee2e580a7456 Mon Sep 17 00:00:00 2001 From: Adnan Rashid Hussain Date: Mon, 2 Mar 2026 22:55:29 -0800 Subject: [PATCH 2/4] re: feedback --- sdks/typescript/README.md | 11 +++++--- .../evaluators/grade-level-appropriateness.ts | 27 ++++++------------- .../grade-level-appropriateness/index.ts | 15 ++--------- .../grade-level-appropriateness.test.ts | 25 +++++++---------- 4 files changed, 28 insertions(+), 50 deletions(-) diff --git a/sdks/typescript/README.md b/sdks/typescript/README.md index 86d022b..f4c3118 100644 --- a/sdks/typescript/README.md +++ b/sdks/typescript/README.md @@ -149,13 +149,18 @@ await evaluator.evaluate(text: string) ```typescript { score: { - grade: 'K-1' | '2-3' | '4-5' | '6-8' | '9-10' | '11-CCR'; + grade: string; // e.g., 'K-1', '2-3', '4-5', '6-8', '9-10', '11-CCR' alternative_grade: string; - scaffolding_needed: string[]; + scaffolding_needed: string; reasoning: string; }; reasoning: string; - metadata: EvaluationMetadata; + metadata: { + promptVersion: string; + model: string; + timestamp: Date; + processingTimeMs: number; + }; } ``` diff --git a/sdks/typescript/src/evaluators/grade-level-appropriateness.ts b/sdks/typescript/src/evaluators/grade-level-appropriateness.ts index fa125c4..19fcdbc 100644 --- a/sdks/typescript/src/evaluators/grade-level-appropriateness.ts +++ b/sdks/typescript/src/evaluators/grade-level-appropriateness.ts @@ -7,7 +7,7 @@ import { import { getSystemPrompt, getUserPrompt } from '../prompts/grade-level-appropriateness/index.js'; import type { EvaluationResult } from '../schemas/index.js'; import { BaseEvaluator, type BaseEvaluatorConfig } from './base.js'; -import { ValidationError, wrapProviderError } from '../errors.js'; +import { ConfigurationError, ValidationError, wrapProviderError } from '../errors.js'; /** * Configuration for GradeLevelAppropriatenessEvaluator @@ -46,7 +46,6 @@ export interface GradeLevelAppropriatenessEvaluatorConfig extends BaseEvaluatorC */ export class GradeLevelAppropriatenessEvaluator extends BaseEvaluator { private provider: LLMProvider; - private evaluatorConfig: GradeLevelAppropriatenessEvaluatorConfig; constructor(config: GradeLevelAppropriatenessEvaluatorConfig) { // Call base constructor for common setup (telemetry, etc.) @@ -54,11 +53,9 @@ export class GradeLevelAppropriatenessEvaluator extends BaseEvaluator { // Validate required API keys if (!config.googleApiKey) { - throw new ValidationError('Google API key is required. Pass googleApiKey in config.'); + throw new ConfigurationError('Google API key is required. Pass googleApiKey in config.'); } - this.evaluatorConfig = config; - // Create Google Gemini provider this.provider = createProvider({ type: 'google', @@ -79,7 +76,8 @@ export class GradeLevelAppropriatenessEvaluator extends BaseEvaluator { * * @param text - The text to evaluate * @returns Evaluation result with grade recommendations and scaffolding suggestions - * @throws {Error} If text is empty + * @throws {ValidationError} If text is empty or too short/long + * @throws {APIError} If LLM API calls fail (includes AuthenticationError, RateLimitError, NetworkError, TimeoutError) */ async evaluate(text: string): Promise> { this.logger.info('Starting grade level appropriateness evaluation', { @@ -88,12 +86,11 @@ export class GradeLevelAppropriatenessEvaluator extends BaseEvaluator { textLength: text.length, }); - // Use inherited validation method - this.validateText(text); - const startTime = Date.now(); try { + // Validate inputs — inside try so validation errors are telemetered. + this.validateText(text); this.logger.debug('Evaluating grade level appropriateness', { evaluator: 'grade-level-appropriateness', operation: 'grade_evaluation', @@ -120,8 +117,8 @@ export class GradeLevelAppropriatenessEvaluator extends BaseEvaluator { score: response.data, reasoning: response.data.reasoning, metadata: { - promptVersion: '1.0', - model: 'gemini-2.5-pro', + promptVersion: '1.2.0', + model: 'google:gemini-2.5-pro', timestamp: new Date(), processingTimeMs: latencyMs, }, @@ -133,10 +130,6 @@ export class GradeLevelAppropriatenessEvaluator extends BaseEvaluator { latencyMs, textLength: text.length, provider: 'google:gemini-2.5-pro', - // TODO: Retry tracking - Vercel AI SDK doesn't expose actual retry attempts - // We set -1 to indicate "unknown" (we may have retried, but can't track it) - // To fix: Implement custom retry wrapper that tracks each attempt - retryAttempts: -1, tokenUsage, // No metadata.stage_details for single-stage evaluator inputText: text, @@ -169,10 +162,6 @@ export class GradeLevelAppropriatenessEvaluator extends BaseEvaluator { latencyMs, textLength: text.length, provider: 'google:gemini-2.5-pro', - // TODO: Retry tracking - Vercel AI SDK doesn't expose actual retry attempts - // We set -1 to indicate "unknown" (we may have retried, but can't track it) - // To fix: Implement custom retry wrapper that tracks each attempt - retryAttempts: -1, errorCode: error instanceof Error ? error.name : 'UnknownError', inputText: text, }).catch(() => { diff --git a/sdks/typescript/src/prompts/grade-level-appropriateness/index.ts b/sdks/typescript/src/prompts/grade-level-appropriateness/index.ts index 03831e0..0939993 100644 --- a/sdks/typescript/src/prompts/grade-level-appropriateness/index.ts +++ b/sdks/typescript/src/prompts/grade-level-appropriateness/index.ts @@ -1,10 +1,5 @@ -import { loadPrompt } from '../../utils/prompts.js'; - -/** - * System prompt for grade level appropriateness evaluation - * Loaded from: prompts/grade-level-appropriateness/system.txt - */ -const SYSTEM_PROMPT_TEMPLATE = loadPrompt('grade-level-appropriateness/system.txt'); +import SYSTEM_PROMPT_TEMPLATE from '../../../../../evals/prompts/grade-level-appropriateness/system.txt'; +import USER_PROMPT_TEMPLATE from '../../../../../evals/prompts/grade-level-appropriateness/user.txt'; /** * Get the system prompt for grade level appropriateness evaluation @@ -14,12 +9,6 @@ export function getSystemPrompt(): string { return SYSTEM_PROMPT_TEMPLATE; } -/** - * User prompt template for grade level appropriateness evaluation - * Loaded from: prompts/grade-level-appropriateness/user.txt - */ -const USER_PROMPT_TEMPLATE = loadPrompt('grade-level-appropriateness/user.txt'); - /** * Get the user prompt with the text to evaluate * @param text - The text to evaluate for grade level appropriateness diff --git a/sdks/typescript/tests/unit/evaluators/grade-level-appropriateness.test.ts b/sdks/typescript/tests/unit/evaluators/grade-level-appropriateness.test.ts index 0d88be0..38e5c68 100644 --- a/sdks/typescript/tests/unit/evaluators/grade-level-appropriateness.test.ts +++ b/sdks/typescript/tests/unit/evaluators/grade-level-appropriateness.test.ts @@ -1,5 +1,6 @@ import { describe, it, expect, vi, beforeEach, afterEach } from 'vitest'; import { GradeLevelAppropriatenessEvaluator } from '../../../src/evaluators/grade-level-appropriateness.js'; +import { ConfigurationError } from '../../../src/errors.js'; import type { LLMProvider } from '../../../src/providers/base.js'; /** @@ -34,10 +35,10 @@ vi.mock('../../../src/telemetry/client.js', () => { }); describe('GradeLevelAppropriatenessEvaluator - Constructor Validation', () => { - it('should throw error when Google API key is missing', () => { + it('should throw ConfigurationError when Google API key is missing', () => { expect(() => new GradeLevelAppropriatenessEvaluator({ googleApiKey: '', - })).toThrow('Google API key is required. Pass googleApiKey in config.'); + })).toThrow(ConfigurationError); }); }); @@ -73,10 +74,7 @@ describe('GradeLevelAppropriatenessEvaluator - Evaluation Flow', () => { data: { grade: '6-8', alternative_grade: '4-5', - scaffolding_needed: [ - 'Pre-teach gravitational forces', - 'Use visual diagrams of moon-sun-earth system', - ], + scaffolding_needed: 'Pre-teach gravitational forces; Use visual diagrams of moon-sun-earth system', reasoning: 'The text discusses gravitational forces and celestial mechanics, which are appropriate for middle school science curriculum.', }, model: 'gemini-2.5-pro', @@ -94,10 +92,10 @@ describe('GradeLevelAppropriatenessEvaluator - Evaluation Flow', () => { expect(result.score).toBeDefined(); expect(result.score.grade).toBe('6-8'); expect(result.score.alternative_grade).toBe('4-5'); - expect(result.score.scaffolding_needed).toHaveLength(2); + expect(result.score.scaffolding_needed).toContain('gravitational forces'); expect(result.reasoning).toContain('gravitational forces'); expect(result.metadata).toBeDefined(); - expect(result.metadata.model).toBe('gemini-2.5-pro'); + expect(result.metadata.model).toBe('google:gemini-2.5-pro'); expect(result.metadata.processingTimeMs).toBeGreaterThanOrEqual(0); // Verify provider was called @@ -128,10 +126,7 @@ describe('GradeLevelAppropriatenessEvaluator - Evaluation Flow', () => { data: { grade: '9-10', alternative_grade: '6-8', - scaffolding_needed: [ - 'Pre-teach advanced vocabulary', - 'Provide background context', - ], + scaffolding_needed: 'Pre-teach advanced vocabulary; Provide background context', reasoning: 'Detailed reasoning about grade appropriateness', }, model: 'gemini-2.5-pro', @@ -159,15 +154,15 @@ describe('GradeLevelAppropriatenessEvaluator - Evaluation Flow', () => { expect(result.metadata).toHaveProperty('processingTimeMs'); // Verify metadata values - expect(result.metadata.promptVersion).toBe('1.0'); - expect(result.metadata.model).toBe('gemini-2.5-pro'); + expect(result.metadata.promptVersion).toBe('1.2.0'); + expect(result.metadata.model).toBe('google:gemini-2.5-pro'); expect(result.metadata.timestamp).toBeInstanceOf(Date); expect(result.metadata.processingTimeMs).toBeGreaterThanOrEqual(0); // Verify score values expect(result.score.grade).toBe('9-10'); expect(result.score.alternative_grade).toBe('6-8'); - expect(result.score.scaffolding_needed).toHaveLength(2); + expect(result.score.scaffolding_needed).toBeTruthy(); }); }); }); From 19201c301ef02f6821cb572152cf6a23937023f9 Mon Sep 17 00:00:00 2001 From: Adnan Rashid Hussain Date: Mon, 2 Mar 2026 23:09:00 -0800 Subject: [PATCH 3/4] re: feedback --- sdks/typescript/README.md | 13 ++++---- .../evaluators/grade-level-appropriateness.ts | 16 +++++----- .../schemas/grade-level-appropriateness.ts | 4 +-- .../grade-level-appropriateness.test.ts | 31 +++++++++++-------- 4 files changed, 35 insertions(+), 29 deletions(-) diff --git a/sdks/typescript/README.md b/sdks/typescript/README.md index f4c3118..3965e82 100644 --- a/sdks/typescript/README.md +++ b/sdks/typescript/README.md @@ -148,12 +148,7 @@ await evaluator.evaluate(text: string) **Returns:** ```typescript { - score: { - grade: string; // e.g., 'K-1', '2-3', '4-5', '6-8', '9-10', '11-CCR' - alternative_grade: string; - scaffolding_needed: string; - reasoning: string; - }; + score: string; // e.g., 'K-1', '2-3', '4-5', '6-8', '9-10', '11-CCR' reasoning: string; metadata: { promptVersion: string; @@ -161,6 +156,12 @@ await evaluator.evaluate(text: string) timestamp: Date; processingTimeMs: number; }; + _internal: { + grade: string; + alternative_grade: string; + scaffolding_needed: string; + reasoning: string; + }; } ``` diff --git a/sdks/typescript/src/evaluators/grade-level-appropriateness.ts b/sdks/typescript/src/evaluators/grade-level-appropriateness.ts index 19fcdbc..525e162 100644 --- a/sdks/typescript/src/evaluators/grade-level-appropriateness.ts +++ b/sdks/typescript/src/evaluators/grade-level-appropriateness.ts @@ -39,9 +39,9 @@ export interface GradeLevelAppropriatenessEvaluatorConfig extends BaseEvaluatorC * }); * * const result = await evaluator.evaluate(text); - * console.log(result.score.grade); // "9-10" - * console.log(result.score.alternative_grade); // "6-8" - * console.log(result.score.scaffolding_needed); + * console.log(result.score); // "9-10" + * console.log(result._internal.alternative_grade); // "6-8" + * console.log(result._internal.scaffolding_needed); * ``` */ export class GradeLevelAppropriatenessEvaluator extends BaseEvaluator { @@ -61,7 +61,6 @@ export class GradeLevelAppropriatenessEvaluator extends BaseEvaluator { type: 'google', model: 'gemini-2.5-pro', apiKey: config.googleApiKey, - temperature: 0.25, maxRetries: this.config.maxRetries, }); } @@ -79,7 +78,7 @@ export class GradeLevelAppropriatenessEvaluator extends BaseEvaluator { * @throws {ValidationError} If text is empty or too short/long * @throws {APIError} If LLM API calls fail (includes AuthenticationError, RateLimitError, NetworkError, TimeoutError) */ - async evaluate(text: string): Promise> { + async evaluate(text: string): Promise> { this.logger.info('Starting grade level appropriateness evaluation', { evaluator: 'grade-level-appropriateness', operation: 'evaluate', @@ -114,7 +113,7 @@ export class GradeLevelAppropriatenessEvaluator extends BaseEvaluator { }; const result = { - score: response.data, + score: response.data.grade, reasoning: response.data.reasoning, metadata: { promptVersion: '1.2.0', @@ -122,6 +121,7 @@ export class GradeLevelAppropriatenessEvaluator extends BaseEvaluator { timestamp: new Date(), processingTimeMs: latencyMs, }, + _internal: response.data, }; // Send success telemetry (fire-and-forget) @@ -140,7 +140,7 @@ export class GradeLevelAppropriatenessEvaluator extends BaseEvaluator { this.logger.info('Grade level appropriateness evaluation completed successfully', { evaluator: 'grade-level-appropriateness', operation: 'evaluate', - grade: result.score.grade, + grade: result.score, processingTimeMs: latencyMs, }); @@ -195,7 +195,7 @@ export class GradeLevelAppropriatenessEvaluator extends BaseEvaluator { export async function evaluateGradeLevelAppropriateness( text: string, config: GradeLevelAppropriatenessEvaluatorConfig -): Promise> { +): Promise> { const evaluator = new GradeLevelAppropriatenessEvaluator(config); return evaluator.evaluate(text); } diff --git a/sdks/typescript/src/schemas/grade-level-appropriateness.ts b/sdks/typescript/src/schemas/grade-level-appropriateness.ts index d18d207..e23e638 100644 --- a/sdks/typescript/src/schemas/grade-level-appropriateness.ts +++ b/sdks/typescript/src/schemas/grade-level-appropriateness.ts @@ -17,8 +17,8 @@ export const GradeLevelAppropriatenessSchema = z.object({ .describe( 'Your reasoning for your answer in numbered bullet points for 4 steps with a 4th bullet point for synthesis.' ), - grade: z.string().describe('The appropriate grade level for the text'), - alternative_grade: z.string().describe('An alternative grade level for the text'), + grade: GradeBand.describe('The appropriate grade level for the text'), + alternative_grade: GradeBand.describe('An alternative grade level for the text'), scaffolding_needed: z .string() .describe('Scaffolding needed for the text to be appropriate for the alternative grade'), diff --git a/sdks/typescript/tests/unit/evaluators/grade-level-appropriateness.test.ts b/sdks/typescript/tests/unit/evaluators/grade-level-appropriateness.test.ts index 38e5c68..443f0a8 100644 --- a/sdks/typescript/tests/unit/evaluators/grade-level-appropriateness.test.ts +++ b/sdks/typescript/tests/unit/evaluators/grade-level-appropriateness.test.ts @@ -89,10 +89,11 @@ describe('GradeLevelAppropriatenessEvaluator - Evaluation Flow', () => { const result = await evaluator.evaluate(testText); // Verify result structure - expect(result.score).toBeDefined(); - expect(result.score.grade).toBe('6-8'); - expect(result.score.alternative_grade).toBe('4-5'); - expect(result.score.scaffolding_needed).toContain('gravitational forces'); + expect(result.score).toBe('6-8'); + expect(result._internal).toBeDefined(); + expect(result._internal!.grade).toBe('6-8'); + expect(result._internal!.alternative_grade).toBe('4-5'); + expect(result._internal!.scaffolding_needed).toContain('gravitational forces'); expect(result.reasoning).toContain('gravitational forces'); expect(result.metadata).toBeDefined(); expect(result.metadata.model).toBe('google:gemini-2.5-pro'); @@ -140,12 +141,16 @@ describe('GradeLevelAppropriatenessEvaluator - Evaluation Flow', () => { expect(result).toHaveProperty('score'); expect(result).toHaveProperty('reasoning'); expect(result).toHaveProperty('metadata'); + expect(result).toHaveProperty('_internal'); - // Verify score structure (GradeLevelAppropriateness) - expect(result.score).toHaveProperty('grade'); - expect(result.score).toHaveProperty('alternative_grade'); - expect(result.score).toHaveProperty('scaffolding_needed'); - expect(result.score).toHaveProperty('reasoning'); + // Verify score is the grade string + expect(result.score).toBe('9-10'); + + // Verify _internal structure (GradeLevelAppropriateness) + expect(result._internal).toHaveProperty('grade'); + expect(result._internal).toHaveProperty('alternative_grade'); + expect(result._internal).toHaveProperty('scaffolding_needed'); + expect(result._internal).toHaveProperty('reasoning'); // Verify metadata structure expect(result.metadata).toHaveProperty('promptVersion'); @@ -159,10 +164,10 @@ describe('GradeLevelAppropriatenessEvaluator - Evaluation Flow', () => { expect(result.metadata.timestamp).toBeInstanceOf(Date); expect(result.metadata.processingTimeMs).toBeGreaterThanOrEqual(0); - // Verify score values - expect(result.score.grade).toBe('9-10'); - expect(result.score.alternative_grade).toBe('6-8'); - expect(result.score.scaffolding_needed).toBeTruthy(); + // Verify _internal values + expect(result._internal!.grade).toBe('9-10'); + expect(result._internal!.alternative_grade).toBe('6-8'); + expect(result._internal!.scaffolding_needed).toBeTruthy(); }); }); }); From 53f69f356fa15853de73c8b3bbd6b438de41b356 Mon Sep 17 00:00:00 2001 From: Adnan Rashid Hussain Date: Mon, 2 Mar 2026 23:18:43 -0800 Subject: [PATCH 4/4] re: feedback --- .../prompts/grade-level-appropriateness/index.ts | 4 +++- .../src/prompts/sentence-structure/analysis.ts | 3 ++- .../src/prompts/sentence-structure/complexity.ts | 3 ++- .../evaluators/grade-level-appropriateness.test.ts | 14 +++++++++++++- 4 files changed, 20 insertions(+), 4 deletions(-) diff --git a/sdks/typescript/src/prompts/grade-level-appropriateness/index.ts b/sdks/typescript/src/prompts/grade-level-appropriateness/index.ts index 0939993..4192b62 100644 --- a/sdks/typescript/src/prompts/grade-level-appropriateness/index.ts +++ b/sdks/typescript/src/prompts/grade-level-appropriateness/index.ts @@ -15,5 +15,7 @@ export function getSystemPrompt(): string { * @returns The formatted user prompt */ export function getUserPrompt(text: string): string { - return USER_PROMPT_TEMPLATE.replace('{text}', text); + return USER_PROMPT_TEMPLATE + .replace('{text}', text) + .replace('{format_instructions}', ''); } diff --git a/sdks/typescript/src/prompts/sentence-structure/analysis.ts b/sdks/typescript/src/prompts/sentence-structure/analysis.ts index c1fb7b7..f5e8c7f 100644 --- a/sdks/typescript/src/prompts/sentence-structure/analysis.ts +++ b/sdks/typescript/src/prompts/sentence-structure/analysis.ts @@ -18,5 +18,6 @@ export function getSystemPromptAnalysis(): string { export function getUserPromptAnalysis(text: string, groundTruthCounts: string): string { return USER_PROMPT_ANALYSIS_TEMPLATE .replace('{text}', text) - .replace('{ground_truth_counts}', groundTruthCounts); + .replace('{ground_truth_counts}', groundTruthCounts) + .replace('{format_instructions}', ''); } diff --git a/sdks/typescript/src/prompts/sentence-structure/complexity.ts b/sdks/typescript/src/prompts/sentence-structure/complexity.ts index 361a69a..32189ea 100644 --- a/sdks/typescript/src/prompts/sentence-structure/complexity.ts +++ b/sdks/typescript/src/prompts/sentence-structure/complexity.ts @@ -48,5 +48,6 @@ export function getUserPromptComplexity( .replace('{sentence_features}', sentenceFeatures) .replace('{grade}', grade) .replace('{rubric}', rubric) - .replace('{excerpt}', excerpt); + .replace('{excerpt}', excerpt) + .replace('{format_instructions}', ''); } diff --git a/sdks/typescript/tests/unit/evaluators/grade-level-appropriateness.test.ts b/sdks/typescript/tests/unit/evaluators/grade-level-appropriateness.test.ts index 443f0a8..7e04d9b 100644 --- a/sdks/typescript/tests/unit/evaluators/grade-level-appropriateness.test.ts +++ b/sdks/typescript/tests/unit/evaluators/grade-level-appropriateness.test.ts @@ -1,6 +1,6 @@ import { describe, it, expect, vi, beforeEach, afterEach } from 'vitest'; import { GradeLevelAppropriatenessEvaluator } from '../../../src/evaluators/grade-level-appropriateness.js'; -import { ConfigurationError } from '../../../src/errors.js'; +import { ConfigurationError, ValidationError } from '../../../src/errors.js'; import type { LLMProvider } from '../../../src/providers/base.js'; /** @@ -105,6 +105,18 @@ describe('GradeLevelAppropriatenessEvaluator - Evaluation Flow', () => { }); + describe('Input Validation', () => { + it('should throw ValidationError for empty text', async () => { + await expect(evaluator.evaluate('')) + .rejects.toThrow(ValidationError); + }); + + it('should throw ValidationError for text that is too short', async () => { + await expect(evaluator.evaluate('Hi')) + .rejects.toThrow(ValidationError); + }); + }); + describe('Error Handling', () => { it('should handle API failure', async () => { const testText = 'Test text here for API failure';