diff --git a/sdks/typescript/README.md b/sdks/typescript/README.md index 1fc51cb..3965e82 100644 --- a/sdks/typescript/README.md +++ b/sdks/typescript/README.md @@ -121,6 +121,52 @@ await evaluator.evaluate(text: string, grade: string) --- +### 3. Grade Level Appropriateness Evaluator + +Determines appropriate grade level for text. + +**No grade parameter required** - evaluates what grade the text is appropriate for. + +**Uses:** Google Gemini 2.5 Pro + +**Constructor:** +```typescript +const evaluator = new GradeLevelAppropriatenessEvaluator({ + googleApiKey: string; // Required - Google API key + maxRetries?: number; // Optional - Max retry attempts (default: 2) + telemetry?: boolean | TelemetryOptions; // Optional (default: true) + logger?: Logger; // Optional - Custom logger + logLevel?: LogLevel; // Optional - Logging verbosity (default: WARN) +}); +``` + +**API:** +```typescript +await evaluator.evaluate(text: string) +``` + +**Returns:** +```typescript +{ + score: string; // e.g., 'K-1', '2-3', '4-5', '6-8', '9-10', '11-CCR' + reasoning: string; + metadata: { + promptVersion: string; + model: string; + timestamp: Date; + processingTimeMs: number; + }; + _internal: { + grade: string; + alternative_grade: string; + scaffolding_needed: string; + reasoning: string; + }; +} +``` + +--- + ## Error Handling The SDK provides specific error types to help you handle different scenarios: diff --git a/sdks/typescript/src/evaluators/grade-level-appropriateness.ts b/sdks/typescript/src/evaluators/grade-level-appropriateness.ts new file mode 100644 index 0000000..525e162 --- /dev/null +++ b/sdks/typescript/src/evaluators/grade-level-appropriateness.ts @@ -0,0 +1,201 @@ +import type { LLMProvider } from '../providers/index.js'; +import { createProvider } from '../providers/index.js'; +import { + GradeLevelAppropriatenessSchema, + type GradeLevelAppropriateness, +} from '../schemas/grade-level-appropriateness.js'; +import { getSystemPrompt, getUserPrompt } from '../prompts/grade-level-appropriateness/index.js'; +import type { EvaluationResult } from '../schemas/index.js'; +import { BaseEvaluator, type BaseEvaluatorConfig } from './base.js'; +import { ConfigurationError, ValidationError, wrapProviderError } from '../errors.js'; + +/** + * Configuration for GradeLevelAppropriatenessEvaluator + */ +export interface GradeLevelAppropriatenessEvaluatorConfig extends BaseEvaluatorConfig { + /** Google API key for grade level evaluation (uses Gemini 2.5 Pro) */ + googleApiKey: string; +} + +/** + * Grade Level Appropriateness Evaluator + * + * Evaluates whether AI-generated text is suitable for a given grade band. + * Uses a structured 4-step analysis process: + * 1. Quantitative analysis (word count, Flesch-Kincaid) + * 2. Qualitative complexity (text structure, language, purpose, knowledge demands) + * 3. Background knowledge assessment + * 4. Synthesis and final recommendation + * + * Returns: + * - Target grade band (K-1, 2-3, 4-5, 6-8, 9-10, 11-CCR) + * - Alternative grade band (with scaffolding) + * - Specific scaffolding recommendations + * + * @example + * ```typescript + * const evaluator = new GradeLevelAppropriatenessEvaluator({ + * googleApiKey: process.env.GOOGLE_API_KEY + * }); + * + * const result = await evaluator.evaluate(text); + * console.log(result.score); // "9-10" + * console.log(result._internal.alternative_grade); // "6-8" + * console.log(result._internal.scaffolding_needed); + * ``` + */ +export class GradeLevelAppropriatenessEvaluator extends BaseEvaluator { + private provider: LLMProvider; + + constructor(config: GradeLevelAppropriatenessEvaluatorConfig) { + // Call base constructor for common setup (telemetry, etc.) + super(config); + + // Validate required API keys + if (!config.googleApiKey) { + throw new ConfigurationError('Google API key is required. Pass googleApiKey in config.'); + } + + // Create Google Gemini provider + this.provider = createProvider({ + type: 'google', + model: 'gemini-2.5-pro', + apiKey: config.googleApiKey, + maxRetries: this.config.maxRetries, + }); + } + + // Implement abstract methods from BaseEvaluator + protected getEvaluatorType(): string { + return 'grade-level-appropriateness'; + } + + /** + * Evaluate grade level appropriateness for a given text + * + * @param text - The text to evaluate + * @returns Evaluation result with grade recommendations and scaffolding suggestions + * @throws {ValidationError} If text is empty or too short/long + * @throws {APIError} If LLM API calls fail (includes AuthenticationError, RateLimitError, NetworkError, TimeoutError) + */ + async evaluate(text: string): Promise> { + this.logger.info('Starting grade level appropriateness evaluation', { + evaluator: 'grade-level-appropriateness', + operation: 'evaluate', + textLength: text.length, + }); + + const startTime = Date.now(); + + try { + // Validate inputs — inside try so validation errors are telemetered. + this.validateText(text); + this.logger.debug('Evaluating grade level appropriateness', { + evaluator: 'grade-level-appropriateness', + operation: 'grade_evaluation', + }); + const userPrompt = getUserPrompt(text); + + const response = await this.provider.generateStructured({ + messages: [ + { role: 'system', content: getSystemPrompt() }, + { role: 'user', content: userPrompt }, + ], + schema: GradeLevelAppropriatenessSchema, + temperature: 0.25, + }); + + const latencyMs = Date.now() - startTime; + + const tokenUsage = { + input_tokens: response.usage.inputTokens, + output_tokens: response.usage.outputTokens, + }; + + const result = { + score: response.data.grade, + reasoning: response.data.reasoning, + metadata: { + promptVersion: '1.2.0', + model: 'google:gemini-2.5-pro', + timestamp: new Date(), + processingTimeMs: latencyMs, + }, + _internal: response.data, + }; + + // Send success telemetry (fire-and-forget) + this.sendTelemetry({ + status: 'success', + latencyMs, + textLength: text.length, + provider: 'google:gemini-2.5-pro', + tokenUsage, + // No metadata.stage_details for single-stage evaluator + inputText: text, + }).catch(() => { + // Ignore telemetry errors + }); + + this.logger.info('Grade level appropriateness evaluation completed successfully', { + evaluator: 'grade-level-appropriateness', + operation: 'evaluate', + grade: result.score, + processingTimeMs: latencyMs, + }); + + return result; + } catch (error) { + const latencyMs = Date.now() - startTime; + + // Log the error + this.logger.error('Grade level appropriateness evaluation failed', { + evaluator: 'grade-level-appropriateness', + operation: 'evaluate', + error: error instanceof Error ? error : undefined, + processingTimeMs: latencyMs, + }); + + // Send failure telemetry (fire-and-forget) + this.sendTelemetry({ + status: 'error', + latencyMs, + textLength: text.length, + provider: 'google:gemini-2.5-pro', + errorCode: error instanceof Error ? error.name : 'UnknownError', + inputText: text, + }).catch(() => { + // Ignore telemetry errors + }); + + // Re-throw validation errors as-is + if (error instanceof ValidationError) { + throw error; + } + + // Wrap provider errors into appropriate error types + throw wrapProviderError(error, 'Grade level appropriateness evaluation failed'); + } + } +} + +/** + * Functional API for grade level appropriateness evaluation + * + * @example + * ```typescript + * const result = await evaluateGradeLevelAppropriateness( + * "Tides are the rise and fall of sea levels...", + * { + * googleApiKey: process.env.GOOGLE_API_KEY + * } + * ); + * ``` + */ +export async function evaluateGradeLevelAppropriateness( + text: string, + config: GradeLevelAppropriatenessEvaluatorConfig +): Promise> { + const evaluator = new GradeLevelAppropriatenessEvaluator(config); + return evaluator.evaluate(text); +} diff --git a/sdks/typescript/src/evaluators/index.ts b/sdks/typescript/src/evaluators/index.ts index e96898e..12b55d6 100644 --- a/sdks/typescript/src/evaluators/index.ts +++ b/sdks/typescript/src/evaluators/index.ts @@ -11,3 +11,9 @@ export { evaluateSentenceStructure, type SentenceStructureEvaluatorConfig, } from './sentence-structure.js'; + +export { + GradeLevelAppropriatenessEvaluator, + evaluateGradeLevelAppropriateness, + type GradeLevelAppropriatenessEvaluatorConfig, +} from './grade-level-appropriateness.js'; diff --git a/sdks/typescript/src/index.ts b/sdks/typescript/src/index.ts index d8d0a08..a84fcc0 100644 --- a/sdks/typescript/src/index.ts +++ b/sdks/typescript/src/index.ts @@ -7,6 +7,8 @@ export type { EvaluationError, } from './schemas/index.js'; +export { ComplexityLevel, GradeLevel, GradeBand } from './schemas/index.js'; + // Error types export { EvaluatorError, @@ -51,6 +53,12 @@ export type { VocabularyComplexityLevel, } from './schemas/vocabulary.js'; +// Grade Level Appropriateness exports +export type { GradeLevelAppropriateness } from './schemas/grade-level-appropriateness.js'; + +export { GradeLevelAppropriatenessSchema } from './schemas/grade-level-appropriateness.js'; + + export { VocabularyEvaluator, evaluateVocabulary, @@ -58,6 +66,9 @@ export { SentenceStructureEvaluator, evaluateSentenceStructure, type SentenceStructureEvaluatorConfig, + GradeLevelAppropriatenessEvaluator, + evaluateGradeLevelAppropriateness, + type GradeLevelAppropriatenessEvaluatorConfig, type BaseEvaluatorConfig, type TelemetryOptions, } from './evaluators/index.js'; diff --git a/sdks/typescript/src/prompts/grade-level-appropriateness/index.ts b/sdks/typescript/src/prompts/grade-level-appropriateness/index.ts new file mode 100644 index 0000000..4192b62 --- /dev/null +++ b/sdks/typescript/src/prompts/grade-level-appropriateness/index.ts @@ -0,0 +1,21 @@ +import SYSTEM_PROMPT_TEMPLATE from '../../../../../evals/prompts/grade-level-appropriateness/system.txt'; +import USER_PROMPT_TEMPLATE from '../../../../../evals/prompts/grade-level-appropriateness/user.txt'; + +/** + * Get the system prompt for grade level appropriateness evaluation + * @returns The system prompt + */ +export function getSystemPrompt(): string { + return SYSTEM_PROMPT_TEMPLATE; +} + +/** + * Get the user prompt with the text to evaluate + * @param text - The text to evaluate for grade level appropriateness + * @returns The formatted user prompt + */ +export function getUserPrompt(text: string): string { + return USER_PROMPT_TEMPLATE + .replace('{text}', text) + .replace('{format_instructions}', ''); +} diff --git a/sdks/typescript/src/prompts/sentence-structure/analysis.ts b/sdks/typescript/src/prompts/sentence-structure/analysis.ts index c1fb7b7..f5e8c7f 100644 --- a/sdks/typescript/src/prompts/sentence-structure/analysis.ts +++ b/sdks/typescript/src/prompts/sentence-structure/analysis.ts @@ -18,5 +18,6 @@ export function getSystemPromptAnalysis(): string { export function getUserPromptAnalysis(text: string, groundTruthCounts: string): string { return USER_PROMPT_ANALYSIS_TEMPLATE .replace('{text}', text) - .replace('{ground_truth_counts}', groundTruthCounts); + .replace('{ground_truth_counts}', groundTruthCounts) + .replace('{format_instructions}', ''); } diff --git a/sdks/typescript/src/prompts/sentence-structure/complexity.ts b/sdks/typescript/src/prompts/sentence-structure/complexity.ts index 361a69a..32189ea 100644 --- a/sdks/typescript/src/prompts/sentence-structure/complexity.ts +++ b/sdks/typescript/src/prompts/sentence-structure/complexity.ts @@ -48,5 +48,6 @@ export function getUserPromptComplexity( .replace('{sentence_features}', sentenceFeatures) .replace('{grade}', grade) .replace('{rubric}', rubric) - .replace('{excerpt}', excerpt); + .replace('{excerpt}', excerpt) + .replace('{format_instructions}', ''); } diff --git a/sdks/typescript/src/schemas/grade-level-appropriateness.ts b/sdks/typescript/src/schemas/grade-level-appropriateness.ts new file mode 100644 index 0000000..e23e638 --- /dev/null +++ b/sdks/typescript/src/schemas/grade-level-appropriateness.ts @@ -0,0 +1,27 @@ +import { z } from 'zod'; + +/** + * Valid grade bands for grade level appropriateness evaluation + */ +export const GradeBand = z.enum(['K-1', '2-3', '4-5', '6-8', '9-10', '11-CCR']); + +export type GradeBand = z.infer; + +/** + * Output schema for Grade Level Appropriateness evaluation + * Matches Python OutputRanges model + */ +export const GradeLevelAppropriatenessSchema = z.object({ + reasoning: z + .string() + .describe( + 'Your reasoning for your answer in numbered bullet points for 4 steps with a 4th bullet point for synthesis.' + ), + grade: GradeBand.describe('The appropriate grade level for the text'), + alternative_grade: GradeBand.describe('An alternative grade level for the text'), + scaffolding_needed: z + .string() + .describe('Scaffolding needed for the text to be appropriate for the alternative grade'), +}); + +export type GradeLevelAppropriateness = z.infer; diff --git a/sdks/typescript/src/schemas/index.ts b/sdks/typescript/src/schemas/index.ts index ded6c72..f1b73d3 100644 --- a/sdks/typescript/src/schemas/index.ts +++ b/sdks/typescript/src/schemas/index.ts @@ -8,3 +8,8 @@ export { type EvaluationError, } from './outputs.js'; +export { + GradeBand, + GradeLevelAppropriatenessSchema, + type GradeLevelAppropriateness, +} from './grade-level-appropriateness.js'; diff --git a/sdks/typescript/tests/unit/evaluators/grade-level-appropriateness.test.ts b/sdks/typescript/tests/unit/evaluators/grade-level-appropriateness.test.ts new file mode 100644 index 0000000..7e04d9b --- /dev/null +++ b/sdks/typescript/tests/unit/evaluators/grade-level-appropriateness.test.ts @@ -0,0 +1,185 @@ +import { describe, it, expect, vi, beforeEach, afterEach } from 'vitest'; +import { GradeLevelAppropriatenessEvaluator } from '../../../src/evaluators/grade-level-appropriateness.js'; +import { ConfigurationError, ValidationError } from '../../../src/errors.js'; +import type { LLMProvider } from '../../../src/providers/base.js'; + +/** + * Comprehensive unit tests for GradeLevelAppropriatenessEvaluator + * + * These tests verify: + * - Constructor validation + * - Successful evaluation flow (single stage) + * - Error handling (LLM failures) + * - Telemetry behavior + * - Response structure + */ + +// Mock providers +const createMockProvider = (): LLMProvider => ({ + generateStructured: vi.fn(), + generateText: vi.fn(), +}); + +// Mock the createProvider factory +vi.mock('../../../src/providers/index.js', () => ({ + createProvider: vi.fn(() => createMockProvider()), +})); + +// Mock telemetry to avoid real HTTP calls +vi.mock('../../../src/telemetry/client.js', () => { + return { + TelemetryClient: class MockTelemetryClient { + send = vi.fn().mockResolvedValue(undefined); + }, + }; +}); + +describe('GradeLevelAppropriatenessEvaluator - Constructor Validation', () => { + it('should throw ConfigurationError when Google API key is missing', () => { + expect(() => new GradeLevelAppropriatenessEvaluator({ + googleApiKey: '', + })).toThrow(ConfigurationError); + }); + +}); + +describe('GradeLevelAppropriatenessEvaluator - Evaluation Flow', () => { + let evaluator: GradeLevelAppropriatenessEvaluator; + let mockProvider: LLMProvider; + + beforeEach(() => { + vi.clearAllMocks(); + + // Create evaluator (provider will be mocked) + evaluator = new GradeLevelAppropriatenessEvaluator({ + googleApiKey: 'test-google-key', + telemetry: false, + }); + + // Get reference to the mocked provider + // @ts-expect-error Accessing private property for testing + mockProvider = evaluator.provider; + }); + + afterEach(() => { + vi.restoreAllMocks(); + }); + + describe('Successful Evaluation Flow', () => { + it('should successfully evaluate text', async () => { + const testText = 'Tides are the rise and fall of sea levels caused by the combined effects of the gravitational forces exerted by the Moon and the Sun.'; + + // Mock grade level response + vi.mocked(mockProvider.generateStructured).mockResolvedValue({ + data: { + grade: '6-8', + alternative_grade: '4-5', + scaffolding_needed: 'Pre-teach gravitational forces; Use visual diagrams of moon-sun-earth system', + reasoning: 'The text discusses gravitational forces and celestial mechanics, which are appropriate for middle school science curriculum.', + }, + model: 'gemini-2.5-pro', + usage: { + inputTokens: 200, + outputTokens: 150, + }, + latencyMs: 800, + }); + + // Execute evaluation (no grade parameter needed) + const result = await evaluator.evaluate(testText); + + // Verify result structure + expect(result.score).toBe('6-8'); + expect(result._internal).toBeDefined(); + expect(result._internal!.grade).toBe('6-8'); + expect(result._internal!.alternative_grade).toBe('4-5'); + expect(result._internal!.scaffolding_needed).toContain('gravitational forces'); + expect(result.reasoning).toContain('gravitational forces'); + expect(result.metadata).toBeDefined(); + expect(result.metadata.model).toBe('google:gemini-2.5-pro'); + expect(result.metadata.processingTimeMs).toBeGreaterThanOrEqual(0); + + // Verify provider was called + expect(mockProvider.generateStructured).toHaveBeenCalledTimes(1); + }); + + }); + + describe('Input Validation', () => { + it('should throw ValidationError for empty text', async () => { + await expect(evaluator.evaluate('')) + .rejects.toThrow(ValidationError); + }); + + it('should throw ValidationError for text that is too short', async () => { + await expect(evaluator.evaluate('Hi')) + .rejects.toThrow(ValidationError); + }); + }); + + describe('Error Handling', () => { + it('should handle API failure', async () => { + const testText = 'Test text here for API failure'; + + // Mock API failure + vi.mocked(mockProvider.generateStructured).mockRejectedValue( + new Error('API timeout') + ); + + // Should propagate the error + await expect(evaluator.evaluate(testText)) + .rejects.toThrow('API timeout'); + }); + + }); + + describe('Response Structure', () => { + it('should return correct result structure', async () => { + vi.mocked(mockProvider.generateStructured).mockResolvedValue({ + data: { + grade: '9-10', + alternative_grade: '6-8', + scaffolding_needed: 'Pre-teach advanced vocabulary; Provide background context', + reasoning: 'Detailed reasoning about grade appropriateness', + }, + model: 'gemini-2.5-pro', + usage: { inputTokens: 200, outputTokens: 150 }, + latencyMs: 800, + }); + + const result = await evaluator.evaluate('Test text here'); + + // Verify result structure + expect(result).toHaveProperty('score'); + expect(result).toHaveProperty('reasoning'); + expect(result).toHaveProperty('metadata'); + expect(result).toHaveProperty('_internal'); + + // Verify score is the grade string + expect(result.score).toBe('9-10'); + + // Verify _internal structure (GradeLevelAppropriateness) + expect(result._internal).toHaveProperty('grade'); + expect(result._internal).toHaveProperty('alternative_grade'); + expect(result._internal).toHaveProperty('scaffolding_needed'); + expect(result._internal).toHaveProperty('reasoning'); + + // Verify metadata structure + expect(result.metadata).toHaveProperty('promptVersion'); + expect(result.metadata).toHaveProperty('model'); + expect(result.metadata).toHaveProperty('timestamp'); + expect(result.metadata).toHaveProperty('processingTimeMs'); + + // Verify metadata values + expect(result.metadata.promptVersion).toBe('1.2.0'); + expect(result.metadata.model).toBe('google:gemini-2.5-pro'); + expect(result.metadata.timestamp).toBeInstanceOf(Date); + expect(result.metadata.processingTimeMs).toBeGreaterThanOrEqual(0); + + // Verify _internal values + expect(result._internal!.grade).toBe('9-10'); + expect(result._internal!.alternative_grade).toBe('6-8'); + expect(result._internal!.scaffolding_needed).toBeTruthy(); + }); + }); +});