From 1b61e94cc31a3093bf0b7bca32e79ec70edef257 Mon Sep 17 00:00:00 2001
From: Adnan Rashid Hussain <ahussain@chanzuckerberg.com>
Date: Fri, 20 Feb 2026 13:56:46 -0800
Subject: [PATCH 1/3] feat: add Subject Matter Knowledge (SMK) evaluator

---
 evals/prompts/README.md      |   4 +
 evals/prompts/smk_prompts.py |  75 +++++++++++++++++
 evals/smk_evaluator.ipynb    | 156 +++++++++++++++++++++++++++++++++++
 3 files changed, 235 insertions(+)
 create mode 100644 evals/prompts/smk_prompts.py
 create mode 100644 evals/smk_evaluator.ipynb

diff --git a/evals/prompts/README.md b/evals/prompts/README.md
index fb7ec24..76d61e8 100644
--- a/evals/prompts/README.md
+++ b/evals/prompts/README.md
@@ -14,3 +14,7 @@ In file `sent_str_prompts.py`, we provide system and user prompts used in the Se
 ### Vocabulary Evaluator
 
 In file `vocab_prompts.py`, we provide system and user prompts used in the Vocabulary Evaluator's code. These prompts can help assess the difficulty of vocabulary in texts and serve as a starting point for your own prompt development.
+
+### Subject Matter Knowledge Evaluator
+
+In file `smk_prompts.py`, we provide system and user prompts used in the Subject Matter Knowledge (SMK) Evaluator's code. These prompts can help assess the background knowledge demands of texts and serve as a starting point for your own prompt development.
diff --git a/evals/prompts/smk_prompts.py b/evals/prompts/smk_prompts.py
new file mode 100644
index 0000000..4cfa3d9
--- /dev/null
+++ b/evals/prompts/smk_prompts.py
@@ -0,0 +1,75 @@
+
+smk_system_prompt = """
+To perform the task of evaluating text complexity based on Subject Matter Knowledge (SMK), strictly adhere to the following instructions.
+Role
+You are an expert K-12 Literacy Pedagogue and Text Complexity Evaluator. Your specific focus is analyzing Subject Matter Knowledge (SMK) demands according to the Common Core Qualitative Text Complexity Rubric.
+Objective
+Analyze a provided text relative to a target grade_level. You must determine the extent of background knowledge required to comprehend the text. You must distinguish between Common/Standard knowledge (generally lower/moderate complexity) and Specialized/Theoretical knowledge (generally higher complexity).
+Input Data
+text: The passage to analyze.
+grade_level: The target student grade (integer).
+fk_score: Flesch-Kincaid Grade Level. Note: Use this only as a loose proxy for sentence structure. Do not let a high FK score artificially inflate the Subject Matter Knowledge score if the concepts remain simple.
+
+1. The Rubric: Subject Matter Knowledge (SMK)
+1. Slightly Complex
+Scope: Everyday, practical knowledge, and Introduction to Skills.
+Concept Type: Concrete, directly observable, and familiar.
+Key Indicator: "How-to" texts involving familiar objects (e.g., drawing a cupboard, playing a game, family life). Even if specific terms (like "scale" or "measure") are used, if the application is on a common object, it remains Slightly Complex.
+2. Moderately Complex
+Scope: Common Discipline-Specific Knowledge or Narrative History.
+Definition: Topics widely introduced in K-8 curricula (Basic American History, Geography, Earth Science, Biology).
+Key Characteristic: The text bridges concrete descriptions with abstract themes (e.g., using farming to discuss justice), OR narrates historical events via sensory details.
+Spatial Reasoning: Texts requiring mental manipulation of maps/routes are generally Moderate, unless the object is a familiar household item (see Slightly Complex).
+3. Very Complex
+Scope: Specialized Discipline-Specific, Engineering Mechanics, or Political Theory.
+Definition: Topics characteristic of High School (9-12) curricula requiring abstract mental models.
+Key Characteristic: Requires understanding mechanisms (how physics works/propulsion), chemical composition, or undefined political stakes (specific treaties, alliances, or secularization without context).
+4. Exceedingly Complex
+Scope: Professional or Academic knowledge.
+
+2. The Expert Mental Model (Decision Logic)
+Use these refined rules to categorize cases.
+Rule A: The "Layers of Meaning" Check
+Concrete -> Abstract (Moderate): The text describes concrete things (farming) to argue an abstract point (justice, rights).
+Concrete -> Concrete (Slightly): The text describes concrete things (lines, paper) to achieve a concrete result (drawing a cupboard). Do not over-rank practical instructions.
+Rule B: The Science & Engineering Boundary
+Observational (Moderate): Habitats, Water Cycle, observable traits, simple definitions.
+Mechanistic/Theoretical (Very): Engineering mechanics (how propulsion works via reaction), Instrumentation (using a spectroscope), or Chemical/Atomic theory.
+Test: Does the text explain how a machine functions using physical principles? If yes, it is Very Complex.
+Rule C: The History/Social Studies Boundary
+General/Narrative (Moderate):
+Sensory: Battle descriptions focusing on sights/sounds (flashes, smoke).
+Standard Topics: Immigration, Slavery, Government, Geography. Lists of nationalities or religions are "Common Knowledge" for Grades 6-8.
+Political/Contextual (Very):
+Implicit Context: Texts assuming knowledge of specific political factions, treaties, or the causes of events without explanation (e.g., "The Allies," "The Front," "The secularization of the clergy").
+Test: If the reader must know why two groups are fighting or the specific political history of a revolution to understand the text, it is Very Complex.
+Rule D: The "Technical vs. Practical" Trap
+Scenario: A text teaches a technical skill (e.g., Technical Drawing/Technology) but applies it to a familiar object (a cupboard).
+Decision: Slightly Complex.
+Reasoning: Do not confuse "Technical Vocabulary" (scale, thick lines) with "Theoretical Complexity." If the underlying concept is familiar (furniture), the SMK load is low.
+
+3. Critical Calibration Examples
+Text: "Make a rough sketch... How many shelves should the cupboard have?" (Grade 2) -> Slightly Complex.
+Reasoning: (Rule D/Rule A) Although it mentions "scale" and "technology," the task is concrete and relies on everyday knowledge.
+Text: "Hydraulic propulsion works by sucking water at the bow and forcing it sternward." (Grade 10) -> Very Complex.
+Reasoning: (Rule B) Explains a mechanism using physics principles.
+Text: "The Allies fight the enemy's cavalry; we remember the hospitality to priests during the Revolution." (Grade 6) -> Very Complex.
+Reasoning: (Rule C) Assumes undefined knowledge of WWI alliances and the specific political history of the French Revolution.
+Text: "Immigrants from Poland, Italy, and Russia arrived. Most were Catholic or Orthodox." (Grade 7) -> Moderately Complex.
+Reasoning: (Rule C) Standard K-8 topic. Lists of nationalities are content vocabulary, not specialized theory.
+
+4. Output Format
+Return your analysis in a valid JSON object. Do not include markdown formatting.
+Keys:
+- identified_topics: List[str] identifying the core subjects.
+- curriculum_check: String explaining if the topics are "Standard/General" (typical for K-8) or "Specialized/High School" (typical for 9-12).
+- assumptions_and_scaffolding: String analyzing what the author assumes the reader knows vs what is explained.
+- friction_analysis: String discussing the gap between Concrete description and Abstract meaning.
+- complexity_score: String (One of: slightly_complex, moderately_complex, very_complex, exceedingly_complex).
+- reasoning: String synthesizing the decision.
+"""
+
+smk_user_prompt = """Analyze:
+Text: {text}
+Grade: {grade}
+FK Score: {fk_score}"""
diff --git a/evals/smk_evaluator.ipynb b/evals/smk_evaluator.ipynb
new file mode 100644
index 0000000..765708a
--- /dev/null
+++ b/evals/smk_evaluator.ipynb
@@ -0,0 +1,156 @@
+{
+ "cells": [
+  {
+   "cell_type": "markdown",
+   "metadata": {},
+   "source": [
+    "# Subject Matter Knowledge (SMK) Evaluator\n",
+    "\n",
+    "**The Subject Matter Knowledge (SMK) Evaluator** assesses the background knowledge demands of informational text for students in grades 3–11. When you run a passage through the evaluator, it returns a structured output that includes:\n",
+    "\n",
+    "* **complexity_score**: The SMK complexity level (Slightly to Exceedingly Complex).\n",
+    "* **identified_topics**: The core subjects and concepts present in the text.\n",
+    "* **curriculum_check**: Whether topics are standard K-8 content or high school–level specialised knowledge.\n",
+    "* **assumptions_and_scaffolding**: What the author assumes the reader already knows vs. what is explained.\n",
+    "* **friction_analysis**: Whether reading difficulty stems from vocabulary/structure or actual knowledge demands.\n",
+    "* **reasoning**: A synthesis of why the text fits the chosen rubric level.\n",
+    "\n",
+    "This gives you a clear signal about the knowledge demands of a passage, helping ensure AI-generated content is appropriate for the target grade."
+   ]
+  },
+  {
+   "cell_type": "markdown",
+   "metadata": {},
+   "source": [
+    "### Install & Load necessary packages"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "%pip install -qU langchain-google-genai langchain pydantic textstat"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "import getpass\n",
+    "import os\n",
+    "from typing import List, Literal\n",
+    "\n",
+    "from dotenv import load_dotenv\n",
+    "from langchain_core.messages import SystemMessage\n",
+    "from langchain_core.output_parsers import JsonOutputParser\n",
+    "from langchain_core.prompts import ChatPromptTemplate\n",
+    "from langchain_core.prompts.chat import HumanMessagePromptTemplate\n",
+    "from langchain_google_genai import ChatGoogleGenerativeAI\n",
+    "from pydantic import BaseModel, Field\n",
+    "from textstat import textstat as ts"
+   ]
+  },
+  {
+   "cell_type": "markdown",
+   "metadata": {},
+   "source": [
+    "### Set up the evaluator's model and prompts"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "from prompts import smk_prompts as prompts\n",
+    "\n",
+    "# Set your api key in your environment, .env file, or enter when prompted.\n",
+    "# os.environ['GOOGLE_API_KEY'] = 'YOUR API KEY'\n",
+    "load_dotenv()\n",
+    "\n",
+    "if not os.environ.get(\"GOOGLE_API_KEY\"):\n",
+    "    os.environ[\"GOOGLE_API_KEY\"] = getpass.getpass(\"Enter your Google API key: \")\n",
+    "\n",
+    "MODEL_NAME = \"gemini-3-flash-preview\"\n",
+    "TEMPERATURE = 0\n",
+    "model = ChatGoogleGenerativeAI(model=MODEL_NAME, temperature=TEMPERATURE)"
+   ]
+  },
+  {
+   "cell_type": "markdown",
+   "metadata": {},
+   "source": [
+    "### Set up the output structure"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "metadata": {},
+   "outputs": [],
+   "source": "class SmkOutput(BaseModel):\n    identified_topics: List[str] = Field(\n        description=\"List of major subjects/concepts found in the text.\"\n    )\n    curriculum_check: str = Field(\n        description=\"Analysis of whether these topics are new or review based on the Grade Level and Reference list.\"\n    )\n    assumptions_and_scaffolding: str = Field(\n        description=\"Analysis of what the author assumes vs. what is explained (and if definitions are provided).\"\n    )\n    friction_analysis: str = Field(\n        description=\"Explicit statement distinguishing if difficulty comes from Vocabulary/Structure or actual Knowledge.\"\n    )\n    complexity_score: Literal[\n        \"slightly_complex\",\n        \"moderately_complex\",\n        \"very_complex\",\n        \"exceedingly_complex\"\n    ] = Field(description=\"The subject matter knowledge complexity level of the text\")\n    reasoning: str = Field(\n        description=\"A synthesis of why the text fits the chosen rubric level.\"\n    )\n\n\nprompt_vars = {\n    \"inputVars\": [\"text\", \"grade\", \"fk_score\"],\n    \"outputParser\": JsonOutputParser(pydantic_object=SmkOutput),\n}"
+  },
+  {
+   "cell_type": "markdown",
+   "metadata": {},
+   "source": [
+    "### Define text complexity evaluation function"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "metadata": {},
+   "outputs": [],
+   "source": "def calculate_fk_score(text) -> float:\n    \"\"\"\n    Calculate the Flesch-Kincaid Grade Level\n    \"\"\"\n    fk_score = round(ts.flesch_kincaid_grade(text), 2)\n\n    return fk_score\n\n\ndef predict_text_complexity_level(text, grade):\n    dataset = {\n        \"text\": text,\n        \"grade\": grade,\n        \"fk_score\": calculate_fk_score(text),\n    }\n\n    messages = [\n        SystemMessage(content=prompts.smk_system_prompt),\n        HumanMessagePromptTemplate.from_template(prompts.smk_user_prompt),\n    ]\n\n    prompt = ChatPromptTemplate(\n        messages,\n        input_variables=prompt_vars[\"inputVars\"],\n        partial_variables={\n            \"format_instructions\": prompt_vars[\"outputParser\"].get_format_instructions()\n        },\n    )\n\n    chain = prompt | model | JsonOutputParser()\n    return chain.invoke(dataset)"
+  },
+  {
+   "cell_type": "markdown",
+   "metadata": {},
+   "source": [
+    "# Test out examples"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "# Add your text & the grade level you want to evaluate for SMK complexity\n",
+    "\n",
+    "sample_text = \"\"\"\n",
+    "\"Well, then,\" said the teacher, \"you may take your slate and go out behind the schoolhouse for half an hour. Think of something to write about, and write the word on your slate. Then try to tell what it is, what it is like, what it is good for, and what is done with it. That is the way to write a composition.\" Henry took his slate and went out. Just behind the schoolhouse was Mr. Finney's barn. Quite close to the barn was a garden. And in the garden, Henry saw a turnip. \"Well, I know what that is,\" he said to himself; and he wrote the word turnip on his slate. Then he tried to tell what it was like, what it was good for, and what was done with it. Before the half hour was ended he had written a very neat composition on his slate. He then went into the house, and waited while the teacher read it. The teacher was surprised and pleased. He said, \"Henry Longfellow, you have done very well. Today you may stand up before the school and read what you have written about the turnip.\"\n",
+    "\"\"\"\n",
+    "\n",
+    "result = predict_text_complexity_level(sample_text, 4)\n",
+    "display(result)"
+   ]
+  },
+  {
+   "cell_type": "markdown",
+   "metadata": {},
+   "source": [
+    "You can copy or edit the above cell to test out different texts and grade levels."
+   ]
+  }
+ ],
+ "metadata": {
+  "kernelspec": {
+   "display_name": "Python 3",
+   "language": "python",
+   "name": "python3"
+  },
+  "language_info": {
+   "name": "python",
+   "version": "3.10.0"
+  }
+ },
+ "nbformat": 4,
+ "nbformat_minor": 0
+}
\ No newline at end of file

From d75fa3549c9064d792328e99fb256845949f9d5b Mon Sep 17 00:00:00 2001
From: Adnan Rashid Hussain <ahussain@chanzuckerberg.com>
Date: Fri, 20 Feb 2026 15:07:54 -0800
Subject: [PATCH 2/3] Update Text Complexity Combo Eval notebook to include SMK

---
 evals/text_complexity_combo.ipynb | 105 ++++++++++++++++++++++++++++--
 1 file changed, 99 insertions(+), 6 deletions(-)

diff --git a/evals/text_complexity_combo.ipynb b/evals/text_complexity_combo.ipynb
index cc2ebcf..32e126c 100644
--- a/evals/text_complexity_combo.ipynb
+++ b/evals/text_complexity_combo.ipynb
@@ -6,11 +6,13 @@
    "source": [
     "# Text Complexity Combo Evaluator (Early Release)\n",
     "\n",
-    "This evaluator combines two distinct evaluators: the **Sentence Structure Evaluator** and the **Vocabulary Evaluator**. It is designed to assess the complexity of texts for educational purposes, particularly in relation to grade-level expectations.\n",
+    "This evaluator combines three distinct evaluators: the **Sentence Structure Evaluator**, the **Vocabulary Evaluator**, and the **Subject Matter Knowledge (SMK) Evaluator**. It is designed to assess the complexity of texts for educational purposes, particularly in relation to grade-level expectations.\n",
     "\n",
     "**The Sentence Structure Evaluator** analyzes the syntactic complexity of a passage, helping to identify whether sentence constructions are appropriate for the intended grade level.\n",
     "\n",
-    "**The Vocabulary Evaluator** gives developers the fine-grained insight they need but can’t get from traditional tools. It helps determine whether texts use words that align with grade-level expectations and support growth in academic language. This ensures students are consistently exposed to the kinds of vocabulary that build knowledge and enable them to fully engage with grade-level texts.\n"
+    "**The Vocabulary Evaluator** gives developers the fine-grained insight they need but can't get from traditional tools. It helps determine whether texts use words that align with grade-level expectations and support growth in academic language. This ensures students are consistently exposed to the kinds of vocabulary that build knowledge and enable them to fully engage with grade-level texts.\n",
+    "\n",
+    "**The Subject Matter Knowledge (SMK) Evaluator** assesses the background knowledge demands of informational text. It returns a structured output that identifies the core subjects and concepts present in the text, and determines the extent of background knowledge required to comprehend it."
    ]
   },
   {
@@ -74,7 +76,7 @@
     "import getpass\n",
     "import json\n",
     "import os\n",
-    "from typing import Any, List\n",
+    "from typing import Any, List, Literal\n",
     "\n",
     "import numpy as np\n",
     "import pandas as pd\n",
@@ -123,7 +125,7 @@
    },
    "outputs": [],
    "source": [
-    "from prompts import vocab_prompts as v_prompts, sent_str_prompts as s_prompts\n",
+    "from prompts import vocab_prompts as v_prompts, sent_str_prompts as s_prompts, smk_prompts as smk_prompts\n",
     "\n",
     "# Set your api keys in your environment, .env file, or enter when prompted.\n",
     "# os.environ['GOOGLE_API_KEY'] = 'YOUR API KEY'\n",
@@ -146,7 +148,12 @@
     "# Define the model to be used for student background knowledge generation and sentence structure complexity\n",
     "COMBO_MODEL = \"gpt-4o-2024-11-20\"\n",
     "COMBO_TEMPERATURE = 0\n",
-    "model = ChatOpenAI(model=COMBO_MODEL, temperature=COMBO_TEMPERATURE)"
+    "model = ChatOpenAI(model=COMBO_MODEL, temperature=COMBO_TEMPERATURE)\n",
+    "\n",
+    "# Define the model to be used for SMK complexity\n",
+    "SMK_MODEL = \"gemini-3-flash-preview\"\n",
+    "SMK_TEMPERATURE = 0\n",
+    "smk_model = ChatGoogleGenerativeAI(model=SMK_MODEL, temperature=SMK_TEMPERATURE)"
    ]
   },
   {
@@ -225,6 +232,36 @@
     "        \"fk_level\",\n",
     "    ],\n",
     "    \"outputParser\": JsonOutputParser(pydantic_object=VocabOutput),\n",
+    "}\n",
+    "\n",
+    "\n",
+    "class SmkOutput(BaseModel):\n",
+    "    identified_topics: List[str] = Field(\n",
+    "        description=\"List of major subjects/concepts found in the text.\"\n",
+    "    )\n",
+    "    curriculum_check: str = Field(\n",
+    "        description=\"Analysis of whether these topics are new or review based on the Grade Level and Reference list.\"\n",
+    "    )\n",
+    "    assumptions_and_scaffolding: str = Field(\n",
+    "        description=\"Analysis of what the author assumes vs. what is explained (and if definitions are provided).\"\n",
+    "    )\n",
+    "    friction_analysis: str = Field(\n",
+    "        description=\"Explicit statement distinguishing if difficulty comes from Vocabulary/Structure or actual Knowledge.\"\n",
+    "    )\n",
+    "    complexity_score: Literal[\n",
+    "        \"slightly_complex\",\n",
+    "        \"moderately_complex\",\n",
+    "        \"very_complex\",\n",
+    "        \"exceedingly_complex\",\n",
+    "    ] = Field(description=\"The subject matter knowledge complexity level of the text\")\n",
+    "    reasoning: str = Field(\n",
+    "        description=\"A synthesis of why the text fits the chosen rubric level.\"\n",
+    "    )\n",
+    "\n",
+    "\n",
+    "smk_prompt_vars = {\n",
+    "    \"inputVars\": [\"text\", \"grade\", \"fk_score\"],\n",
+    "    \"outputParser\": JsonOutputParser(pydantic_object=SmkOutput),\n",
     "}"
    ]
   },
@@ -949,6 +986,59 @@
     "    return processed_df"
    ]
   },
+  {
+   "cell_type": "code",
+   "execution_count": 17,
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "async def smk_df(input_df: pd.DataFrame, concurrency: int = 5) -> pd.DataFrame:\n",
+    "    sem = asyncio.Semaphore(concurrency)\n",
+    "    error_payload: dict = {key: None for key in list(SmkOutput.model_fields.keys())}\n",
+    "\n",
+    "    async def _smk_one_row(text: str, grade: int):\n",
+    "        async with sem:\n",
+    "            try:\n",
+    "                dataset = {\n",
+    "                    \"text\": text,\n",
+    "                    \"grade\": grade,\n",
+    "                    \"fk_score\": calculate_fk_score(text),\n",
+    "                }\n",
+    "                messages = [\n",
+    "                    SystemMessage(content=smk_prompts.smk_system_prompt),\n",
+    "                    HumanMessagePromptTemplate.from_template(smk_prompts.smk_user_prompt),\n",
+    "                ]\n",
+    "                prompt = ChatPromptTemplate(\n",
+    "                    messages,\n",
+    "                    input_variables=smk_prompt_vars[\"inputVars\"],\n",
+    "                    partial_variables={\n",
+    "                        \"format_instructions\": smk_prompt_vars[\"outputParser\"].get_format_instructions()\n",
+    "                    },\n",
+    "                )\n",
+    "                chain = prompt | smk_model | JsonOutputParser()\n",
+    "                result = await chain.ainvoke(dataset)\n",
+    "                score = result.get(\"complexity_score\", \"\")\n",
+    "                result[\"complexity_score\"] = normalize_label(\n",
+    "                    score.replace(\"_\", \" \") if score else score\n",
+    "                )\n",
+    "                result[\"error\"] = None\n",
+    "                return result\n",
+    "            except Exception as e:\n",
+    "                result = {**error_payload, \"error\": str(e)}\n",
+    "                return result\n",
+    "\n",
+    "    tasks = [\n",
+    "        _smk_one_row(text, grade)\n",
+    "        for text, grade in zip(input_df[\"text\"], input_df[\"grade\"])\n",
+    "    ]\n",
+    "    results = await asyncio.gather(*tasks)\n",
+    "    results_df = pd.DataFrame.from_records(results, index=input_df.index)\n",
+    "    results_df.rename(columns={\"complexity_score\": \"smk_score\"}, inplace=True)\n",
+    "    results_df.rename(columns={\"reasoning\": \"smk_reasoning\"}, inplace=True)\n",
+    "    processed_df = input_df.join(results_df, rsuffix=\"_smk\")\n",
+    "    return processed_df"
+   ]
+  },
   {
    "cell_type": "code",
    "execution_count": null,
@@ -960,7 +1050,8 @@
     "    # Convert to a dataframe for feature engineering and processing\n",
     "    input_df = pd.DataFrame.from_records([input])\n",
     "    vocab_df = await vocabulary_df(input_df)\n",
-    "    processed_df = await analyze_df(vocab_df)\n",
+    "    smk_result_df = await smk_df(vocab_df)\n",
+    "    processed_df = await analyze_df(smk_result_df)\n",
     "    final_features_df = add_engineered_features(processed_df)\n",
     "    predictions_df = await evaluate_df(final_features_df)\n",
     "    return predictions_df"
@@ -1015,8 +1106,10 @@
     "pretty = results.iloc[0][\n",
     "    [\n",
     "        \"vocabulary_score\",\n",
+    "        \"smk_score\",\n",
     "        \"complexity_answer\",\n",
     "        \"vocabulary_reasoning\",\n",
+    "        \"smk_reasoning\",\n",
     "        \"complexity_reasoning\",\n",
     "    ]\n",
     "].to_dict()\n",

From 5d9cbcd0bc597ef7b264adcd8748e263ee91c5a2 Mon Sep 17 00:00:00 2001
From: Adnan Rashid Hussain <ahussain@chanzuckerberg.com>
Date: Thu, 12 Mar 2026 11:51:58 -0700
Subject: [PATCH 3/3] Update reasoning and include formatting instructions

---
 evals/prompts/smk_prompts.py      |  2 +
 evals/smk_evaluator.ipynb         | 69 +++++++++++++++++++++++++++++--
 evals/text_complexity_combo.ipynb |  2 +-
 3 files changed, 68 insertions(+), 5 deletions(-)

diff --git a/evals/prompts/smk_prompts.py b/evals/prompts/smk_prompts.py
index 4cfa3d9..6e1ad51 100644
--- a/evals/prompts/smk_prompts.py
+++ b/evals/prompts/smk_prompts.py
@@ -67,6 +67,8 @@
 - friction_analysis: String discussing the gap between Concrete description and Abstract meaning.
 - complexity_score: String (One of: slightly_complex, moderately_complex, very_complex, exceedingly_complex).
 - reasoning: String synthesizing the decision.
+
+{format_instructions}
 """
 
 smk_user_prompt = """Analyze:
diff --git a/evals/smk_evaluator.ipynb b/evals/smk_evaluator.ipynb
index 765708a..994da3a 100644
--- a/evals/smk_evaluator.ipynb
+++ b/evals/smk_evaluator.ipynb
@@ -13,7 +13,7 @@
     "* **curriculum_check**: Whether topics are standard K-8 content or high school–level specialised knowledge.\n",
     "* **assumptions_and_scaffolding**: What the author assumes the reader already knows vs. what is explained.\n",
     "* **friction_analysis**: Whether reading difficulty stems from vocabulary/structure or actual knowledge demands.\n",
-    "* **reasoning**: A synthesis of why the text fits the chosen rubric level.\n",
+    "* **reasoning**: A brief synthesis of why the text fits the chosen complexity level.\n",
     "\n",
     "This gives you a clear signal about the knowledge demands of a passage, helping ensure AI-generated content is appropriate for the target grade."
    ]
@@ -93,7 +93,36 @@
    "execution_count": null,
    "metadata": {},
    "outputs": [],
-   "source": "class SmkOutput(BaseModel):\n    identified_topics: List[str] = Field(\n        description=\"List of major subjects/concepts found in the text.\"\n    )\n    curriculum_check: str = Field(\n        description=\"Analysis of whether these topics are new or review based on the Grade Level and Reference list.\"\n    )\n    assumptions_and_scaffolding: str = Field(\n        description=\"Analysis of what the author assumes vs. what is explained (and if definitions are provided).\"\n    )\n    friction_analysis: str = Field(\n        description=\"Explicit statement distinguishing if difficulty comes from Vocabulary/Structure or actual Knowledge.\"\n    )\n    complexity_score: Literal[\n        \"slightly_complex\",\n        \"moderately_complex\",\n        \"very_complex\",\n        \"exceedingly_complex\"\n    ] = Field(description=\"The subject matter knowledge complexity level of the text\")\n    reasoning: str = Field(\n        description=\"A synthesis of why the text fits the chosen rubric level.\"\n    )\n\n\nprompt_vars = {\n    \"inputVars\": [\"text\", \"grade\", \"fk_score\"],\n    \"outputParser\": JsonOutputParser(pydantic_object=SmkOutput),\n}"
+   "source": [
+    "class SmkOutput(BaseModel):\n",
+    "    identified_topics: List[str] = Field(\n",
+    "        description=\"List of major subjects/concepts found in the text.\"\n",
+    "    )\n",
+    "    curriculum_check: str = Field(\n",
+    "        description=\"Analysis of whether these topics are new or review based on the Grade Level and Reference list.\"\n",
+    "    )\n",
+    "    assumptions_and_scaffolding: str = Field(\n",
+    "        description=\"Analysis of what the author assumes vs. what is explained (and if definitions are provided).\"\n",
+    "    )\n",
+    "    friction_analysis: str = Field(\n",
+    "        description=\"Explicit statement distinguishing if difficulty comes from Vocabulary/Structure or actual Knowledge.\"\n",
+    "    )\n",
+    "    complexity_score: Literal[\n",
+    "        \"slightly_complex\",\n",
+    "        \"moderately_complex\",\n",
+    "        \"very_complex\",\n",
+    "        \"exceedingly_complex\"\n",
+    "    ] = Field(description=\"The subject matter knowledge complexity level of the text\")\n",
+    "    reasoning: str = Field(\n",
+    "        description=\"A brief synthesis of why the text fits the chosen complexity level.\"\n",
+    "    )\n",
+    "\n",
+    "\n",
+    "prompt_vars = {\n",
+    "    \"inputVars\": [\"text\", \"grade\", \"fk_score\"],\n",
+    "    \"outputParser\": JsonOutputParser(pydantic_object=SmkOutput),\n",
+    "}"
+   ]
   },
   {
    "cell_type": "markdown",
@@ -107,7 +136,39 @@
    "execution_count": null,
    "metadata": {},
    "outputs": [],
-   "source": "def calculate_fk_score(text) -> float:\n    \"\"\"\n    Calculate the Flesch-Kincaid Grade Level\n    \"\"\"\n    fk_score = round(ts.flesch_kincaid_grade(text), 2)\n\n    return fk_score\n\n\ndef predict_text_complexity_level(text, grade):\n    dataset = {\n        \"text\": text,\n        \"grade\": grade,\n        \"fk_score\": calculate_fk_score(text),\n    }\n\n    messages = [\n        SystemMessage(content=prompts.smk_system_prompt),\n        HumanMessagePromptTemplate.from_template(prompts.smk_user_prompt),\n    ]\n\n    prompt = ChatPromptTemplate(\n        messages,\n        input_variables=prompt_vars[\"inputVars\"],\n        partial_variables={\n            \"format_instructions\": prompt_vars[\"outputParser\"].get_format_instructions()\n        },\n    )\n\n    chain = prompt | model | JsonOutputParser()\n    return chain.invoke(dataset)"
+   "source": [
+    "def calculate_fk_score(text) -> float:\n",
+    "    \"\"\"\n",
+    "    Calculate the Flesch-Kincaid Grade Level\n",
+    "    \"\"\"\n",
+    "    fk_score = round(ts.flesch_kincaid_grade(text), 2)\n",
+    "\n",
+    "    return fk_score\n",
+    "\n",
+    "\n",
+    "def predict_text_complexity_level(text, grade):\n",
+    "    dataset = {\n",
+    "        \"text\": text,\n",
+    "        \"grade\": grade,\n",
+    "        \"fk_score\": calculate_fk_score(text),\n",
+    "    }\n",
+    "\n",
+    "    messages = [\n",
+    "        SystemMessage(content=prompts.smk_system_prompt),\n",
+    "        HumanMessagePromptTemplate.from_template(prompts.smk_user_prompt),\n",
+    "    ]\n",
+    "\n",
+    "    prompt = ChatPromptTemplate(\n",
+    "        messages,\n",
+    "        input_variables=prompt_vars[\"inputVars\"],\n",
+    "        partial_variables={\n",
+    "            \"format_instructions\": prompt_vars[\"outputParser\"].get_format_instructions()\n",
+    "        },\n",
+    "    )\n",
+    "\n",
+    "    chain = prompt | model | JsonOutputParser()\n",
+    "    return chain.invoke(dataset)"
+   ]
   },
   {
    "cell_type": "markdown",
@@ -153,4 +214,4 @@
  },
  "nbformat": 4,
  "nbformat_minor": 0
-}
\ No newline at end of file
+}
diff --git a/evals/text_complexity_combo.ipynb b/evals/text_complexity_combo.ipynb
index 32e126c..c35ef0f 100644
--- a/evals/text_complexity_combo.ipynb
+++ b/evals/text_complexity_combo.ipynb
@@ -255,7 +255,7 @@
     "        \"exceedingly_complex\",\n",
     "    ] = Field(description=\"The subject matter knowledge complexity level of the text\")\n",
     "    reasoning: str = Field(\n",
-    "        description=\"A synthesis of why the text fits the chosen rubric level.\"\n",
+    "        description=\"A brief synthesis of why the text fits the chosen complexity level.\"\n",
     "    )\n",
     "\n",
     "\n",