From 1d326b1e368993377eba813d8907f06ae1d8f5e9 Mon Sep 17 00:00:00 2001
From: Bo Li <drluodian@gmail.com>
Date: Thu, 26 Mar 2026 17:04:14 +0100
Subject: [PATCH] feat: add physics reasoning benchmarks (PhysBench, ContPhy,
 PhysGame, PhysicsRW, PhysReason)

Add five physics reasoning benchmarks:
- PhysBench: multi-domain physics reasoning (ICLR 2025)
- ContPhy: continuum physics understanding from videos (ICML 2024)
- PhysGame: physics understanding from game environments
- PhysicsRW: real-world physics scenarios
- PhysReason: physics reasoning with mini split

Also adds shared MCQ answer extraction utility used by PhysBench.
---
 lmms_eval/tasks/_task_utils/mcq_extract.py    | 147 ++++
 lmms_eval/tasks/contphy/contphy.yaml          |  43 ++
 lmms_eval/tasks/contphy/generate_qa.py        | 658 ++++++++++++++++++
 lmms_eval/tasks/contphy/utils.py              | 265 +++++++
 lmms_eval/tasks/physbench/physbench.yaml      |  32 +
 lmms_eval/tasks/physbench/utils.py            | 192 +++++
 lmms_eval/tasks/physgame/physgame.yaml        |  32 +
 lmms_eval/tasks/physgame/utils.py             | 107 +++
 lmms_eval/tasks/physics_rw/physics_rw.yaml    |  30 +
 lmms_eval/tasks/physics_rw/utils.py           | 166 +++++
 lmms_eval/tasks/physreason/physreason.yaml    |  21 +
 .../physreason/physreason_data/.gitignore     |   2 +
 .../tasks/physreason/physreason_mini.yaml     |  21 +
 lmms_eval/tasks/physreason/utils.py           | 285 ++++++++
 14 files changed, 2001 insertions(+)
 create mode 100644 lmms_eval/tasks/_task_utils/mcq_extract.py
 create mode 100644 lmms_eval/tasks/contphy/contphy.yaml
 create mode 100644 lmms_eval/tasks/contphy/generate_qa.py
 create mode 100644 lmms_eval/tasks/contphy/utils.py
 create mode 100644 lmms_eval/tasks/physbench/physbench.yaml
 create mode 100644 lmms_eval/tasks/physbench/utils.py
 create mode 100644 lmms_eval/tasks/physgame/physgame.yaml
 create mode 100644 lmms_eval/tasks/physgame/utils.py
 create mode 100644 lmms_eval/tasks/physics_rw/physics_rw.yaml
 create mode 100644 lmms_eval/tasks/physics_rw/utils.py
 create mode 100644 lmms_eval/tasks/physreason/physreason.yaml
 create mode 100644 lmms_eval/tasks/physreason/physreason_data/.gitignore
 create mode 100644 lmms_eval/tasks/physreason/physreason_mini.yaml
 create mode 100644 lmms_eval/tasks/physreason/utils.py

diff --git a/lmms_eval/tasks/_task_utils/mcq_extract.py b/lmms_eval/tasks/_task_utils/mcq_extract.py
new file mode 100644
index 000000000..92bee43af
--- /dev/null
+++ b/lmms_eval/tasks/_task_utils/mcq_extract.py
@@ -0,0 +1,147 @@
+"""Robust multiple-choice answer extraction.
+
+Shared utility for benchmark tasks that need to extract a choice letter
+(A/B/C/D/...) from free-form model output.  Handles 10+ common answer
+formats and uses a priority ranking to pick the best candidate.
+
+Usage::
+
+    from lmms_eval.tasks._task_utils.mcq_extract import extract_mcq_answer
+
+    letter = extract_mcq_answer("The correct answer is (B).")  # -> "B"
+"""
+
+import re
+from typing import List, Optional
+
+_DEFAULT_CHOICES = ["A", "B", "C", "D", "E", "F", "G", "H"]
+
+_ANSWER_PHRASES = [
+    "the answer is",
+    "answer is",
+    "the correct answer is",
+    "correct answer is",
+    "the best answer is",
+    "best answer is",
+    "the correct option is",
+    "correct option is",
+    "the best option is",
+    "best option is",
+    "the choice is",
+    "choice is",
+    "the correct choice is",
+    "correct choice is",
+    "i choose",
+    "i select",
+    "i pick",
+    "my answer is",
+    "my choice is",
+]
+
+# Higher = more confident that this is the intended answer.
+_FORMAT_PRIORITY = {
+    "start": 10,
+    "end": 9,
+    "phrase": 7,
+    "parentheses": 6,
+    "period": 5,
+    "colon": 4,
+    "right_paren": 3,
+    "space": 2,
+    "fallback": 0,
+}
+
+
+def extract_mcq_answer(response: str, choices: Optional[List[str]] = None) -> str:
+    """Extract a multiple-choice answer letter from model output.
+
+    Searches for choice letters in various common formats and returns the
+    best candidate using a priority ranking.  When multiple candidates
+    match, prefers the **last** occurrence in the **highest-priority**
+    format — this naturally handles reasoning-style outputs where the
+    model discusses options before giving its final answer.
+
+    Args:
+        response: Model output (should already have ``<think>`` tags
+            stripped by the postprocessing pipeline).
+        choices: Valid choice letters.  Defaults to ``["A".."H"]``.
+
+    Returns:
+        Uppercase choice letter, or ``""`` if none found.
+    """
+    if not response or not response.strip():
+        return ""
+
+    all_choices = choices or _DEFAULT_CHOICES
+
+    text = response.strip()
+    for char in [",", ".", "!", "?", ";", ":", "'", '"']:
+        text = text.strip(char)
+    # Pad with spaces for boundary matching.
+    text = " " + text + " "
+
+    candidates: list = []  # (letter, position, format_name)
+
+    # --- (A) ---
+    for ch in all_choices:
+        if f"({ch})" in text:
+            candidates.append((ch, text.rfind(f"({ch})"), "parentheses"))
+
+    # --- A. ---
+    for ch in all_choices:
+        if f"{ch}." in text:
+            candidates.append((ch, text.rfind(f"{ch}."), "period"))
+
+    # --- A: ---
+    for ch in all_choices:
+        if f"{ch}:" in text:
+            candidates.append((ch, text.rfind(f"{ch}:"), "colon"))
+
+    # --- A) ---
+    for ch in all_choices:
+        if f"{ch})" in text:
+            candidates.append((ch, text.rfind(f"{ch})"), "right_paren"))
+
+    # --- A followed by space ---
+    for ch in all_choices:
+        if f"{ch} " in text:
+            candidates.append((ch, text.rfind(f"{ch} "), "space"))
+
+    # --- Common answer phrases ("the answer is A", etc.) ---
+    text_lower = text.lower()
+    for phrase in _ANSWER_PHRASES:
+        idx = text_lower.find(phrase)
+        if idx != -1:
+            after = idx + len(phrase)
+            for ch in all_choices:
+                ch_pos = text.find(ch, after)
+                if ch_pos != -1:
+                    candidates.append((ch, ch_pos, "phrase"))
+
+    # --- Starts with standalone choice letter (not part of a word) ---
+    stripped = text.strip()
+    for ch in all_choices:
+        if stripped.startswith(ch) and (len(stripped) == 1 or not stripped[1].isalpha()):
+            candidates.append((ch, 0, "start"))
+
+    # --- Ends with standalone choice letter ---
+    for ch in all_choices:
+        if stripped.endswith(ch) and (len(stripped) == 1 or not stripped[-2].isalpha()):
+            candidates.append((ch, len(text) - 1, "end"))
+
+    # --- Fallback: any occurrence (lowest priority) ---
+    if not candidates:
+        for ch in all_choices:
+            if ch in text:
+                candidates.append((ch, text.rfind(ch), "fallback"))
+
+    if not candidates:
+        return ""
+
+    # Sort by (priority DESC, position DESC) — highest-priority format
+    # wins; within the same format, later position (closer to end) wins.
+    candidates.sort(
+        key=lambda x: (_FORMAT_PRIORITY.get(x[2], 0), x[1]),
+        reverse=True,
+    )
+    return candidates[0][0]
diff --git a/lmms_eval/tasks/contphy/contphy.yaml b/lmms_eval/tasks/contphy/contphy.yaml
new file mode 100644
index 000000000..f53ef0f92
--- /dev/null
+++ b/lmms_eval/tasks/contphy/contphy.yaml
@@ -0,0 +1,43 @@
+# ContPhy: Continuum Physical Concept Learning and Reasoning from Videos (ICML 2024)
+# Paper: https://arxiv.org/abs/2402.06119
+# Dataset: https://huggingface.co/datasets/zzcnewly/ContPhy_Dataset
+#
+# Setup:
+#   1. Generate the QA file from raw ContPhy zips:
+#      python -m lmms_eval.tasks.contphy.generate_qa \
+#        --mini --output $HF_HOME/contphy/contphy_qa.json
+#   2. Set CONTPHY_DATA_DIR to the extracted data root (for video files)
+#   3. Update data_files below to point to your generated QA JSON
+dataset_path: json
+dataset_kwargs:
+  data_files:
+    test: contphy_qa.json
+  cache_dir: contphy
+  video: true
+task: contphy
+test_split: test
+output_type: generate_until
+process_docs: !function utils.contphy_process_docs
+doc_to_visual: !function utils.contphy_doc_to_visual
+doc_to_text: !function utils.contphy_doc_to_text
+doc_to_target: "answer"
+generation_kwargs:
+  max_new_tokens: 16
+  temperature: 0
+  top_p: 1.0
+  num_beams: 1
+  do_sample: false
+process_results: !function utils.contphy_process_results
+metric_list:
+  - metric: contphy_accuracy
+    aggregation: !function utils.contphy_aggregate_results
+    higher_is_better: true
+lmms_eval_specific_kwargs:
+  default:
+    pre_prompt: ""
+    post_prompt: "\nAnswer with the option's letter from the given choices directly."
+  gpt4v:
+    pre_prompt: ""
+    post_prompt: "\nPlease answer with the correct choice letter (e.g. A, B, or C). Please do NOT add any other text in your response."
+metadata:
+  - version: 0.0
diff --git a/lmms_eval/tasks/contphy/generate_qa.py b/lmms_eval/tasks/contphy/generate_qa.py
new file mode 100644
index 000000000..a2581b655
--- /dev/null
+++ b/lmms_eval/tasks/contphy/generate_qa.py
@@ -0,0 +1,658 @@
+"""Generate ContPhy QA JSON from raw dataset zips.
+
+Downloads (or reads from local cache) the ContPhy zip files from HuggingFace
+and produces a single JSONL file with multiple-choice QA pairs suitable for
+lmms-eval ingestion.
+
+Usage:
+    python -m lmms_eval.tasks.contphy.generate_qa --output /path/to/contphy_qa.json
+
+Set CONTPHY_DATA_DIR to skip download if you already have the zips extracted.
+"""
+
+import argparse
+import json
+import os
+import random
+import zipfile
+from pathlib import Path
+
+# ---------------------------------------------------------------------------
+# Scenario -> zip name mapping
+# ---------------------------------------------------------------------------
+SCENARIO_ZIPS = {
+    "fluid": "fluid_full.zip",
+    "rope": "rope_full.zip",
+    "cloth": "cloth_full.zip",
+    "ball": "ball_full.zip",
+}
+
+SCENARIO_DIRS = {
+    "fluid": "fluid_slides",
+    "rope": "pulley_group",
+    "cloth": "cloth_collision",
+    "ball": "soft_body",
+}
+
+HF_BASE = "https://huggingface.co/datasets/zzcnewly/ContPhy_Dataset/resolve/main"
+
+
+# ---------------------------------------------------------------------------
+# Helpers
+# ---------------------------------------------------------------------------
+def _color_name(name: str) -> str:
+    """Extract color from object name like 'Blue Fluid' -> 'blue'."""
+    parts = name.split()
+    if len(parts) >= 2:
+        return parts[0].lower()
+    return name.lower()
+
+
+def _compare_word(greater: bool) -> str:
+    return "greater" if greater else "less"
+
+
+# ---------------------------------------------------------------------------
+# Question generators per scenario
+# ---------------------------------------------------------------------------
+def generate_fluid_questions(data: dict, video_id: str) -> list[dict]:
+    """Generate MC questions for the fluid scenario."""
+    questions = []
+    fluids = data.get("metaSamplingData", {}).get("fluids", [])
+    sticks = data.get("metaSamplingData", {}).get("sticks", [])
+    tracking = data.get("trackingData", {})
+    receptor_stats = tracking.get("perReceptorFluidStat", {})
+    cf_annotations = data.get("CounterFactualAnnotations", {})
+
+    # Filter to fluids without 'Later Emitted' prefix for cleaner questions
+    named_fluids = [f for f in fluids if not f["name"].startswith("Later")]
+
+    # --- Property: Density comparison ---
+    if len(named_fluids) >= 2:
+        for i in range(len(named_fluids)):
+            for j in range(i + 1, len(named_fluids)):
+                f1, f2 = named_fluids[i], named_fluids[j]
+                gt_greater = f1["density"] > f2["density"]
+                q = f"Is the density of the {f1['name'].lower()} " f"{_compare_word(True)} than that of the {f2['name'].lower()}?"
+                answer = "Yes" if gt_greater else "No"
+                questions.append(
+                    {
+                        "question": q,
+                        "options": ["Yes", "No", "Cannot Judge"],
+                        "answer": answer,
+                        "question_type": "property",
+                        "question_class": "density",
+                        "scenario": "fluid",
+                        "video_id": video_id,
+                    }
+                )
+
+    # --- Property: Stick count ---
+    if sticks:
+        q = "How many sticks are there in the video?"
+        n = len(sticks)
+        options = sorted(set([str(n), str(max(1, n - 1)), str(n + 1), str(n + 2)]))
+        questions.append(
+            {
+                "question": q,
+                "options": options,
+                "answer": str(n),
+                "question_type": "property",
+                "question_class": "stick_number",
+                "scenario": "fluid",
+                "video_id": video_id,
+            }
+        )
+
+    # --- Predictive: Which container will fluid flow into ---
+    for container_name, container_fluids in receptor_stats.items():
+        for fluid_name, amount in container_fluids.items():
+            if fluid_name.startswith("Later"):
+                continue
+            if amount > 0:
+                other_containers = [c for c in receptor_stats if c != container_name]
+                if other_containers:
+                    q = f"Which container will {fluid_name.lower()} flow into?"
+                    options = [container_name] + other_containers[:2]
+                    if len(options) < 3:
+                        options.append("None of the above")
+                    random.shuffle(options)
+                    questions.append(
+                        {
+                            "question": q,
+                            "options": options,
+                            "answer": container_name,
+                            "question_type": "predictive",
+                            "question_class": "container",
+                            "scenario": "fluid",
+                            "video_id": video_id,
+                        }
+                    )
+                    break  # One per container
+        else:
+            continue
+        break  # One total
+
+    # --- Counterfactual: If stick removed, where would fluid go ---
+    for cf_key, cf_data in list(cf_annotations.items())[:1]:
+        removed = cf_data.get("removedStickName", "")
+        if not removed:
+            continue
+        cf_receptors = cf_data.get("trackingData", {}).get("perReceptorFluidStat", {})
+        # Find a fluid that changed destination
+        for fluid_name in [f["name"] for f in named_fluids]:
+            orig_containers = {c for c, fs in receptor_stats.items() if fluid_name in fs and fs[fluid_name] > 0}
+            cf_containers = {c for c, fs in cf_receptors.items() if fluid_name in fs and fs[fluid_name] > 0}
+            new_containers = cf_containers - orig_containers
+            if new_containers:
+                answer = list(new_containers)[0]
+                all_containers = list(set(list(receptor_stats.keys()) + list(cf_receptors.keys())))
+                options = [answer] + [c for c in all_containers if c != answer][:2]
+                if len(options) < 3:
+                    options.append("None of the above")
+                random.shuffle(options)
+                q = f"If the {removed.lower()} were removed, which container would " f"{fluid_name.lower()} flow into?"
+                questions.append(
+                    {
+                        "question": q,
+                        "options": options,
+                        "answer": answer,
+                        "question_type": "counterfactual",
+                        "question_class": "container",
+                        "scenario": "fluid",
+                        "video_id": video_id,
+                    }
+                )
+                break
+
+    return questions
+
+
+def generate_rope_questions(data: dict, video_id: str) -> list[dict]:
+    """Generate MC questions for the rope scenario."""
+    questions = []
+    masses = data.get("outputMass", {})
+    rotations = data.get("ResultRotation", {})
+    tension_avg = data.get("ResultTensionAvg", {})
+    meta = data.get("metaSamplingData", {})
+    cf_annotations = data.get("CounterFactualAnnotations", {})
+
+    objects_with_mass = list(masses.keys())
+
+    # --- Property: Mass comparison ---
+    if len(objects_with_mass) >= 2:
+        for i in range(len(objects_with_mass)):
+            for j in range(i + 1, len(objects_with_mass)):
+                o1, o2 = objects_with_mass[i], objects_with_mass[j]
+                m1, m2 = masses[o1], masses[o2]
+                gt_greater = m1 > m2 * 0.5
+                q = f"Is the mass of the {o1.lower()} greater than " f"half that of the {o2.lower()}?"
+                answer = "Yes" if gt_greater else "No"
+                questions.append(
+                    {
+                        "question": q,
+                        "options": ["Yes", "No", "Cannot Judge"],
+                        "answer": answer,
+                        "question_type": "property",
+                        "question_class": "mass",
+                        "scenario": "rope",
+                        "video_id": video_id,
+                    }
+                )
+
+    # --- Property: Tension comparison ---
+    rope_names = list(tension_avg.keys())
+    if len(rope_names) >= 2:
+        r1, r2 = rope_names[0], rope_names[1]
+        t1, t2 = abs(tension_avg[r1]), abs(tension_avg[r2])
+        gt_greater = t1 > t2
+        q = f"Is the tension of the {r1.lower()} greater than " f"that of the {r2.lower()}?"
+        answer = "Yes" if gt_greater else "No"
+        questions.append(
+            {
+                "question": q,
+                "options": ["Yes", "No", "Cannot Judge"],
+                "answer": answer,
+                "question_type": "property",
+                "question_class": "tension",
+                "scenario": "rope",
+                "video_id": video_id,
+            }
+        )
+
+    # --- Counterfactual: If object were heavier, which direction ---
+    for cf_key, cf_data in list(cf_annotations.items())[:1]:
+        cf_rotations = cf_data.get("ResultRotation", {})
+        # Find a pulley that changed rotation direction
+        for pulley_name, orig_rot in rotations.items():
+            cf_rot = cf_rotations.get(pulley_name, orig_rot)
+            if cf_rot != orig_rot and cf_rot != 0:
+                direction = "clockwise" if cf_rot > 0 else "anti-clockwise"
+                # Pick a changed mass object
+                changed_obj = cf_data.get("changedMassObjectName", "")
+                if not changed_obj and objects_with_mass:
+                    changed_obj = objects_with_mass[0]
+                q = f"If the {changed_obj.lower()} were far much heavier, " f"which direction would the {pulley_name.lower()} rotate?"
+                questions.append(
+                    {
+                        "question": q,
+                        "options": ["Clockwise", "Anti-clockwise", "No rotation"],
+                        "answer": direction.capitalize(),
+                        "question_type": "counterfactual",
+                        "question_class": "rotation",
+                        "scenario": "rope",
+                        "video_id": video_id,
+                    }
+                )
+                break
+
+    # --- Property: Object counting (shapes/colors) ---
+    name2pos = meta.get("name2position", {})
+    if name2pos:
+        # Count objects by type
+        type_counts = {}
+        for obj_name in name2pos:
+            for obj_type in ["Cube", "Sphere", "Pulley"]:
+                if obj_type in obj_name:
+                    type_counts[obj_type.lower()] = type_counts.get(obj_type.lower(), 0) + 1
+        for obj_type, count in type_counts.items():
+            plural = obj_type + "s" if count != 1 else obj_type
+            q = f"How many {plural} are there in the video?"
+            options = sorted(set([str(count), str(max(1, count - 1)), str(count + 1), str(count + 2)]))
+            questions.append(
+                {
+                    "question": q,
+                    "options": options,
+                    "answer": str(count),
+                    "question_type": "property",
+                    "question_class": "shape",
+                    "scenario": "rope",
+                    "video_id": video_id,
+                }
+            )
+
+    return questions
+
+
+def generate_cloth_questions(data: dict, video_id: str) -> list[dict]:
+    """Generate MC questions for the cloth scenario."""
+    questions = []
+    cloth_left = data.get("clothLeft", {})
+    cloth_right = data.get("clothRight", {})
+    ofa = data.get("objectFullAnnotation", {})
+
+    # --- Property: Elasticity (stretching compliance) ---
+    sc_l = cloth_left.get("stretchingCompliance", 0)
+    sc_r = cloth_right.get("stretchingCompliance", 0)
+    # Higher compliance = easier to stretch = more elastic
+    gt_easier = sc_l > sc_r
+    q = "Is the left cloth much easier to stretch than the other?"
+    answer = "Yes" if gt_easier else "No"
+    questions.append(
+        {
+            "question": q,
+            "options": ["Yes", "No"],
+            "answer": answer,
+            "question_type": "property",
+            "question_class": "elasticity",
+            "scenario": "cloth",
+            "video_id": video_id,
+        }
+    )
+
+    # --- Property: Bending ---
+    bc_l = cloth_left.get("bendingCompliance", 0)
+    bc_r = cloth_right.get("bendingCompliance", 0)
+    # Lower compliance = harder to bend
+    gt_harder = bc_l < bc_r
+    q = "Is the left cloth much harder to bend or have wrinkles " "than the other?"
+    answer = "Yes" if gt_harder else "No"
+    questions.append(
+        {
+            "question": q,
+            "options": ["Yes", "No"],
+            "answer": answer,
+            "question_type": "property",
+            "question_class": "bending",
+            "scenario": "cloth",
+            "video_id": video_id,
+        }
+    )
+
+    # --- Predictive: Object fall over ---
+    for side_key, side_name in [("leftAll", "left"), ("rightAll", "right")]:
+        side_objs = ofa.get(side_key, {})
+        isolated = side_objs.get("isolatedObjects", {})
+        for obj_name, obj_data in isolated.items():
+            pose = obj_data.get("endPoseDescription", "")
+            if pose and pose != "Upright":
+                q = f"Does the {obj_name.lower()} fall over?"
+                answer = "Yes"
+            else:
+                q = f"Does the {obj_name.lower()} fall over?"
+                answer = "No"
+            questions.append(
+                {
+                    "question": q,
+                    "options": ["Yes", "No"],
+                    "answer": answer,
+                    "question_type": "predictive",
+                    "question_class": "fall_over",
+                    "scenario": "cloth",
+                    "video_id": video_id,
+                }
+            )
+            break  # One per side
+        else:
+            continue
+        break  # One total
+
+    # --- Predictive: Final pose ---
+    for side_key in ["leftAll", "rightAll"]:
+        side_objs = ofa.get(side_key, {})
+        isolated = side_objs.get("isolatedObjects", {})
+        for obj_name, obj_data in isolated.items():
+            pose = obj_data.get("endPoseDescription", "")
+            if pose:
+                q = f"Which phrase below can best describe the final pose " f"of the {obj_name.lower()}?"
+                pose_options = ["Standing upright", "Leaning", "Lying horizontally"]
+                if pose == "Upright":
+                    answer = "Standing upright"
+                elif "lean" in pose.lower() or "tilt" in pose.lower():
+                    answer = "Leaning"
+                else:
+                    answer = "Lying horizontally"
+                questions.append(
+                    {
+                        "question": q,
+                        "options": pose_options,
+                        "answer": answer,
+                        "question_type": "predictive",
+                        "question_class": "pose",
+                        "scenario": "cloth",
+                        "video_id": video_id,
+                    }
+                )
+                break
+        else:
+            continue
+        break
+
+    return questions
+
+
+def generate_ball_questions(data: dict, video_id: str) -> list[dict]:
+    """Generate MC questions for the ball scenario."""
+    questions = []
+    tracking = data.get("trackingData", {})
+    meta = data.get("metaSamplingData", {})
+    balls = meta.get("balls", [])
+    holes = meta.get("holesCenterXValue", [])
+    cf_annotations = data.get("CounterFactualAnnotations", {})
+
+    # --- Property: Elasticity comparison ---
+    if len(balls) >= 2:
+        for i in range(len(balls)):
+            for j in range(i + 1, len(balls)):
+                b1, b2 = balls[i], balls[j]
+                e1 = b1.get("elasticityType", "")
+                e2 = b2.get("elasticityType", "")
+                # Elastic > Plastic > Rigid in terms of elasticity
+                elasticity_rank = {"Elastic": 3, "Plastic": 2, "Rigid": 1}
+                r1 = elasticity_rank.get(e1, 0)
+                r2 = elasticity_rank.get(e2, 0)
+                if r1 != r2:
+                    gt_greater = r1 > r2
+                    q = f"Is the elasticity (deformability) of the " f"{b1['name'].lower()} much greater than " f"the {b2['name'].lower()}?"
+                    answer = "Yes" if gt_greater else "No"
+                    questions.append(
+                        {
+                            "question": q,
+                            "options": ["Yes", "No"],
+                            "answer": answer,
+                            "question_type": "property",
+                            "question_class": "elasticity",
+                            "scenario": "ball",
+                            "video_id": video_id,
+                        }
+                    )
+
+    # --- Property: Plasticity comparison ---
+    if len(balls) >= 2:
+        for i in range(len(balls)):
+            for j in range(i + 1, len(balls)):
+                b1, b2 = balls[i], balls[j]
+                e1 = b1.get("elasticityType", "")
+                e2 = b2.get("elasticityType", "")
+                # Plastic has highest plasticity
+                plasticity_rank = {"Plastic": 3, "Rigid": 2, "Elastic": 1}
+                r1 = plasticity_rank.get(e1, 0)
+                r2 = plasticity_rank.get(e2, 0)
+                if r1 != r2:
+                    gt_greater = r1 > r2
+                    q = f"Is the plasticity of the " f"{b1['name'].lower()} much greater than " f"the {b2['name'].lower()}?"
+                    answer = "Yes" if gt_greater else "No"
+                    questions.append(
+                        {
+                            "question": q,
+                            "options": ["Yes", "No"],
+                            "answer": answer,
+                            "question_type": "property",
+                            "question_class": "plasticity",
+                            "scenario": "ball",
+                            "video_id": video_id,
+                        }
+                    )
+                    break
+            else:
+                continue
+            break
+
+    # --- Predictive: Final drop (which pit) ---
+    for ball_name, ball_tracking in tracking.items():
+        pit_result = ball_tracking.get("pitResult", "")
+        if "Left" in pit_result:
+            answer = "The left pit"
+        elif "Right" in pit_result:
+            answer = "The right pit"
+        elif "No Pit" in pit_result:
+            answer = "None of the above"
+        else:
+            continue
+
+        q = f"Will the {ball_name.lower()} finally drop into the left pit or the right pit?"
+        options = ["The left pit", "The right pit", "None of the above"]
+        questions.append(
+            {
+                "question": q,
+                "options": options,
+                "answer": answer,
+                "question_type": "predictive",
+                "question_class": "final_drop",
+                "scenario": "ball",
+                "video_id": video_id,
+            }
+        )
+        break  # One per video
+
+    # --- Counterfactual: Remove wall, which pit ---
+    for cf_key, cf_data in list(cf_annotations.items())[:1]:
+        removed_wall = cf_data.get("removedWall", "")
+        ball_name = cf_data.get("ball", "")
+        cf_tracking = cf_data.get("trackingData", {})
+        if not removed_wall or not ball_name:
+            continue
+        ball_track = cf_tracking.get(ball_name, {})
+        pit_result = ball_track.get("pitResult", "")
+        if "Left" in pit_result:
+            answer = "The left pit"
+        elif "Right" in pit_result:
+            answer = "The right pit"
+        else:
+            answer = "None of the above"
+
+        q = f"If we removed the {removed_wall.lower()} and other balls, " f"which pit would the {ball_name.lower()} drop into?"
+        options = ["The left pit", "The right pit", "None of the above"]
+        questions.append(
+            {
+                "question": q,
+                "options": options,
+                "answer": answer,
+                "question_type": "counterfactual",
+                "question_class": "remove",
+                "scenario": "ball",
+                "video_id": video_id,
+            }
+        )
+
+    return questions
+
+
+# ---------------------------------------------------------------------------
+# Main generation pipeline
+# ---------------------------------------------------------------------------
+GENERATORS = {
+    "fluid": generate_fluid_questions,
+    "rope": generate_rope_questions,
+    "cloth": generate_cloth_questions,
+    "ball": generate_ball_questions,
+}
+
+
+def process_scenario(scenario: str, data_dir: str) -> list[dict]:
+    """Process all trials in a scenario directory."""
+    scenario_dir_name = SCENARIO_DIRS[scenario]
+    scenario_path = Path(data_dir) / scenario_dir_name
+
+    if not scenario_path.exists():
+        print(f"  Skipping {scenario}: {scenario_path} not found")
+        return []
+
+    generator = GENERATORS[scenario]
+    all_questions = []
+
+    trial_dirs = sorted(
+        [d for d in scenario_path.iterdir() if d.is_dir()],
+        key=lambda d: int(d.name) if d.name.isdigit() else 0,
+    )
+
+    for trial_dir in trial_dirs:
+        outputs_path = trial_dir / "outputs.json"
+        if not outputs_path.exists():
+            continue
+
+        with open(outputs_path) as f:
+            data = json.load(f)
+
+        if not data.get("validity", False):
+            continue
+
+        video_file = trial_dir / "output_Full.mp4"
+        if not video_file.exists():
+            continue
+
+        video_id = f"{scenario_dir_name}/{trial_dir.name}"
+        questions = generator(data, video_id)
+        all_questions.extend(questions)
+
+    return all_questions
+
+
+def download_and_extract(cache_dir: str, use_mini: bool = False) -> str:
+    """Download ContPhy zips from HuggingFace and extract them."""
+    import urllib.request
+
+    os.makedirs(cache_dir, exist_ok=True)
+
+    if use_mini:
+        zip_name = "contphy_mini.zip"
+        url = f"{HF_BASE}/{zip_name}"
+        zip_path = os.path.join(cache_dir, zip_name)
+        extract_dir = os.path.join(cache_dir, "contphy_data")
+
+        if not os.path.exists(zip_path):
+            print(f"Downloading {url} ...")
+            urllib.request.urlretrieve(url, zip_path)
+
+        if not os.path.exists(extract_dir):
+            print(f"Extracting {zip_path} ...")
+            os.makedirs(extract_dir, exist_ok=True)
+            with zipfile.ZipFile(zip_path) as zf:
+                zf.extractall(extract_dir)
+
+        return extract_dir
+    else:
+        extract_dir = os.path.join(cache_dir, "contphy_data")
+        os.makedirs(extract_dir, exist_ok=True)
+
+        for scenario, zip_name in SCENARIO_ZIPS.items():
+            zip_path = os.path.join(cache_dir, zip_name)
+            scenario_dir = os.path.join(extract_dir, SCENARIO_DIRS[scenario])
+
+            if os.path.exists(scenario_dir):
+                print(f"  {scenario}: already extracted")
+                continue
+
+            if not os.path.exists(zip_path):
+                url = f"{HF_BASE}/{zip_name}"
+                print(f"  Downloading {url} ...")
+                urllib.request.urlretrieve(url, zip_path)
+
+            print(f"  Extracting {zip_path} ...")
+            with zipfile.ZipFile(zip_path) as zf:
+                zf.extractall(extract_dir)
+
+        return extract_dir
+
+
+def main():
+    parser = argparse.ArgumentParser(description="Generate ContPhy QA JSON")
+    parser.add_argument("--data-dir", type=str, default="", help="Path to extracted ContPhy data. If empty, downloads from HF.")
+    parser.add_argument("--output", type=str, required=True, help="Output JSON file path")
+    parser.add_argument("--mini", action="store_true", help="Use mini dataset (20 videos per scenario) instead of full")
+    parser.add_argument("--cache-dir", type=str, default="", help="Cache directory for downloads")
+    parser.add_argument("--seed", type=int, default=42, help="Random seed for option shuffling")
+    args = parser.parse_args()
+
+    random.seed(args.seed)
+
+    data_dir = args.data_dir
+    if not data_dir:
+        cache_dir = args.cache_dir or os.path.join(
+            os.path.expanduser(os.getenv("HF_HOME", "~/.cache/huggingface")),
+            "contphy",
+        )
+        data_dir = download_and_extract(cache_dir, use_mini=args.mini)
+
+    all_questions = []
+    for scenario in ["fluid", "rope", "cloth", "ball"]:
+        print(f"Processing {scenario}...")
+        questions = process_scenario(scenario, data_dir)
+        print(f"  Generated {len(questions)} questions")
+        all_questions.extend(questions)
+
+    # Add index
+    for i, q in enumerate(all_questions):
+        q["idx"] = i
+
+    # Write output
+    os.makedirs(os.path.dirname(os.path.abspath(args.output)), exist_ok=True)
+    with open(args.output, "w") as f:
+        json.dump(all_questions, f, indent=2)
+
+    print(f"\nTotal: {len(all_questions)} questions written to {args.output}")
+
+    # Stats
+    by_scenario = {}
+    by_type = {}
+    for q in all_questions:
+        s = q["scenario"]
+        t = q["question_type"]
+        by_scenario[s] = by_scenario.get(s, 0) + 1
+        by_type[t] = by_type.get(t, 0) + 1
+    print("\nBy scenario:", by_scenario)
+    print("By type:", by_type)
+
+
+if __name__ == "__main__":
+    main()
diff --git a/lmms_eval/tasks/contphy/utils.py b/lmms_eval/tasks/contphy/utils.py
new file mode 100644
index 000000000..9b29173ae
--- /dev/null
+++ b/lmms_eval/tasks/contphy/utils.py
@@ -0,0 +1,265 @@
+"""ContPhy benchmark utilities for lmms-eval.
+
+ContPhy: Continuum Physical Concept Learning and Reasoning from Videos (ICML 2024)
+https://arxiv.org/abs/2402.06119
+
+Handles multiple-choice QA over physics simulation videos covering four
+scenarios: fluid dynamics, rope/pulley systems, cloth manipulation, and
+soft-body ball interactions.
+"""
+
+import os
+import re
+from collections import defaultdict
+
+import datasets
+from loguru import logger as eval_logger
+
+# ---------------------------------------------------------------------------
+# Scenario categories for per-scenario reporting
+# ---------------------------------------------------------------------------
+SCENARIOS = ["fluid", "rope", "cloth", "ball"]
+
+QUESTION_TYPES = ["property", "predictive", "counterfactual", "goal_driven"]
+
+# Option letters
+OPTION_LETTERS = "ABCDEFGHIJ"
+
+
+# ---------------------------------------------------------------------------
+# Data directory resolution
+# ---------------------------------------------------------------------------
+def _get_data_dir() -> str:
+    """Resolve the ContPhy data directory containing extracted video files.
+
+    Checks in order:
+      1. CONTPHY_DATA_DIR env var
+      2. $HF_HOME/contphy/contphy_data/
+    """
+    explicit = os.getenv("CONTPHY_DATA_DIR", "").strip()
+    if explicit:
+        return os.path.expanduser(explicit)
+
+    hf_home = os.path.expanduser(os.getenv("HF_HOME", "~/.cache/huggingface"))
+    return os.path.join(hf_home, "contphy", "contphy_data")
+
+
+# ---------------------------------------------------------------------------
+# process_docs: format options with letter prefixes
+# ---------------------------------------------------------------------------
+def contphy_process_docs(dataset):
+    """Add letter-prefixed option strings and answer letter to each doc."""
+    processed = []
+    for doc in dataset:
+        doc = dict(doc)
+        options = doc.get("options", [])
+        answer_text = doc.get("answer", "")
+
+        # Build lettered options: "A. Yes", "B. No", ...
+        lettered = []
+        answer_letter = ""
+        for i, opt in enumerate(options):
+            letter = OPTION_LETTERS[i]
+            lettered.append(f"{letter}. {opt}")
+            if opt == answer_text:
+                answer_letter = letter
+
+        doc["options_str"] = "\n".join(lettered)
+        doc["answer_letter"] = answer_letter
+        doc["answer"] = answer_letter  # override for doc_to_target
+        processed.append(doc)
+    return datasets.Dataset.from_list(processed)
+
+
+# ---------------------------------------------------------------------------
+# doc_to_visual: locate the video file
+# ---------------------------------------------------------------------------
+def contphy_doc_to_visual(doc):
+    """Return the video file path for this question."""
+    video_id = doc.get("video_id", "")
+    if not video_id:
+        eval_logger.warning("ContPhy: no video_id in doc")
+        return []
+
+    data_dir = _get_data_dir()
+    video_path = os.path.join(data_dir, video_id, "output_Full.mp4")
+
+    if os.path.exists(video_path):
+        return [video_path]
+
+    # Try without subdirectory nesting
+    alt_path = os.path.join(data_dir, f"{video_id}.mp4")
+    if os.path.exists(alt_path):
+        return [alt_path]
+
+    eval_logger.warning(
+        "ContPhy: video not found for {} (tried {})",
+        video_id,
+        video_path,
+    )
+    return []
+
+
+# ---------------------------------------------------------------------------
+# doc_to_text: build the MC prompt
+# ---------------------------------------------------------------------------
+def contphy_doc_to_text(doc, lmms_eval_specific_kwargs=None):
+    """Format the question with lettered options."""
+    kwargs = lmms_eval_specific_kwargs or {}
+    pre_prompt = kwargs.get("pre_prompt", "")
+    post_prompt = kwargs.get("post_prompt", "")
+
+    question = doc.get("question", "")
+    options_str = doc.get("options_str", "")
+
+    return f"{pre_prompt}{question}\n{options_str}{post_prompt}"
+
+
+# ---------------------------------------------------------------------------
+# Answer extraction
+# ---------------------------------------------------------------------------
+def _extract_letter(text: str, num_options: int = 5) -> str:
+    """Extract a single option letter from model response."""
+    text = text.strip()
+
+    # Common answer prefixes to strip
+    answer_prefixes = [
+        "The best answer is",
+        "The correct answer is",
+        "The answer is",
+        "The answer",
+        "Best answer:",
+        "Answer:",
+    ]
+    text_lower = text.lower()
+    for prefix in answer_prefixes:
+        if text_lower.startswith(prefix.lower()):
+            text = text[len(prefix) :].strip(" :.-")
+            break
+
+    valid_letters = OPTION_LETTERS[:num_options]
+
+    # Try to find a letter at the very start
+    if text and text[0].upper() in valid_letters:
+        # Make sure it's not part of a word (e.g. "A" vs "And")
+        if len(text) == 1 or not text[1].isalpha():
+            return text[0].upper()
+
+    # Look for standalone letter (word boundary) or "letter." / "letter)"
+    pattern = rf"\b([{valid_letters}])(?:\b|[.\)\]:,])"
+    matches = re.findall(pattern, text.upper())
+    if matches:
+        return matches[0]
+
+    # Fallback: any letter in the response
+    fallback = re.findall(rf"[{valid_letters}]", text.upper())
+    if fallback:
+        return fallback[0]
+
+    return ""
+
+
+# ---------------------------------------------------------------------------
+# process_results
+# ---------------------------------------------------------------------------
+def contphy_process_results(doc, results):
+    """Compare model prediction against ground truth."""
+    pred = results[0] if results else ""
+    num_options = len(doc.get("options", []))
+    pred_letter = _extract_letter(pred, num_options)
+
+    gt_letter = doc.get("answer_letter", doc.get("answer", ""))
+
+    return {
+        "contphy_accuracy": {
+            "video_id": doc.get("video_id", ""),
+            "scenario": doc.get("scenario", "unknown"),
+            "question_type": doc.get("question_type", "unknown"),
+            "question_class": doc.get("question_class", "unknown"),
+            "pred_answer": pred_letter,
+            "answer": gt_letter,
+        }
+    }
+
+
+# ---------------------------------------------------------------------------
+# aggregate_results
+# ---------------------------------------------------------------------------
+def contphy_aggregate_results(results):
+    """Compute per-scenario, per-type, and overall accuracy."""
+    # Per-scenario stats
+    scenario_stats = defaultdict(lambda: {"correct": 0, "total": 0})
+    # Per-question-type stats
+    type_stats = defaultdict(lambda: {"correct": 0, "total": 0})
+    # Per scenario+type combo
+    combo_stats = defaultdict(lambda: {"correct": 0, "total": 0})
+
+    total_correct = 0
+    total = 0
+
+    for result in results:
+        scenario = result.get("scenario", "unknown")
+        q_type = result.get("question_type", "unknown")
+        correct = result["pred_answer"] == result["answer"]
+
+        scenario_stats[scenario]["total"] += 1
+        type_stats[q_type]["total"] += 1
+        combo_stats[f"{scenario}/{q_type}"]["total"] += 1
+        total += 1
+
+        if correct:
+            scenario_stats[scenario]["correct"] += 1
+            type_stats[q_type]["correct"] += 1
+            combo_stats[f"{scenario}/{q_type}"]["correct"] += 1
+            total_correct += 1
+
+    # Log per-scenario results
+    for scenario in SCENARIOS:
+        stats = scenario_stats.get(scenario, {"correct": 0, "total": 0})
+        if stats["total"] > 0:
+            acc = 100 * stats["correct"] / stats["total"]
+            eval_logger.info(
+                "ContPhy [{}]: {:.1f}% ({}/{})",
+                scenario,
+                acc,
+                stats["correct"],
+                stats["total"],
+            )
+
+    # Log per-type results
+    for q_type in sorted(type_stats):
+        stats = type_stats[q_type]
+        if stats["total"] > 0:
+            acc = 100 * stats["correct"] / stats["total"]
+            eval_logger.info(
+                "ContPhy [{}]: {:.1f}% ({}/{})",
+                q_type,
+                acc,
+                stats["correct"],
+                stats["total"],
+            )
+
+    # Log combo results
+    for combo in sorted(combo_stats):
+        stats = combo_stats[combo]
+        if stats["total"] > 0:
+            acc = 100 * stats["correct"] / stats["total"]
+            eval_logger.info(
+                "ContPhy [{}]: {:.1f}% ({}/{})",
+                combo,
+                acc,
+                stats["correct"],
+                stats["total"],
+            )
+
+    if total == 0:
+        return 0.0
+
+    overall = 100 * total_correct / total
+    eval_logger.info(
+        "ContPhy overall: {:.1f}% ({}/{})",
+        overall,
+        total_correct,
+        total,
+    )
+    return overall
diff --git a/lmms_eval/tasks/physbench/physbench.yaml b/lmms_eval/tasks/physbench/physbench.yaml
new file mode 100644
index 000000000..62127522c
--- /dev/null
+++ b/lmms_eval/tasks/physbench/physbench.yaml
@@ -0,0 +1,32 @@
+# PhysBench: Multi-domain physics reasoning (ICLR 2025)
+# Paper: https://arxiv.org/abs/2501.16411
+# Original: USC-PSI-Lab/PhysBench (incompatible parquet schema)
+# Clean copy: lmms-lab-eval/PhysBench (val/test splits, answers merged)
+#
+# Media files (image.zip, video.zip) must be in the original repo or
+# copied to lmms-lab-eval/PhysBench. Set cache_dir to control extraction path.
+dataset_path: lmms-lab-eval/PhysBench
+dataset_kwargs:
+  token: True
+  cache_dir: physbench
+  video: True
+task: physbench
+test_split: val
+output_type: generate_until
+doc_to_visual: !function utils.physbench_doc_to_visual
+doc_to_text: !function utils.physbench_doc_to_text
+doc_to_target: "answer"
+generation_kwargs:
+  max_new_tokens: 16
+  temperature: 0
+  do_sample: false
+process_results: !function utils.physbench_process_results
+metric_list:
+  - metric: physbench_accuracy
+    aggregation: !function utils.physbench_aggregate_results
+    higher_is_better: true
+lmms_eval_specific_kwargs:
+  default:
+    post_prompt: "\nAnswer with the option's letter from the given choices directly."
+metadata:
+  version: 0.2
diff --git a/lmms_eval/tasks/physbench/utils.py b/lmms_eval/tasks/physbench/utils.py
new file mode 100644
index 000000000..267ebd833
--- /dev/null
+++ b/lmms_eval/tasks/physbench/utils.py
@@ -0,0 +1,192 @@
+import json
+import os
+import urllib.request
+from collections import defaultdict
+from pathlib import Path
+
+import datasets
+import yaml
+from loguru import logger as eval_logger
+
+from lmms_eval import utils as lmms_utils
+from lmms_eval.tasks._task_utils.mcq_extract import extract_mcq_answer
+
+# PhysBench category breakdowns
+TASK_TYPES = ["property", "relationships", "scene", "dynamics"]
+ABILITY_TYPES = ["identify", "comparison", "static", "dynamic", "perception", "prediction", "judgment", "reasoning"]
+
+# URL for val split answers (answers not included in HF dataset)
+VAL_ANSWER_URL = "https://raw.githubusercontent.com/USC-GVL/PhysBench/main/eval/physbench/val_answer.json"
+
+# Cached answer map: idx -> {answer, task_type, sub_type, ability_type}
+_val_answers = None
+
+
+def _fetch_val_answers():
+    """Download and cache the val answer file from the PhysBench GitHub repo."""
+    global _val_answers
+    if _val_answers is not None:
+        return _val_answers
+
+    eval_logger.info(f"Fetching PhysBench val answers from {VAL_ANSWER_URL}")
+    try:
+        with urllib.request.urlopen(VAL_ANSWER_URL, timeout=30) as resp:
+            data = json.loads(resp.read().decode("utf-8"))
+        _val_answers = {item["idx"]: item for item in data}
+        eval_logger.info(f"Loaded {len(_val_answers)} val answers")
+    except Exception as e:
+        eval_logger.warning(f"Failed to fetch val answers: {e}. Accuracy will not be computed.")
+        _val_answers = {}
+    return _val_answers
+
+
+def _load_task_config():
+    with open(Path(__file__).parent / "physbench.yaml", "r") as f:
+        raw_data = f.readlines()
+        safe_data = [line for line in raw_data if "!function" not in line]
+    return yaml.safe_load("".join(safe_data))
+
+
+def _get_cache_dir():
+    config = _load_task_config()
+    hf_home = os.path.expanduser(os.getenv("HF_HOME", "~/.cache/huggingface/"))
+    return lmms_utils.resolve_cache_dir(config["dataset_kwargs"]["cache_dir"], base_dir=hf_home)
+
+
+def physbench_process_docs(dataset: datasets.Dataset) -> datasets.Dataset:
+    """Filter to val split only (test answers are hidden) and merge answers."""
+    # Filter to val entries only
+    dataset = dataset.filter(lambda x: x["split"] == "val")
+
+    # Fetch and merge answer metadata
+    answers = _fetch_val_answers()
+    if not answers:
+        return dataset
+
+    def _merge_answer(example):
+        idx = example["idx"]
+        ans_info = answers.get(idx, {})
+        example["answer"] = ans_info.get("answer", "")
+        example["task_type"] = ans_info.get("task_type", "")
+        example["sub_type"] = ans_info.get("sub_type", "")
+        example["ability_type"] = ans_info.get("ability_type", "")
+        return example
+
+    dataset = dataset.map(_merge_answer)
+    return dataset
+
+
+def _resolve_media_path(cache_dir, fname):
+    """Resolve media file path, checking subdirectory and flat layouts.
+
+    The PhysBench README instructs users to ``unzip image.zip -d image`` which
+    produces ``{cache_dir}/image/foo.png``.  The lmms-eval framework's zip
+    extraction, however, extracts flat into ``cache_dir`` giving
+    ``{cache_dir}/foo.png``.  We check both locations.
+    """
+    ext = os.path.splitext(fname)[1].lower()
+    subdir = "video" if ext == ".mp4" else "image"
+
+    # Prefer the subdirectory layout (manual unzip / README instructions)
+    subdir_path = os.path.join(cache_dir, subdir, fname)
+    if os.path.exists(subdir_path):
+        return subdir_path
+
+    # Fall back to flat layout (framework zip extraction)
+    flat_path = os.path.join(cache_dir, fname)
+    if os.path.exists(flat_path):
+        return flat_path
+
+    # Neither found; return the subdirectory path for the warning message
+    eval_logger.warning(f"PhysBench media file not found: {subdir_path} (also checked {flat_path})")
+    return subdir_path
+
+
+def physbench_doc_to_visual(doc):
+    """Return list of media paths (images and videos) for a document."""
+    cache_dir = _get_cache_dir()
+    # Clean dataset uses media_path (string), original uses file_name (list)
+    media = doc.get("file_name") or doc.get("media_path", "")
+    if isinstance(media, str):
+        media = [media] if media else []
+    return [_resolve_media_path(cache_dir, fname) for fname in media]
+
+
+def physbench_doc_to_text(doc, lmms_eval_specific_kwargs=None):
+    """Format the question text.
+
+    The question field already contains inline options (A-D) with <video>/<image>
+    placeholders. The lmms-eval framework handles visual injection, so we keep
+    the placeholders as-is and append the post_prompt.
+    """
+    question = doc["question"].strip()
+
+    post_prompt = ""
+    if lmms_eval_specific_kwargs:
+        post_prompt = lmms_eval_specific_kwargs.get("post_prompt", "")
+
+    return question + post_prompt
+
+
+def physbench_process_results(doc, results):
+    """Extract predicted answer and compare to ground truth."""
+    pred = results[0]
+    pred_ans = extract_mcq_answer(pred, choices=["A", "B", "C", "D"])
+    gt_ans = doc.get("answer", "")
+
+    score = 1.0 if pred_ans.upper() == gt_ans.upper() and gt_ans else 0.0
+
+    data_dict = {
+        "idx": doc.get("idx", doc.get("index", "")),
+        "task_type": doc.get("task_type", ""),
+        "sub_type": doc.get("sub_type", ""),
+        "ability_type": doc.get("ability_type", ""),
+        "pred_answer": pred_ans,
+        "answer": gt_ans,
+        "score": score,
+    }
+
+    return {"physbench_accuracy": data_dict}
+
+
+def physbench_aggregate_results(results):
+    """Compute overall and per-category accuracy."""
+    # Per task_type breakdown
+    task_type_scores = defaultdict(lambda: {"correct": 0, "total": 0})
+    # Per ability_type breakdown
+    ability_type_scores = defaultdict(lambda: {"correct": 0, "total": 0})
+
+    total_correct = 0
+    total_answered = 0
+
+    for result in results:
+        total_answered += 1
+        correct = result["score"]
+        total_correct += correct
+
+        task_type = result.get("task_type", "")
+        if task_type:
+            task_type_scores[task_type]["total"] += 1
+            task_type_scores[task_type]["correct"] += correct
+
+        ability_type = result.get("ability_type", "")
+        if ability_type:
+            ability_type_scores[ability_type]["total"] += 1
+            ability_type_scores[ability_type]["correct"] += correct
+
+    # Log per-category results
+    for task_type in TASK_TYPES:
+        stats = task_type_scores.get(task_type, {"correct": 0, "total": 0})
+        if stats["total"] > 0:
+            acc = 100 * stats["correct"] / stats["total"]
+            eval_logger.info(f"PhysBench task_type={task_type}: {acc:.1f}% ({int(stats['correct'])}/{stats['total']})")
+
+    for ability_type in ABILITY_TYPES:
+        stats = ability_type_scores.get(ability_type, {"correct": 0, "total": 0})
+        if stats["total"] > 0:
+            acc = 100 * stats["correct"] / stats["total"]
+            eval_logger.info(f"PhysBench ability_type={ability_type}: {acc:.1f}% ({int(stats['correct'])}/{stats['total']})")
+
+    overall_acc = 100 * total_correct / total_answered if total_answered > 0 else 0
+    eval_logger.info(f"PhysBench Overall: {overall_acc:.1f}% ({int(total_correct)}/{total_answered})")
+    return overall_acc
diff --git a/lmms_eval/tasks/physgame/physgame.yaml b/lmms_eval/tasks/physgame/physgame.yaml
new file mode 100644
index 000000000..81122396d
--- /dev/null
+++ b/lmms_eval/tasks/physgame/physgame.yaml
@@ -0,0 +1,32 @@
+# NOTE: PhysGame/PhysGame-Benchmark does not exist as a public HuggingFace dataset.
+# The PhysGame org (https://huggingface.co/PhysGame) only has training datasets
+# (PhysGame-40K, PhysDPO-10k). The 880-video benchmark referenced in the paper
+# (https://arxiv.org/abs/2412.01800) and GitHub repo was never published publicly.
+# This task cannot run until the authors release the benchmark dataset.
+# See: https://github.com/PhysGame/PhysGame
+dataset_path: PhysGame/PhysGame-Benchmark
+dataset_kwargs:
+  token: True
+  cache_dir: physgame
+  video: True
+task: physgame
+test_split: test
+output_type: generate_until
+doc_to_visual: !function utils.physgame_doc_to_visual
+doc_to_text: !function utils.physgame_doc_to_text
+doc_to_target: "answer"
+generation_kwargs:
+  max_new_tokens: 16
+  temperature: 0
+  do_sample: false
+process_results: !function utils.physgame_process_results
+metric_list:
+  - metric: physgame_accuracy
+    aggregation: !function utils.physgame_aggregate_results
+    higher_is_better: true
+lmms_eval_specific_kwargs:
+  default:
+    pre_prompt: ""
+    post_prompt: "\nAnswer with the option's letter from the given choices directly."
+metadata:
+  - version: 0.0
diff --git a/lmms_eval/tasks/physgame/utils.py b/lmms_eval/tasks/physgame/utils.py
new file mode 100644
index 000000000..cf930e9c7
--- /dev/null
+++ b/lmms_eval/tasks/physgame/utils.py
@@ -0,0 +1,107 @@
+import os
+import re
+import sys
+from pathlib import Path
+
+import yaml
+from loguru import logger as eval_logger
+
+# Physics categories from the PhysGame benchmark (4 domains, 12 fine-grained)
+PHYSICS_DOMAINS = ["Mechanics", "Kinematics", "Optics", "Material Properties"]
+
+hf_home = os.getenv("HF_HOME", "~/.cache/huggingface/")
+base_cache_dir = os.path.expanduser(hf_home)
+with open(Path(__file__).parent / "physgame.yaml", "r") as f:
+    raw_data = f.readlines()
+    safe_data = []
+    for i, line in enumerate(raw_data):
+        if "!function" not in line:
+            safe_data.append(line)
+cache_name = yaml.safe_load("".join(safe_data))["dataset_kwargs"]["cache_dir"]
+
+
+def physgame_doc_to_visual(doc):
+    cache_dir = os.path.join(base_cache_dir, cache_name)
+    video_id = str(doc["question_id"])
+    video_path = os.path.join(cache_dir, video_id + ".mp4")
+
+    if os.path.exists(video_path):
+        return [video_path]
+    # Try alternate extensions
+    for ext in ["MP4", "mkv", "webm", "avi"]:
+        alt = video_path.rsplit(".", 1)[0] + "." + ext
+        if os.path.exists(alt):
+            return [alt]
+    sys.exit(f"video path:{video_path} does not exist, please check")
+
+
+def physgame_doc_to_text(doc, lmms_eval_specific_kwargs=None):
+    if lmms_eval_specific_kwargs is None:
+        lmms_eval_specific_kwargs = {}
+
+    question = doc["question"]
+    options = doc["options"]
+    # options is a dict like {"A": "...", "B": "...", "C": "...", "D": "..."}
+    if isinstance(options, dict):
+        option_str = "\n".join([f"{k}. {v}" for k, v in options.items()])
+    elif isinstance(options, list):
+        option_str = "\n".join(options)
+    else:
+        option_str = str(options)
+
+    pre_prompt = lmms_eval_specific_kwargs.get("pre_prompt", "")
+    post_prompt = lmms_eval_specific_kwargs.get("post_prompt", "")
+
+    full_prompt = f"{pre_prompt}{question}\n{option_str}{post_prompt}"
+    return full_prompt
+
+
+def _extract_answer(s):
+    """Extract a single letter answer [A-D] from model output."""
+    s = s.strip()
+    # Remove common prefixes
+    for prefix in [
+        "The best answer is",
+        "The correct answer is",
+        "The answer is",
+        "The answer",
+        "Best answer:",
+        "Answer:",
+    ]:
+        s = s.replace(prefix, "")
+    s = s.strip().lstrip(":(").strip()
+
+    if len(s.split()) > 10 and not re.search("[ABCD]", s):
+        return ""
+
+    match = re.search(r"[ABCD]", s)
+    if match is None:
+        return ""
+    return match[0]
+
+
+def physgame_process_results(doc, results):
+    pred = results[0]
+    pred_ans = _extract_answer(pred)
+    gt_ans = str(doc["answer"]).strip().upper()
+
+    return {
+        "physgame_accuracy": {
+            "question_id": doc["question_id"],
+            "pred_answer": pred_ans,
+            "answer": gt_ans,
+        }
+    }
+
+
+def physgame_aggregate_results(results):
+    correct = 0
+    total = 0
+    for result in results:
+        total += 1
+        if result["pred_answer"] == result["answer"]:
+            correct += 1
+
+    accuracy = 100.0 * correct / total if total > 0 else 0
+    eval_logger.info(f"PhysGame Overall Accuracy: {accuracy:.1f}% ({correct}/{total})")
+    return accuracy
diff --git a/lmms_eval/tasks/physics_rw/physics_rw.yaml b/lmms_eval/tasks/physics_rw/physics_rw.yaml
new file mode 100644
index 000000000..2dd81f624
--- /dev/null
+++ b/lmms_eval/tasks/physics_rw/physics_rw.yaml
@@ -0,0 +1,30 @@
+dataset_path: json
+dataset_kwargs:
+  data_files:
+    test:
+      - https://huggingface.co/datasets/zhaopengyu/Physics-RW/resolve/main/Physics-RW/Electromagnetism/classification/classification_en.json
+      - https://huggingface.co/datasets/zhaopengyu/Physics-RW/resolve/main/Physics-RW/Mechanics/classification/classification_en.json
+      - https://huggingface.co/datasets/zhaopengyu/Physics-RW/resolve/main/Physics-RW/Optics/classification/classification_en.json
+      - https://huggingface.co/datasets/zhaopengyu/Physics-RW/resolve/main/Physics-RW/Thermodynamics/classification/classification_en.json
+task: physics_rw
+test_split: test
+process_docs: !function utils.physics_rw_process_docs
+output_type: generate_until
+doc_to_visual: !function utils.physics_rw_doc_to_visual
+doc_to_text: !function utils.physics_rw_doc_to_text
+doc_to_target: "label"
+generation_kwargs:
+  max_new_tokens: 16
+  temperature: 0
+  do_sample: false
+process_results: !function utils.physics_rw_process_results
+metric_list:
+  - metric: physics_rw_accuracy
+    aggregation: !function utils.physics_rw_aggregate_results
+    higher_is_better: true
+lmms_eval_specific_kwargs:
+  default:
+    pre_prompt: ""
+    post_prompt: ""
+metadata:
+  - version: 0.0
diff --git a/lmms_eval/tasks/physics_rw/utils.py b/lmms_eval/tasks/physics_rw/utils.py
new file mode 100644
index 000000000..45498fda2
--- /dev/null
+++ b/lmms_eval/tasks/physics_rw/utils.py
@@ -0,0 +1,166 @@
+import os
+import re
+
+import datasets
+from loguru import logger as eval_logger
+
+# Maps video filename prefix to the HuggingFace subdirectory for that domain.
+_PREFIX_TO_DOMAIN = {
+    "elec": "Electromagnetism",
+    "elasticity": "Mechanics",
+    "gravity": "Mechanics",
+    "dominoes": "Mechanics",
+    "optics": "Optics",
+    "thermology": "Thermodynamics",
+    "buoyancy": "Mechanics",
+    "pressure": "Mechanics",
+}
+
+_HF_BASE = "https://huggingface.co/datasets/zhaopengyu/Physics-RW/resolve/main/Physics-RW"
+
+DOMAINS = ["Electromagnetism", "Mechanics", "Optics", "Thermodynamics"]
+
+
+def _get_cache_dir():
+    hf_home = os.path.expanduser(os.getenv("HF_HOME", "~/.cache/huggingface/"))
+    return os.path.join(hf_home, "physics_rw")
+
+
+def _domain_from_video_path(video_path):
+    """Infer domain from video filename prefix."""
+    basename = os.path.basename(video_path).replace(".mp4", "")
+    for prefix, domain in _PREFIX_TO_DOMAIN.items():
+        if basename.startswith(prefix):
+            return domain
+    # Fallback: try to guess from the full path
+    for domain in DOMAINS:
+        if domain.lower() in video_path.lower():
+            return domain
+    return "Unknown"
+
+
+def _download_video(video_path, domain, cache_dir):
+    """Download a video from HuggingFace if not already cached."""
+    # video_path is like "video/elec_ori_video_26_0.mp4"
+    filename = os.path.basename(video_path)
+    local_path = os.path.join(cache_dir, domain, filename)
+
+    if os.path.exists(local_path):
+        return local_path
+
+    url = f"{_HF_BASE}/{domain}/classification/{video_path}"
+    os.makedirs(os.path.dirname(local_path), exist_ok=True)
+
+    try:
+        import urllib.request
+
+        eval_logger.info("Physics-RW: downloading {} -> {}", url, local_path)
+        urllib.request.urlretrieve(url, local_path)
+        return local_path
+    except Exception as e:
+        eval_logger.warning("Physics-RW: failed to download {}: {}", url, e)
+        return None
+
+
+def physics_rw_process_docs(dataset):
+    """Add domain field to each doc based on the video filename prefix."""
+    processed = []
+    for doc in dataset:
+        doc = dict(doc)
+        doc["domain"] = _domain_from_video_path(doc.get("video_path", ""))
+        processed.append(doc)
+    return datasets.Dataset.from_list(processed)
+
+
+def physics_rw_doc_to_visual(doc):
+    cache_dir = _get_cache_dir()
+    video_path = doc.get("video_path", "")
+    domain = doc.get("domain", _domain_from_video_path(video_path))
+
+    # Try local cache first
+    filename = os.path.basename(video_path)
+    local_path = os.path.join(cache_dir, domain, filename)
+
+    if not os.path.exists(local_path):
+        local_path = _download_video(video_path, domain, cache_dir)
+
+    if local_path and os.path.exists(local_path):
+        return [local_path]
+
+    eval_logger.warning("Physics-RW: video not found for {}", video_path)
+    return []
+
+
+def physics_rw_doc_to_text(doc, lmms_eval_specific_kwargs=None):
+    kwargs = lmms_eval_specific_kwargs or {}
+    pre_prompt = kwargs.get("pre_prompt", "")
+    post_prompt = kwargs.get("post_prompt", "")
+    instruction = doc.get("instruction", "")
+    return f"{pre_prompt}{instruction}{post_prompt}"
+
+
+def _extract_yes_no(text):
+    """Extract yes/no answer from model response."""
+    text = text.strip().lower()
+    # Direct match at start
+    if text.startswith("yes"):
+        return "yes"
+    if text.startswith("no"):
+        return "no"
+    # Search for yes/no in the response
+    yes_match = re.search(r"\byes\b", text)
+    no_match = re.search(r"\bno\b", text)
+    if yes_match and not no_match:
+        return "yes"
+    if no_match and not yes_match:
+        return "no"
+    # Both present: take whichever appears first
+    if yes_match and no_match:
+        return "yes" if yes_match.start() < no_match.start() else "no"
+    return ""
+
+
+def physics_rw_process_results(doc, results):
+    pred = results[0]
+    pred_ans = _extract_yes_no(pred)
+    gt_ans = doc.get("label", "").strip().lower()
+    domain = doc.get("domain", "Unknown")
+
+    return {
+        "physics_rw_accuracy": {
+            "idx": doc.get("idx", -1),
+            "domain": domain,
+            "pred_answer": pred_ans,
+            "answer": gt_ans,
+        }
+    }
+
+
+def physics_rw_aggregate_results(results):
+    domain_stats = {}
+    for domain in DOMAINS:
+        domain_stats[domain] = {"correct": 0, "total": 0}
+
+    for result in results:
+        domain = result["domain"]
+        if domain not in domain_stats:
+            domain_stats[domain] = {"correct": 0, "total": 0}
+        domain_stats[domain]["total"] += 1
+        if result["pred_answer"] == result["answer"]:
+            domain_stats[domain]["correct"] += 1
+
+    for domain in DOMAINS:
+        stats = domain_stats.get(domain, {"correct": 0, "total": 0})
+        if stats["total"] > 0:
+            acc = 100 * stats["correct"] / stats["total"]
+            eval_logger.info("Physics-RW [{}]: {:.1f}% ({}/{})", domain, acc, stats["correct"], stats["total"])
+
+    total_correct = sum(s["correct"] for s in domain_stats.values())
+    total = sum(s["total"] for s in domain_stats.values())
+
+    if total == 0:
+        return 0.0
+
+    overall = 100 * total_correct / total
+    eval_logger.info("Physics-RW overall: {:.1f}% ({}/{})", overall, total_correct, total)
+    return overall
diff --git a/lmms_eval/tasks/physreason/physreason.yaml b/lmms_eval/tasks/physreason/physreason.yaml
new file mode 100644
index 000000000..7679cd339
--- /dev/null
+++ b/lmms_eval/tasks/physreason/physreason.yaml
@@ -0,0 +1,21 @@
+dataset_path: lmms_eval/tasks/physreason/physreason_data/full
+task: "physreason"
+test_split: test
+output_type: generate_until
+doc_to_visual: !function utils.physreason_doc_to_visual
+doc_to_text: !function utils.physreason_doc_to_text
+doc_to_target: "answers"
+process_results: !function utils.physreason_process_results
+
+generation_kwargs:
+  max_new_tokens: 4096
+  temperature: 0
+  do_sample: false
+
+metric_list:
+  - metric: physreason_accuracy
+    aggregation: !function utils.physreason_aggregate_results
+    higher_is_better: true
+
+metadata:
+  version: 0.0
diff --git a/lmms_eval/tasks/physreason/physreason_data/.gitignore b/lmms_eval/tasks/physreason/physreason_data/.gitignore
new file mode 100644
index 000000000..bfaf9c296
--- /dev/null
+++ b/lmms_eval/tasks/physreason/physreason_data/.gitignore
@@ -0,0 +1,2 @@
+# Generated JSONL files -- rebuilt from HF zip on first use
+*/test/
diff --git a/lmms_eval/tasks/physreason/physreason_mini.yaml b/lmms_eval/tasks/physreason/physreason_mini.yaml
new file mode 100644
index 000000000..3e46fb4a7
--- /dev/null
+++ b/lmms_eval/tasks/physreason/physreason_mini.yaml
@@ -0,0 +1,21 @@
+dataset_path: lmms_eval/tasks/physreason/physreason_data/mini
+task: "physreason_mini"
+test_split: test
+output_type: generate_until
+doc_to_visual: !function utils.physreason_doc_to_visual
+doc_to_text: !function utils.physreason_doc_to_text
+doc_to_target: "answers"
+process_results: !function utils.physreason_process_results
+
+generation_kwargs:
+  max_new_tokens: 4096
+  temperature: 0
+  do_sample: false
+
+metric_list:
+  - metric: physreason_accuracy
+    aggregation: !function utils.physreason_aggregate_results
+    higher_is_better: true
+
+metadata:
+  version: 0.0
diff --git a/lmms_eval/tasks/physreason/utils.py b/lmms_eval/tasks/physreason/utils.py
new file mode 100644
index 000000000..e61114c17
--- /dev/null
+++ b/lmms_eval/tasks/physreason/utils.py
@@ -0,0 +1,285 @@
+"""Utility functions for PhysReason benchmark evaluation.
+
+Handles data preparation (zip download + JSONL generation), prompt
+construction, image extraction, and scoring for open-ended physics
+problem solving.
+
+Dataset: https://huggingface.co/datasets/zhibei1204/PhysReason
+Paper:   https://arxiv.org/abs/2502.12054
+"""
+
+import json
+import os
+import re
+import zipfile
+
+import numpy as np
+from loguru import logger as eval_logger
+from PIL import Image
+
+# ---------------------------------------------------------------------------
+# Data preparation: download zip from HF, extract, build JSONL for datasets
+# ---------------------------------------------------------------------------
+
+_HF_REPO = "zhibei1204/PhysReason"
+_ZIP_NAMES = {
+    "full": "PhysReason-full.zip",
+    "mini": "PhysReason-mini.zip",
+}
+_ZIP_ROOTS = {
+    "full": "PhysReason_full",
+    "mini": "PhysReason-mini",
+}
+_TASK_DIR = os.path.dirname(os.path.abspath(__file__))
+_DATA_DIR = os.path.join(_TASK_DIR, "physreason_data")
+
+# Cache directory for extracted zip contents (images live here)
+_CACHE_BASE = os.path.join(
+    os.path.expanduser(os.getenv("HF_HOME", "~/.cache/huggingface")),
+    "physreason",
+)
+
+
+def _ensure_data_prepared(config_name: str) -> None:
+    """Download the PhysReason zip, extract it, and build a JSONL file.
+
+    The JSONL is placed at ``physreason_data/<config_name>/data.jsonl`` so
+    that ``datasets.load_dataset`` auto-discovers it.  Images are extracted
+    to ``$HF_HOME/physreason/<config_name>/`` and referenced by path in the
+    ``image_path`` field.
+    """
+    split_dir = os.path.join(_DATA_DIR, config_name, "test")
+    jsonl_path = os.path.join(split_dir, "data.jsonl")
+
+    if os.path.exists(jsonl_path):
+        return  # already prepared
+
+    eval_logger.info(f"[physreason] Preparing {config_name} data ...")
+
+    from huggingface_hub import hf_hub_download
+
+    zip_file = hf_hub_download(
+        repo_id=_HF_REPO,
+        repo_type="dataset",
+        filename=_ZIP_NAMES[config_name],
+    )
+
+    extract_dir = os.path.join(_CACHE_BASE, config_name)
+    os.makedirs(extract_dir, exist_ok=True)
+
+    zip_root = _ZIP_ROOTS[config_name]
+    marker = os.path.join(extract_dir, ".extracted")
+    if not os.path.exists(marker):
+        eval_logger.info(f"[physreason] Extracting {zip_file} -> {extract_dir}")
+        with zipfile.ZipFile(zip_file, "r") as zf:
+            zf.extractall(extract_dir)
+        with open(marker, "w") as f:
+            f.write("done\n")
+
+    problems_root = os.path.join(extract_dir, zip_root)
+    if not os.path.isdir(problems_root):
+        problems_root = extract_dir
+
+    rows = []
+    for problem_dir_name in sorted(os.listdir(problems_root)):
+        problem_path = os.path.join(problems_root, problem_dir_name)
+        if not os.path.isdir(problem_path):
+            continue
+        json_path = os.path.join(problem_path, "problem.json")
+        if not os.path.exists(json_path):
+            continue
+
+        with open(json_path, "r", encoding="utf-8") as f:
+            data = json.load(f)
+
+        qs = data.get("question_structure", {})
+        context = qs.get("context", "")
+
+        sub_questions = []
+        i = 1
+        while f"sub_question_{i}" in qs:
+            sub_questions.append(qs[f"sub_question_{i}"])
+            i += 1
+
+        answers = data.get("answer", [])
+        if isinstance(answers, str):
+            answers = [answers]
+
+        difficulty = data.get("difficulty", "unknown")
+
+        image_list = data.get("question_image_list", [])
+        image_rel = image_list[0] if image_list else ""
+        # Absolute path to image within the extraction cache
+        if image_rel:
+            image_abs = os.path.join(problem_path, image_rel)
+        else:
+            image_abs = ""
+
+        image_caption = data.get("image_captions", "")
+        if isinstance(image_caption, list):
+            image_caption = " ".join(image_caption)
+
+        explanation_steps = data.get("explanation_steps", {})
+        num_steps = sum(len(sq_steps) for sq_steps in explanation_steps.values() if isinstance(sq_steps, dict))
+
+        rows.append(
+            {
+                "problem_id": problem_dir_name,
+                "context": context,
+                "sub_questions": sub_questions,
+                "answers": answers,
+                "difficulty": difficulty,
+                "image_path": image_abs,
+                "image_caption": image_caption or "",
+                "num_sub_questions": len(sub_questions),
+                "num_steps": num_steps,
+            }
+        )
+
+    os.makedirs(split_dir, exist_ok=True)
+    with open(jsonl_path, "w", encoding="utf-8") as f:
+        for row in rows:
+            f.write(json.dumps(row, ensure_ascii=False) + "\n")
+
+    eval_logger.info(f"[physreason] Wrote {len(rows)} rows to {jsonl_path}")
+
+
+# Prepare both configs at import time so the JSONL files exist before
+# datasets.load_dataset is called.
+for _cfg in ("full", "mini"):
+    try:
+        _ensure_data_prepared(_cfg)
+    except Exception:
+        # Don't crash on import if only one config is needed and the
+        # other can't be downloaded (e.g. offline).  The actual load
+        # will fail later with a clear error.
+        pass
+
+
+# ---------------------------------------------------------------------------
+# doc_to_visual / doc_to_text / scoring
+# ---------------------------------------------------------------------------
+
+
+def physreason_doc_to_visual(doc):
+    """Load the problem image from the cached extraction directory."""
+    image_path = doc.get("image_path", "")
+    if not image_path or not os.path.exists(image_path):
+        return []
+    try:
+        img = Image.open(image_path).convert("RGB")
+        return [img]
+    except Exception:
+        eval_logger.warning(f"[physreason] Failed to load image: {image_path}")
+        return []
+
+
+def physreason_doc_to_text(doc, lmms_eval_specific_kwargs=None):
+    """Build the prompt from context + sub-questions.
+
+    Formats the physics problem with all sub-questions numbered,
+    and asks for step-by-step reasoning with clearly labeled answers.
+    """
+    context = doc.get("context", "")
+    sub_questions = doc.get("sub_questions", [])
+
+    prompt_parts = []
+    prompt_parts.append(context.strip())
+
+    if sub_questions:
+        prompt_parts.append("")
+        for i, sq in enumerate(sub_questions, 1):
+            prompt_parts.append(f"({i}) {sq.strip()}")
+
+    prompt_parts.append("")
+    prompt_parts.append("Solve each sub-question step by step. " "For each sub-question, show your reasoning and then give the final answer. " "Format each final answer as: Answer (N): <your answer>")
+
+    return "\n".join(prompt_parts)
+
+
+def _normalize_answer(text):
+    """Normalize a LaTeX/math answer string for comparison."""
+    s = text.strip()
+    s = s.strip("$")
+    s = re.sub(r"\\(?:text|mathrm|mathsf|mathit)\{([^}]*)\}", r"\1", s)
+    s = re.sub(r"\\[,;:!\s]", "", s)
+    s = s.replace("\\left", "").replace("\\right", "")
+    s = re.sub(r"\s+", " ", s).strip()
+    return s
+
+
+def _extract_answers_from_response(response, num_expected):
+    """Try to extract numbered answers from model response."""
+    answers = []
+
+    pattern = r"Answer\s*\(?(\d+)\)?[:\s]+(.+?)(?=Answer\s*\(?\d|$)"
+    matches = re.findall(pattern, response, re.IGNORECASE | re.DOTALL)
+
+    if matches:
+        matches.sort(key=lambda x: int(x[0]))
+        for _, ans in matches:
+            ans_clean = ans.strip().split("\n")[0].strip()
+            ans_clean = ans_clean.rstrip(".")
+            answers.append(ans_clean)
+
+    if len(answers) < num_expected:
+        boxed = re.findall(r"\\boxed\{([^}]+)\}", response)
+        if len(boxed) >= num_expected:
+            answers = boxed[:num_expected]
+
+    return answers
+
+
+def physreason_process_results(doc, results):
+    """Process model output and compare against ground truth answers."""
+    prediction = results[0].strip() if results else ""
+    answers_gt = doc.get("answers", [])
+    num_sq = len(answers_gt)
+    difficulty = doc.get("difficulty", "unknown")
+
+    extracted = _extract_answers_from_response(prediction, num_sq)
+
+    correct = 0
+    for i, gt in enumerate(answers_gt):
+        gt_norm = _normalize_answer(gt)
+        if i < len(extracted):
+            pred_norm = _normalize_answer(extracted[i])
+            if gt_norm == pred_norm:
+                correct += 1
+
+    accuracy = correct / max(num_sq, 1)
+
+    eval_result = {
+        "problem_id": doc.get("problem_id", ""),
+        "difficulty": difficulty,
+        "num_sub_questions": num_sq,
+        "correct": correct,
+        "accuracy": accuracy,
+    }
+
+    return {"physreason_accuracy": eval_result}
+
+
+def physreason_aggregate_results(results):
+    """Aggregate per-problem accuracy into overall score."""
+    if not results:
+        eval_logger.warning("Empty results list for PhysReason. Returning 0.0")
+        return 0.0
+
+    accuracies = [r["accuracy"] for r in results]
+    overall = float(np.mean(accuracies))
+
+    by_difficulty = {}
+    for r in results:
+        d = r["difficulty"]
+        if d not in by_difficulty:
+            by_difficulty[d] = []
+        by_difficulty[d].append(r["accuracy"])
+
+    for d in sorted(by_difficulty.keys()):
+        acc = float(np.mean(by_difficulty[d]))
+        count = len(by_difficulty[d])
+        eval_logger.info(f"PhysReason [{d}]: {acc:.4f} ({count} problems)")
+
+    eval_logger.info(f"PhysReason [overall]: {overall:.4f} ({len(results)} problems)")
+    return overall