From 1d326b1e368993377eba813d8907f06ae1d8f5e9 Mon Sep 17 00:00:00 2001 From: Bo Li Date: Thu, 26 Mar 2026 17:04:14 +0100 Subject: [PATCH] feat: add physics reasoning benchmarks (PhysBench, ContPhy, PhysGame, PhysicsRW, PhysReason) Add five physics reasoning benchmarks: - PhysBench: multi-domain physics reasoning (ICLR 2025) - ContPhy: continuum physics understanding from videos (ICML 2024) - PhysGame: physics understanding from game environments - PhysicsRW: real-world physics scenarios - PhysReason: physics reasoning with mini split Also adds shared MCQ answer extraction utility used by PhysBench. --- lmms_eval/tasks/_task_utils/mcq_extract.py | 147 ++++ lmms_eval/tasks/contphy/contphy.yaml | 43 ++ lmms_eval/tasks/contphy/generate_qa.py | 658 ++++++++++++++++++ lmms_eval/tasks/contphy/utils.py | 265 +++++++ lmms_eval/tasks/physbench/physbench.yaml | 32 + lmms_eval/tasks/physbench/utils.py | 192 +++++ lmms_eval/tasks/physgame/physgame.yaml | 32 + lmms_eval/tasks/physgame/utils.py | 107 +++ lmms_eval/tasks/physics_rw/physics_rw.yaml | 30 + lmms_eval/tasks/physics_rw/utils.py | 166 +++++ lmms_eval/tasks/physreason/physreason.yaml | 21 + .../physreason/physreason_data/.gitignore | 2 + .../tasks/physreason/physreason_mini.yaml | 21 + lmms_eval/tasks/physreason/utils.py | 285 ++++++++ 14 files changed, 2001 insertions(+) create mode 100644 lmms_eval/tasks/_task_utils/mcq_extract.py create mode 100644 lmms_eval/tasks/contphy/contphy.yaml create mode 100644 lmms_eval/tasks/contphy/generate_qa.py create mode 100644 lmms_eval/tasks/contphy/utils.py create mode 100644 lmms_eval/tasks/physbench/physbench.yaml create mode 100644 lmms_eval/tasks/physbench/utils.py create mode 100644 lmms_eval/tasks/physgame/physgame.yaml create mode 100644 lmms_eval/tasks/physgame/utils.py create mode 100644 lmms_eval/tasks/physics_rw/physics_rw.yaml create mode 100644 lmms_eval/tasks/physics_rw/utils.py create mode 100644 lmms_eval/tasks/physreason/physreason.yaml create mode 100644 lmms_eval/tasks/physreason/physreason_data/.gitignore create mode 100644 lmms_eval/tasks/physreason/physreason_mini.yaml create mode 100644 lmms_eval/tasks/physreason/utils.py diff --git a/lmms_eval/tasks/_task_utils/mcq_extract.py b/lmms_eval/tasks/_task_utils/mcq_extract.py new file mode 100644 index 000000000..92bee43af --- /dev/null +++ b/lmms_eval/tasks/_task_utils/mcq_extract.py @@ -0,0 +1,147 @@ +"""Robust multiple-choice answer extraction. + +Shared utility for benchmark tasks that need to extract a choice letter +(A/B/C/D/...) from free-form model output. Handles 10+ common answer +formats and uses a priority ranking to pick the best candidate. + +Usage:: + + from lmms_eval.tasks._task_utils.mcq_extract import extract_mcq_answer + + letter = extract_mcq_answer("The correct answer is (B).") # -> "B" +""" + +import re +from typing import List, Optional + +_DEFAULT_CHOICES = ["A", "B", "C", "D", "E", "F", "G", "H"] + +_ANSWER_PHRASES = [ + "the answer is", + "answer is", + "the correct answer is", + "correct answer is", + "the best answer is", + "best answer is", + "the correct option is", + "correct option is", + "the best option is", + "best option is", + "the choice is", + "choice is", + "the correct choice is", + "correct choice is", + "i choose", + "i select", + "i pick", + "my answer is", + "my choice is", +] + +# Higher = more confident that this is the intended answer. +_FORMAT_PRIORITY = { + "start": 10, + "end": 9, + "phrase": 7, + "parentheses": 6, + "period": 5, + "colon": 4, + "right_paren": 3, + "space": 2, + "fallback": 0, +} + + +def extract_mcq_answer(response: str, choices: Optional[List[str]] = None) -> str: + """Extract a multiple-choice answer letter from model output. + + Searches for choice letters in various common formats and returns the + best candidate using a priority ranking. When multiple candidates + match, prefers the **last** occurrence in the **highest-priority** + format — this naturally handles reasoning-style outputs where the + model discusses options before giving its final answer. + + Args: + response: Model output (should already have ```` tags + stripped by the postprocessing pipeline). + choices: Valid choice letters. Defaults to ``["A".."H"]``. + + Returns: + Uppercase choice letter, or ``""`` if none found. + """ + if not response or not response.strip(): + return "" + + all_choices = choices or _DEFAULT_CHOICES + + text = response.strip() + for char in [",", ".", "!", "?", ";", ":", "'", '"']: + text = text.strip(char) + # Pad with spaces for boundary matching. + text = " " + text + " " + + candidates: list = [] # (letter, position, format_name) + + # --- (A) --- + for ch in all_choices: + if f"({ch})" in text: + candidates.append((ch, text.rfind(f"({ch})"), "parentheses")) + + # --- A. --- + for ch in all_choices: + if f"{ch}." in text: + candidates.append((ch, text.rfind(f"{ch}."), "period")) + + # --- A: --- + for ch in all_choices: + if f"{ch}:" in text: + candidates.append((ch, text.rfind(f"{ch}:"), "colon")) + + # --- A) --- + for ch in all_choices: + if f"{ch})" in text: + candidates.append((ch, text.rfind(f"{ch})"), "right_paren")) + + # --- A followed by space --- + for ch in all_choices: + if f"{ch} " in text: + candidates.append((ch, text.rfind(f"{ch} "), "space")) + + # --- Common answer phrases ("the answer is A", etc.) --- + text_lower = text.lower() + for phrase in _ANSWER_PHRASES: + idx = text_lower.find(phrase) + if idx != -1: + after = idx + len(phrase) + for ch in all_choices: + ch_pos = text.find(ch, after) + if ch_pos != -1: + candidates.append((ch, ch_pos, "phrase")) + + # --- Starts with standalone choice letter (not part of a word) --- + stripped = text.strip() + for ch in all_choices: + if stripped.startswith(ch) and (len(stripped) == 1 or not stripped[1].isalpha()): + candidates.append((ch, 0, "start")) + + # --- Ends with standalone choice letter --- + for ch in all_choices: + if stripped.endswith(ch) and (len(stripped) == 1 or not stripped[-2].isalpha()): + candidates.append((ch, len(text) - 1, "end")) + + # --- Fallback: any occurrence (lowest priority) --- + if not candidates: + for ch in all_choices: + if ch in text: + candidates.append((ch, text.rfind(ch), "fallback")) + + if not candidates: + return "" + + # Sort by (priority DESC, position DESC) — highest-priority format + # wins; within the same format, later position (closer to end) wins. + candidates.sort( + key=lambda x: (_FORMAT_PRIORITY.get(x[2], 0), x[1]), + reverse=True, + ) + return candidates[0][0] diff --git a/lmms_eval/tasks/contphy/contphy.yaml b/lmms_eval/tasks/contphy/contphy.yaml new file mode 100644 index 000000000..f53ef0f92 --- /dev/null +++ b/lmms_eval/tasks/contphy/contphy.yaml @@ -0,0 +1,43 @@ +# ContPhy: Continuum Physical Concept Learning and Reasoning from Videos (ICML 2024) +# Paper: https://arxiv.org/abs/2402.06119 +# Dataset: https://huggingface.co/datasets/zzcnewly/ContPhy_Dataset +# +# Setup: +# 1. Generate the QA file from raw ContPhy zips: +# python -m lmms_eval.tasks.contphy.generate_qa \ +# --mini --output $HF_HOME/contphy/contphy_qa.json +# 2. Set CONTPHY_DATA_DIR to the extracted data root (for video files) +# 3. Update data_files below to point to your generated QA JSON +dataset_path: json +dataset_kwargs: + data_files: + test: contphy_qa.json + cache_dir: contphy + video: true +task: contphy +test_split: test +output_type: generate_until +process_docs: !function utils.contphy_process_docs +doc_to_visual: !function utils.contphy_doc_to_visual +doc_to_text: !function utils.contphy_doc_to_text +doc_to_target: "answer" +generation_kwargs: + max_new_tokens: 16 + temperature: 0 + top_p: 1.0 + num_beams: 1 + do_sample: false +process_results: !function utils.contphy_process_results +metric_list: + - metric: contphy_accuracy + aggregation: !function utils.contphy_aggregate_results + higher_is_better: true +lmms_eval_specific_kwargs: + default: + pre_prompt: "" + post_prompt: "\nAnswer with the option's letter from the given choices directly." + gpt4v: + pre_prompt: "" + post_prompt: "\nPlease answer with the correct choice letter (e.g. A, B, or C). Please do NOT add any other text in your response." +metadata: + - version: 0.0 diff --git a/lmms_eval/tasks/contphy/generate_qa.py b/lmms_eval/tasks/contphy/generate_qa.py new file mode 100644 index 000000000..a2581b655 --- /dev/null +++ b/lmms_eval/tasks/contphy/generate_qa.py @@ -0,0 +1,658 @@ +"""Generate ContPhy QA JSON from raw dataset zips. + +Downloads (or reads from local cache) the ContPhy zip files from HuggingFace +and produces a single JSONL file with multiple-choice QA pairs suitable for +lmms-eval ingestion. + +Usage: + python -m lmms_eval.tasks.contphy.generate_qa --output /path/to/contphy_qa.json + +Set CONTPHY_DATA_DIR to skip download if you already have the zips extracted. +""" + +import argparse +import json +import os +import random +import zipfile +from pathlib import Path + +# --------------------------------------------------------------------------- +# Scenario -> zip name mapping +# --------------------------------------------------------------------------- +SCENARIO_ZIPS = { + "fluid": "fluid_full.zip", + "rope": "rope_full.zip", + "cloth": "cloth_full.zip", + "ball": "ball_full.zip", +} + +SCENARIO_DIRS = { + "fluid": "fluid_slides", + "rope": "pulley_group", + "cloth": "cloth_collision", + "ball": "soft_body", +} + +HF_BASE = "https://huggingface.co/datasets/zzcnewly/ContPhy_Dataset/resolve/main" + + +# --------------------------------------------------------------------------- +# Helpers +# --------------------------------------------------------------------------- +def _color_name(name: str) -> str: + """Extract color from object name like 'Blue Fluid' -> 'blue'.""" + parts = name.split() + if len(parts) >= 2: + return parts[0].lower() + return name.lower() + + +def _compare_word(greater: bool) -> str: + return "greater" if greater else "less" + + +# --------------------------------------------------------------------------- +# Question generators per scenario +# --------------------------------------------------------------------------- +def generate_fluid_questions(data: dict, video_id: str) -> list[dict]: + """Generate MC questions for the fluid scenario.""" + questions = [] + fluids = data.get("metaSamplingData", {}).get("fluids", []) + sticks = data.get("metaSamplingData", {}).get("sticks", []) + tracking = data.get("trackingData", {}) + receptor_stats = tracking.get("perReceptorFluidStat", {}) + cf_annotations = data.get("CounterFactualAnnotations", {}) + + # Filter to fluids without 'Later Emitted' prefix for cleaner questions + named_fluids = [f for f in fluids if not f["name"].startswith("Later")] + + # --- Property: Density comparison --- + if len(named_fluids) >= 2: + for i in range(len(named_fluids)): + for j in range(i + 1, len(named_fluids)): + f1, f2 = named_fluids[i], named_fluids[j] + gt_greater = f1["density"] > f2["density"] + q = f"Is the density of the {f1['name'].lower()} " f"{_compare_word(True)} than that of the {f2['name'].lower()}?" + answer = "Yes" if gt_greater else "No" + questions.append( + { + "question": q, + "options": ["Yes", "No", "Cannot Judge"], + "answer": answer, + "question_type": "property", + "question_class": "density", + "scenario": "fluid", + "video_id": video_id, + } + ) + + # --- Property: Stick count --- + if sticks: + q = "How many sticks are there in the video?" + n = len(sticks) + options = sorted(set([str(n), str(max(1, n - 1)), str(n + 1), str(n + 2)])) + questions.append( + { + "question": q, + "options": options, + "answer": str(n), + "question_type": "property", + "question_class": "stick_number", + "scenario": "fluid", + "video_id": video_id, + } + ) + + # --- Predictive: Which container will fluid flow into --- + for container_name, container_fluids in receptor_stats.items(): + for fluid_name, amount in container_fluids.items(): + if fluid_name.startswith("Later"): + continue + if amount > 0: + other_containers = [c for c in receptor_stats if c != container_name] + if other_containers: + q = f"Which container will {fluid_name.lower()} flow into?" + options = [container_name] + other_containers[:2] + if len(options) < 3: + options.append("None of the above") + random.shuffle(options) + questions.append( + { + "question": q, + "options": options, + "answer": container_name, + "question_type": "predictive", + "question_class": "container", + "scenario": "fluid", + "video_id": video_id, + } + ) + break # One per container + else: + continue + break # One total + + # --- Counterfactual: If stick removed, where would fluid go --- + for cf_key, cf_data in list(cf_annotations.items())[:1]: + removed = cf_data.get("removedStickName", "") + if not removed: + continue + cf_receptors = cf_data.get("trackingData", {}).get("perReceptorFluidStat", {}) + # Find a fluid that changed destination + for fluid_name in [f["name"] for f in named_fluids]: + orig_containers = {c for c, fs in receptor_stats.items() if fluid_name in fs and fs[fluid_name] > 0} + cf_containers = {c for c, fs in cf_receptors.items() if fluid_name in fs and fs[fluid_name] > 0} + new_containers = cf_containers - orig_containers + if new_containers: + answer = list(new_containers)[0] + all_containers = list(set(list(receptor_stats.keys()) + list(cf_receptors.keys()))) + options = [answer] + [c for c in all_containers if c != answer][:2] + if len(options) < 3: + options.append("None of the above") + random.shuffle(options) + q = f"If the {removed.lower()} were removed, which container would " f"{fluid_name.lower()} flow into?" + questions.append( + { + "question": q, + "options": options, + "answer": answer, + "question_type": "counterfactual", + "question_class": "container", + "scenario": "fluid", + "video_id": video_id, + } + ) + break + + return questions + + +def generate_rope_questions(data: dict, video_id: str) -> list[dict]: + """Generate MC questions for the rope scenario.""" + questions = [] + masses = data.get("outputMass", {}) + rotations = data.get("ResultRotation", {}) + tension_avg = data.get("ResultTensionAvg", {}) + meta = data.get("metaSamplingData", {}) + cf_annotations = data.get("CounterFactualAnnotations", {}) + + objects_with_mass = list(masses.keys()) + + # --- Property: Mass comparison --- + if len(objects_with_mass) >= 2: + for i in range(len(objects_with_mass)): + for j in range(i + 1, len(objects_with_mass)): + o1, o2 = objects_with_mass[i], objects_with_mass[j] + m1, m2 = masses[o1], masses[o2] + gt_greater = m1 > m2 * 0.5 + q = f"Is the mass of the {o1.lower()} greater than " f"half that of the {o2.lower()}?" + answer = "Yes" if gt_greater else "No" + questions.append( + { + "question": q, + "options": ["Yes", "No", "Cannot Judge"], + "answer": answer, + "question_type": "property", + "question_class": "mass", + "scenario": "rope", + "video_id": video_id, + } + ) + + # --- Property: Tension comparison --- + rope_names = list(tension_avg.keys()) + if len(rope_names) >= 2: + r1, r2 = rope_names[0], rope_names[1] + t1, t2 = abs(tension_avg[r1]), abs(tension_avg[r2]) + gt_greater = t1 > t2 + q = f"Is the tension of the {r1.lower()} greater than " f"that of the {r2.lower()}?" + answer = "Yes" if gt_greater else "No" + questions.append( + { + "question": q, + "options": ["Yes", "No", "Cannot Judge"], + "answer": answer, + "question_type": "property", + "question_class": "tension", + "scenario": "rope", + "video_id": video_id, + } + ) + + # --- Counterfactual: If object were heavier, which direction --- + for cf_key, cf_data in list(cf_annotations.items())[:1]: + cf_rotations = cf_data.get("ResultRotation", {}) + # Find a pulley that changed rotation direction + for pulley_name, orig_rot in rotations.items(): + cf_rot = cf_rotations.get(pulley_name, orig_rot) + if cf_rot != orig_rot and cf_rot != 0: + direction = "clockwise" if cf_rot > 0 else "anti-clockwise" + # Pick a changed mass object + changed_obj = cf_data.get("changedMassObjectName", "") + if not changed_obj and objects_with_mass: + changed_obj = objects_with_mass[0] + q = f"If the {changed_obj.lower()} were far much heavier, " f"which direction would the {pulley_name.lower()} rotate?" + questions.append( + { + "question": q, + "options": ["Clockwise", "Anti-clockwise", "No rotation"], + "answer": direction.capitalize(), + "question_type": "counterfactual", + "question_class": "rotation", + "scenario": "rope", + "video_id": video_id, + } + ) + break + + # --- Property: Object counting (shapes/colors) --- + name2pos = meta.get("name2position", {}) + if name2pos: + # Count objects by type + type_counts = {} + for obj_name in name2pos: + for obj_type in ["Cube", "Sphere", "Pulley"]: + if obj_type in obj_name: + type_counts[obj_type.lower()] = type_counts.get(obj_type.lower(), 0) + 1 + for obj_type, count in type_counts.items(): + plural = obj_type + "s" if count != 1 else obj_type + q = f"How many {plural} are there in the video?" + options = sorted(set([str(count), str(max(1, count - 1)), str(count + 1), str(count + 2)])) + questions.append( + { + "question": q, + "options": options, + "answer": str(count), + "question_type": "property", + "question_class": "shape", + "scenario": "rope", + "video_id": video_id, + } + ) + + return questions + + +def generate_cloth_questions(data: dict, video_id: str) -> list[dict]: + """Generate MC questions for the cloth scenario.""" + questions = [] + cloth_left = data.get("clothLeft", {}) + cloth_right = data.get("clothRight", {}) + ofa = data.get("objectFullAnnotation", {}) + + # --- Property: Elasticity (stretching compliance) --- + sc_l = cloth_left.get("stretchingCompliance", 0) + sc_r = cloth_right.get("stretchingCompliance", 0) + # Higher compliance = easier to stretch = more elastic + gt_easier = sc_l > sc_r + q = "Is the left cloth much easier to stretch than the other?" + answer = "Yes" if gt_easier else "No" + questions.append( + { + "question": q, + "options": ["Yes", "No"], + "answer": answer, + "question_type": "property", + "question_class": "elasticity", + "scenario": "cloth", + "video_id": video_id, + } + ) + + # --- Property: Bending --- + bc_l = cloth_left.get("bendingCompliance", 0) + bc_r = cloth_right.get("bendingCompliance", 0) + # Lower compliance = harder to bend + gt_harder = bc_l < bc_r + q = "Is the left cloth much harder to bend or have wrinkles " "than the other?" + answer = "Yes" if gt_harder else "No" + questions.append( + { + "question": q, + "options": ["Yes", "No"], + "answer": answer, + "question_type": "property", + "question_class": "bending", + "scenario": "cloth", + "video_id": video_id, + } + ) + + # --- Predictive: Object fall over --- + for side_key, side_name in [("leftAll", "left"), ("rightAll", "right")]: + side_objs = ofa.get(side_key, {}) + isolated = side_objs.get("isolatedObjects", {}) + for obj_name, obj_data in isolated.items(): + pose = obj_data.get("endPoseDescription", "") + if pose and pose != "Upright": + q = f"Does the {obj_name.lower()} fall over?" + answer = "Yes" + else: + q = f"Does the {obj_name.lower()} fall over?" + answer = "No" + questions.append( + { + "question": q, + "options": ["Yes", "No"], + "answer": answer, + "question_type": "predictive", + "question_class": "fall_over", + "scenario": "cloth", + "video_id": video_id, + } + ) + break # One per side + else: + continue + break # One total + + # --- Predictive: Final pose --- + for side_key in ["leftAll", "rightAll"]: + side_objs = ofa.get(side_key, {}) + isolated = side_objs.get("isolatedObjects", {}) + for obj_name, obj_data in isolated.items(): + pose = obj_data.get("endPoseDescription", "") + if pose: + q = f"Which phrase below can best describe the final pose " f"of the {obj_name.lower()}?" + pose_options = ["Standing upright", "Leaning", "Lying horizontally"] + if pose == "Upright": + answer = "Standing upright" + elif "lean" in pose.lower() or "tilt" in pose.lower(): + answer = "Leaning" + else: + answer = "Lying horizontally" + questions.append( + { + "question": q, + "options": pose_options, + "answer": answer, + "question_type": "predictive", + "question_class": "pose", + "scenario": "cloth", + "video_id": video_id, + } + ) + break + else: + continue + break + + return questions + + +def generate_ball_questions(data: dict, video_id: str) -> list[dict]: + """Generate MC questions for the ball scenario.""" + questions = [] + tracking = data.get("trackingData", {}) + meta = data.get("metaSamplingData", {}) + balls = meta.get("balls", []) + holes = meta.get("holesCenterXValue", []) + cf_annotations = data.get("CounterFactualAnnotations", {}) + + # --- Property: Elasticity comparison --- + if len(balls) >= 2: + for i in range(len(balls)): + for j in range(i + 1, len(balls)): + b1, b2 = balls[i], balls[j] + e1 = b1.get("elasticityType", "") + e2 = b2.get("elasticityType", "") + # Elastic > Plastic > Rigid in terms of elasticity + elasticity_rank = {"Elastic": 3, "Plastic": 2, "Rigid": 1} + r1 = elasticity_rank.get(e1, 0) + r2 = elasticity_rank.get(e2, 0) + if r1 != r2: + gt_greater = r1 > r2 + q = f"Is the elasticity (deformability) of the " f"{b1['name'].lower()} much greater than " f"the {b2['name'].lower()}?" + answer = "Yes" if gt_greater else "No" + questions.append( + { + "question": q, + "options": ["Yes", "No"], + "answer": answer, + "question_type": "property", + "question_class": "elasticity", + "scenario": "ball", + "video_id": video_id, + } + ) + + # --- Property: Plasticity comparison --- + if len(balls) >= 2: + for i in range(len(balls)): + for j in range(i + 1, len(balls)): + b1, b2 = balls[i], balls[j] + e1 = b1.get("elasticityType", "") + e2 = b2.get("elasticityType", "") + # Plastic has highest plasticity + plasticity_rank = {"Plastic": 3, "Rigid": 2, "Elastic": 1} + r1 = plasticity_rank.get(e1, 0) + r2 = plasticity_rank.get(e2, 0) + if r1 != r2: + gt_greater = r1 > r2 + q = f"Is the plasticity of the " f"{b1['name'].lower()} much greater than " f"the {b2['name'].lower()}?" + answer = "Yes" if gt_greater else "No" + questions.append( + { + "question": q, + "options": ["Yes", "No"], + "answer": answer, + "question_type": "property", + "question_class": "plasticity", + "scenario": "ball", + "video_id": video_id, + } + ) + break + else: + continue + break + + # --- Predictive: Final drop (which pit) --- + for ball_name, ball_tracking in tracking.items(): + pit_result = ball_tracking.get("pitResult", "") + if "Left" in pit_result: + answer = "The left pit" + elif "Right" in pit_result: + answer = "The right pit" + elif "No Pit" in pit_result: + answer = "None of the above" + else: + continue + + q = f"Will the {ball_name.lower()} finally drop into the left pit or the right pit?" + options = ["The left pit", "The right pit", "None of the above"] + questions.append( + { + "question": q, + "options": options, + "answer": answer, + "question_type": "predictive", + "question_class": "final_drop", + "scenario": "ball", + "video_id": video_id, + } + ) + break # One per video + + # --- Counterfactual: Remove wall, which pit --- + for cf_key, cf_data in list(cf_annotations.items())[:1]: + removed_wall = cf_data.get("removedWall", "") + ball_name = cf_data.get("ball", "") + cf_tracking = cf_data.get("trackingData", {}) + if not removed_wall or not ball_name: + continue + ball_track = cf_tracking.get(ball_name, {}) + pit_result = ball_track.get("pitResult", "") + if "Left" in pit_result: + answer = "The left pit" + elif "Right" in pit_result: + answer = "The right pit" + else: + answer = "None of the above" + + q = f"If we removed the {removed_wall.lower()} and other balls, " f"which pit would the {ball_name.lower()} drop into?" + options = ["The left pit", "The right pit", "None of the above"] + questions.append( + { + "question": q, + "options": options, + "answer": answer, + "question_type": "counterfactual", + "question_class": "remove", + "scenario": "ball", + "video_id": video_id, + } + ) + + return questions + + +# --------------------------------------------------------------------------- +# Main generation pipeline +# --------------------------------------------------------------------------- +GENERATORS = { + "fluid": generate_fluid_questions, + "rope": generate_rope_questions, + "cloth": generate_cloth_questions, + "ball": generate_ball_questions, +} + + +def process_scenario(scenario: str, data_dir: str) -> list[dict]: + """Process all trials in a scenario directory.""" + scenario_dir_name = SCENARIO_DIRS[scenario] + scenario_path = Path(data_dir) / scenario_dir_name + + if not scenario_path.exists(): + print(f" Skipping {scenario}: {scenario_path} not found") + return [] + + generator = GENERATORS[scenario] + all_questions = [] + + trial_dirs = sorted( + [d for d in scenario_path.iterdir() if d.is_dir()], + key=lambda d: int(d.name) if d.name.isdigit() else 0, + ) + + for trial_dir in trial_dirs: + outputs_path = trial_dir / "outputs.json" + if not outputs_path.exists(): + continue + + with open(outputs_path) as f: + data = json.load(f) + + if not data.get("validity", False): + continue + + video_file = trial_dir / "output_Full.mp4" + if not video_file.exists(): + continue + + video_id = f"{scenario_dir_name}/{trial_dir.name}" + questions = generator(data, video_id) + all_questions.extend(questions) + + return all_questions + + +def download_and_extract(cache_dir: str, use_mini: bool = False) -> str: + """Download ContPhy zips from HuggingFace and extract them.""" + import urllib.request + + os.makedirs(cache_dir, exist_ok=True) + + if use_mini: + zip_name = "contphy_mini.zip" + url = f"{HF_BASE}/{zip_name}" + zip_path = os.path.join(cache_dir, zip_name) + extract_dir = os.path.join(cache_dir, "contphy_data") + + if not os.path.exists(zip_path): + print(f"Downloading {url} ...") + urllib.request.urlretrieve(url, zip_path) + + if not os.path.exists(extract_dir): + print(f"Extracting {zip_path} ...") + os.makedirs(extract_dir, exist_ok=True) + with zipfile.ZipFile(zip_path) as zf: + zf.extractall(extract_dir) + + return extract_dir + else: + extract_dir = os.path.join(cache_dir, "contphy_data") + os.makedirs(extract_dir, exist_ok=True) + + for scenario, zip_name in SCENARIO_ZIPS.items(): + zip_path = os.path.join(cache_dir, zip_name) + scenario_dir = os.path.join(extract_dir, SCENARIO_DIRS[scenario]) + + if os.path.exists(scenario_dir): + print(f" {scenario}: already extracted") + continue + + if not os.path.exists(zip_path): + url = f"{HF_BASE}/{zip_name}" + print(f" Downloading {url} ...") + urllib.request.urlretrieve(url, zip_path) + + print(f" Extracting {zip_path} ...") + with zipfile.ZipFile(zip_path) as zf: + zf.extractall(extract_dir) + + return extract_dir + + +def main(): + parser = argparse.ArgumentParser(description="Generate ContPhy QA JSON") + parser.add_argument("--data-dir", type=str, default="", help="Path to extracted ContPhy data. If empty, downloads from HF.") + parser.add_argument("--output", type=str, required=True, help="Output JSON file path") + parser.add_argument("--mini", action="store_true", help="Use mini dataset (20 videos per scenario) instead of full") + parser.add_argument("--cache-dir", type=str, default="", help="Cache directory for downloads") + parser.add_argument("--seed", type=int, default=42, help="Random seed for option shuffling") + args = parser.parse_args() + + random.seed(args.seed) + + data_dir = args.data_dir + if not data_dir: + cache_dir = args.cache_dir or os.path.join( + os.path.expanduser(os.getenv("HF_HOME", "~/.cache/huggingface")), + "contphy", + ) + data_dir = download_and_extract(cache_dir, use_mini=args.mini) + + all_questions = [] + for scenario in ["fluid", "rope", "cloth", "ball"]: + print(f"Processing {scenario}...") + questions = process_scenario(scenario, data_dir) + print(f" Generated {len(questions)} questions") + all_questions.extend(questions) + + # Add index + for i, q in enumerate(all_questions): + q["idx"] = i + + # Write output + os.makedirs(os.path.dirname(os.path.abspath(args.output)), exist_ok=True) + with open(args.output, "w") as f: + json.dump(all_questions, f, indent=2) + + print(f"\nTotal: {len(all_questions)} questions written to {args.output}") + + # Stats + by_scenario = {} + by_type = {} + for q in all_questions: + s = q["scenario"] + t = q["question_type"] + by_scenario[s] = by_scenario.get(s, 0) + 1 + by_type[t] = by_type.get(t, 0) + 1 + print("\nBy scenario:", by_scenario) + print("By type:", by_type) + + +if __name__ == "__main__": + main() diff --git a/lmms_eval/tasks/contphy/utils.py b/lmms_eval/tasks/contphy/utils.py new file mode 100644 index 000000000..9b29173ae --- /dev/null +++ b/lmms_eval/tasks/contphy/utils.py @@ -0,0 +1,265 @@ +"""ContPhy benchmark utilities for lmms-eval. + +ContPhy: Continuum Physical Concept Learning and Reasoning from Videos (ICML 2024) +https://arxiv.org/abs/2402.06119 + +Handles multiple-choice QA over physics simulation videos covering four +scenarios: fluid dynamics, rope/pulley systems, cloth manipulation, and +soft-body ball interactions. +""" + +import os +import re +from collections import defaultdict + +import datasets +from loguru import logger as eval_logger + +# --------------------------------------------------------------------------- +# Scenario categories for per-scenario reporting +# --------------------------------------------------------------------------- +SCENARIOS = ["fluid", "rope", "cloth", "ball"] + +QUESTION_TYPES = ["property", "predictive", "counterfactual", "goal_driven"] + +# Option letters +OPTION_LETTERS = "ABCDEFGHIJ" + + +# --------------------------------------------------------------------------- +# Data directory resolution +# --------------------------------------------------------------------------- +def _get_data_dir() -> str: + """Resolve the ContPhy data directory containing extracted video files. + + Checks in order: + 1. CONTPHY_DATA_DIR env var + 2. $HF_HOME/contphy/contphy_data/ + """ + explicit = os.getenv("CONTPHY_DATA_DIR", "").strip() + if explicit: + return os.path.expanduser(explicit) + + hf_home = os.path.expanduser(os.getenv("HF_HOME", "~/.cache/huggingface")) + return os.path.join(hf_home, "contphy", "contphy_data") + + +# --------------------------------------------------------------------------- +# process_docs: format options with letter prefixes +# --------------------------------------------------------------------------- +def contphy_process_docs(dataset): + """Add letter-prefixed option strings and answer letter to each doc.""" + processed = [] + for doc in dataset: + doc = dict(doc) + options = doc.get("options", []) + answer_text = doc.get("answer", "") + + # Build lettered options: "A. Yes", "B. No", ... + lettered = [] + answer_letter = "" + for i, opt in enumerate(options): + letter = OPTION_LETTERS[i] + lettered.append(f"{letter}. {opt}") + if opt == answer_text: + answer_letter = letter + + doc["options_str"] = "\n".join(lettered) + doc["answer_letter"] = answer_letter + doc["answer"] = answer_letter # override for doc_to_target + processed.append(doc) + return datasets.Dataset.from_list(processed) + + +# --------------------------------------------------------------------------- +# doc_to_visual: locate the video file +# --------------------------------------------------------------------------- +def contphy_doc_to_visual(doc): + """Return the video file path for this question.""" + video_id = doc.get("video_id", "") + if not video_id: + eval_logger.warning("ContPhy: no video_id in doc") + return [] + + data_dir = _get_data_dir() + video_path = os.path.join(data_dir, video_id, "output_Full.mp4") + + if os.path.exists(video_path): + return [video_path] + + # Try without subdirectory nesting + alt_path = os.path.join(data_dir, f"{video_id}.mp4") + if os.path.exists(alt_path): + return [alt_path] + + eval_logger.warning( + "ContPhy: video not found for {} (tried {})", + video_id, + video_path, + ) + return [] + + +# --------------------------------------------------------------------------- +# doc_to_text: build the MC prompt +# --------------------------------------------------------------------------- +def contphy_doc_to_text(doc, lmms_eval_specific_kwargs=None): + """Format the question with lettered options.""" + kwargs = lmms_eval_specific_kwargs or {} + pre_prompt = kwargs.get("pre_prompt", "") + post_prompt = kwargs.get("post_prompt", "") + + question = doc.get("question", "") + options_str = doc.get("options_str", "") + + return f"{pre_prompt}{question}\n{options_str}{post_prompt}" + + +# --------------------------------------------------------------------------- +# Answer extraction +# --------------------------------------------------------------------------- +def _extract_letter(text: str, num_options: int = 5) -> str: + """Extract a single option letter from model response.""" + text = text.strip() + + # Common answer prefixes to strip + answer_prefixes = [ + "The best answer is", + "The correct answer is", + "The answer is", + "The answer", + "Best answer:", + "Answer:", + ] + text_lower = text.lower() + for prefix in answer_prefixes: + if text_lower.startswith(prefix.lower()): + text = text[len(prefix) :].strip(" :.-") + break + + valid_letters = OPTION_LETTERS[:num_options] + + # Try to find a letter at the very start + if text and text[0].upper() in valid_letters: + # Make sure it's not part of a word (e.g. "A" vs "And") + if len(text) == 1 or not text[1].isalpha(): + return text[0].upper() + + # Look for standalone letter (word boundary) or "letter." / "letter)" + pattern = rf"\b([{valid_letters}])(?:\b|[.\)\]:,])" + matches = re.findall(pattern, text.upper()) + if matches: + return matches[0] + + # Fallback: any letter in the response + fallback = re.findall(rf"[{valid_letters}]", text.upper()) + if fallback: + return fallback[0] + + return "" + + +# --------------------------------------------------------------------------- +# process_results +# --------------------------------------------------------------------------- +def contphy_process_results(doc, results): + """Compare model prediction against ground truth.""" + pred = results[0] if results else "" + num_options = len(doc.get("options", [])) + pred_letter = _extract_letter(pred, num_options) + + gt_letter = doc.get("answer_letter", doc.get("answer", "")) + + return { + "contphy_accuracy": { + "video_id": doc.get("video_id", ""), + "scenario": doc.get("scenario", "unknown"), + "question_type": doc.get("question_type", "unknown"), + "question_class": doc.get("question_class", "unknown"), + "pred_answer": pred_letter, + "answer": gt_letter, + } + } + + +# --------------------------------------------------------------------------- +# aggregate_results +# --------------------------------------------------------------------------- +def contphy_aggregate_results(results): + """Compute per-scenario, per-type, and overall accuracy.""" + # Per-scenario stats + scenario_stats = defaultdict(lambda: {"correct": 0, "total": 0}) + # Per-question-type stats + type_stats = defaultdict(lambda: {"correct": 0, "total": 0}) + # Per scenario+type combo + combo_stats = defaultdict(lambda: {"correct": 0, "total": 0}) + + total_correct = 0 + total = 0 + + for result in results: + scenario = result.get("scenario", "unknown") + q_type = result.get("question_type", "unknown") + correct = result["pred_answer"] == result["answer"] + + scenario_stats[scenario]["total"] += 1 + type_stats[q_type]["total"] += 1 + combo_stats[f"{scenario}/{q_type}"]["total"] += 1 + total += 1 + + if correct: + scenario_stats[scenario]["correct"] += 1 + type_stats[q_type]["correct"] += 1 + combo_stats[f"{scenario}/{q_type}"]["correct"] += 1 + total_correct += 1 + + # Log per-scenario results + for scenario in SCENARIOS: + stats = scenario_stats.get(scenario, {"correct": 0, "total": 0}) + if stats["total"] > 0: + acc = 100 * stats["correct"] / stats["total"] + eval_logger.info( + "ContPhy [{}]: {:.1f}% ({}/{})", + scenario, + acc, + stats["correct"], + stats["total"], + ) + + # Log per-type results + for q_type in sorted(type_stats): + stats = type_stats[q_type] + if stats["total"] > 0: + acc = 100 * stats["correct"] / stats["total"] + eval_logger.info( + "ContPhy [{}]: {:.1f}% ({}/{})", + q_type, + acc, + stats["correct"], + stats["total"], + ) + + # Log combo results + for combo in sorted(combo_stats): + stats = combo_stats[combo] + if stats["total"] > 0: + acc = 100 * stats["correct"] / stats["total"] + eval_logger.info( + "ContPhy [{}]: {:.1f}% ({}/{})", + combo, + acc, + stats["correct"], + stats["total"], + ) + + if total == 0: + return 0.0 + + overall = 100 * total_correct / total + eval_logger.info( + "ContPhy overall: {:.1f}% ({}/{})", + overall, + total_correct, + total, + ) + return overall diff --git a/lmms_eval/tasks/physbench/physbench.yaml b/lmms_eval/tasks/physbench/physbench.yaml new file mode 100644 index 000000000..62127522c --- /dev/null +++ b/lmms_eval/tasks/physbench/physbench.yaml @@ -0,0 +1,32 @@ +# PhysBench: Multi-domain physics reasoning (ICLR 2025) +# Paper: https://arxiv.org/abs/2501.16411 +# Original: USC-PSI-Lab/PhysBench (incompatible parquet schema) +# Clean copy: lmms-lab-eval/PhysBench (val/test splits, answers merged) +# +# Media files (image.zip, video.zip) must be in the original repo or +# copied to lmms-lab-eval/PhysBench. Set cache_dir to control extraction path. +dataset_path: lmms-lab-eval/PhysBench +dataset_kwargs: + token: True + cache_dir: physbench + video: True +task: physbench +test_split: val +output_type: generate_until +doc_to_visual: !function utils.physbench_doc_to_visual +doc_to_text: !function utils.physbench_doc_to_text +doc_to_target: "answer" +generation_kwargs: + max_new_tokens: 16 + temperature: 0 + do_sample: false +process_results: !function utils.physbench_process_results +metric_list: + - metric: physbench_accuracy + aggregation: !function utils.physbench_aggregate_results + higher_is_better: true +lmms_eval_specific_kwargs: + default: + post_prompt: "\nAnswer with the option's letter from the given choices directly." +metadata: + version: 0.2 diff --git a/lmms_eval/tasks/physbench/utils.py b/lmms_eval/tasks/physbench/utils.py new file mode 100644 index 000000000..267ebd833 --- /dev/null +++ b/lmms_eval/tasks/physbench/utils.py @@ -0,0 +1,192 @@ +import json +import os +import urllib.request +from collections import defaultdict +from pathlib import Path + +import datasets +import yaml +from loguru import logger as eval_logger + +from lmms_eval import utils as lmms_utils +from lmms_eval.tasks._task_utils.mcq_extract import extract_mcq_answer + +# PhysBench category breakdowns +TASK_TYPES = ["property", "relationships", "scene", "dynamics"] +ABILITY_TYPES = ["identify", "comparison", "static", "dynamic", "perception", "prediction", "judgment", "reasoning"] + +# URL for val split answers (answers not included in HF dataset) +VAL_ANSWER_URL = "https://raw.githubusercontent.com/USC-GVL/PhysBench/main/eval/physbench/val_answer.json" + +# Cached answer map: idx -> {answer, task_type, sub_type, ability_type} +_val_answers = None + + +def _fetch_val_answers(): + """Download and cache the val answer file from the PhysBench GitHub repo.""" + global _val_answers + if _val_answers is not None: + return _val_answers + + eval_logger.info(f"Fetching PhysBench val answers from {VAL_ANSWER_URL}") + try: + with urllib.request.urlopen(VAL_ANSWER_URL, timeout=30) as resp: + data = json.loads(resp.read().decode("utf-8")) + _val_answers = {item["idx"]: item for item in data} + eval_logger.info(f"Loaded {len(_val_answers)} val answers") + except Exception as e: + eval_logger.warning(f"Failed to fetch val answers: {e}. Accuracy will not be computed.") + _val_answers = {} + return _val_answers + + +def _load_task_config(): + with open(Path(__file__).parent / "physbench.yaml", "r") as f: + raw_data = f.readlines() + safe_data = [line for line in raw_data if "!function" not in line] + return yaml.safe_load("".join(safe_data)) + + +def _get_cache_dir(): + config = _load_task_config() + hf_home = os.path.expanduser(os.getenv("HF_HOME", "~/.cache/huggingface/")) + return lmms_utils.resolve_cache_dir(config["dataset_kwargs"]["cache_dir"], base_dir=hf_home) + + +def physbench_process_docs(dataset: datasets.Dataset) -> datasets.Dataset: + """Filter to val split only (test answers are hidden) and merge answers.""" + # Filter to val entries only + dataset = dataset.filter(lambda x: x["split"] == "val") + + # Fetch and merge answer metadata + answers = _fetch_val_answers() + if not answers: + return dataset + + def _merge_answer(example): + idx = example["idx"] + ans_info = answers.get(idx, {}) + example["answer"] = ans_info.get("answer", "") + example["task_type"] = ans_info.get("task_type", "") + example["sub_type"] = ans_info.get("sub_type", "") + example["ability_type"] = ans_info.get("ability_type", "") + return example + + dataset = dataset.map(_merge_answer) + return dataset + + +def _resolve_media_path(cache_dir, fname): + """Resolve media file path, checking subdirectory and flat layouts. + + The PhysBench README instructs users to ``unzip image.zip -d image`` which + produces ``{cache_dir}/image/foo.png``. The lmms-eval framework's zip + extraction, however, extracts flat into ``cache_dir`` giving + ``{cache_dir}/foo.png``. We check both locations. + """ + ext = os.path.splitext(fname)[1].lower() + subdir = "video" if ext == ".mp4" else "image" + + # Prefer the subdirectory layout (manual unzip / README instructions) + subdir_path = os.path.join(cache_dir, subdir, fname) + if os.path.exists(subdir_path): + return subdir_path + + # Fall back to flat layout (framework zip extraction) + flat_path = os.path.join(cache_dir, fname) + if os.path.exists(flat_path): + return flat_path + + # Neither found; return the subdirectory path for the warning message + eval_logger.warning(f"PhysBench media file not found: {subdir_path} (also checked {flat_path})") + return subdir_path + + +def physbench_doc_to_visual(doc): + """Return list of media paths (images and videos) for a document.""" + cache_dir = _get_cache_dir() + # Clean dataset uses media_path (string), original uses file_name (list) + media = doc.get("file_name") or doc.get("media_path", "") + if isinstance(media, str): + media = [media] if media else [] + return [_resolve_media_path(cache_dir, fname) for fname in media] + + +def physbench_doc_to_text(doc, lmms_eval_specific_kwargs=None): + """Format the question text. + + The question field already contains inline options (A-D) with