From fb00a2637b19762bbae4a3fbe5e190c672ac8497 Mon Sep 17 00:00:00 2001 From: Mohit Garg Date: Fri, 24 Apr 2026 19:07:01 +0530 Subject: [PATCH 1/3] fix(hf): route all uploads through bench.load_data() and fix image handling MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit The upload script had three alternative code paths (load_csv_benchmark, load_json_benchmark, load_manifest_benchmark) that bypassed each benchmark's own load_data() to save time on dataset enumeration. Those shortcuts drifted from the contract build_model_input() expects at runtime, producing nested / half-formed sample rows on HF that crashed downstream. The sweep showed 8 of 39 benchmarks broken on HF: svg-1, svg-2, svg-5 KeyError: 'options' / 'original_svg' layout-2, layout-8 TypeError / KeyError: 'input_image' temporal-1, temporal-2, temporal-3 KeyError: 'shuffled_keyframe_paths' / 'video_path' Two bugs were at play: 1. The shortcut paths produced the wrong sample shape. Fix: delete them; always go through load_via_registry so HF parquet is round-trip equivalent to local load_data() output. 2. load_via_registry was over-eagerly excluding keys like video_path, input_image, source_image, input_composite from metadata, assuming they'd be packed into the `image` column. That column holds at most one PIL blob, so path-valued keys were silently lost. Fix: only exclude sample_id / ground_truth / prompt (the columns that faithfully preserve those values); everything else survives in metadata. Also: - Strip absolute-path prefixes from metadata (portability — HF consumers don't share the uploader's filesystem layout). - Add per-500-sample progress logging so slow load_data() invocations aren't a silent black box. - hf.py: guard against non-PIL image column values (Value("string") for generation-task configs) to avoid `'str' object has no attribute 'save'`. Verification: in-process sweep (load_from_hub → build_model_input) now reports 38/39 benchmarks passing, up from 31/39. layout-2 upload pending completion (load_data() takes ~10 min on macOS — one-time upload cost, not a runtime concern). Made-with: Cursor --- scripts/upload_to_hf.py | 339 ++++++++++------------------------------ src/gdb/hf.py | 8 +- 2 files changed, 89 insertions(+), 258 deletions(-) diff --git a/scripts/upload_to_hf.py b/scripts/upload_to_hf.py index 3c590e2..2a61933 100644 --- a/scripts/upload_to_hf.py +++ b/scripts/upload_to_hf.py @@ -15,7 +15,6 @@ from __future__ import annotations import argparse -import csv import json import logging import sys @@ -30,48 +29,7 @@ DEFAULT_DATASET_ROOT = REPO_ROOT / "data" / "gdb-dataset" HF_REPO_ID = "lica-world/GDB" -SKIP_BENCHMARKS = set() - -# Benchmarks whose load_data() is too slow (image compositing, alpha checks) -# and should be loaded directly from their manifest CSVs instead. -MANIFEST_BENCHMARKS = { - "layout-1": { - "csv": "layout2_manifest.csv", - "prompt_key": "prompt", - "gt_key": "source_layout", - "image_key": "reference_image", - }, - "layout-2": { - "csv": "layout_single_manifest.csv", - "prompt_key": "prompt", - "gt_key": "ground_truth_image", - "image_key": "input_composite", - }, - "layout-3": { - "csv": "g4_firestore_image_gen_pairs_manifest.filtered_component_renders.csv", - "prompt_key": None, - "gt_key": None, - "image_key": "a_image_path", - }, - "layout-8": { - "csv": "g15_object_insertion_manifest.csv", - "prompt_key": "prompt", - "gt_key": "ground_truth_image", - "image_key": "masked_layout", - }, - "typography-7": { - "csv": "g10_text_element_manifest.csv", - "prompt_key": "prompt", - "gt_key": "ground_truth_image", - "image_key": "input_image", - }, - "typography-8": { - "csv": "g10_text_inpaint_manifest.csv", - "prompt_key": "prompt", - "gt_key": "ground_truth_image", - "image_key": "input_image", - }, -} +SKIP_BENCHMARKS: set = set() def _serialize(value: Any) -> str: @@ -98,202 +56,24 @@ def _is_video(path: str) -> bool: return path.lower().endswith(".mp4") -def _read_csv(csv_path: Path) -> List[Dict[str, str]]: - with open(csv_path, "r", encoding="utf-8") as f: - return list(csv.DictReader(f)) - - -def _read_json(json_path: Path) -> Any: - with open(json_path, encoding="utf-8") as f: - return json.load(f) - - -def load_csv_benchmark( - benchmark_id: str, - meta: Any, - data_dir: Path, - dataset_root: Path, -) -> List[Dict[str, Any]]: - csv_path = data_dir / "samples.csv" - if not csv_path.exists(): - raise FileNotFoundError(f"samples.csv not found in {data_dir}") - - rows_out = [] - base = dataset_root.resolve() - - for row in _read_csv(csv_path): - img_rel = row.get("image_path", "") - img_abs = str((base / img_rel).resolve()) if img_rel else "" - is_vid = _is_video(img_abs) if img_abs else False - - has_image = bool(img_abs and not is_vid - and Path(img_abs).exists() and _is_image_file(img_abs)) - - extra = {k: v for k, v in row.items() - if k not in ("sample_id", "prompt", "image_path", "expected_output")} - - rows_out.append({ - "sample_id": row.get("sample_id", ""), - "benchmark_id": benchmark_id, - "domain": meta.domain, - "task_type": meta.task_type.value, - "benchmark_name": meta.name, - "prompt": row.get("prompt", ""), - "ground_truth": row.get("expected_output", ""), - "image": img_abs if has_image else None, - "media_path": img_rel, - "media_type": "video" if is_vid else ("image" if has_image else "none"), - "metadata": json.dumps(extra, ensure_ascii=False) if extra else "{}", - }) +def _normalize_paths(value: Any, dataset_root_str: str) -> Any: + """Replace absolute paths under ``dataset_root`` with the relative tail. - return rows_out - - -JSON_FIELD_MAP = { - "svg-1": {"gt_key": "answer", "extra": ["svg_code", "question", "options"]}, - "svg-2": {"gt_key": "answer", "extra": ["svg_code", "question", "options"]}, - "svg-3": {"gt_key": "fixed_svg", "extra": ["bug_svg", "error_type", "difficulty"]}, - "svg-4": {"gt_key": None, "extra": ["origin_svg", "opti_ratio"]}, - "svg-5": {"gt_key": "answer", "extra": ["original_svg", "command"]}, - "svg-6": {"gt_key": None, "extra": ["description"]}, - "svg-7": {"gt_key": None, "extra": ["description"]}, - "svg-8": {"gt_key": None, "extra": ["description"]}, - "lottie-1": {"gt_key": None, "extra": ["description"]}, - "lottie-2": {"gt_key": None, "extra": ["description"]}, - "template-1": {"gt_key": "label", "extra": []}, - "template-2": {"gt_key": None, "extra": []}, - "template-3": {"gt_key": None, "extra": ["n_clusters"]}, - "template-4": {"gt_key": None, "extra": []}, - "template-5": {"gt_key": None, "extra": ["difficulty"]}, -} - - -def load_json_benchmark( - benchmark_id: str, - meta: Any, - data_dir: Path, - dataset_root: Path, -) -> List[Dict[str, Any]]: - json_path = data_dir / f"{benchmark_id}.json" - if not json_path.exists(): - raise FileNotFoundError(f"{benchmark_id}.json not found in {data_dir}") - - data = _read_json(json_path) - base = dataset_root.resolve() - rows_out = [] - - if isinstance(data, list): - items = data - else: - for key in ("samples", "items", "pairs", "queries", "problems"): - if key in data: - items = data[key] - break - else: - items = [data] - - for item in items: - sid = str(item.get("id", item.get("sample_id", ""))) - - gt = item.get("answer", item.get("ground_truth", item.get("label", ""))) - if isinstance(gt, (dict, list)): - gt = json.dumps(gt, ensure_ascii=False) - else: - gt = str(gt) if gt is not None else "" - - prompt = item.get("question", item.get("prompt", item.get("description", ""))) - if isinstance(prompt, (dict, list)): - prompt = json.dumps(prompt, ensure_ascii=False) - - img_rel = item.get("image_path", item.get("image", "")) - img_abs = "" - if img_rel and isinstance(img_rel, str): - candidate = base / img_rel - if candidate.exists(): - img_abs = str(candidate.resolve()) - else: - candidate2 = data_dir / img_rel - if candidate2.exists(): - img_abs = str(candidate2.resolve()) - - is_vid = _is_video(img_abs) if img_abs else False - has_image = bool(img_abs and not is_vid - and Path(img_abs).exists() and _is_image_file(img_abs)) - - skip_keys = {"id", "sample_id", "answer", "ground_truth", "label", - "question", "prompt", "description", "image_path", "image"} - extra = {k: v for k, v in item.items() if k not in skip_keys} - - rows_out.append({ - "sample_id": sid, - "benchmark_id": benchmark_id, - "domain": meta.domain, - "task_type": meta.task_type.value, - "benchmark_name": meta.name, - "prompt": str(prompt) if prompt else "", - "ground_truth": gt, - "image": img_abs if has_image else None, - "media_path": str(img_rel) if img_rel else "", - "media_type": "video" if is_vid else ("image" if has_image else "none"), - "metadata": json.dumps(extra, ensure_ascii=False, default=str) if extra else "{}", - }) - - return rows_out - - -def load_manifest_benchmark( - benchmark_id: str, - meta: Any, - data_dir: Path, - dataset_root: Path, -) -> List[Dict[str, Any]]: - spec = MANIFEST_BENCHMARKS[benchmark_id] - csv_path = data_dir / spec["csv"] - if not csv_path.exists(): - raise FileNotFoundError(f"{spec['csv']} not found in {data_dir}") - - base = dataset_root.resolve() - prompt_key = spec["prompt_key"] - gt_key = spec["gt_key"] - image_key = spec["image_key"] - skip_keys = {"sample_id", prompt_key, gt_key, image_key} - {None} - - rows_out = [] - for row in _read_csv(csv_path): - sid = row.get("sample_id", row.get("pair_id", "")) - prompt = row.get(prompt_key, "") if prompt_key else "" - gt_raw = row.get(gt_key, "") if gt_key else "" - - img_rel = row.get(image_key, "") if image_key else "" - img_abs = "" - if img_rel: - for candidate_base in [data_dir, base]: - candidate = candidate_base / img_rel - if candidate.exists(): - img_abs = str(candidate.resolve()) - break - - is_vid = _is_video(img_abs) if img_abs else False - has_image = bool(img_abs and not is_vid - and Path(img_abs).exists() and _is_image_file(img_abs)) - - extra = {k: v for k, v in row.items() if k not in skip_keys} - - rows_out.append({ - "sample_id": sid, - "benchmark_id": benchmark_id, - "domain": meta.domain, - "task_type": meta.task_type.value, - "benchmark_name": meta.name, - "prompt": prompt, - "ground_truth": gt_raw, - "image": img_abs if has_image else None, - "media_path": img_rel, - "media_type": "video" if is_vid else ("image" if has_image else "none"), - "metadata": json.dumps(extra, ensure_ascii=False, default=str) if extra else "{}", - }) - - return rows_out + Benchmarks load locally with absolute paths like + ``/home/.../gdb-dataset/benchmarks/svg/assets/foo.png``; those strings + are useless to an HF consumer who doesn't have that tree on disk. We + strip the root prefix so the parquet is portable — downstream code can + still try ``Path(x).is_file()`` and gracefully fall back when missing. + """ + if isinstance(value, str): + if value.startswith(dataset_root_str + "/"): + return value[len(dataset_root_str) + 1:] + return value + if isinstance(value, list): + return [_normalize_paths(v, dataset_root_str) for v in value] + if isinstance(value, dict): + return {k: _normalize_paths(v, dataset_root_str) for k, v in value.items()} + return value def load_via_registry( @@ -304,18 +84,32 @@ def load_via_registry( dataset_root: Path, ) -> List[Dict[str, Any]]: bench = registry.get(benchmark_id) + logger.info(" %s: calling bench.load_data()…", benchmark_id) + t0 = time.time() samples = bench.load_data(data_dir, dataset_root=str(dataset_root)) + logger.info(" %s: load_data produced %d samples in %.1fs", + benchmark_id, len(samples), time.time() - t0) + dataset_root_str = str(Path(dataset_root).resolve()) rows_out = [] - for sample in samples: + # Only keys whose values are faithfully preserved elsewhere in the parquet + # row are excluded from ``metadata``. The ``image`` column packs at most + # ONE PIL blob, so path-valued keys (``video_path``, ``input_image``, + # ``shuffled_keyframe_paths`` etc.) MUST survive in metadata — otherwise + # ``build_model_input`` crashes with KeyError on the HF side. + metadata_skip = {"sample_id", "ground_truth", "prompt"} + for i, sample in enumerate(samples): + if i and i % 500 == 0: + logger.info(" %s: packed %d/%d rows", benchmark_id, i, len(samples)) img_path = _find_image(sample) is_vid = _is_video(img_path) if img_path else False has_image = bool(img_path and not is_vid and Path(img_path).exists() and _is_image_file(img_path)) - skip = {"sample_id", "ground_truth", "prompt", "image_path", - "input_image", "input_composite", "source_image", "video_path"} - extra = {k: v for k, v in sample.items() if k not in skip} + extra = {k: _normalize_paths(v, dataset_root_str) + for k, v in sample.items() if k not in metadata_skip} + + media_path_rel = _normalize_paths(img_path, dataset_root_str) if img_path else "" rows_out.append({ "sample_id": str(sample.get("sample_id", "")), @@ -326,7 +120,7 @@ def load_via_registry( "prompt": sample.get("prompt", ""), "ground_truth": _serialize(sample.get("ground_truth", "")), "image": img_path if has_image else None, - "media_path": img_path or "", + "media_path": media_path_rel, "media_type": "video" if is_vid else ("image" if has_image else "none"), "metadata": json.dumps(extra, ensure_ascii=False, default=str) if extra else "{}", }) @@ -339,6 +133,17 @@ def load_benchmark( benchmark_id: str, dataset_root: Path, ) -> List[Dict[str, Any]]: + """Always load via the benchmark's own ``load_data()``. + + The historical ``load_csv_benchmark`` / ``load_json_benchmark`` / + ``load_manifest_benchmark`` shortcuts were faster but drifted from the + sample shape each benchmark's ``build_model_input()`` expects at runtime + (e.g. svg-1 was uploaded with nested ``questions`` structures that the + pipeline could not consume, and several benchmarks were missing the + per-entry sample expansion their ``load_data()`` does). Going through + ``load_via_registry`` guarantees HF parquet rows round-trip to the same + shape local-file loading produces. + """ if benchmark_id in SKIP_BENCHMARKS: logger.info("Skipping %s (excluded)", benchmark_id) return [] @@ -352,19 +157,9 @@ def load_benchmark( logger.warning("Skipping %s: %s", benchmark_id, exc) return [] - csv_path = data_dir / "samples.csv" - json_path = data_dir / f"{benchmark_id}.json" - t0 = time.time() try: - if benchmark_id in MANIFEST_BENCHMARKS: - rows = load_manifest_benchmark(benchmark_id, meta, data_dir, dataset_root) - elif csv_path.exists(): - rows = load_csv_benchmark(benchmark_id, meta, data_dir, dataset_root) - elif json_path.exists(): - rows = load_json_benchmark(benchmark_id, meta, data_dir, dataset_root) - else: - rows = load_via_registry(registry, benchmark_id, meta, data_dir, dataset_root) + rows = load_via_registry(registry, benchmark_id, meta, data_dir, dataset_root) except Exception as exc: logger.warning("Failed to load %s: %s: %s", benchmark_id, type(exc).__name__, exc) return [] @@ -403,6 +198,34 @@ def build_dataset(all_rows: List[Dict[str, Any]]): return datasets.Dataset.from_list(all_rows, features=features) +_BENCHMARK_ID_PATTERN = r"^[a-z]+-\d+$" + + +def _merge_card_configs(api, repo_id: str, new_configs: List[str]) -> List[str]: + """Union new configs with any already-declared configs on the Hub. + + Avoids the footgun where pushing only a subset of benchmarks and then + regenerating the card would delete declarations for the rest. + """ + import re + from huggingface_hub import hf_hub_download + + try: + existing_readme = hf_hub_download( + repo_id=repo_id, repo_type="dataset", filename="README.md", + ) + except Exception: + return sorted(set(new_configs)) + + content = Path(existing_readme).read_text(encoding="utf-8") + existing = set() + for match in re.finditer(r"- config_name:\s*([^\s]+)", content): + name = match.group(1).strip() + if re.match(_BENCHMARK_ID_PATTERN, name): + existing.add(name) + return sorted(existing | set(new_configs)) + + def generate_dataset_card(config_names: Optional[List[str]] = None) -> str: if config_names is None: config_names = ["all"] @@ -579,13 +402,17 @@ def main(): ds.push_to_hub(args.repo_id, config_name=bid, commit_message=f"Upload GDB benchmark: {bid}") + # Merge configs into the existing card rather than replacing, so that a + # partial re-upload (e.g. --benchmarks template-4 template-5) doesn't wipe + # out declarations for the other 37 configs that are still on the Hub. logger.info("Uploading dataset card...") + card_config_names = _merge_card_configs(api, args.repo_id, sorted(per_benchmark.keys())) api.upload_file( - path_or_fileobj=generate_dataset_card(sorted(per_benchmark.keys())).encode("utf-8"), + path_or_fileobj=generate_dataset_card(card_config_names).encode("utf-8"), path_in_repo="README.md", repo_id=args.repo_id, repo_type="dataset", - commit_message="Add dataset card", + commit_message="Update dataset card", ) logger.info("Done! https://huggingface.co/datasets/%s", args.repo_id) diff --git a/src/gdb/hf.py b/src/gdb/hf.py index 76ccb9d..69dc069 100644 --- a/src/gdb/hf.py +++ b/src/gdb/hf.py @@ -116,9 +116,13 @@ def load_from_hub( if k not in sample: sample[k] = v - # Handle image: save PIL to cache, store path + # Handle image: save PIL to cache, store path. + # Generation-task configs store `image` as an empty string rather than + # a PIL object (the column type is Value("string") when no samples in + # that config have images), so skip anything that isn't a PIL image. pil_img = row.get("image") - if pil_img is not None: + has_pil = pil_img is not None and hasattr(pil_img, "save") + if has_pil: dest = _image_cache_path(cache_dir, benchmark_id, sample["sample_id"]) if dest.exists(): img_path = str(dest) From af09b5b409996c640d425722505a63f9a3f6811d Mon Sep 17 00:00:00 2001 From: Mohit Garg Date: Fri, 24 Apr 2026 19:15:12 +0530 Subject: [PATCH 2/3] cleanup: trim verbose docstrings and dead code in HF upload path - drop SKIP_BENCHMARKS (never populated) and its guard in load_benchmark - trim load_benchmark / _normalize_paths / _merge_card_configs docstrings (historical refactor narrative belongs in the commit log, not the code) - collapse multi-line comments in load_via_registry and hf.load_from_hub to one-liners that describe invariants rather than restate the code - drop unused api param from _merge_card_configs - shorten hf.py module/function docstrings No behavior change. Made-with: Cursor --- scripts/upload_to_hf.py | 54 +++++++---------------------------------- src/gdb/hf.py | 29 +++++++--------------- 2 files changed, 18 insertions(+), 65 deletions(-) diff --git a/scripts/upload_to_hf.py b/scripts/upload_to_hf.py index 2a61933..497101d 100644 --- a/scripts/upload_to_hf.py +++ b/scripts/upload_to_hf.py @@ -29,8 +29,6 @@ DEFAULT_DATASET_ROOT = REPO_ROOT / "data" / "gdb-dataset" HF_REPO_ID = "lica-world/GDB" -SKIP_BENCHMARKS: set = set() - def _serialize(value: Any) -> str: if isinstance(value, str): @@ -57,14 +55,7 @@ def _is_video(path: str) -> bool: def _normalize_paths(value: Any, dataset_root_str: str) -> Any: - """Replace absolute paths under ``dataset_root`` with the relative tail. - - Benchmarks load locally with absolute paths like - ``/home/.../gdb-dataset/benchmarks/svg/assets/foo.png``; those strings - are useless to an HF consumer who doesn't have that tree on disk. We - strip the root prefix so the parquet is portable — downstream code can - still try ``Path(x).is_file()`` and gracefully fall back when missing. - """ + """Strip ``dataset_root`` prefix from absolute paths so parquet is portable.""" if isinstance(value, str): if value.startswith(dataset_root_str + "/"): return value[len(dataset_root_str) + 1:] @@ -92,11 +83,8 @@ def load_via_registry( dataset_root_str = str(Path(dataset_root).resolve()) rows_out = [] - # Only keys whose values are faithfully preserved elsewhere in the parquet - # row are excluded from ``metadata``. The ``image`` column packs at most - # ONE PIL blob, so path-valued keys (``video_path``, ``input_image``, - # ``shuffled_keyframe_paths`` etc.) MUST survive in metadata — otherwise - # ``build_model_input`` crashes with KeyError on the HF side. + # Only fields promoted to their own parquet column are excluded from + # ``metadata``; everything else (including path-valued keys) must survive. metadata_skip = {"sample_id", "ground_truth", "prompt"} for i, sample in enumerate(samples): if i and i % 500 == 0: @@ -133,24 +121,7 @@ def load_benchmark( benchmark_id: str, dataset_root: Path, ) -> List[Dict[str, Any]]: - """Always load via the benchmark's own ``load_data()``. - - The historical ``load_csv_benchmark`` / ``load_json_benchmark`` / - ``load_manifest_benchmark`` shortcuts were faster but drifted from the - sample shape each benchmark's ``build_model_input()`` expects at runtime - (e.g. svg-1 was uploaded with nested ``questions`` structures that the - pipeline could not consume, and several benchmarks were missing the - per-entry sample expansion their ``load_data()`` does). Going through - ``load_via_registry`` guarantees HF parquet rows round-trip to the same - shape local-file loading produces. - """ - if benchmark_id in SKIP_BENCHMARKS: - logger.info("Skipping %s (excluded)", benchmark_id) - return [] - bench = registry.get(benchmark_id) - meta = bench.meta - try: data_dir = bench.resolve_data_dir(dataset_root) except FileNotFoundError as exc: @@ -159,13 +130,12 @@ def load_benchmark( t0 = time.time() try: - rows = load_via_registry(registry, benchmark_id, meta, data_dir, dataset_root) + rows = load_via_registry(registry, benchmark_id, bench.meta, data_dir, dataset_root) except Exception as exc: logger.warning("Failed to load %s: %s: %s", benchmark_id, type(exc).__name__, exc) return [] - dt = time.time() - t0 - logger.info("Loaded %s: %d samples (%.1fs)", benchmark_id, len(rows), dt) + logger.info("Loaded %s: %d samples (%.1fs)", benchmark_id, len(rows), time.time() - t0) return rows @@ -201,12 +171,9 @@ def build_dataset(all_rows: List[Dict[str, Any]]): _BENCHMARK_ID_PATTERN = r"^[a-z]+-\d+$" -def _merge_card_configs(api, repo_id: str, new_configs: List[str]) -> List[str]: - """Union new configs with any already-declared configs on the Hub. - - Avoids the footgun where pushing only a subset of benchmarks and then - regenerating the card would delete declarations for the rest. - """ +def _merge_card_configs(repo_id: str, new_configs: List[str]) -> List[str]: + """Union new configs with any already on the Hub, so partial uploads don't + drop existing declarations.""" import re from huggingface_hub import hf_hub_download @@ -402,11 +369,8 @@ def main(): ds.push_to_hub(args.repo_id, config_name=bid, commit_message=f"Upload GDB benchmark: {bid}") - # Merge configs into the existing card rather than replacing, so that a - # partial re-upload (e.g. --benchmarks template-4 template-5) doesn't wipe - # out declarations for the other 37 configs that are still on the Hub. logger.info("Uploading dataset card...") - card_config_names = _merge_card_configs(api, args.repo_id, sorted(per_benchmark.keys())) + card_config_names = _merge_card_configs(args.repo_id, sorted(per_benchmark.keys())) api.upload_file( path_or_fileobj=generate_dataset_card(card_config_names).encode("utf-8"), path_in_repo="README.md", diff --git a/src/gdb/hf.py b/src/gdb/hf.py index 69dc069..be38dfb 100644 --- a/src/gdb/hf.py +++ b/src/gdb/hf.py @@ -1,11 +1,7 @@ """Load benchmark samples from the HuggingFace Hub dataset (lica-world/GDB). -This module provides a drop-in alternative to the local file-based -``load_data()`` path. When ``--dataset-root`` is not provided, the runner -can call ``load_from_hub()`` to fetch data directly from HuggingFace. - -Images are cached to disk so that ``build_model_input()`` gets file paths -it can pass to model APIs, matching the local-file contract. +Used when the runner has no ``--dataset-root``. Images are cached to disk +so ``build_model_input()`` receives file paths, matching the local contract. """ from __future__ import annotations @@ -69,15 +65,12 @@ def load_from_hub( repo_id: str = HF_REPO_ID, cache_dir: Optional[Path] = None, ) -> List[Dict[str, Any]]: - """Load samples for *benchmark_id* from the HuggingFace Hub dataset. - - Returns a list of dicts matching the contract of - ``BaseBenchmark.load_data()`` — at minimum ``sample_id`` and - ``ground_truth``, plus task-specific fields unpacked from the - ``metadata`` column. + """Load samples for *benchmark_id* from the HuggingFace Hub. - Images are saved to *cache_dir* (default ``~/.cache/gdb/images/``) - so downstream code receives file path strings, not PIL objects. + Matches ``BaseBenchmark.load_data()``: returns dicts with ``sample_id``, + ``ground_truth``, plus any fields unpacked from the ``metadata`` column. + Images are cached under *cache_dir* (default ``~/.cache/gdb/images/``) + and surfaced as ``image_path`` strings. """ try: from datasets import load_dataset @@ -101,11 +94,9 @@ def load_from_hub( "prompt": row.get("prompt", ""), } - # Alias prompt into keys that some benchmarks expect sample["question"] = sample["prompt"] sample["description"] = sample["prompt"] - # Unpack task-specific fields from metadata JSON meta_raw = row.get("metadata", "{}") try: extra = json.loads(meta_raw) if meta_raw else {} @@ -116,10 +107,8 @@ def load_from_hub( if k not in sample: sample[k] = v - # Handle image: save PIL to cache, store path. - # Generation-task configs store `image` as an empty string rather than - # a PIL object (the column type is Value("string") when no samples in - # that config have images), so skip anything that isn't a PIL image. + # Generation-only configs type ``image`` as Value("string") and store "", + # so we check for a PIL-like object rather than truthiness. pil_img = row.get("image") has_pil = pil_img is not None and hasattr(pil_img, "save") if has_pil: From 796df1acd24508f35b2f93f0862268b02fd8371b Mon Sep 17 00:00:00 2001 From: Mohit Garg Date: Fri, 24 Apr 2026 19:16:52 +0530 Subject: [PATCH 3/3] lint: separate stdlib from third-party imports in _merge_card_configs Made-with: Cursor --- scripts/upload_to_hf.py | 1 + 1 file changed, 1 insertion(+) diff --git a/scripts/upload_to_hf.py b/scripts/upload_to_hf.py index 497101d..d255411 100644 --- a/scripts/upload_to_hf.py +++ b/scripts/upload_to_hf.py @@ -175,6 +175,7 @@ def _merge_card_configs(repo_id: str, new_configs: List[str]) -> List[str]: """Union new configs with any already on the Hub, so partial uploads don't drop existing declarations.""" import re + from huggingface_hub import hf_hub_download try: