chore(fail): removing misplaced file

2026-06-26 12:47:18 +00:00 · 2026-06-24 21:35:39 +02:00
parent 71f5fc0bf8
commit 82b8e56011
1 changed files with 0 additions and 981 deletions
@@ -1,981 +0,0 @@
-#!/usr/bin/env python
-
-# Copyright 2026 The HuggingFace Inc. team. All rights reserved.
-#
-# Licensed under the Apache License, Version 2.0 (the "License");
-# you may not use this file except in compliance with the License.
-# You may obtain a copy of the License at
-#
-#     http://www.apache.org/licenses/LICENSE-2.0
-#
-# Unless required by applicable law or agreed to in writing, software
-# distributed under the License is distributed on an "AS IS" BASIS,
-# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-# See the License for the specific language governing permissions and
-# limitations under the License.
-
-"""
-Validate the integrity of a LeRobot v3.0 dataset, given only its repo id.
-
-This is a single-file, read-only health check. It loads the dataset metadata
-(downloading only ``meta/`` from the Hub when needed) and then runs a series of
-independent checks, grouped into clearly delimited sections:
-
-    1. Folder architecture & expected files
-    2. info.json aggregate counts, splits & path templates
-    3. Feature schema & "missing features" (columns present in data)
-    4. Episode-metadata indexing continuity (episodes / frames)
-    5. Per-data-file scan: episode membership, frame counts, frame_index /
-       timestamp monotonicity & continuity, global index uniqueness
-    6. tasks.parquet referential integrity
-    7. stats.json validity
-    8. Video integrity (presence, fps, resolution, timestamp bounds, contiguity)
-    9. End-to-end loadability smoke test (LeRobotDataset[0] / [-1])
-   10. Hub metadata: repo presence, codebase-version revision, discoverability
-       tags, license & README
-
-Each section returns a list of *failures* (hard inconsistencies) and a list of
-*warnings* (suspicious but non-fatal). The script prints a per-section report
-and exits with code 1 if any failure was detected, 0 otherwise.
-
-Sections 1-4, 6, 7 only need ``meta/`` (cheap). Sections 5, 8, 9 read the data
-parquet / video payloads; for files missing locally they are fetched from the
-Hub on demand. Section 10 queries the Hub API for repo metadata. Use the flags
-below to skip the expensive parts.
-
-Usage:
-    # Full check of a Hub dataset (downloads data/videos as needed):
-    python examples/dataset/check_dataset_integrity.py --repo-id lerobot/pusht
-
-    # Local dataset, metadata-only (fast):
-    python examples/dataset/check_dataset_integrity.py \
-        --repo-id lerobot/pusht --root /path/to/pusht --metadata-only
-
-    # Skip the video, smoke-test and Hub sections:
-    python examples/dataset/check_dataset_integrity.py \
-        --repo-id lerobot/pusht --no-videos --no-smoke-test --no-hub
-"""
-
-from __future__ import annotations
-
-import argparse
-import math
-import sys
-from collections import defaultdict
-from dataclasses import dataclass, field
-from pathlib import Path
-
-import numpy as np
-import pyarrow.parquet as pq
-from huggingface_hub import HfFileSystem, hf_hub_download
-
-from lerobot.datasets.dataset_metadata import CODEBASE_VERSION, LeRobotDatasetMetadata
-from lerobot.datasets.utils import (
-    DATA_DIR,
-    EPISODES_DIR,
-    INFO_PATH,
-    STATS_PATH,
-    VIDEO_DIR,
-)
-from lerobot.utils.constants import DEFAULT_FEATURES
-
-# Bookkeeping columns every data parquet file must carry (stored as scalar
-# features alongside the user-defined ones). They double as the columns the
-# frame-level checks rely on.
-BOOKKEEPING_COLUMNS = set(DEFAULT_FEATURES)  # timestamp, frame_index, episode_index, index, task_index
-
-# Default value types considered "numeric" for NaN/Inf and stats sanity checks.
-_FLOAT_DTYPES = {"float16", "float32", "float64"}
-_INT_DTYPES = {"int8", "int16", "int32", "int64", "uint8", "uint16", "uint32", "uint64", "bool"}
-
-
-@dataclass
-class SectionResult:
-    """Outcome of a single check section."""
-
-    name: str
-    failures: list[str] = field(default_factory=list)
-    warnings: list[str] = field(default_factory=list)
-    skipped: bool = False
-    skip_reason: str = ""
-
-
-# ----------------------------------------------------------------------------
-# Small shared helpers
-# ----------------------------------------------------------------------------
-def _episodes_dataframe(meta: LeRobotDatasetMetadata):
-    """Return the per-episode metadata as a pandas DataFrame, sorted by episode_index.
-
-    ``meta.episodes`` is a HuggingFace ``Dataset`` with the ``stats/*`` columns
-    already dropped, so this is metadata-only and cheap.
-    """
-    df = meta.episodes.to_pandas()
-    return df.sort_values("episode_index").reset_index(drop=True)
-
-
-def _read_parquet_columns(meta, rel_path, columns, fs):
-    """Read selected columns of a (possibly remote) parquet file.
-
-    Local files are read directly; otherwise the columns are fetched from the
-    Hub via ``HfFileSystem`` range requests, so only the requested columns are
-    transferred (never the bulk image payload). Returns a pyarrow Table, or
-    ``None`` if the file exists nowhere.
-    """
-    local_path = meta.root / rel_path
-    if local_path.is_file():
-        return pq.read_table(local_path, columns=columns)
-    hf_path = f"datasets/{meta.repo_id}/{rel_path}"
-    if not fs.exists(hf_path, revision=meta.revision):
-        return None
-    with fs.open(hf_path, "rb", revision=meta.revision) as f:
-        return pq.read_table(f, columns=columns)
-
-
-def _read_parquet_schema(meta, rel_path, fs):
-    """Read just the schema (footer) of a (possibly remote) parquet file."""
-    local_path = meta.root / rel_path
-    if local_path.is_file():
-        return pq.read_schema(local_path)
-    hf_path = f"datasets/{meta.repo_id}/{rel_path}"
-    if not fs.exists(hf_path, revision=meta.revision):
-        return None
-    with fs.open(hf_path, "rb", revision=meta.revision) as f:
-        return pq.read_schema(f)
-
-
-def _ensure_local_file(meta, rel_path):
-    """Return a local path to a dataset file, downloading from the Hub if absent.
-
-    Used for videos (which must be a real local file to probe with PyAV).
-    Returns ``None`` if the file cannot be located or downloaded.
-    """
-    local_path = meta.root / rel_path
-    if local_path.is_file():
-        return local_path
-    try:
-        downloaded = hf_hub_download(
-            repo_id=meta.repo_id,
-            repo_type="dataset",
-            filename=rel_path,
-            revision=meta.revision,
-        )
-        return Path(downloaded)
-    except Exception:
-        return None
-
-
-def _feature_height_width(ft):
-    """Best-effort (height, width) extraction from an image/video feature spec."""
-    shape = tuple(ft["shape"])
-    names = ft.get("names")
-    if names and len(names) == len(shape):
-        idx = {n: i for i, n in enumerate(names)}
-        if "height" in idx and "width" in idx:
-            return shape[idx["height"]], shape[idx["width"]]
-    if len(shape) == 3:
-        # Heuristic: channel-first (C, H, W) when first dim is small, else (H, W, C).
-        if shape[0] <= 4:
-            return shape[1], shape[2]
-        return shape[0], shape[1]
-    return None
-
-
-def _parse_splits(splits):
-    """Turn a ``{"train": "0:100", ...}`` dict into the set of covered episodes."""
-    covered = []
-    for spec in splits.values():
-        if isinstance(spec, str) and ":" in spec:
-            start, end = spec.split(":")
-            covered.extend(range(int(start), int(end)))
-    return covered
-
-
-def _group_episodes_by_data_file(df):
-    """Bucket episode rows by the (chunk, file) data parquet they live in."""
-    buckets = defaultdict(list)
-    for _, ep in df.iterrows():
-        key = (int(ep["data/chunk_index"]), int(ep["data/file_index"]))
-        buckets[key].append(ep)
-    return buckets
-
-
-# ============================================================================
-# SECTION 1 - Folder architecture & expected files
-# ----------------------------------------------------------------------------
-# Verify the canonical v3.0 layout exists: the required meta/ files, and the
-# data/ (and videos/ when the dataset has video features) directories with at
-# least one chunk/file. Only checks local presence; remote-only datasets that
-# were just metadata-downloaded will legitimately have no local data/ yet, so
-# missing data/video dirs are warnings rather than failures here (Section 5/8
-# resolve them against the Hub).
-# ============================================================================
-def check_folder_architecture(meta) -> SectionResult:
-    res = SectionResult("1. Folder architecture & expected files")
-    root = meta.root
-
-    # Required metadata files (always pulled with meta/).
-    for rel in (INFO_PATH, STATS_PATH, "meta/tasks.parquet"):
-        if not (root / rel).is_file():
-            res.failures.append(f"missing required metadata file: {rel}")
-
-    # At least one episode-metadata parquet under meta/episodes/.
-    episode_meta_files = list((root / EPISODES_DIR).glob("**/*.parquet"))
-    if not episode_meta_files:
-        res.failures.append(f"no episode metadata parquet found under {EPISODES_DIR}/")
-
-    # data/ directory: warn (not fail) when absent locally, since metadata-only
-    # snapshots are valid and Section 5 resolves data against the Hub.
-    data_dir = root / DATA_DIR
-    if not data_dir.is_dir():
-        res.warnings.append(f"no local '{DATA_DIR}/' directory (will resolve files from the Hub)")
-    elif not list(data_dir.glob("chunk-*/file-*.parquet")):
-        res.warnings.append(f"'{DATA_DIR}/' present but contains no chunk-*/file-*.parquet locally")
-
-    # videos/ only expected when the dataset declares video features.
-    if meta.video_keys:
-        video_dir = root / VIDEO_DIR
-        if not video_dir.is_dir():
-            res.warnings.append(f"no local '{VIDEO_DIR}/' directory but dataset has video keys")
-        else:
-            for key in meta.video_keys:
-                if not (video_dir / key).is_dir():
-                    res.warnings.append(f"no local video directory for video key {key!r}")
-
-    return res
-
-
-# ============================================================================
-# SECTION 2 - info.json aggregate counts, splits & path templates
-# ----------------------------------------------------------------------------
-# Cross-check the global counters every other consumer trusts: total_episodes /
-# total_frames / total_tasks against the actual metadata, that the splits cover
-# exactly [0, total_episodes), and that the path templates carry the expected
-# placeholders. fps/chunk sizes are validated by DatasetInfo on load, so we only
-# surface the codebase version here.
-# ============================================================================
-def check_info_consistency(meta, df) -> SectionResult:
-    res = SectionResult("2. info.json counts, splits & templates")
-
-    # Codebase version (load already raised on hard-incompatible versions).
-    if meta.info.codebase_version != CODEBASE_VERSION:
-        res.warnings.append(
-            f"info.codebase_version={meta.info.codebase_version!r} != script target {CODEBASE_VERSION!r}"
-        )
-
-    # total_episodes vs number of episode rows and max(episode_index)+1.
-    n_rows = len(df)
-    if meta.total_episodes != n_rows:
-        res.failures.append(f"info.total_episodes={meta.total_episodes} but found {n_rows} episode rows")
-    if n_rows > 0:
-        max_idx = int(df["episode_index"].max())
-        if max_idx + 1 != n_rows:
-            res.failures.append(f"episode_index range [0, {max_idx}] inconsistent with {n_rows} episode rows")
-
-    # total_frames vs sum of per-episode lengths.
-    sum_len = int(df["length"].sum()) if n_rows > 0 else 0
-    if meta.total_frames != sum_len:
-        res.failures.append(f"info.total_frames={meta.total_frames} but sum(length)={sum_len}")
-
-    # total_tasks vs tasks.parquet row count.
-    n_tasks = len(meta.tasks)
-    if meta.total_tasks != n_tasks:
-        res.failures.append(f"info.total_tasks={meta.total_tasks} but tasks.parquet has {n_tasks} rows")
-
-    # Splits must cover exactly [0, total_episodes) with no gaps/overlaps.
-    if meta.info.splits:
-        covered = sorted(_parse_splits(meta.info.splits))
-        expected = list(range(meta.total_episodes))
-        if covered != expected:
-            res.failures.append(
-                f"splits {meta.info.splits} do not cover exactly [0, {meta.total_episodes}) "
-                f"(covered {len(covered)} unique episode(s))"
-            )
-
-    # Path templates must contain the placeholders the readers format against.
-    if not ("{chunk_index" in meta.data_path and "{file_index" in meta.data_path):
-        res.failures.append(f"data_path template missing chunk/file placeholders: {meta.data_path!r}")
-    if meta.video_keys:
-        vp = meta.video_path or ""
-        if not all(tok in vp for tok in ("{video_key", "{chunk_index", "{file_index")):
-            res.failures.append(f"video_path template missing placeholders: {meta.video_path!r}")
-
-    return res
-
-
-# ============================================================================
-# SECTION 3 - Feature schema & "missing features"
-# ----------------------------------------------------------------------------
-# Validate the features dict structurally (dtype/shape/names coherence) and
-# confirm the first data file actually carries a column for every non-video
-# feature (image features are embedded in the parquet; video features live in
-# mp4 files and are intentionally absent from the parquet columns).
-# ============================================================================
-def check_feature_schema(meta, df, fs, scan_data) -> SectionResult:
-    res = SectionResult("3. Feature schema & missing features")
-    video_keys = set(meta.video_keys)
-
-    # 3a. Structural validation of each feature spec.
-    for key, ft in meta.features.items():
-        if "dtype" not in ft or "shape" not in ft:
-            res.failures.append(f"feature {key!r} missing 'dtype' or 'shape'")
-            continue
-        shape = tuple(ft["shape"])
-        names = ft.get("names")
-        # Vector features: names length must match the (1-D) shape.
-        if names is not None and isinstance(names, list) and len(shape) == 1 and len(names) != shape[0]:
-            res.failures.append(f"feature {key!r}: len(names)={len(names)} != shape[0]={shape[0]}")
-        # Image/video features should be 3-dimensional.
-        if ft["dtype"] in ("image", "video") and len(shape) != 3:
-            res.failures.append(f"visual feature {key!r} has non-3D shape {shape}")
-
-    # 3b. Column presence in the data parquet (needs to read one file footer).
-    if not scan_data:
-        res.warnings.append("data scan disabled: skipped data-column presence check")
-        return res
-
-    if len(df) == 0:
-        return res
-
-    first = df.iloc[0]
-    rel = meta.data_path.format(
-        chunk_index=int(first["data/chunk_index"]), file_index=int(first["data/file_index"])
-    )
-    schema = _read_parquet_schema(meta, rel, fs)
-    if schema is None:
-        res.warnings.append(f"could not read schema of first data file {rel} to check columns")
-        return res
-
-    data_columns = set(schema.names)
-    expected = {k for k in meta.features if k not in video_keys}
-    missing = expected - data_columns
-    if missing:
-        res.failures.append(f"data file {rel} missing feature columns: {sorted(missing)}")
-    # Columns present in data but neither a feature nor bookkeeping -> warn.
-    extra = data_columns - set(meta.features) - BOOKKEEPING_COLUMNS
-    if extra:
-        res.warnings.append(f"data file {rel} has unexpected columns: {sorted(extra)}")
-
-    return res
-
-
-# ============================================================================
-# SECTION 4 - Episode-metadata indexing continuity
-# ----------------------------------------------------------------------------
-# Independently of any data file, the per-episode metadata must describe a
-# contiguous, gap-free indexing of frames into the global frame index:
-#   * episode_index == 0, 1, 2, ... in order;
-#   * the first episode starts at dataset_from_index == 0;
-#   * dataset_to_index - dataset_from_index == length for each episode;
-#   * each dataset_from_index equals the previous dataset_to_index (no gaps);
-#   * the last dataset_to_index equals info.total_frames.
-# This is the "missing episodes or frames according to metadata" check.
-# ============================================================================
-def check_episode_continuity(meta, df) -> SectionResult:
-    res = SectionResult("4. Episode-metadata indexing continuity")
-    prev_to = 0
-    for expected_idx, (_, row) in enumerate(df.iterrows()):
-        ep_idx = int(row["episode_index"])
-        if ep_idx != expected_idx:
-            res.failures.append(f"episode_index not contiguous: expected {expected_idx}, found {ep_idx}")
-
-        d_from = int(row["dataset_from_index"])
-        d_to = int(row["dataset_to_index"])
-        length = int(row["length"])
-
-        if d_from != prev_to:
-            ref = f"episode {expected_idx - 1} dataset_to_index" if expected_idx > 0 else "start (0)"
-            res.failures.append(
-                f"episode {ep_idx}: dataset_from_index={d_from} does not match {ref}={prev_to}"
-            )
-        if d_to - d_from != length:
-            res.failures.append(
-                f"episode {ep_idx}: dataset_to_index - dataset_from_index = {d_to - d_from} but length = {length}"
-            )
-        if length <= 0:
-            res.failures.append(f"episode {ep_idx}: non-positive length {length}")
-
-        prev_to = d_to
-
-    if len(df) > 0 and prev_to != meta.total_frames:
-        res.failures.append(
-            f"last dataset_to_index={prev_to} does not match info.total_frames={meta.total_frames}"
-        )
-    return res
-
-
-# ============================================================================
-# SECTION 5 - Per-data-file scan
-# ----------------------------------------------------------------------------
-# For each data parquet file referenced by the metadata, read the bookkeeping
-# columns (episode_index, frame_index, timestamp, index) and validate:
-#   * "missing data files": the file resolves locally or on the Hub;
-#   * episode membership: the set of episode_index values in the file matches
-#     the set the metadata assigns to it, and the row count matches sum(length);
-#   * frame_index per episode runs exactly 0..length-1 (monotonic + continuous);
-#   * timestamp == frame_index / fps within tolerance;
-#   * the global "index" column is a contiguous 0..total_frames-1 with no
-#     duplicates across files (cross-file uniqueness);
-#   * each episode_index appears in exactly one data file.
-# ============================================================================
-def check_data_files(meta, df, fs, fps_tol_s) -> SectionResult:
-    res = SectionResult("5. Per-data-file scan (membership, frames, monotonicity)")
-    fps = meta.fps
-    buckets = _group_episodes_by_data_file(df)
-
-    seen_global_index = set()
-    episode_to_file = {}
-    duplicate_index_count = 0
-
-    # Per-episode lengths from metadata for cross-checking.
-    meta_len = {int(r["episode_index"]): int(r["length"]) for _, r in df.iterrows()}
-
-    for (chunk_idx, file_idx), eps in sorted(buckets.items()):
-        rel = meta.data_path.format(chunk_index=chunk_idx, file_index=file_idx)
-        meta_eps = {int(e["episode_index"]) for e in eps}
-        meta_frames = sum(int(e["length"]) for e in eps)
-
-        # Cross-file uniqueness of episodes (an episode must live in one file).
-        for ep in meta_eps:
-            if ep in episode_to_file:
-                res.failures.append(
-                    f"episode {ep} assigned to multiple data files: {episode_to_file[ep]} and ({chunk_idx},{file_idx})"
-                )
-            else:
-                episode_to_file[ep] = (chunk_idx, file_idx)
-
-        try:
-            table = _read_parquet_columns(
-                meta, rel, ["episode_index", "frame_index", "timestamp", "index"], fs
-            )
-        except Exception as exc:
-            res.failures.append(f"[chunk={chunk_idx:03d} file={file_idx:03d}] failed to read {rel}: {exc}")
-            continue
-        if table is None:
-            res.failures.append(f"[chunk={chunk_idx:03d} file={file_idx:03d}] missing data file: {rel}")
-            continue
-
-        cols = table.to_pydict()
-        ep_col = cols["episode_index"]
-        frame_col = cols["frame_index"]
-        ts_col = cols["timestamp"]
-        idx_col = cols["index"]
-        data_eps = {int(v) for v in ep_col}
-
-        # Episode membership: metadata set vs data set.
-        missing = meta_eps - data_eps
-        unexpected = data_eps - meta_eps
-        if missing:
-            res.failures.append(f"{rel}: episodes in metadata but absent from data: {sorted(missing)}")
-        if unexpected:
-            res.failures.append(f"{rel}: episodes in data but not in metadata: {sorted(unexpected)}")
-        if not missing and not unexpected and len(ep_col) != meta_frames:
-            res.failures.append(f"{rel}: data rows={len(ep_col)} vs metadata sum(length)={meta_frames}")
-
-        # Per-episode frame_index/timestamp checks + global index collection.
-        per_ep_frames = defaultdict(list)
-        for ep_v, fr_v, ts_v, ix_v in zip(ep_col, frame_col, ts_col, idx_col, strict=True):
-            per_ep_frames[int(ep_v)].append((int(fr_v), float(ts_v)))
-            ix = int(ix_v)
-            if ix in seen_global_index:
-                duplicate_index_count += 1
-            else:
-                seen_global_index.add(ix)
-
-        for ep, frames in per_ep_frames.items():
-            frames.sort(key=lambda p: p[0])
-            expected_len = meta_len.get(ep)
-            # frame_index must be exactly 0..len-1.
-            frame_indices = [f for f, _ in frames]
-            if frame_indices != list(range(len(frames))):
-                res.failures.append(f"{rel}: episode {ep} frame_index not contiguous 0..{len(frames) - 1}")
-            elif expected_len is not None and len(frames) != expected_len:
-                res.failures.append(
-                    f"{rel}: episode {ep} has {len(frames)} frames but metadata length={expected_len}"
-                )
-            # timestamp == frame_index / fps within tolerance.
-            for f_i, ts in frames:
-                if not math.isfinite(ts) or abs(ts - f_i / fps) > fps_tol_s:
-                    res.failures.append(
-                        f"{rel}: episode {ep} frame {f_i} timestamp={ts:.6f} != {f_i / fps:.6f} (1/fps grid)"
-                    )
-                    break
-
-    # Global index sanity across all files.
-    if duplicate_index_count:
-        res.failures.append(
-            f"found {duplicate_index_count} duplicated global 'index' value(s) across data files"
-        )
-    if seen_global_index:
-        expected_index = set(range(meta.total_frames))
-        if seen_global_index != expected_index:
-            missing_n = len(expected_index - seen_global_index)
-            extra_n = len(seen_global_index - expected_index)
-            res.failures.append(
-                f"global 'index' is not a contiguous 0..{meta.total_frames - 1} "
-                f"(missing {missing_n}, unexpected {extra_n})"
-            )
-
-    return res
-
-
-# ============================================================================
-# SECTION 6 - tasks.parquet referential integrity
-# ----------------------------------------------------------------------------
-# tasks.parquet maps a task string to a task_index. Validate that task indices
-# are a contiguous 0..total_tasks-1 with no duplicates (indices or strings), and
-# that every task referenced by an episode exists. Tasks never referenced by any
-# episode are reported as warnings (orphans).
-# ============================================================================
-def check_tasks(meta, df) -> SectionResult:
-    res = SectionResult("6. tasks.parquet referential integrity")
-    tasks = meta.tasks  # index = task string, column 'task_index'
-
-    indices = sorted(int(i) for i in tasks["task_index"].tolist())
-    if indices != list(range(len(indices))):
-        res.failures.append(f"task_index values are not a contiguous 0..{len(indices) - 1}: {indices[:20]}")
-
-    # Duplicate task strings (the index of the tasks frame).
-    task_strings = list(tasks.index)
-    if len(set(task_strings)) != len(task_strings):
-        res.failures.append("duplicate task strings found in tasks.parquet")
-
-    # Referential integrity: every task named by an episode must exist.
-    known_tasks = set(task_strings)
-    referenced = set()
-    if "tasks" in df.columns:
-        for _, row in df.iterrows():
-            ep_tasks = row["tasks"]
-            if ep_tasks is None:
-                continue
-            for t in list(ep_tasks):
-                referenced.add(t)
-                if t not in known_tasks:
-                    res.failures.append(f"episode {int(row['episode_index'])} references unknown task {t!r}")
-
-    # Orphan tasks (declared but never used) -> warning.
-    orphans = known_tasks - referenced
-    if orphans and referenced:
-        sample = sorted(orphans)[:10]
-        res.warnings.append(f"{len(orphans)} task(s) never referenced by any episode, e.g. {sample}")
-
-    return res
-
-
-# ============================================================================
-# SECTION 7 - stats.json validity
-# ----------------------------------------------------------------------------
-# stats.json holds per-feature min/max/mean/std/count used for normalization.
-# Validate that each entry has the expected sub-keys, that min <= mean <= max
-# element-wise, std >= 0, no NaN/Inf, and that the stored shapes are consistent.
-# Missing stats for a feature is a warning (some auxiliary features carry none);
-# a stats key that is not a feature is a failure.
-# ============================================================================
-def check_stats(meta) -> SectionResult:
-    res = SectionResult("7. stats.json validity")
-    if meta.stats is None:
-        res.skipped = True
-        res.skip_reason = "no stats.json present"
-        return res
-
-    feature_keys = set(meta.features)
-    for key, stat in meta.stats.items():
-        if key not in feature_keys:
-            res.failures.append(f"stats key {key!r} is not a declared feature")
-            continue
-
-        for sub in ("min", "max", "mean", "std", "count"):
-            if sub not in stat:
-                res.failures.append(f"stats[{key!r}] missing '{sub}'")
-        if any(sub not in stat for sub in ("min", "max", "mean", "std")):
-            continue
-
-        mn = np.asarray(stat["min"], dtype=np.float64)
-        mx = np.asarray(stat["max"], dtype=np.float64)
-        mean = np.asarray(stat["mean"], dtype=np.float64)
-        std = np.asarray(stat["std"], dtype=np.float64)
-
-        # No NaN / Inf anywhere.
-        for sub, arr in (("min", mn), ("max", mx), ("mean", mean), ("std", std)):
-            if not np.all(np.isfinite(arr)):
-                res.failures.append(f"stats[{key!r}]['{sub}'] contains NaN/Inf")
-
-        # Ordering and non-negative std.
-        if np.any(mn > mx + 1e-6):
-            res.failures.append(f"stats[{key!r}]: min > max somewhere")
-        if np.any(mean < mn - 1e-6) or np.any(mean > mx + 1e-6):
-            res.failures.append(f"stats[{key!r}]: mean outside [min, max] somewhere")
-        if np.any(std < -1e-6):
-            res.failures.append(f"stats[{key!r}]: negative std somewhere")
-
-        # count consistency (warning: image stats can be sub-sampled).
-        if "count" in stat:
-            count = int(np.asarray(stat["count"]).reshape(-1)[0])
-            if count != meta.total_frames:
-                res.warnings.append(f"stats[{key!r}]['count']={count} != total_frames={meta.total_frames}")
-
-    # Warn about features lacking any stats entry.
-    missing_stats = feature_keys - set(meta.stats)
-    if missing_stats:
-        res.warnings.append(f"features without stats: {sorted(missing_stats)}")
-
-    return res
-
-
-# ============================================================================
-# SECTION 8 - Video integrity
-# ----------------------------------------------------------------------------
-# For datasets with video features, verify each referenced mp4 (per video key)
-# is present (locally or on the Hub), decodable, and consistent with metadata:
-#   * "missing video files": the file resolves;
-#   * container fps == info.fps;
-#   * width/height match the feature shape;
-#   * every episode's [from_timestamp, to_timestamp] lies within the video
-#     duration, with to > from;
-#   * the per-file episode segments are non-overlapping (timeline contiguity);
-#   * (to - from) * fps is close to the episode length.
-# ============================================================================
-def check_videos(meta, df) -> SectionResult:
-    res = SectionResult("8. Video integrity")
-    if not meta.video_keys:
-        res.skipped = True
-        res.skip_reason = "dataset has no video features"
-        return res
-
-    from lerobot.datasets.video_utils import get_video_duration_in_s, get_video_info
-
-    fps = meta.fps
-    meta_len = {int(r["episode_index"]): int(r["length"]) for _, r in df.iterrows()}
-
-    for vid_key in meta.video_keys:
-        ft = meta.features[vid_key]
-        hw = _feature_height_width(ft)
-
-        # Bucket episodes by the video file they reference for this key.
-        file_to_eps = defaultdict(list)
-        for _, row in df.iterrows():
-            chunk = int(row[f"videos/{vid_key}/chunk_index"])
-            file_ = int(row[f"videos/{vid_key}/file_index"])
-            file_to_eps[(chunk, file_)].append(row)
-
-        for (chunk, file_), eps in sorted(file_to_eps.items()):
-            rel = meta.video_path.format(video_key=vid_key, chunk_index=chunk, file_index=file_)
-            path = _ensure_local_file(meta, rel)
-            if path is None:
-                res.failures.append(f"missing video file: {rel}")
-                continue
-
-            try:
-                info = get_video_info(path)
-                duration = get_video_duration_in_s(path)
-            except Exception as exc:
-                res.failures.append(f"{rel}: not decodable ({exc})")
-                continue
-
-            # fps consistency.
-            vfps = info.get("video.fps")
-            if vfps is not None and int(vfps) != int(fps):
-                res.failures.append(f"{rel}: video fps={vfps} != info.fps={fps}")
-
-            # Resolution consistency.
-            if hw is not None:
-                vh, vw = info.get("video.height"), info.get("video.width")
-                if vh is not None and vw is not None and (int(vh), int(vw)) != (int(hw[0]), int(hw[1])):
-                    res.failures.append(f"{rel}: video resolution {vh}x{vw} != feature {hw[0]}x{hw[1]} (HxW)")
-
-            # Timestamp bounds + contiguity within this video file.
-            segments = []
-            for row in eps:
-                ep = int(row["episode_index"])
-                t_from = float(row[f"videos/{vid_key}/from_timestamp"])
-                t_to = float(row[f"videos/{vid_key}/to_timestamp"])
-                segments.append((t_from, t_to, ep))
-
-                if t_from < -1e-6 or t_to <= t_from:
-                    res.failures.append(f"{rel}: episode {ep} invalid timestamps [{t_from}, {t_to}]")
-                if t_to > duration + 1.0 / fps:
-                    res.failures.append(
-                        f"{rel}: episode {ep} to_timestamp={t_to:.3f}s exceeds video duration={duration:.3f}s"
-                    )
-                # (to - from) * fps should be ~ episode length.
-                expected_len = meta_len.get(ep)
-                if expected_len is not None:
-                    n = round((t_to - t_from) * fps)
-                    if abs(n - expected_len) > 1:
-                        res.warnings.append(
-                            f"{rel}: episode {ep} (to-from)*fps={n} differs from length={expected_len}"
-                        )
-
-            # Non-overlapping segments along the timeline.
-            segments.sort()
-            for (a_from, a_to, a_ep), (b_from, b_to, b_ep) in zip(segments, segments[1:], strict=False):
-                if b_from < a_to - 1e-6:
-                    res.failures.append(
-                        f"{rel}: episode {a_ep} [{a_from:.3f},{a_to:.3f}] overlaps episode {b_ep} "
-                        f"[{b_from:.3f},{b_to:.3f}]"
-                    )
-
-    return res
-
-
-# ============================================================================
-# SECTION 9 - End-to-end loadability smoke test
-# ----------------------------------------------------------------------------
-# Final sanity: construct a LeRobotDataset and fetch the first and last frames.
-# This exercises the full read path (parquet + video decoding + delta-timestamp
-# querying) and confirms the returned items expose every declared feature key
-# with the expected shape. Failures here usually mean the lower-level checks
-# missed something or a payload is corrupt.
-# ============================================================================
-def check_smoke_test(meta, root) -> SectionResult:
-    res = SectionResult("9. End-to-end loadability smoke test")
-    try:
-        from lerobot.datasets.lerobot_dataset import LeRobotDataset
-
-        ds = LeRobotDataset(meta.repo_id, root=root, revision=meta.revision)
-    except Exception as exc:
-        res.failures.append(f"LeRobotDataset failed to construct: {exc}")
-        return res
-
-    # Length must match total_frames.
-    if len(ds) != meta.total_frames:
-        res.failures.append(f"len(dataset)={len(ds)} != info.total_frames={meta.total_frames}")
-    if len(ds) == 0:
-        return res
-
-    expected_keys = set(meta.features)
-    for idx in {0, len(ds) - 1}:
-        try:
-            item = ds[idx]
-        except Exception as exc:
-            res.failures.append(f"dataset[{idx}] raised: {exc}")
-            continue
-        missing = expected_keys - set(item)
-        if missing:
-            res.failures.append(f"dataset[{idx}] missing keys: {sorted(missing)}")
-
-    return res
-
-
-# ============================================================================
-# SECTION 10 - Hugging Face Hub metadata, version tag & discoverability tags
-# ----------------------------------------------------------------------------
-# Independently of the payload, verify the dataset is properly published on the
-# Hub and discoverable:
-#   * the repo exists on the Hub;
-#   * a version branch/tag matching the codebase version (e.g. ``v3.0``) exists,
-#     so consumers can pin the revision they load;
-#   * the auto-generated / declared discoverability tags are present
-#     (task_categories:robotics, the custom ``LeRobot`` tag, modality:tabular /
-#     timeseries / video, format:parquet, size_categories:*);
-#   * a license is declared;
-#   * a README.md (dataset card) is present.
-# Missing repo / version are failures; missing tags / license / README are
-# warnings (they hurt discoverability but not loadability).
-# ============================================================================
-def check_hub_metadata(meta) -> SectionResult:
-    res = SectionResult("10. Hugging Face Hub metadata & tags")
-
-    import packaging.version
-    from huggingface_hub import HfApi
-    from huggingface_hub.errors import RepositoryNotFoundError
-
-    from lerobot.datasets.utils import get_repo_versions
-
-    api = HfApi()
-    repo_id = meta.repo_id
-
-    # Is the dataset published on the Hub?
-    try:
-        info = api.dataset_info(repo_id)
-    except RepositoryNotFoundError:
-        res.failures.append(f"dataset {repo_id!r} not found on the Hugging Face Hub")
-        return res
-    except Exception as exc:
-        res.warnings.append(f"could not query the Hub for {repo_id!r}: {exc}")
-        return res
-
-    # A version branch/tag matching the codebase version must exist.
-    try:
-        target = packaging.version.parse(CODEBASE_VERSION)
-        versions = get_repo_versions(repo_id)
-        if target not in versions:
-            found = sorted(f"v{v}" for v in versions) or "none"
-            res.failures.append(f"no {CODEBASE_VERSION} version branch/tag on the Hub (found: {found})")
-    except Exception as exc:
-        res.warnings.append(f"could not list repo version refs: {exc}")
-
-    # Discoverability tags (auto-generated by the Hub + declared on the card).
-    tags = set(info.tags or [])
-    expected_tags = ["task_categories:robotics", "LeRobot", "format:parquet", "modality:tabular", "modality:timeseries"]
-    if meta.video_keys:
-        expected_tags.append("modality:video")
-    for tag in expected_tags:
-        if tag not in tags:
-            res.warnings.append(f"missing expected Hub tag: {tag}")
-    if not any(t.startswith("size_categories:") for t in tags):
-        res.warnings.append("missing size category tag (size_categories:*)")
-
-    # License (declared on the card or surfaced as a license:* tag).
-    card_data = info.card_data
-    has_license = bool(getattr(card_data, "license", None)) or any(t.startswith("license:") for t in tags)
-    if not has_license:
-        res.warnings.append("no license declared on the Hub")
-
-    # README / dataset card.
-    siblings = {s.rfilename for s in (info.siblings or [])}
-    if "README.md" not in siblings:
-        res.warnings.append("no README.md (dataset card) on the Hub")
-
-    return res
-
-
-# ----------------------------------------------------------------------------
-# Orchestration & reporting
-# ----------------------------------------------------------------------------
-def run_all_checks(
-    repo_id,
-    root=None,
-    revision=None,
-    scan_data=True,
-    check_video=True,
-    smoke_test=True,
-    check_hub=True,
-    fps_tol_s=1e-3,
-):
-    """Load metadata and run every section, returning the list of SectionResults."""
-    meta = LeRobotDatasetMetadata(repo_id, root=root, revision=revision)
-    print(
-        f"Loaded metadata for {repo_id!r}: {meta.total_episodes} episodes, "
-        f"{meta.total_frames} frames, {meta.total_tasks} tasks, "
-        f"{len(meta.video_keys)} video key(s).\n"
-    )
-
-    df = _episodes_dataframe(meta)
-    fs = HfFileSystem()
-
-    results: list[SectionResult] = []
-
-    # Metadata-only sections (cheap).
-    results.append(check_folder_architecture(meta))
-    results.append(check_info_consistency(meta, df))
-    results.append(check_feature_schema(meta, df, fs, scan_data))
-    results.append(check_episode_continuity(meta, df))
-
-    # Data-payload section.
-    if scan_data:
-        results.append(check_data_files(meta, df, fs, fps_tol_s))
-    else:
-        results.append(
-            SectionResult("5. Per-data-file scan", skipped=True, skip_reason="--metadata-only / --no-data")
-        )
-
-    results.append(check_tasks(meta, df))
-    results.append(check_stats(meta))
-
-    # Video section.
-    if check_video:
-        results.append(check_videos(meta, df))
-    else:
-        results.append(SectionResult("8. Video integrity", skipped=True, skip_reason="--no-videos"))
-
-    # Smoke test.
-    if smoke_test:
-        results.append(check_smoke_test(meta, root))
-    else:
-        results.append(
-            SectionResult("9. End-to-end loadability smoke test", skipped=True, skip_reason="--no-smoke-test")
-        )
-
-    # Hub metadata section (needs network).
-    if check_hub:
-        results.append(check_hub_metadata(meta))
-    else:
-        results.append(
-            SectionResult("10. Hugging Face Hub metadata & tags", skipped=True, skip_reason="--no-hub")
-        )
-
-    return results
-
-
-def print_report(results) -> int:
-    """Print a per-section report and return the total number of failures."""
-    total_failures = 0
-    total_warnings = 0
-
-    for res in results:
-        print("=" * 78)
-        if res.skipped:
-            print(f"{res.name}: SKIPPED ({res.skip_reason})")
-            continue
-
-        status = "OK" if not res.failures else f"FAILED ({len(res.failures)})"
-        print(f"{res.name}: {status}")
-        for f in res.failures:
-            print(f"  [FAIL] {f}")
-        for w in res.warnings:
-            print(f"  [warn] {w}")
-
-        total_failures += len(res.failures)
-        total_warnings += len(res.warnings)
-
-    print("=" * 78)
-    if total_failures:
-        print(f"RESULT: FAILED - {total_failures} failure(s), {total_warnings} warning(s).")
-    else:
-        print(f"RESULT: OK - 0 failures, {total_warnings} warning(s).")
-    return total_failures
-
-
-def main() -> int:
-    parser = argparse.ArgumentParser(
-        description=__doc__, formatter_class=argparse.RawDescriptionHelpFormatter
-    )
-    parser.add_argument(
-        "--repo-id", required=True, help="Hugging Face dataset repo id (e.g. 'lerobot/pusht')."
-    )
-    parser.add_argument("--root", default=None, help="Optional local dataset root.")
-    parser.add_argument("--revision", default=None, help="Optional git revision (branch, tag, or commit).")
-    parser.add_argument(
-        "--metadata-only",
-        action="store_true",
-        help="Only run metadata sections (skip data scan, videos, and smoke test).",
-    )
-    parser.add_argument("--no-data", action="store_true", help="Skip the per-data-file scan (Section 5).")
-    parser.add_argument(
-        "--no-videos", action="store_true", help="Skip the video integrity section (Section 8)."
-    )
-    parser.add_argument(
-        "--no-smoke-test", action="store_true", help="Skip the end-to-end loadability smoke test (Section 9)."
-    )
-    parser.add_argument(
-        "--no-hub", action="store_true", help="Skip the Hugging Face Hub metadata & tags section (Section 10)."
-    )
-    parser.add_argument(
-        "--timestamp-tol",
-        type=float,
-        default=1e-3,
-        help="Tolerance (seconds) for the timestamp == frame_index / fps check.",
-    )
-    args = parser.parse_args()
-
-    scan_data = not (args.metadata_only or args.no_data)
-    check_video = not (args.metadata_only or args.no_videos)
-    smoke_test = not (args.metadata_only or args.no_smoke_test)
-    check_hub = not args.no_hub
-
-    results = run_all_checks(
-        repo_id=args.repo_id,
-        root=args.root,
-        revision=args.revision,
-        scan_data=scan_data,
-        check_video=check_video,
-        smoke_test=smoke_test,
-        check_hub=check_hub,
-        fps_tol_s=args.timestamp_tol,
-    )
-    failures = print_report(results)
-    return 1 if failures else 0
-
-
-if __name__ == "__main__":
-    sys.exit(main())