From bdfe8f8ce9f7ce5c7112446931553e954723ef72 Mon Sep 17 00:00:00 2001 From: Pepijn Date: Wed, 17 Jun 2026 20:22:04 +0200 Subject: [PATCH] Use full MP4 sidecar for episode cache benchmark --- scripts/bench_episode_byte_cache.py | 20 +++++++++----------- scripts/build_mp4_sidecar.py | 7 ++----- 2 files changed, 11 insertions(+), 16 deletions(-) diff --git a/scripts/bench_episode_byte_cache.py b/scripts/bench_episode_byte_cache.py index 5040abd51..3fce34850 100644 --- a/scripts/bench_episode_byte_cache.py +++ b/scripts/bench_episode_byte_cache.py @@ -38,6 +38,7 @@ DEFAULT_REPO = "allenai/MolmoAct2-BimanualYAM-Dataset" DEFAULT_REVISION = "e9f21ae15074330839f2ac25ed4b49d76dfa1f9c" DEFAULT_DATA_ROOT = "hf://buckets/pepijn223/MolmoAct2-BimanualYAM-Dataset-bucket" SIDECAR_CACHE_DIR = Path(tempfile.gettempdir()) / "lerobot-sidecars" +FULL_SIDECAR_NAME = "molmoact2-full.npz" def parse_args() -> argparse.Namespace: @@ -162,16 +163,15 @@ def _root_join(data_root: str, relative_path: str) -> str: def _find_or_download_sidecar(data_root: str, manifest_episode_count: int) -> Path | None: - local = SIDECAR_CACHE_DIR / f"molmoact2-{manifest_episode_count}.npz" + _ = manifest_episode_count + local = SIDECAR_CACHE_DIR / FULL_SIDECAR_NAME if _valid_sidecar(local): return local if local.exists(): print(f"mp4_sidecar_invalid_local: {local}") local.unlink() - full_local = SIDECAR_CACHE_DIR / "molmoact2-full.npz" - if _valid_sidecar(full_local): - return full_local - remote = _root_join(data_root, f"meta/mp4-sidecars/molmoact2-{manifest_episode_count}.npz") + remote_relative = f"meta/mp4-sidecars/{FULL_SIDECAR_NAME}" + remote = _root_join(data_root, remote_relative) protocol = "hf" if data_root.startswith("hf://") else "file" fs = fsspec.filesystem(protocol) if not fs.exists(remote): @@ -179,9 +179,7 @@ def _find_or_download_sidecar(data_root: str, manifest_episode_count: int) -> Pa local.parent.mkdir(parents=True, exist_ok=True) print(f"downloading_mp4_sidecar: {remote} -> {local}") if data_root.startswith("hf://"): - _download_sidecar_native_http( - data_root, f"meta/mp4-sidecars/molmoact2-{manifest_episode_count}.npz", local - ) + _download_sidecar_native_http(data_root, remote_relative, local) else: fs.get(remote, str(local)) return local @@ -686,13 +684,13 @@ def main() -> None: ) return if args.strategy == "both": - expected_sidecar = SIDECAR_CACHE_DIR / f"molmoact2-{manifest_episode_count}.npz" - expected_remote = _root_join(data_root, f"meta/mp4-sidecars/molmoact2-{manifest_episode_count}.npz") + expected_sidecar = SIDECAR_CACHE_DIR / FULL_SIDECAR_NAME + expected_remote = _root_join(data_root, f"meta/mp4-sidecars/{FULL_SIDECAR_NAME}") print(f"mp4_sidecar_missing_local: {expected_sidecar}") print(f"mp4_sidecar_missing_remote: {expected_remote}") print( "build_mp4_sidecar: " - f"uv run --no-sync python scripts/build_mp4_sidecar.py --episodes {manifest_episode_count} " + "uv run --no-sync python scripts/build_mp4_sidecar.py " f"--workers {args.workers} --range-backend native-http --output {expected_sidecar}" ) print("running_without_mp4_sidecar: indexed variants will build MP4 indexes online") diff --git a/scripts/build_mp4_sidecar.py b/scripts/build_mp4_sidecar.py index ef6d77ff0..3fcb9ed8f 100644 --- a/scripts/build_mp4_sidecar.py +++ b/scripts/build_mp4_sidecar.py @@ -41,7 +41,7 @@ def parse_args() -> argparse.Namespace: return parser.parse_args() -def push_sidecar(local_path: str, data_root: str, episode_count: int) -> list[str]: +def push_sidecar(local_path: str, data_root: str) -> list[str]: if not data_root.startswith("hf://"): return [] @@ -49,9 +49,6 @@ def push_sidecar(local_path: str, data_root: str, episode_count: int) -> list[st fs = fsspec.filesystem("hf") remote_dir = f"{data_root.rstrip('/')}/meta/mp4-sidecars" remote_paths = [f"{remote_dir}/{local.name}"] - alias = f"{remote_dir}/molmoact2-{episode_count}.npz" - if alias not in remote_paths: - remote_paths.append(alias) for remote in remote_paths: fs.put(str(local), remote) @@ -87,7 +84,7 @@ def main() -> None: if args.no_push: print("push_skipped: --no-push") else: - pushed = push_sidecar(args.output, args.data_root, total) + pushed = push_sidecar(args.output, args.data_root) for remote in pushed: print(f"pushed {remote}")