mirror of
https://github.com/huggingface/lerobot.git
synced 2026-06-18 08:47:05 +00:00
fix(annotate): default frame provider to video keys, not image keys
VideoFrameProvider derived its default camera and camera list from meta.camera_keys, which mixes image- and video-stored cameras. The clip/decode paths read videos/<key>/from_timestamp, which only exists for video keys, so an image-stored camera sorted first (e.g. observation.images.wrist) crashed the plan phase with a KeyError. Restrict the list and default to meta.video_keys. Add a regression test and point the example job at the dataset's actual video camera. Skip bandit B607 (ffmpeg/git are intentionally resolved via PATH). Co-authored-by: Cursor <cursoragent@cursor.com>
This commit is contained in:
@@ -36,8 +36,8 @@ CMD = (
|
||||
"export VLLM_MEMORY_PROFILER_ESTIMATE_CUDAGRAPHS=0 && "
|
||||
"export VLLM_VIDEO_BACKEND=pyav && "
|
||||
"lerobot-annotate "
|
||||
"--repo_id=imstevenpmwork/super_poulain_draft "
|
||||
"--dest_repo_id=pepijn223/super_poulain_vocab "
|
||||
"--repo_id=pepijn223/robocasa_smoke_2atomic_v3 "
|
||||
"--dest_repo_id=pepijn223/robocasa_smoke_2atomic_v3_ann "
|
||||
"--push_to_hub=true "
|
||||
"--vlm.backend=openai "
|
||||
"--vlm.model_id=Qwen/Qwen3.6-35B-A3B-FP8 "
|
||||
@@ -52,17 +52,18 @@ CMD = (
|
||||
"--vlm.temperature=0.7 "
|
||||
"--executor.episode_parallelism=16 "
|
||||
"--vlm.chat_template_kwargs='{\"enable_thinking\": false}' "
|
||||
"--vlm.camera_key=observation.images.wrist "
|
||||
"--vlm.camera_key=observation.images.robot0_agentview_right "
|
||||
# Phase 1 — plan module (subtasks + plan + memory + task_aug).
|
||||
"--plan.frames_per_second=1.0 "
|
||||
"--plan.use_video_url=true "
|
||||
"--plan.use_video_url_fps=1.0 "
|
||||
"--plan.derive_task_from_video=always "
|
||||
"--plan.n_task_rephrasings=30 "
|
||||
"--plan.task_aug_axes.enabled=true "
|
||||
"--plan.action_records.enabled=true "
|
||||
# Phase 2 — interjections + speech.
|
||||
"--interjections.max_interjections_per_episode=6 "
|
||||
# Phase 4 — general VQA.
|
||||
"--vqa.K=3 "
|
||||
"--vqa.K=1 "
|
||||
"--vqa.vqa_emission_hz=1.0"
|
||||
)
|
||||
|
||||
|
||||
+1
-1
@@ -401,7 +401,7 @@ exclude_dirs = [
|
||||
"benchmarks",
|
||||
"src/lerobot/datasets/push_dataset_to_hub",
|
||||
]
|
||||
skips = ["B101", "B311", "B404", "B603", "B615"]
|
||||
skips = ["B101", "B311", "B404", "B603", "B607", "B615"]
|
||||
|
||||
[tool.typos]
|
||||
default.extend-ignore-re = [
|
||||
|
||||
@@ -151,13 +151,15 @@ class VideoFrameProvider:
|
||||
from lerobot.datasets.dataset_metadata import LeRobotDatasetMetadata # noqa: PLC0415
|
||||
|
||||
self._meta = LeRobotDatasetMetadata(repo_id="local", root=self.root)
|
||||
# ``camera_keys`` covers both image- and video-stored cameras and is
|
||||
# always defined on the metadata (``[]`` in the worst case), so it is
|
||||
# the single source we need here.
|
||||
keys = list(self._meta.camera_keys)
|
||||
# Last-resort fallback: if metadata didn't surface anything but the
|
||||
# caller explicitly named a camera (``--vlm.camera_key=...``), trust
|
||||
# them — the key is by definition known to exist on the dataset.
|
||||
# Only ``video_keys`` are decodable here: the clip/decode paths read
|
||||
# ``videos/<key>/from_timestamp`` from episode metadata, which exists
|
||||
# only for video-stored cameras. Image-stored cameras (also in
|
||||
# ``camera_keys``) would KeyError, so restrict the list — and the
|
||||
# default — to video keys.
|
||||
keys = list(self._meta.video_keys)
|
||||
# Last-resort fallback: if metadata didn't surface any video keys but
|
||||
# the caller explicitly named a camera (``--vlm.camera_key=...``),
|
||||
# trust them — the key is by definition known to exist on the dataset.
|
||||
if not keys and self.camera_key:
|
||||
keys = [self.camera_key]
|
||||
self._camera_keys = keys
|
||||
@@ -338,8 +340,7 @@ class VideoFrameProvider:
|
||||
self._warned_decode_fail = True
|
||||
if not already_warned:
|
||||
logger.warning(
|
||||
"VideoFrameProvider._decode failed for episode=%s camera=%s "
|
||||
"video_path=%s backends=%s: %s",
|
||||
"VideoFrameProvider._decode failed for episode=%s camera=%s video_path=%s backends=%s: %s",
|
||||
episode_index,
|
||||
camera_key,
|
||||
video_path,
|
||||
@@ -383,11 +384,21 @@ def _decode_frames_ffmpeg(video_path: Path, timestamps: list[float]) -> list[Any
|
||||
for ts in timestamps:
|
||||
proc = subprocess.run(
|
||||
[
|
||||
"ffmpeg", "-nostdin", "-loglevel", "error",
|
||||
"-ss", f"{max(ts, 0.0):.3f}",
|
||||
"-i", str(video_path),
|
||||
"-frames:v", "1",
|
||||
"-f", "image2pipe", "-vcodec", "png", "pipe:1",
|
||||
"ffmpeg",
|
||||
"-nostdin",
|
||||
"-loglevel",
|
||||
"error",
|
||||
"-ss",
|
||||
f"{max(ts, 0.0):.3f}",
|
||||
"-i",
|
||||
str(video_path),
|
||||
"-frames:v",
|
||||
"1",
|
||||
"-f",
|
||||
"image2pipe",
|
||||
"-vcodec",
|
||||
"png",
|
||||
"pipe:1",
|
||||
],
|
||||
capture_output=True,
|
||||
check=True,
|
||||
|
||||
@@ -45,6 +45,33 @@ from lerobot.annotations.steerable_pipeline.frames import ( # noqa: E402
|
||||
)
|
||||
|
||||
|
||||
class _FakeMeta:
|
||||
"""Minimal metadata stub exposing ``video_keys`` / ``camera_keys``."""
|
||||
|
||||
def __init__(self, video_keys: list[str], image_keys: list[str]) -> None:
|
||||
self.video_keys = video_keys
|
||||
self.camera_keys = [*video_keys, *image_keys]
|
||||
|
||||
|
||||
def test_default_camera_key_skips_image_only_cameras(tmp_path: Path, monkeypatch) -> None:
|
||||
"""The default camera must be a *video* key — image-stored cameras have no
|
||||
``videos/<key>/from_timestamp`` and would KeyError in the clip/decode path.
|
||||
|
||||
Regression: a dataset whose first ``camera_keys`` entry was an image-stored
|
||||
camera (e.g. ``observation.images.wrist``) crashed at clip extraction.
|
||||
"""
|
||||
fake = _FakeMeta(
|
||||
video_keys=["observation.images.robot0_agentview_right"],
|
||||
image_keys=["observation.images.wrist"],
|
||||
)
|
||||
import lerobot.datasets.dataset_metadata as meta_mod
|
||||
|
||||
monkeypatch.setattr(meta_mod, "LeRobotDatasetMetadata", lambda *a, **k: fake, raising=True)
|
||||
provider = VideoFrameProvider(root=tmp_path)
|
||||
assert provider.camera_key == "observation.images.robot0_agentview_right"
|
||||
assert "observation.images.wrist" not in provider.camera_keys
|
||||
|
||||
|
||||
def test_video_for_episode_is_a_method_of_videoframeprovider():
|
||||
"""``video_for_episode`` must be a bound method, not nested dead code."""
|
||||
assert callable(getattr(VideoFrameProvider, "video_for_episode", None))
|
||||
@@ -81,9 +108,15 @@ def sample_video(tmp_path: Path) -> Path:
|
||||
out = tmp_path / "sample.mp4"
|
||||
subprocess.run(
|
||||
[
|
||||
"ffmpeg", "-y", "-f", "lavfi",
|
||||
"-i", "testsrc=duration=3:size=160x120:rate=10",
|
||||
"-pix_fmt", "yuv420p", str(out),
|
||||
"ffmpeg",
|
||||
"-y",
|
||||
"-f",
|
||||
"lavfi",
|
||||
"-i",
|
||||
"testsrc=duration=3:size=160x120:rate=10",
|
||||
"-pix_fmt",
|
||||
"yuv420p",
|
||||
str(out),
|
||||
],
|
||||
check=True,
|
||||
capture_output=True,
|
||||
|
||||
Reference in New Issue
Block a user