fix(annotate): default frame provider to video keys, not image keys

VideoFrameProvider derived its default camera and camera list from meta.camera_keys, which mixes image- and video-stored cameras. The clip/decode paths read videos/<key>/from_timestamp, which only exists for video keys, so an image-stored camera sorted first (e.g. observation.images.wrist) crashed the plan phase with a KeyError. Restrict the list and default to meta.video_keys. Add a regression test and point the example job at the dataset's actual video camera. Skip bandit B607 (ffmpeg/git are intentionally resolved via PATH). Co-authored-by: Cursor <cursoragent@cursor.com>
2026-08-02 14:39:57 +00:00 · 2026-06-02 12:08:15 +02:00
parent 5dbf0fac5f
commit 98a519e7f2
4 changed files with 68 additions and 23 deletions
@@ -36,8 +36,8 @@ CMD = (
    "export VLLM_MEMORY_PROFILER_ESTIMATE_CUDAGRAPHS=0 && "
    "export VLLM_VIDEO_BACKEND=pyav && "
    "lerobot-annotate "
-    "--repo_id=imstevenpmwork/super_poulain_draft "
-    "--dest_repo_id=pepijn223/super_poulain_vocab "
+    "--repo_id=pepijn223/robocasa_smoke_2atomic_v3 "
+    "--dest_repo_id=pepijn223/robocasa_smoke_2atomic_v3_ann "
    "--push_to_hub=true "
    "--vlm.backend=openai "
    "--vlm.model_id=Qwen/Qwen3.6-35B-A3B-FP8 "
@@ -52,17 +52,18 @@ CMD = (
    "--vlm.temperature=0.7 "
    "--executor.episode_parallelism=16 "
    "--vlm.chat_template_kwargs='{\"enable_thinking\": false}' "
-    "--vlm.camera_key=observation.images.wrist "
+    "--vlm.camera_key=observation.images.robot0_agentview_right "
    # Phase 1 — plan module (subtasks + plan + memory + task_aug).
    "--plan.frames_per_second=1.0 "
    "--plan.use_video_url=true "
    "--plan.use_video_url_fps=1.0 "
    "--plan.derive_task_from_video=always "
-    "--plan.n_task_rephrasings=30 "
+    "--plan.task_aug_axes.enabled=true "
+    "--plan.action_records.enabled=true "
    # Phase 2 — interjections + speech.
    "--interjections.max_interjections_per_episode=6 "
    # Phase 4 — general VQA.
-    "--vqa.K=3 "
+    "--vqa.K=1 "
    "--vqa.vqa_emission_hz=1.0"
 )

@@ -401,7 +401,7 @@ exclude_dirs = [
    "benchmarks",
    "src/lerobot/datasets/push_dataset_to_hub",
 ]
-skips = ["B101", "B311", "B404", "B603", "B615"]
+skips = ["B101", "B311", "B404", "B603", "B607", "B615"]

 [tool.typos]
 default.extend-ignore-re = [
@@ -151,13 +151,15 @@ class VideoFrameProvider:
        from lerobot.datasets.dataset_metadata import LeRobotDatasetMetadata  # noqa: PLC0415

        self._meta = LeRobotDatasetMetadata(repo_id="local", root=self.root)
-        # ``camera_keys`` covers both image- and video-stored cameras and is
-        # always defined on the metadata (``[]`` in the worst case), so it is
-        # the single source we need here.
-        keys = list(self._meta.camera_keys)
-        # Last-resort fallback: if metadata didn't surface anything but the
-        # caller explicitly named a camera (``--vlm.camera_key=...``), trust
-        # them — the key is by definition known to exist on the dataset.
+        # Only ``video_keys`` are decodable here: the clip/decode paths read
+        # ``videos/<key>/from_timestamp`` from episode metadata, which exists
+        # only for video-stored cameras. Image-stored cameras (also in
+        # ``camera_keys``) would KeyError, so restrict the list — and the
+        # default — to video keys.
+        keys = list(self._meta.video_keys)
+        # Last-resort fallback: if metadata didn't surface any video keys but
+        # the caller explicitly named a camera (``--vlm.camera_key=...``),
+        # trust them — the key is by definition known to exist on the dataset.
        if not keys and self.camera_key:
            keys = [self.camera_key]
        self._camera_keys = keys
@@ -338,8 +340,7 @@ class VideoFrameProvider:
                self._warned_decode_fail = True
        if not already_warned:
            logger.warning(
-                "VideoFrameProvider._decode failed for episode=%s camera=%s "
-                "video_path=%s backends=%s: %s",
+                "VideoFrameProvider._decode failed for episode=%s camera=%s video_path=%s backends=%s: %s",
                episode_index,
                camera_key,
                video_path,
@@ -383,11 +384,21 @@ def _decode_frames_ffmpeg(video_path: Path, timestamps: list[float]) -> list[Any
    for ts in timestamps:
        proc = subprocess.run(
            [
-                "ffmpeg", "-nostdin", "-loglevel", "error",
-                "-ss", f"{max(ts, 0.0):.3f}",
-                "-i", str(video_path),
-                "-frames:v", "1",
-                "-f", "image2pipe", "-vcodec", "png", "pipe:1",
+                "ffmpeg",
+                "-nostdin",
+                "-loglevel",
+                "error",
+                "-ss",
+                f"{max(ts, 0.0):.3f}",
+                "-i",
+                str(video_path),
+                "-frames:v",
+                "1",
+                "-f",
+                "image2pipe",
+                "-vcodec",
+                "png",
+                "pipe:1",
            ],
            capture_output=True,
            check=True,
@@ -45,6 +45,33 @@ from lerobot.annotations.steerable_pipeline.frames import (  # noqa: E402
 )


+class _FakeMeta:
+    """Minimal metadata stub exposing ``video_keys`` / ``camera_keys``."""
+
+    def __init__(self, video_keys: list[str], image_keys: list[str]) -> None:
+        self.video_keys = video_keys
+        self.camera_keys = [*video_keys, *image_keys]
+
+
+def test_default_camera_key_skips_image_only_cameras(tmp_path: Path, monkeypatch) -> None:
+    """The default camera must be a *video* key — image-stored cameras have no
+    ``videos/<key>/from_timestamp`` and would KeyError in the clip/decode path.
+
+    Regression: a dataset whose first ``camera_keys`` entry was an image-stored
+    camera (e.g. ``observation.images.wrist``) crashed at clip extraction.
+    """
+    fake = _FakeMeta(
+        video_keys=["observation.images.robot0_agentview_right"],
+        image_keys=["observation.images.wrist"],
+    )
+    import lerobot.datasets.dataset_metadata as meta_mod
+
+    monkeypatch.setattr(meta_mod, "LeRobotDatasetMetadata", lambda *a, **k: fake, raising=True)
+    provider = VideoFrameProvider(root=tmp_path)
+    assert provider.camera_key == "observation.images.robot0_agentview_right"
+    assert "observation.images.wrist" not in provider.camera_keys
+
+
 def test_video_for_episode_is_a_method_of_videoframeprovider():
    """``video_for_episode`` must be a bound method, not nested dead code."""
    assert callable(getattr(VideoFrameProvider, "video_for_episode", None))
@@ -81,9 +108,15 @@ def sample_video(tmp_path: Path) -> Path:
    out = tmp_path / "sample.mp4"
    subprocess.run(
        [
-            "ffmpeg", "-y", "-f", "lavfi",
-            "-i", "testsrc=duration=3:size=160x120:rate=10",
-            "-pix_fmt", "yuv420p", str(out),
+            "ffmpeg",
+            "-y",
+            "-f",
+            "lavfi",
+            "-i",
+            "testsrc=duration=3:size=160x120:rate=10",
+            "-pix_fmt",
+            "yuv420p",
+            str(out),
        ],
        check=True,
        capture_output=True,