feat(language): per-camera tagging on view-dependent styles

Adds a nullable `camera` field to the language row struct (both persistent and event variants) so view-dependent styles like `vqa` can carry which `observation.images.*` view they were grounded against. Without this, multi-camera datasets ended up with multiple `(vqa, role)` rows at the same timestamp that the resolver could not disambiguate. - `language.py`: add `camera` to PERSISTENT_ROW_FIELDS / EVENT_ROW_FIELDS, to both Arrow struct types and the HF datasets feature mappings; introduce VIEW_DEPENDENT_STYLES = {vqa, motion, trace} plus `is_view_dependent_style` and `validate_camera_field` helpers (camera required iff style is view-dependent). - `language_render.py`: thread an optional `camera=` kwarg through every resolver (`active_at`, `emitted_at`, `nth_prev`, `nth_next`) and through `_matching_rows` / `_select_*`, so recipes can disambiguate per-camera VQA with `emitted_at(t, style=vqa, role=assistant, camera=...)`. Without a `camera` filter, multi-row matches keep raising the existing ambiguity error — which is the desired behaviour on multi-camera data. - `recipes/pi05_hirobot.yaml`: replace the single `ask_vqa` branch with `ask_vqa_top` and `ask_vqa_wrist` per-camera sub-recipes (each carrying the matching image block), keeping the original 0.20 budget and documenting the customization point for datasets with different cameras. - Tests: schema test asserts the new field order; new tests cover `is_view_dependent_style`, `validate_camera_field` (both required and forbidden directions), per-camera `emitted_at` filtering, and the ambiguity error when two cameras emit `(vqa, assistant)` at the same timestamp without a `camera=` filter. RenderMessagesStep + dataset passthrough fixtures updated to include the new field. - `docs/source/language_and_recipes.mdx`: document the `camera` field, the per-camera resolver pattern, and the canonical recipe convention. Co-Authored-By: Claude Opus 4.7 (1M context) <noreply@anthropic.com>
2026-07-23 01:41:54 +00:00 · 2026-04-30 10:48:17 +02:00
parent 0b06790da0
commit 5a6aa64570
8 changed files with 344 additions and 33 deletions
@@ -13,10 +13,13 @@ from lerobot.datasets.language import (
    LANGUAGE_PERSISTENT,
    PERSISTENT_STYLES,
    STYLE_REGISTRY,
+    VIEW_DEPENDENT_STYLES,
    column_for_style,
+    is_view_dependent_style,
    language_events_arrow_type,
    language_feature_info,
    language_persistent_arrow_type,
+    validate_camera_field,
 )
 from lerobot.datasets.utils import DEFAULT_DATA_PATH

@@ -26,10 +29,17 @@ def test_language_arrow_schema_has_expected_fields():
    event_row_type = language_events_arrow_type().value_type

    assert isinstance(persistent_row_type, pa.StructType)
-    assert persistent_row_type.names == ["role", "content", "style", "timestamp", "tool_calls"]
+    assert persistent_row_type.names == [
+        "role",
+        "content",
+        "style",
+        "timestamp",
+        "camera",
+        "tool_calls",
+    ]

    assert isinstance(event_row_type, pa.StructType)
-    assert event_row_type.names == ["role", "content", "style", "tool_calls"]
+    assert event_row_type.names == ["role", "content", "style", "camera", "tool_calls"]


 def test_style_registry_routes_columns():
@@ -47,6 +57,41 @@ def test_style_registry_routes_columns():
    assert column_for_style(None) == LANGUAGE_EVENTS


+def test_view_dependent_styles():
+    assert {"vqa", "motion", "trace"} == VIEW_DEPENDENT_STYLES
+    assert is_view_dependent_style("vqa")
+    assert is_view_dependent_style("motion")
+    assert is_view_dependent_style("trace")
+    assert not is_view_dependent_style("subtask")
+    assert not is_view_dependent_style("plan")
+    assert not is_view_dependent_style("interjection")
+    assert not is_view_dependent_style(None)
+
+
+def test_validate_camera_field_requires_camera_for_view_dependent_styles():
+    validate_camera_field("vqa", "observation.images.top")
+    validate_camera_field("motion", "observation.images.wrist")
+    validate_camera_field("trace", "observation.images.front")
+    with pytest.raises(ValueError, match="view-dependent"):
+        validate_camera_field("vqa", None)
+    with pytest.raises(ValueError, match="view-dependent"):
+        validate_camera_field("motion", "")
+
+
+def test_validate_camera_field_rejects_camera_on_non_view_dependent_styles():
+    validate_camera_field("subtask", None)
+    validate_camera_field("plan", None)
+    validate_camera_field("memory", None)
+    validate_camera_field("interjection", None)
+    validate_camera_field(None, None)
+    with pytest.raises(ValueError, match="must have camera=None"):
+        validate_camera_field("subtask", "observation.images.top")
+    with pytest.raises(ValueError, match="must have camera=None"):
+        validate_camera_field("interjection", "observation.images.top")
+    with pytest.raises(ValueError, match="must have camera=None"):
+        validate_camera_field(None, "observation.images.top")
+
+
 def test_unknown_style_rejected():
    with pytest.raises(ValueError, match="Unknown language style"):
        column_for_style("surprise")
@@ -70,6 +115,7 @@ def test_lerobot_dataset_passes_language_columns_through(tmp_path, empty_lerobot
            "content": "reach for the cup",
            "style": "subtask",
            "timestamp": 0.0,
+            "camera": None,
            "tool_calls": None,
        }
    ]
@@ -77,6 +123,7 @@ def test_lerobot_dataset_passes_language_columns_through(tmp_path, empty_lerobot
        "role": "user",
        "content": "what is visible?",
        "style": "vqa",
+        "camera": "observation.images.top",
        "tool_calls": None,
    }
    data_path = root / DEFAULT_DATA_PATH.format(chunk_index=0, file_index=0)