feat(language): per-camera tagging on view-dependent styles

Adds a nullable `camera` field to the language row struct (both persistent and event variants) so view-dependent styles like `vqa` can carry which `observation.images.*` view they were grounded against. Without this, multi-camera datasets ended up with multiple `(vqa, role)` rows at the same timestamp that the resolver could not disambiguate. - `language.py`: add `camera` to PERSISTENT_ROW_FIELDS / EVENT_ROW_FIELDS, to both Arrow struct types and the HF datasets feature mappings; introduce VIEW_DEPENDENT_STYLES = {vqa, motion, trace} plus `is_view_dependent_style` and `validate_camera_field` helpers (camera required iff style is view-dependent). - `language_render.py`: thread an optional `camera=` kwarg through every resolver (`active_at`, `emitted_at`, `nth_prev`, `nth_next`) and through `_matching_rows` / `_select_*`, so recipes can disambiguate per-camera VQA with `emitted_at(t, style=vqa, role=assistant, camera=...)`. Without a `camera` filter, multi-row matches keep raising the existing ambiguity error — which is the desired behaviour on multi-camera data. - `recipes/pi05_hirobot.yaml`: replace the single `ask_vqa` branch with `ask_vqa_top` and `ask_vqa_wrist` per-camera sub-recipes (each carrying the matching image block), keeping the original 0.20 budget and documenting the customization point for datasets with different cameras. - Tests: schema test asserts the new field order; new tests cover `is_view_dependent_style`, `validate_camera_field` (both required and forbidden directions), per-camera `emitted_at` filtering, and the ambiguity error when two cameras emit `(vqa, assistant)` at the same timestamp without a `camera=` filter. RenderMessagesStep + dataset passthrough fixtures updated to include the new field. - `docs/source/language_and_recipes.mdx`: document the `camera` field, the per-camera resolver pattern, and the canonical recipe convention. Co-Authored-By: Claude Opus 4.7 (1M context) <noreply@anthropic.com>
2026-07-25 10:46:01 +00:00 · 2026-04-30 10:48:17 +02:00
parent 0b06790da0
commit 5a6aa64570
8 changed files with 344 additions and 33 deletions
@@ -40,8 +40,35 @@ blend:
      - {role: user, content: "${task}\nPlan: ${plan}\nMemory: ${memory}", stream: high_level}
      - {role: assistant, content: "${subtask}", stream: low_level, target: true}

-  ask_vqa:
-    weight: 0.20
+  # VQA is view-dependent: bbox / keypoint / count answers only make sense for
+  # the camera they were grounded against. Each camera gets its own sub-recipe
+  # so the resolver can disambiguate via `camera=...` and the user-turn carries
+  # the matching image block. Adjust the camera keys (and add more sub-recipes)
+  # to match the cameras present on your dataset.
+  ask_vqa_top:
+    weight: 0.10
+    bindings:
+      vqa_query: "emitted_at(t, style=vqa, role=user, camera=observation.images.top)"
+      vqa: "emitted_at(t, style=vqa, role=assistant, camera=observation.images.top)"
    messages:
-      - {role: user, content: "${vqa_query}", stream: high_level, if_present: vqa_query}
+      - role: user
+        stream: high_level
+        if_present: vqa_query
+        content:
+          - {type: image, feature: observation.images.top}
+          - {type: text, text: "${vqa_query}"}
+      - {role: assistant, content: "${vqa}", stream: high_level, target: true, if_present: vqa}
+
+  ask_vqa_wrist:
+    weight: 0.10
+    bindings:
+      vqa_query: "emitted_at(t, style=vqa, role=user, camera=observation.images.wrist)"
+      vqa: "emitted_at(t, style=vqa, role=assistant, camera=observation.images.wrist)"
+    messages:
+      - role: user
+        stream: high_level
+        if_present: vqa_query
+        content:
+          - {type: image, feature: observation.images.wrist}
+          - {type: text, text: "${vqa_query}"}
      - {role: assistant, content: "${vqa}", stream: high_level, target: true, if_present: vqa}
@@ -24,8 +24,8 @@ import pyarrow as pa
 LANGUAGE_PERSISTENT = "language_persistent"
 LANGUAGE_EVENTS = "language_events"
 LANGUAGE_COLUMNS = (LANGUAGE_PERSISTENT, LANGUAGE_EVENTS)
-PERSISTENT_ROW_FIELDS = ("role", "content", "style", "timestamp", "tool_calls")
-EVENT_ROW_FIELDS = ("role", "content", "style", "tool_calls")
+PERSISTENT_ROW_FIELDS = ("role", "content", "style", "timestamp", "camera", "tool_calls")
+EVENT_ROW_FIELDS = ("role", "content", "style", "camera", "tool_calls")

 CORE_STYLES = {"subtask", "plan", "memory", "motion", "interjection", "vqa", "trace"}
 EXTENDED_STYLES = set()
@@ -34,6 +34,11 @@ STYLE_REGISTRY = CORE_STYLES | EXTENDED_STYLES
 PERSISTENT_STYLES = {"subtask", "plan", "memory", "motion"}
 EVENT_ONLY_STYLES = {"interjection", "vqa", "trace"}

+# Styles whose ``content`` is grounded in a specific camera view. Rows of these
+# styles MUST carry a non-null ``camera`` referencing an ``observation.images.*``
+# feature key. Rows of every other style MUST have ``camera=None``.
+VIEW_DEPENDENT_STYLES = {"vqa", "motion", "trace"}
+
 LanguageColumn = Literal["language_persistent", "language_events"]


@@ -59,6 +64,7 @@ def language_persistent_row_arrow_type() -> pa.StructType:
            pa.field("content", pa.string(), nullable=True),
            pa.field("style", pa.string(), nullable=True),
            pa.field("timestamp", pa.float64(), nullable=False),
+            pa.field("camera", pa.string(), nullable=True),
            pa.field("tool_calls", pa.list_(_json_arrow_type()), nullable=True),
        ]
    )
@@ -75,6 +81,7 @@ def language_event_row_arrow_type() -> pa.StructType:
            pa.field("role", pa.string(), nullable=False),
            pa.field("content", pa.string(), nullable=True),
            pa.field("style", pa.string(), nullable=True),
+            pa.field("camera", pa.string(), nullable=True),
            pa.field("tool_calls", pa.list_(_json_arrow_type()), nullable=True),
        ]
    )
@@ -97,6 +104,7 @@ def language_persistent_row_feature() -> dict[str, object]:
        "content": datasets.Value("string"),
        "style": datasets.Value("string"),
        "timestamp": datasets.Value("float64"),
+        "camera": datasets.Value("string"),
        "tool_calls": datasets.List(_json_feature()),
    }

@@ -107,6 +115,7 @@ def language_event_row_feature() -> dict[str, object]:
        "role": datasets.Value("string"),
        "content": datasets.Value("string"),
        "style": datasets.Value("string"),
+        "camera": datasets.Value("string"),
        "tool_calls": datasets.List(_json_feature()),
    }

@@ -134,6 +143,30 @@ def is_language_column(key: str) -> bool:
    return key in LANGUAGE_COLUMNS


+def is_view_dependent_style(style: str | None) -> bool:
+    """Return ``True`` if rows of ``style`` must be tagged with a ``camera`` key."""
+    return style in VIEW_DEPENDENT_STYLES
+
+
+def validate_camera_field(style: str | None, camera: str | None) -> None:
+    """Enforce the ``camera`` invariant: required iff ``style`` is view-dependent.
+
+    Raises ``ValueError`` if a view-dependent style is missing ``camera`` or if
+    a non-view-dependent style carries one. Pipeline writers and the validator
+    should call this on every emitted row.
+    """
+    if is_view_dependent_style(style):
+        if not camera:
+            raise ValueError(
+                f"Rows of view-dependent style {style!r} require a non-empty 'camera' "
+                f"field referencing an 'observation.images.*' feature key."
+            )
+    elif camera is not None:
+        raise ValueError(
+            f"Rows of style {style!r} must have camera=None; got camera={camera!r}."
+        )
+
+
 def column_for_style(style: str | None) -> LanguageColumn:
    """Map a language style to the column where rows of that style are stored.

@@ -46,18 +46,23 @@ def active_at(
    style: str | None = None,
    role: str | None = None,
    tool_name: str | None = None,
+    camera: str | None = None,
 ) -> LanguageRow | None:
    """Return the persistent row of ``style`` that is active at time ``t``.

    A persistent row is "active" at ``t`` when its own ``timestamp`` is the
-    most recent one ``<= t`` for the given ``style``/``role``/``tool_name``
-    selector. ``events`` is accepted for resolver-signature uniformity but is
-    not consulted: only persistent styles are valid here.
+    most recent one ``<= t`` for the given ``style``/``role``/``tool_name``/
+    ``camera`` selector. ``events`` is accepted for resolver-signature
+    uniformity but is not consulted: only persistent styles are valid here.
    """
    _validate_persistent_resolver("active_at", style)
-    matches = _matching_rows(persistent, style=style, role=role, tool_name=tool_name)
+    matches = _matching_rows(
+        persistent, style=style, role=role, tool_name=tool_name, camera=camera
+    )
    matches = [row for row in matches if _timestamp(row) <= t]
-    return _select_latest(matches, style=style, role=role, tool_name=tool_name)
+    return _select_latest(
+        matches, style=style, role=role, tool_name=tool_name, camera=camera
+    )


 def emitted_at(
@@ -68,26 +73,45 @@ def emitted_at(
    style: str | None = None,
    role: str | None = None,
    tool_name: str | None = None,
+    camera: str | None = None,
 ) -> LanguageRow | None:
    """Return the row of ``style`` emitted at exactly time ``t``.

    For persistent styles, this matches persistent rows whose own ``timestamp``
    equals ``t``. For event styles, the ``events`` list is assumed to come from
    the dataset row at frame ``t`` (event rows carry no timestamp of their own),
-    so all matching event rows are considered emitted at ``t``.
+    so all matching event rows are considered emitted at ``t``. ``camera``
+    filters by the row's ``camera`` field — required to disambiguate when
+    multiple view-dependent rows share ``(t, role)`` across cameras.
    """
    column = column_for_style(style)
    if column == LANGUAGE_PERSISTENT:
        matches = [
            row
-            for row in _matching_rows(persistent, style=style, role=role, tool_name=tool_name)
+            for row in _matching_rows(
+                persistent, style=style, role=role, tool_name=tool_name, camera=camera
+            )
            if _timestamp(row) == t
        ]
        return _select_one(
-            matches, style=style, role=role, tool_name=tool_name, sort_key=_persistent_sort_key
+            matches,
+            style=style,
+            role=role,
+            tool_name=tool_name,
+            camera=camera,
+            sort_key=_persistent_sort_key,
        )
-    matches = _matching_rows(events, style=style, role=role, tool_name=tool_name)
-    return _select_one(matches, style=style, role=role, tool_name=tool_name, sort_key=_event_sort_key)
+    matches = _matching_rows(
+        events, style=style, role=role, tool_name=tool_name, camera=camera
+    )
+    return _select_one(
+        matches,
+        style=style,
+        role=role,
+        tool_name=tool_name,
+        camera=camera,
+        sort_key=_event_sort_key,
+    )


 def nth_prev(
@@ -99,12 +123,14 @@ def nth_prev(
    offset: int = 1,
    role: str | None = None,
    tool_name: str | None = None,
+    camera: str | None = None,
 ) -> LanguageRow | None:
    """Return the persistent row that was active ``offset`` steps before ``t``.

    Walks back through chronologically sorted persistent rows of ``style``
-    (filtered by optional ``role``/``tool_name``) and returns the one ``offset``
-    positions before the row active at ``t``. Only valid for persistent styles.
+    (filtered by optional ``role``/``tool_name``/``camera``) and returns the
+    one ``offset`` positions before the row active at ``t``. Only valid for
+    persistent styles.
    """
    return _nth_relative(
        t,
@@ -113,6 +139,7 @@ def nth_prev(
        offset=-offset,
        role=role,
        tool_name=tool_name,
+        camera=camera,
        resolver_name="nth_prev",
    )

@@ -126,12 +153,14 @@ def nth_next(
    offset: int = 1,
    role: str | None = None,
    tool_name: str | None = None,
+    camera: str | None = None,
 ) -> LanguageRow | None:
    """Return the persistent row that becomes active ``offset`` steps after ``t``.

    Walks forward through chronologically sorted persistent rows of ``style``
-    (filtered by optional ``role``/``tool_name``) and returns the one ``offset``
-    positions after the row active at ``t``. Only valid for persistent styles.
+    (filtered by optional ``role``/``tool_name``/``camera``) and returns the
+    one ``offset`` positions after the row active at ``t``. Only valid for
+    persistent styles.
    """
    return _nth_relative(
        t,
@@ -140,6 +169,7 @@ def nth_next(
        offset=offset,
        role=role,
        tool_name=tool_name,
+        camera=camera,
        resolver_name="nth_next",
    )

@@ -376,6 +406,7 @@ def _nth_relative(
    offset: int,
    role: str | None,
    tool_name: str | None,
+    camera: str | None,
    resolver_name: str,
 ) -> LanguageRow | None:
    """Shared body for ``nth_prev`` / ``nth_next`` with signed ``offset``."""
@@ -384,7 +415,7 @@ def _nth_relative(
        raise ValueError(f"{resolver_name} offset must be non-zero.")

    rows = sorted(
-        _matching_rows(persistent, style=style, role=role, tool_name=tool_name),
+        _matching_rows(persistent, style=style, role=role, tool_name=tool_name, camera=camera),
        key=_persistent_sort_key,
    )
    if not rows:
@@ -420,14 +451,16 @@ def _matching_rows(
    style: str | None,
    role: str | None,
    tool_name: str | None,
+    camera: str | None,
 ) -> list[LanguageRow]:
-    """Return ``rows`` filtered by optional ``style``/``role``/``tool_name`` selectors."""
+    """Return ``rows`` filtered by optional ``style``/``role``/``tool_name``/``camera`` selectors."""
    return [
        row
        for row in rows
        if (style is None or row.get("style") == style)
        and (role is None or row.get("role") == role)
        and (tool_name is None or _row_has_tool_name(row, tool_name))
+        and (camera is None or row.get("camera") == camera)
    ]


@@ -437,6 +470,7 @@ def _select_latest(
    style: str | None,
    role: str | None,
    tool_name: str | None,
+    camera: str | None,
 ) -> LanguageRow | None:
    """Return the row tied for the latest ``timestamp`` (disambiguated by selectors)."""
    if not rows:
@@ -448,6 +482,7 @@ def _select_latest(
        style=style,
        role=role,
        tool_name=tool_name,
+        camera=camera,
        sort_key=_persistent_sort_key,
    )

@@ -458,14 +493,16 @@ def _select_one(
    style: str | None,
    role: str | None,
    tool_name: str | None,
+    camera: str | None,
    sort_key: Any,
 ) -> LanguageRow | None:
    """Return the single matching row, or raise if the selectors are ambiguous."""
    if not rows:
        return None
-    if len(rows) > 1 and role is None and tool_name is None:
+    if len(rows) > 1 and role is None and tool_name is None and camera is None:
        raise ValueError(
-            f"Ambiguous resolver for style={style!r}; add role=... or tool_name=... to disambiguate."
+            f"Ambiguous resolver for style={style!r}; add role=..., tool_name=..., "
+            f"or camera=... to disambiguate."
        )
    return sorted(rows, key=sort_key)[0]