mirror of
https://github.com/huggingface/lerobot.git
synced 2026-05-18 10:10:08 +00:00
feat(language): per-camera tagging on view-dependent styles
Adds a nullable `camera` field to the language row struct (both persistent
and event variants) so view-dependent styles like `vqa` can carry which
`observation.images.*` view they were grounded against. Without this,
multi-camera datasets ended up with multiple `(vqa, role)` rows at the
same timestamp that the resolver could not disambiguate.
- `language.py`: add `camera` to PERSISTENT_ROW_FIELDS / EVENT_ROW_FIELDS,
to both Arrow struct types and the HF datasets feature mappings;
introduce VIEW_DEPENDENT_STYLES = {vqa, motion, trace} plus
`is_view_dependent_style` and `validate_camera_field` helpers (camera
required iff style is view-dependent).
- `language_render.py`: thread an optional `camera=` kwarg through every
resolver (`active_at`, `emitted_at`, `nth_prev`, `nth_next`) and through
`_matching_rows` / `_select_*`, so recipes can disambiguate per-camera
VQA with `emitted_at(t, style=vqa, role=assistant, camera=...)`.
Without a `camera` filter, multi-row matches keep raising the existing
ambiguity error — which is the desired behaviour on multi-camera data.
- `recipes/pi05_hirobot.yaml`: replace the single `ask_vqa` branch with
`ask_vqa_top` and `ask_vqa_wrist` per-camera sub-recipes (each carrying
the matching image block), keeping the original 0.20 budget and
documenting the customization point for datasets with different cameras.
- Tests: schema test asserts the new field order; new tests cover
`is_view_dependent_style`, `validate_camera_field` (both required and
forbidden directions), per-camera `emitted_at` filtering, and the
ambiguity error when two cameras emit `(vqa, assistant)` at the same
timestamp without a `camera=` filter. RenderMessagesStep + dataset
passthrough fixtures updated to include the new field.
- `docs/source/language_and_recipes.mdx`: document the `camera` field,
the per-camera resolver pattern, and the canonical recipe convention.
Co-Authored-By: Claude Opus 4.7 (1M context) <noreply@anthropic.com>
This commit is contained in:
@@ -26,6 +26,7 @@ def test_canonical_recipe_loads():
|
||||
"user_interjection_response",
|
||||
"high_level_subtask",
|
||||
"low_level_execution",
|
||||
"ask_vqa",
|
||||
"ask_vqa_top",
|
||||
"ask_vqa_wrist",
|
||||
}
|
||||
assert sum(component.weight for component in recipe.blend.values()) == pytest.approx(0.96)
|
||||
|
||||
@@ -13,10 +13,13 @@ from lerobot.datasets.language import (
|
||||
LANGUAGE_PERSISTENT,
|
||||
PERSISTENT_STYLES,
|
||||
STYLE_REGISTRY,
|
||||
VIEW_DEPENDENT_STYLES,
|
||||
column_for_style,
|
||||
is_view_dependent_style,
|
||||
language_events_arrow_type,
|
||||
language_feature_info,
|
||||
language_persistent_arrow_type,
|
||||
validate_camera_field,
|
||||
)
|
||||
from lerobot.datasets.utils import DEFAULT_DATA_PATH
|
||||
|
||||
@@ -26,10 +29,17 @@ def test_language_arrow_schema_has_expected_fields():
|
||||
event_row_type = language_events_arrow_type().value_type
|
||||
|
||||
assert isinstance(persistent_row_type, pa.StructType)
|
||||
assert persistent_row_type.names == ["role", "content", "style", "timestamp", "tool_calls"]
|
||||
assert persistent_row_type.names == [
|
||||
"role",
|
||||
"content",
|
||||
"style",
|
||||
"timestamp",
|
||||
"camera",
|
||||
"tool_calls",
|
||||
]
|
||||
|
||||
assert isinstance(event_row_type, pa.StructType)
|
||||
assert event_row_type.names == ["role", "content", "style", "tool_calls"]
|
||||
assert event_row_type.names == ["role", "content", "style", "camera", "tool_calls"]
|
||||
|
||||
|
||||
def test_style_registry_routes_columns():
|
||||
@@ -47,6 +57,41 @@ def test_style_registry_routes_columns():
|
||||
assert column_for_style(None) == LANGUAGE_EVENTS
|
||||
|
||||
|
||||
def test_view_dependent_styles():
|
||||
assert {"vqa", "motion", "trace"} == VIEW_DEPENDENT_STYLES
|
||||
assert is_view_dependent_style("vqa")
|
||||
assert is_view_dependent_style("motion")
|
||||
assert is_view_dependent_style("trace")
|
||||
assert not is_view_dependent_style("subtask")
|
||||
assert not is_view_dependent_style("plan")
|
||||
assert not is_view_dependent_style("interjection")
|
||||
assert not is_view_dependent_style(None)
|
||||
|
||||
|
||||
def test_validate_camera_field_requires_camera_for_view_dependent_styles():
|
||||
validate_camera_field("vqa", "observation.images.top")
|
||||
validate_camera_field("motion", "observation.images.wrist")
|
||||
validate_camera_field("trace", "observation.images.front")
|
||||
with pytest.raises(ValueError, match="view-dependent"):
|
||||
validate_camera_field("vqa", None)
|
||||
with pytest.raises(ValueError, match="view-dependent"):
|
||||
validate_camera_field("motion", "")
|
||||
|
||||
|
||||
def test_validate_camera_field_rejects_camera_on_non_view_dependent_styles():
|
||||
validate_camera_field("subtask", None)
|
||||
validate_camera_field("plan", None)
|
||||
validate_camera_field("memory", None)
|
||||
validate_camera_field("interjection", None)
|
||||
validate_camera_field(None, None)
|
||||
with pytest.raises(ValueError, match="must have camera=None"):
|
||||
validate_camera_field("subtask", "observation.images.top")
|
||||
with pytest.raises(ValueError, match="must have camera=None"):
|
||||
validate_camera_field("interjection", "observation.images.top")
|
||||
with pytest.raises(ValueError, match="must have camera=None"):
|
||||
validate_camera_field(None, "observation.images.top")
|
||||
|
||||
|
||||
def test_unknown_style_rejected():
|
||||
with pytest.raises(ValueError, match="Unknown language style"):
|
||||
column_for_style("surprise")
|
||||
@@ -70,6 +115,7 @@ def test_lerobot_dataset_passes_language_columns_through(tmp_path, empty_lerobot
|
||||
"content": "reach for the cup",
|
||||
"style": "subtask",
|
||||
"timestamp": 0.0,
|
||||
"camera": None,
|
||||
"tool_calls": None,
|
||||
}
|
||||
]
|
||||
@@ -77,6 +123,7 @@ def test_lerobot_dataset_passes_language_columns_through(tmp_path, empty_lerobot
|
||||
"role": "user",
|
||||
"content": "what is visible?",
|
||||
"style": "vqa",
|
||||
"camera": "observation.images.top",
|
||||
"tool_calls": None,
|
||||
}
|
||||
data_path = root / DEFAULT_DATA_PATH.format(chunk_index=0, file_index=0)
|
||||
|
||||
@@ -8,21 +8,23 @@ from lerobot.configs.recipe import MessageTurn, TrainingRecipe
|
||||
from lerobot.datasets.language_render import active_at, emitted_at, nth_next, nth_prev, render_sample
|
||||
|
||||
|
||||
def persistent_row(role, content, style, timestamp, tool_calls=None):
|
||||
def persistent_row(role, content, style, timestamp, tool_calls=None, camera=None):
|
||||
return {
|
||||
"role": role,
|
||||
"content": content,
|
||||
"style": style,
|
||||
"timestamp": timestamp,
|
||||
"camera": camera,
|
||||
"tool_calls": tool_calls,
|
||||
}
|
||||
|
||||
|
||||
def event_row(role, content, style, tool_calls=None):
|
||||
def event_row(role, content, style, tool_calls=None, camera=None):
|
||||
return {
|
||||
"role": role,
|
||||
"content": content,
|
||||
"style": style,
|
||||
"camera": camera,
|
||||
"tool_calls": tool_calls,
|
||||
}
|
||||
|
||||
@@ -35,8 +37,8 @@ PERSISTENT = [
|
||||
persistent_row("assistant", "subtask 1", "subtask", 1.0),
|
||||
]
|
||||
EVENTS_AT_1 = [
|
||||
event_row("user", "what is visible?", "vqa"),
|
||||
event_row("assistant", '{"count": 2}', "vqa"),
|
||||
event_row("user", "what is visible?", "vqa", camera="observation.images.top"),
|
||||
event_row("assistant", '{"count": 2}', "vqa", camera="observation.images.top"),
|
||||
]
|
||||
EVENTS_AT_2 = [
|
||||
event_row("user", "skip wiping", "interjection"),
|
||||
@@ -47,6 +49,15 @@ EVENTS_AT_2 = [
|
||||
[{"type": "function", "function": {"name": "say", "arguments": {"text": "Skipping wiping."}}}],
|
||||
),
|
||||
]
|
||||
# Same emission tick, two cameras: triggers per-camera disambiguation in
|
||||
# resolvers, mirroring how Module 3 of the annotation pipeline writes one
|
||||
# (vqa, user) + (vqa, assistant) pair per camera.
|
||||
EVENTS_AT_3_TWO_CAMERAS = [
|
||||
event_row("user", "how many cups (top)?", "vqa", camera="observation.images.top"),
|
||||
event_row("assistant", '{"count": 3}', "vqa", camera="observation.images.top"),
|
||||
event_row("user", "how many cups (wrist)?", "vqa", camera="observation.images.wrist"),
|
||||
event_row("assistant", '{"count": 1}', "vqa", camera="observation.images.wrist"),
|
||||
]
|
||||
|
||||
|
||||
def test_resolver_temporal_semantics():
|
||||
@@ -158,6 +169,126 @@ def test_deterministic_blend_sampling():
|
||||
assert first == second
|
||||
|
||||
|
||||
def test_emitted_at_filters_vqa_by_camera():
|
||||
top = emitted_at(
|
||||
3.0,
|
||||
persistent=PERSISTENT,
|
||||
events=EVENTS_AT_3_TWO_CAMERAS,
|
||||
style="vqa",
|
||||
role="assistant",
|
||||
camera="observation.images.top",
|
||||
)
|
||||
wrist = emitted_at(
|
||||
3.0,
|
||||
persistent=PERSISTENT,
|
||||
events=EVENTS_AT_3_TWO_CAMERAS,
|
||||
style="vqa",
|
||||
role="assistant",
|
||||
camera="observation.images.wrist",
|
||||
)
|
||||
assert top["content"] == '{"count": 3}'
|
||||
assert wrist["content"] == '{"count": 1}'
|
||||
|
||||
|
||||
def test_emitted_at_raises_on_ambiguous_per_camera_vqa():
|
||||
with pytest.raises(ValueError, match="Ambiguous resolver"):
|
||||
emitted_at(
|
||||
3.0,
|
||||
persistent=PERSISTENT,
|
||||
events=EVENTS_AT_3_TWO_CAMERAS,
|
||||
style="vqa",
|
||||
role="assistant",
|
||||
)
|
||||
|
||||
|
||||
def test_per_camera_blend_renders_both_views():
|
||||
recipe = TrainingRecipe(
|
||||
blend={
|
||||
"top": TrainingRecipe(
|
||||
weight=1.0,
|
||||
bindings={
|
||||
"vqa_query": (
|
||||
"emitted_at(t, style=vqa, role=user, camera=observation.images.top)"
|
||||
),
|
||||
"vqa": (
|
||||
"emitted_at(t, style=vqa, role=assistant, camera=observation.images.top)"
|
||||
),
|
||||
},
|
||||
messages=[
|
||||
MessageTurn(
|
||||
role="user",
|
||||
content=[
|
||||
{"type": "image", "feature": "observation.images.top"},
|
||||
{"type": "text", "text": "${vqa_query}"},
|
||||
],
|
||||
stream="high_level",
|
||||
if_present="vqa_query",
|
||||
),
|
||||
MessageTurn(
|
||||
role="assistant",
|
||||
content="${vqa}",
|
||||
stream="high_level",
|
||||
target=True,
|
||||
if_present="vqa",
|
||||
),
|
||||
],
|
||||
),
|
||||
"wrist": TrainingRecipe(
|
||||
weight=1.0,
|
||||
bindings={
|
||||
"vqa_query": (
|
||||
"emitted_at(t, style=vqa, role=user, camera=observation.images.wrist)"
|
||||
),
|
||||
"vqa": (
|
||||
"emitted_at(t, style=vqa, role=assistant, camera=observation.images.wrist)"
|
||||
),
|
||||
},
|
||||
messages=[
|
||||
MessageTurn(
|
||||
role="user",
|
||||
content=[
|
||||
{"type": "image", "feature": "observation.images.wrist"},
|
||||
{"type": "text", "text": "${vqa_query}"},
|
||||
],
|
||||
stream="high_level",
|
||||
if_present="vqa_query",
|
||||
),
|
||||
MessageTurn(
|
||||
role="assistant",
|
||||
content="${vqa}",
|
||||
stream="high_level",
|
||||
target=True,
|
||||
if_present="vqa",
|
||||
),
|
||||
],
|
||||
),
|
||||
}
|
||||
)
|
||||
|
||||
rendered_top = render_sample(
|
||||
recipe=recipe.blend["top"],
|
||||
persistent=PERSISTENT,
|
||||
events=EVENTS_AT_3_TWO_CAMERAS,
|
||||
t=3.0,
|
||||
sample_idx=0,
|
||||
)
|
||||
rendered_wrist = render_sample(
|
||||
recipe=recipe.blend["wrist"],
|
||||
persistent=PERSISTENT,
|
||||
events=EVENTS_AT_3_TWO_CAMERAS,
|
||||
t=3.0,
|
||||
sample_idx=0,
|
||||
)
|
||||
|
||||
assert rendered_top["messages"][0]["content"][0]["feature"] == "observation.images.top"
|
||||
assert rendered_top["messages"][0]["content"][1]["text"] == "how many cups (top)?"
|
||||
assert rendered_top["messages"][1]["content"] == '{"count": 3}'
|
||||
|
||||
assert rendered_wrist["messages"][0]["content"][0]["feature"] == "observation.images.wrist"
|
||||
assert rendered_wrist["messages"][0]["content"][1]["text"] == "how many cups (wrist)?"
|
||||
assert rendered_wrist["messages"][1]["content"] == '{"count": 1}'
|
||||
|
||||
|
||||
def test_canonical_recipe_can_render_low_level_branch():
|
||||
recipe = TrainingRecipe.from_yaml(Path("src/lerobot/configs/recipes/pi05_hirobot.yaml"))
|
||||
low_level = TrainingRecipe(blend={"low": recipe.blend["low_level_execution"]})
|
||||
|
||||
@@ -38,6 +38,7 @@ def test_render_messages_step_renders_and_drops_raw_language():
|
||||
"content": "reach carefully",
|
||||
"style": "subtask",
|
||||
"timestamp": 0.0,
|
||||
"camera": None,
|
||||
"tool_calls": None,
|
||||
}
|
||||
],
|
||||
|
||||
Reference in New Issue
Block a user