mirror of
https://github.com/huggingface/lerobot.git
synced 2026-05-15 00:29:52 +00:00
5a6aa64570
Adds a nullable `camera` field to the language row struct (both persistent
and event variants) so view-dependent styles like `vqa` can carry which
`observation.images.*` view they were grounded against. Without this,
multi-camera datasets ended up with multiple `(vqa, role)` rows at the
same timestamp that the resolver could not disambiguate.
- `language.py`: add `camera` to PERSISTENT_ROW_FIELDS / EVENT_ROW_FIELDS,
to both Arrow struct types and the HF datasets feature mappings;
introduce VIEW_DEPENDENT_STYLES = {vqa, motion, trace} plus
`is_view_dependent_style` and `validate_camera_field` helpers (camera
required iff style is view-dependent).
- `language_render.py`: thread an optional `camera=` kwarg through every
resolver (`active_at`, `emitted_at`, `nth_prev`, `nth_next`) and through
`_matching_rows` / `_select_*`, so recipes can disambiguate per-camera
VQA with `emitted_at(t, style=vqa, role=assistant, camera=...)`.
Without a `camera` filter, multi-row matches keep raising the existing
ambiguity error — which is the desired behaviour on multi-camera data.
- `recipes/pi05_hirobot.yaml`: replace the single `ask_vqa` branch with
`ask_vqa_top` and `ask_vqa_wrist` per-camera sub-recipes (each carrying
the matching image block), keeping the original 0.20 budget and
documenting the customization point for datasets with different cameras.
- Tests: schema test asserts the new field order; new tests cover
`is_view_dependent_style`, `validate_camera_field` (both required and
forbidden directions), per-camera `emitted_at` filtering, and the
ambiguity error when two cameras emit `(vqa, assistant)` at the same
timestamp without a `camera=` filter. RenderMessagesStep + dataset
passthrough fixtures updated to include the new field.
- `docs/source/language_and_recipes.mdx`: document the `camera` field,
the per-camera resolver pattern, and the canonical recipe convention.
Co-Authored-By: Claude Opus 4.7 (1M context) <noreply@anthropic.com>
57 lines
1.9 KiB
Python
57 lines
1.9 KiB
Python
#!/usr/bin/env python
|
|
|
|
import torch
|
|
|
|
from lerobot.configs.recipe import MessageTurn, TrainingRecipe
|
|
from lerobot.processor.converters import create_transition
|
|
from lerobot.processor.render_messages_processor import RenderMessagesStep
|
|
from lerobot.types import TransitionKey
|
|
|
|
|
|
def test_render_messages_step_noops_without_language_columns():
|
|
recipe = TrainingRecipe(
|
|
messages=[
|
|
MessageTurn(role="user", content="${task}", stream="high_level"),
|
|
MessageTurn(role="assistant", content="${subtask}", stream="low_level", target=True),
|
|
]
|
|
)
|
|
transition = create_transition(complementary_data={"task": "do it"})
|
|
|
|
assert RenderMessagesStep(recipe)(transition) == transition
|
|
|
|
|
|
def test_render_messages_step_renders_and_drops_raw_language():
|
|
recipe = TrainingRecipe(
|
|
messages=[
|
|
MessageTurn(role="user", content="${task}", stream="high_level"),
|
|
MessageTurn(role="assistant", content="${subtask}", stream="low_level", target=True),
|
|
]
|
|
)
|
|
transition = create_transition(
|
|
complementary_data={
|
|
"task": "do it",
|
|
"timestamp": torch.tensor(0.0),
|
|
"index": torch.tensor(7),
|
|
"language_persistent": [
|
|
{
|
|
"role": "assistant",
|
|
"content": "reach carefully",
|
|
"style": "subtask",
|
|
"timestamp": 0.0,
|
|
"camera": None,
|
|
"tool_calls": None,
|
|
}
|
|
],
|
|
"language_events": [],
|
|
}
|
|
)
|
|
|
|
out = RenderMessagesStep(recipe)(transition)
|
|
data = out[TransitionKey.COMPLEMENTARY_DATA]
|
|
|
|
assert "language_persistent" not in data
|
|
assert "language_events" not in data
|
|
assert data["messages"][-1]["content"] == "reach carefully"
|
|
assert data["message_streams"] == ["high_level", "low_level"]
|
|
assert data["target_message_indices"] == [1]
|