review: address CarolinePascal feedback

- language timestamps: float64 -> float32 to match LeRobotDataset frame
  timestamps (Arrow struct + HF feature)
- dataset_metadata: hoist `.language` imports to module top — language.py
  has no lerobot imports, so there is no circular-import risk
- dataset_metadata: add a `meta.tools` setter that persists the catalog to
  info.json and reloads `meta.info`
- feature_utils: validate the `language` dtype instead of returning "" —
  warn (non-fatal) when a non-empty value is written at record time
- centralize the scalar-unwrap helper as `lerobot.utils.utils.unwrap_scalar`,
  shared by render_messages_processor and language_render
- docs: move `## Layer 2 — recipe anatomy` ahead of the resolver sections,
  which describe recipe bindings rather than dataset layout
- language_render: note in EMITTED_AT_TOLERANCE_S that persistent rows change
  on a human-action timescale, not the camera frame rate

Co-Authored-By: Claude Opus 4.7 (1M context) <noreply@anthropic.com>
This commit is contained in:
Pepijn
2026-05-18 11:04:55 +02:00
parent bce5387e04
commit 949a0505a1
9 changed files with 168 additions and 46 deletions
+56
View File
@@ -466,3 +466,59 @@ def test_tools_round_trip_through_dataset_info(tmp_path):
info = DatasetInfo.from_dict(raw)
assert info.tools == raw["tools"]
assert info.to_dict()["tools"] == raw["tools"]
def test_tools_setter_persists_to_info_json_and_reloads(tmp_path):
"""Assigning meta.tools writes info.json and reloads meta.info."""
from lerobot.datasets.io_utils import load_info
root = tmp_path / "set_tools"
meta = LeRobotDatasetMetadata.create(
repo_id="test/set_tools",
fps=DEFAULT_FPS,
features=SIMPLE_FEATURES,
root=root,
use_videos=False,
)
custom_tool = {
"type": "function",
"function": {
"name": "record_observation",
"description": "Capture a still image.",
"parameters": {
"type": "object",
"properties": {"label": {"type": "string"}},
"required": ["label"],
},
},
}
meta.tools = [custom_tool]
# In-memory metadata reflects the new catalog ...
assert meta.tools == [custom_tool]
assert meta.info.tools == [custom_tool]
# ... and a fresh read from disk agrees.
assert load_info(root).tools == [custom_tool]
def test_tools_setter_clears_key_when_set_to_none(tmp_path):
"""Setting meta.tools back to None drops the key and restores the default."""
from lerobot.datasets.language import DEFAULT_TOOLS
root = tmp_path / "clear_tools"
meta = LeRobotDatasetMetadata.create(
repo_id="test/clear_tools",
fps=DEFAULT_FPS,
features=SIMPLE_FEATURES,
root=root,
use_videos=False,
)
meta.tools = [{"type": "function", "function": {"name": "say"}}]
meta.tools = None
assert meta.tools == DEFAULT_TOOLS
with open(root / INFO_PATH) as f:
info_on_disk = json.load(f)
assert "tools" not in info_on_disk
+17
View File
@@ -45,6 +45,23 @@ def test_language_arrow_schema_has_expected_fields():
assert isinstance(event_row_type, pa.StructType)
assert event_row_type.names == ["role", "content", "style", "camera", "tool_calls"]
# Persistent-row timestamps use float32, matching LeRobotDataset frame timestamps.
assert persistent_row_type.field("timestamp").type == pa.float32()
def test_validate_feature_language_warns_only_on_non_empty_value(caplog):
from lerobot.datasets.feature_utils import validate_feature_language
# None (the expected record-time value) is silent and non-fatal.
with caplog.at_level("WARNING"):
assert validate_feature_language("language_persistent", None) == ""
assert caplog.records == []
# A stray non-empty value is dropped later, so we warn rather than fail.
with caplog.at_level("WARNING"):
assert validate_feature_language("language_persistent", [{"role": "user"}]) == ""
assert any("language_persistent" in r.message for r in caplog.records)
def test_style_registry_routes_columns():
assert {"subtask", "plan", "memory", "motion", "task_aug"} == PERSISTENT_STYLES