mirror of
https://github.com/huggingface/lerobot.git
synced 2026-05-16 09:09:48 +00:00
fix(annotate): sync language metadata after parquet rewrite
Ensure annotated datasets advertise language columns in meta/info.json so non-streaming dataset loads cast against the rewritten parquet schema. Co-authored-by: Cursor <cursoragent@cursor.com>
This commit is contained in:
@@ -167,6 +167,7 @@ def test_writer_column_routing(fixture_dataset_root: Path, tmp_path: Path) -> No
|
||||
"content": "where is the cup?",
|
||||
"style": "vqa",
|
||||
"timestamp": 0.4,
|
||||
"camera": "observation.images.front",
|
||||
"tool_calls": None,
|
||||
},
|
||||
{
|
||||
@@ -177,6 +178,7 @@ def test_writer_column_routing(fixture_dataset_root: Path, tmp_path: Path) -> No
|
||||
),
|
||||
"style": "vqa",
|
||||
"timestamp": 0.4,
|
||||
"camera": "observation.images.front",
|
||||
"tool_calls": None,
|
||||
},
|
||||
],
|
||||
@@ -285,6 +287,56 @@ def test_writer_does_not_add_tools_column(fixture_dataset_root: Path, tmp_path:
|
||||
assert "tools" not in table.column_names
|
||||
|
||||
|
||||
def test_annotation_metadata_sync_allows_non_streaming_load(
|
||||
fixture_dataset_root: Path, tmp_path: Path
|
||||
) -> None:
|
||||
"""Annotated parquet columns must be declared in ``meta/info.json``.
|
||||
|
||||
``LeRobotDataset`` loads non-streaming datasets by casting parquet
|
||||
against metadata-derived HF features. If the annotation writer adds
|
||||
language columns but metadata stays stale, that cast fails with a column
|
||||
mismatch.
|
||||
"""
|
||||
from lerobot.annotations.steerable_pipeline.executor import Executor
|
||||
from lerobot.datasets.feature_utils import get_hf_features_from_features
|
||||
from lerobot.datasets.io_utils import load_info, load_nested_dataset
|
||||
from lerobot.datasets.language import LANGUAGE_EVENTS, LANGUAGE_PERSISTENT, language_feature_info
|
||||
|
||||
info_path = fixture_dataset_root / "meta" / "info.json"
|
||||
info = json.loads(info_path.read_text())
|
||||
info["features"] = {
|
||||
"episode_index": {"dtype": "int64", "shape": (1,), "names": None},
|
||||
"frame_index": {"dtype": "int64", "shape": (1,), "names": None},
|
||||
"timestamp": {"dtype": "float32", "shape": (1,), "names": None},
|
||||
"task_index": {"dtype": "int64", "shape": (1,), "names": None},
|
||||
}
|
||||
info_path.write_text(json.dumps(info, indent=2))
|
||||
|
||||
staging_dir = tmp_path / "stage"
|
||||
_stage_episode(
|
||||
staging_dir,
|
||||
0,
|
||||
module_1=[
|
||||
{"role": "assistant", "content": "do X", "style": "subtask", "timestamp": 0.0, "tool_calls": None}
|
||||
],
|
||||
)
|
||||
records = list(iter_episodes(fixture_dataset_root))
|
||||
LanguageColumnsWriter().write_all(records, staging_dir, fixture_dataset_root)
|
||||
|
||||
Executor._ensure_annotation_metadata_in_info(fixture_dataset_root)
|
||||
|
||||
synced = load_info(fixture_dataset_root)
|
||||
for key, feature in language_feature_info().items():
|
||||
assert synced["features"][key] == feature
|
||||
|
||||
hf_features = get_hf_features_from_features(synced["features"])
|
||||
dataset = load_nested_dataset(fixture_dataset_root / "data", features=hf_features)
|
||||
|
||||
assert LANGUAGE_PERSISTENT in dataset.column_names
|
||||
assert LANGUAGE_EVENTS in dataset.column_names
|
||||
assert len(dataset) == 24
|
||||
|
||||
|
||||
def test_speech_atom_shape_matches_plan_spec() -> None:
|
||||
atom = speech_atom(2.5, "I'm cleaning up!")
|
||||
assert atom["role"] == "assistant"
|
||||
|
||||
Reference in New Issue
Block a user