fix(annotate): sync language metadata after parquet rewrite

Ensure annotated datasets advertise language columns in meta/info.json so non-streaming dataset loads cast against the rewritten parquet schema. Co-authored-by: Cursor <cursoragent@cursor.com>
2026-05-16 09:09:48 +00:00 · 2026-05-04 15:17:15 +00:00
parent 73740ecf4b
commit 8fa8323c91
2 changed files with 79 additions and 14 deletions
@@ -167,6 +167,7 @@ def test_writer_column_routing(fixture_dataset_root: Path, tmp_path: Path) -> No
                "content": "where is the cup?",
                "style": "vqa",
                "timestamp": 0.4,
+                "camera": "observation.images.front",
                "tool_calls": None,
            },
            {
@@ -177,6 +178,7 @@ def test_writer_column_routing(fixture_dataset_root: Path, tmp_path: Path) -> No
                ),
                "style": "vqa",
                "timestamp": 0.4,
+                "camera": "observation.images.front",
                "tool_calls": None,
            },
        ],
@@ -285,6 +287,56 @@ def test_writer_does_not_add_tools_column(fixture_dataset_root: Path, tmp_path:
    assert "tools" not in table.column_names


+def test_annotation_metadata_sync_allows_non_streaming_load(
+    fixture_dataset_root: Path, tmp_path: Path
+) -> None:
+    """Annotated parquet columns must be declared in ``meta/info.json``.
+
+    ``LeRobotDataset`` loads non-streaming datasets by casting parquet
+    against metadata-derived HF features. If the annotation writer adds
+    language columns but metadata stays stale, that cast fails with a column
+    mismatch.
+    """
+    from lerobot.annotations.steerable_pipeline.executor import Executor
+    from lerobot.datasets.feature_utils import get_hf_features_from_features
+    from lerobot.datasets.io_utils import load_info, load_nested_dataset
+    from lerobot.datasets.language import LANGUAGE_EVENTS, LANGUAGE_PERSISTENT, language_feature_info
+
+    info_path = fixture_dataset_root / "meta" / "info.json"
+    info = json.loads(info_path.read_text())
+    info["features"] = {
+        "episode_index": {"dtype": "int64", "shape": (1,), "names": None},
+        "frame_index": {"dtype": "int64", "shape": (1,), "names": None},
+        "timestamp": {"dtype": "float32", "shape": (1,), "names": None},
+        "task_index": {"dtype": "int64", "shape": (1,), "names": None},
+    }
+    info_path.write_text(json.dumps(info, indent=2))
+
+    staging_dir = tmp_path / "stage"
+    _stage_episode(
+        staging_dir,
+        0,
+        module_1=[
+            {"role": "assistant", "content": "do X", "style": "subtask", "timestamp": 0.0, "tool_calls": None}
+        ],
+    )
+    records = list(iter_episodes(fixture_dataset_root))
+    LanguageColumnsWriter().write_all(records, staging_dir, fixture_dataset_root)
+
+    Executor._ensure_annotation_metadata_in_info(fixture_dataset_root)
+
+    synced = load_info(fixture_dataset_root)
+    for key, feature in language_feature_info().items():
+        assert synced["features"][key] == feature
+
+    hf_features = get_hf_features_from_features(synced["features"])
+    dataset = load_nested_dataset(fixture_dataset_root / "data", features=hf_features)
+
+    assert LANGUAGE_PERSISTENT in dataset.column_names
+    assert LANGUAGE_EVENTS in dataset.column_names
+    assert len(dataset) == 24
+
+
 def test_speech_atom_shape_matches_plan_spec() -> None:
    atom = speech_atom(2.5, "I'm cleaning up!")
    assert atom["role"] == "assistant"