From d762f4bfe8b22045d13483cf7bd15a1e28099678 Mon Sep 17 00:00:00 2001 From: Caroline Pascal Date: Fri, 10 Apr 2026 11:29:40 +0200 Subject: [PATCH] fix(dataset): adding metadata loading when reading from a dataset after writing (#3305) * fix(one shot load): adding metadata loading when reading from a dataset after writing * refactor(one shot load): move metadata reload to ensure_readable() on LeRobotDatasetMetadata Move the metadata reload from DatasetReader.load_and_activate() to a new public ensure_readable() method on LeRobotDatasetMetadata, called from LeRobotDataset._ensure_reader(). This places lifecycle management in the right layer: metadata owns its readiness check, the dataset orchestrates the write-to-read transition, and the reader stays clean. Also adds a regression test using delta_timestamps to exercise the meta.episodes access path in the create -> write -> finalize -> read flow. Co-authored-by: Steven Palma --------- Co-authored-by: claude[bot] <41898282+claude[bot]@users.noreply.github.com> Co-authored-by: Steven Palma --- src/lerobot/datasets/dataset_metadata.py | 10 ++++++++++ src/lerobot/datasets/lerobot_dataset.py | 1 + tests/datasets/test_lerobot_dataset.py | 25 ++++++++++++++++++++++++ 3 files changed, 36 insertions(+) diff --git a/src/lerobot/datasets/dataset_metadata.py b/src/lerobot/datasets/dataset_metadata.py index 65dbc9c4a..d79f4bfba 100644 --- a/src/lerobot/datasets/dataset_metadata.py +++ b/src/lerobot/datasets/dataset_metadata.py @@ -180,6 +180,16 @@ class LeRobotDatasetMetadata: self.episodes = load_episodes(self.root) self.stats = load_stats(self.root) + def ensure_readable(self) -> None: + """Guarantee metadata is fully loaded for read operations. + + Idempotent — when metadata is already in memory this is a single + ``is None`` check. Call this before transitioning from write to + read mode on the same instance. + """ + if self.episodes is None: + self._load_metadata() + def _pull_from_repo( self, allow_patterns: list[str] | str | None = None, diff --git a/src/lerobot/datasets/lerobot_dataset.py b/src/lerobot/datasets/lerobot_dataset.py index 1725046f2..2f0154cda 100644 --- a/src/lerobot/datasets/lerobot_dataset.py +++ b/src/lerobot/datasets/lerobot_dataset.py @@ -278,6 +278,7 @@ class LeRobotDataset(torch.utils.data.Dataset): def _ensure_reader(self) -> DatasetReader: """Lazily create the reader on first access.""" if self.reader is None: + self.meta.ensure_readable() self.reader = DatasetReader( meta=self.meta, root=self.root, diff --git a/tests/datasets/test_lerobot_dataset.py b/tests/datasets/test_lerobot_dataset.py index a8aa47ed2..5c3c24f99 100644 --- a/tests/datasets/test_lerobot_dataset.py +++ b/tests/datasets/test_lerobot_dataset.py @@ -535,6 +535,31 @@ def test_getitem_works_after_finalize(tmp_path): assert "task" in item +def test_getitem_after_finalize_with_delta_timestamps(tmp_path): + """After finalize(), dataset[0] works when delta_timestamps require episode metadata. + + Regression test for https://github.com/huggingface/lerobot/pull/3305. + The create -> write -> finalize -> read path left meta.episodes as None + because the write path flushes episodes to disk without updating them + in memory. Features that access meta.episodes (video decoding, + delta_timestamps) would crash with a TypeError. + """ + dataset = LeRobotDataset.create( + repo_id=DUMMY_REPO_ID, fps=DEFAULT_FPS, features=SIMPLE_FEATURES, root=tmp_path / "ds" + ) + for _ in range(5): + dataset.add_frame(_make_frame()) + dataset.save_episode() + dataset.finalize() + + # Set delta_timestamps so get_item() accesses meta.episodes via _get_query_indices + dataset.delta_timestamps = {"state": [0.0]} + + item = dataset[0] + assert "state" in item + assert "state_is_pad" in item + + # ── Property delegation ──────────────────────────────────────────────