From a19bd6e84dfdb7307911add568300190705a809c Mon Sep 17 00:00:00 2001 From: Steven Palma Date: Wed, 5 Nov 2025 21:08:23 +0100 Subject: [PATCH] try fix 3 --- src/lerobot/datasets/lerobot_dataset.py | 76 ++++++++++++++++++++++++- src/lerobot/datasets/utils.py | 28 --------- 2 files changed, 73 insertions(+), 31 deletions(-) diff --git a/src/lerobot/datasets/lerobot_dataset.py b/src/lerobot/datasets/lerobot_dataset.py index 38181990c..d9eae49a1 100644 --- a/src/lerobot/datasets/lerobot_dataset.py +++ b/src/lerobot/datasets/lerobot_dataset.py @@ -19,6 +19,7 @@ import shutil import tempfile from collections.abc import Callable from pathlib import Path +from typing import Any import datasets import numpy as np @@ -31,6 +32,8 @@ import torch import torch.utils from huggingface_hub import HfApi, snapshot_download from huggingface_hub.errors import RevisionNotFoundError +from PIL import Image as PILImage +from torchvision import transforms from lerobot.datasets.compute_stats import aggregate_stats, compute_episode_stats from lerobot.datasets.image_writer import AsyncImageWriter, write_image @@ -50,11 +53,9 @@ from lerobot.datasets.utils import ( get_file_size_in_mb, get_hf_features_from_features, get_safe_version, - hf_transform_to_torch, is_valid_version, load_episodes, load_info, - load_nested_dataset, load_stats, load_tasks, update_chunk_file_indices, @@ -832,8 +833,50 @@ class LeRobotDataset(torch.utils.data.Dataset): def load_hf_dataset(self) -> datasets.Dataset: """hf_dataset contains all the observations, states, actions, rewards, etc.""" + + # We MUST import this here to avoid circular dependency + # (utils imports lerobot_dataset for backward_compatibility) + from lerobot.datasets.utils import hf_transform_to_torch + features = get_hf_features_from_features(self.features) - hf_dataset = load_nested_dataset(self.root / "data", features=features) + + # This is the v2.1 logic that forces an efficient, pre-decoded cache build. + # This is the key to performance for dtype="image" datasets. + + # 1. Check if specific episodes are requested by the user. + if self.episodes is not None: + # Get the unique set of parquet files for the requested episodes + fpaths = set() + for ep_idx in self.episodes: + # Need to read metadata to find the file path for this episode + # We use self.meta.episodes (the loaded dataset) here + ep_meta = self.meta.episodes[ep_idx] + chunk_idx = ep_meta["data/chunk_index"] + file_idx = ep_meta["data/file_index"] + fpath_str = self.meta.data_path.format(chunk_index=chunk_idx, file_index=file_idx) + fpaths.add(str(self.root / fpath_str)) + + data_files = sorted(list(fpaths)) + + hf_dataset = datasets.load_dataset( + "parquet", data_files=data_files, features=features, split="train" + ) + + # Filter the loaded dataset to *only* include the requested episodes + # This is necessary because the v3 files contain multiple episodes. + requested_episodes_set = set(self.episodes) + hf_dataset = hf_dataset.filter( + lambda x: x["episode_index"] in requested_episodes_set, + batched=True, # Use batched=True for faster filtering + batch_size=1000, + ) + + else: + # THIS IS THE FAST PATH FOR TRAINING (self.episodes is None) + # Load all data files using data_dir, which is the most efficient. + data_dir = str(self.root / "data") + hf_dataset = datasets.load_dataset("parquet", data_dir=data_dir, features=features, split="train") + hf_dataset.set_transform(hf_transform_to_torch) return hf_dataset @@ -1675,3 +1718,30 @@ class MultiLeRobotDataset(torch.utils.data.Dataset): f" Transformations: {self.image_transforms},\n" f")" ) + + +def hf_transform_to_torch(items_dict: dict[str, list[Any]]) -> dict[str, list[torch.Tensor | str]]: + """Convert a batch from a Hugging Face dataset to torch tensors. + + This transform function converts items from Hugging Face dataset format (pyarrow) + to torch tensors. Importantly, images are converted from PIL objects (H, W, C, uint8) + to a torch image representation (C, H, W, float32) in the range [0, 1]. Other + types are converted to torch.tensor. + + Args: + items_dict (dict): A dictionary representing a batch of data from a + Hugging Face dataset. + + Returns: + dict: The batch with items converted to torch tensors. + """ + for key in items_dict: + first_item = items_dict[key][0] + if isinstance(first_item, PILImage.Image): + to_tensor = transforms.ToTensor() + items_dict[key] = [to_tensor(img) for img in items_dict[key]] + elif first_item is None: + pass + else: + items_dict[key] = [x if isinstance(x, str) else torch.tensor(x) for x in items_dict[key]] + return items_dict diff --git a/src/lerobot/datasets/utils.py b/src/lerobot/datasets/utils.py index 0dd226719..a24ce2287 100644 --- a/src/lerobot/datasets/utils.py +++ b/src/lerobot/datasets/utils.py @@ -35,7 +35,6 @@ from datasets.table import embed_table_storage from huggingface_hub import DatasetCard, DatasetCardData, HfApi from huggingface_hub.errors import RevisionNotFoundError from PIL import Image as PILImage -from torchvision import transforms from lerobot.configs.types import FeatureType, PolicyFeature from lerobot.datasets.backward_compatibility import ( @@ -399,33 +398,6 @@ def load_image_as_numpy( return img_array -def hf_transform_to_torch(items_dict: dict[str, list[Any]]) -> dict[str, list[torch.Tensor | str]]: - """Convert a batch from a Hugging Face dataset to torch tensors. - - This transform function converts items from Hugging Face dataset format (pyarrow) - to torch tensors. Importantly, images are converted from PIL objects (H, W, C, uint8) - to a torch image representation (C, H, W, float32) in the range [0, 1]. Other - types are converted to torch.tensor. - - Args: - items_dict (dict): A dictionary representing a batch of data from a - Hugging Face dataset. - - Returns: - dict: The batch with items converted to torch tensors. - """ - for key in items_dict: - first_item = items_dict[key][0] - if isinstance(first_item, PILImage.Image): - to_tensor = transforms.ToTensor() - items_dict[key] = [to_tensor(img) for img in items_dict[key]] - elif first_item is None: - pass - else: - items_dict[key] = [x if isinstance(x, str) else torch.tensor(x) for x in items_dict[key]] - return items_dict - - def is_valid_version(version: str) -> bool: """Check if a string is a valid PEP 440 version.