mirror of
https://github.com/huggingface/lerobot.git
synced 2026-05-11 14:49:43 +00:00
Compare commits
11 Commits
| Author | SHA1 | Date | |
|---|---|---|---|
| 03cce79c88 | |||
| 3918ab7882 | |||
| 65b0e73ae4 | |||
| ca7c5fcdfe | |||
| 28f8098df4 | |||
| db7d501281 | |||
| 88380fe34e | |||
| 154abfd233 | |||
| dc14266762 | |||
| fd623e0cc5 | |||
| a52e88d349 |
@@ -0,0 +1,464 @@
|
||||
#!/usr/bin/env python
|
||||
|
||||
# Copyright 2025 The HuggingFace Inc. team. All rights reserved.
|
||||
#
|
||||
# Licensed under the Apache License, Version 2.0 (the "License");
|
||||
# you may not use this file except in compliance with the License.
|
||||
# You may obtain a copy of the License at
|
||||
#
|
||||
# http://www.apache.org/licenses/LICENSE-2.0
|
||||
#
|
||||
# Unless required by applicable law or agreed to in writing, software
|
||||
# distributed under the License is distributed on an "AS IS" BASIS,
|
||||
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
|
||||
# See the License for the specific language governing permissions and
|
||||
# limitations under the License.
|
||||
|
||||
"""
|
||||
BehaviorLeRobotDatasetV3: A wrapper around LeRobotDataset v3.0 for loading BEHAVIOR-1K data.
|
||||
|
||||
This wrapper extends LeRobotDataset to support BEHAVIOR-1K specific features:
|
||||
- Modality and camera selection (rgb, depth, seg_instance_id)
|
||||
- Efficient chunk streaming mode with keyframe access
|
||||
- Additional BEHAVIOR-1K metadata (cam_rel_poses, task_info, etc.)
|
||||
"""
|
||||
|
||||
import logging
|
||||
from collections.abc import Callable
|
||||
from pathlib import Path
|
||||
|
||||
import datasets
|
||||
import numpy as np
|
||||
from behaviour_1k_constants import ROBOT_CAMERA_NAMES, ROBOT_TYPE
|
||||
from torch.utils.data import Dataset, get_worker_info
|
||||
|
||||
from lerobot.datasets.lerobot_dataset import CODEBASE_VERSION, LeRobotDataset, LeRobotDatasetMetadata
|
||||
from lerobot.datasets.utils import (
|
||||
check_delta_timestamps,
|
||||
get_delta_indices,
|
||||
get_safe_version,
|
||||
hf_transform_to_torch,
|
||||
)
|
||||
from lerobot.datasets.video_utils import decode_video_frames, get_safe_default_codec
|
||||
from lerobot.utils.constants import HF_LEROBOT_HOME
|
||||
|
||||
logger = logging.getLogger(__name__)
|
||||
|
||||
|
||||
class BehaviorLeRobotDatasetMetadata(LeRobotDatasetMetadata):
|
||||
"""
|
||||
Extended metadata class for BEHAVIOR-1K datasets.
|
||||
|
||||
Adds support for:
|
||||
- Modality and camera filtering
|
||||
- Custom metainfo and annotation paths
|
||||
"""
|
||||
|
||||
def __init__(
|
||||
self,
|
||||
repo_id: str,
|
||||
root: str | Path | None = None,
|
||||
revision: str | None = None,
|
||||
force_cache_sync: bool = False,
|
||||
metadata_buffer_size: int = 10,
|
||||
modalities: set[str] | None = None,
|
||||
cameras: set[str] | None = None,
|
||||
):
|
||||
self.modalities = set(modalities) if modalities else {"rgb", "depth", "seg_instance_id"}
|
||||
self.camera_names = set(cameras) if cameras else {"head", "left_wrist", "right_wrist"}
|
||||
|
||||
assert self.modalities.issubset({"rgb", "depth", "seg_instance_id"}), (
|
||||
f"Modalities must be subset of ['rgb', 'depth', 'seg_instance_id'], got {self.modalities}"
|
||||
)
|
||||
|
||||
assert self.camera_names.issubset(set(ROBOT_CAMERA_NAMES[ROBOT_TYPE])), (
|
||||
f"Camera names must be subset of {list(ROBOT_CAMERA_NAMES[ROBOT_TYPE])}, got {self.camera_names}"
|
||||
)
|
||||
|
||||
super().__init__(repo_id, root, revision, force_cache_sync, metadata_buffer_size)
|
||||
|
||||
@property
|
||||
def filtered_features(self) -> dict[str, dict]:
|
||||
"""Return only features matching selected modalities and cameras."""
|
||||
features = {}
|
||||
for name, feature_info in self.features.items():
|
||||
if not name.startswith("observation.images."):
|
||||
features[name] = feature_info
|
||||
continue
|
||||
|
||||
parts = name.split(".")
|
||||
if len(parts) >= 4:
|
||||
modality = parts[2]
|
||||
camera = parts[3]
|
||||
if modality in self.modalities and camera in self.camera_names:
|
||||
features[name] = feature_info
|
||||
|
||||
return features
|
||||
|
||||
@property
|
||||
def video_keys(self) -> list[str]:
|
||||
"""Return only video keys for selected modalities and cameras."""
|
||||
all_video_keys = super().video_keys
|
||||
|
||||
filtered_keys = []
|
||||
for key in all_video_keys:
|
||||
parts = key.split(".")
|
||||
if len(parts) >= 4:
|
||||
modality = parts[2]
|
||||
camera = parts[3]
|
||||
if modality in self.modalities and camera in self.camera_names:
|
||||
filtered_keys.append(key)
|
||||
|
||||
return filtered_keys
|
||||
|
||||
def get_metainfo_path(self, ep_index: int) -> Path:
|
||||
"""Get path to episode metainfo file."""
|
||||
if "metainfo_path" in self.info:
|
||||
fpath = self.info["metainfo_path"].format(episode_index=ep_index)
|
||||
return Path(fpath)
|
||||
return None
|
||||
|
||||
def get_annotation_path(self, ep_index: int) -> Path:
|
||||
"""Get path to episode annotation file."""
|
||||
if "annotation_path" in self.info:
|
||||
fpath = self.info["annotation_path"].format(episode_index=ep_index)
|
||||
return Path(fpath)
|
||||
return None
|
||||
|
||||
|
||||
class BehaviorLeRobotDatasetV3(LeRobotDataset):
|
||||
"""
|
||||
BEHAVIOR-1K wrapper for LeRobotDataset v3.0.
|
||||
|
||||
Each BEHAVIOR-1K dataset contains a single task (e.g., behavior1k-task0000).
|
||||
See https://huggingface.co/collections/lerobot/behavior-1k for all available tasks.
|
||||
|
||||
Key features:
|
||||
- Modality and camera selection
|
||||
- Efficient chunk streaming with keyframe access (recommended for B1K with GOP=250)
|
||||
- Support for BEHAVIOR-1K specific observations (cam_rel_poses, task_info, task_index)
|
||||
"""
|
||||
|
||||
def __init__(
|
||||
self,
|
||||
repo_id: str,
|
||||
root: str | Path | None = None,
|
||||
episodes: list[int] | None = None,
|
||||
image_transforms: Callable | None = None,
|
||||
delta_timestamps: dict[list[float]] | None = None,
|
||||
tolerance_s: float = 1e-4,
|
||||
revision: str | None = None,
|
||||
force_cache_sync: bool = False,
|
||||
download_videos: bool = True,
|
||||
video_backend: str | None = None,
|
||||
batch_encoding_size: int = 1,
|
||||
# BEHAVIOR-1K specific arguments
|
||||
modalities: list[str] | None = None,
|
||||
cameras: list[str] | None = None,
|
||||
check_timestamp_sync: bool = True,
|
||||
chunk_streaming_using_keyframe: bool = True,
|
||||
shuffle: bool = True,
|
||||
seed: int = 42,
|
||||
):
|
||||
"""
|
||||
Initialize BEHAVIOR-1K dataset.
|
||||
|
||||
Args:
|
||||
repo_id: HuggingFace repository ID (e.g., "lerobot/behavior1k-task0000")
|
||||
root: Local directory for dataset storage
|
||||
episodes: List of episode indices to load (for train/val split)
|
||||
image_transforms: Torchvision v2 transforms for images
|
||||
delta_timestamps: Temporal offsets for history/future frames
|
||||
tolerance_s: Tolerance for timestamp synchronization
|
||||
revision: Git revision/branch to load
|
||||
force_cache_sync: Force re-download from hub
|
||||
download_videos: Whether to download video files
|
||||
video_backend: Video decoder ('pyav' or 'torchcodec')
|
||||
batch_encoding_size: Batch size for video encoding
|
||||
modalities: List of modalities to load (None = all: rgb, depth, seg_instance_id)
|
||||
cameras: List of cameras to load (None = all: head, left_wrist, right_wrist)
|
||||
check_timestamp_sync: Verify timestamp synchronization (can be slow)
|
||||
chunk_streaming_using_keyframe: Use keyframe-based streaming (STRONGLY RECOMMENDED for B1K)
|
||||
shuffle: Shuffle chunks in streaming mode
|
||||
seed: Random seed for shuffling
|
||||
"""
|
||||
Dataset.__init__(self)
|
||||
|
||||
self.repo_id = repo_id
|
||||
if root:
|
||||
self.root = Path(root)
|
||||
else:
|
||||
dataset_name = repo_id.split("/")[-1] if "/" in repo_id else repo_id
|
||||
self.root = HF_LEROBOT_HOME / dataset_name
|
||||
|
||||
self.image_transforms = image_transforms
|
||||
self.delta_timestamps = delta_timestamps
|
||||
self.tolerance_s = tolerance_s
|
||||
self.revision = revision if revision else CODEBASE_VERSION
|
||||
self.video_backend = video_backend if video_backend else get_safe_default_codec()
|
||||
self.delta_indices = None
|
||||
self.batch_encoding_size = batch_encoding_size
|
||||
self.episodes_since_last_encoding = 0
|
||||
self.seed = seed
|
||||
|
||||
self.image_writer = None
|
||||
self.episode_buffer = None
|
||||
self.writer = None
|
||||
self.latest_episode = None
|
||||
self._current_file_start_frame = None
|
||||
|
||||
self.root.mkdir(exist_ok=True, parents=True)
|
||||
|
||||
if modalities is None:
|
||||
modalities = ["rgb", "depth", "seg_instance_id"]
|
||||
if "seg_instance_id" in modalities:
|
||||
assert chunk_streaming_using_keyframe, (
|
||||
"For performance, seg_instance_id requires chunk_streaming_using_keyframe=True"
|
||||
)
|
||||
if "depth" in modalities:
|
||||
assert self.video_backend == "pyav", "Depth videos require video_backend='pyav'"
|
||||
if cameras is None:
|
||||
cameras = ["head", "left_wrist", "right_wrist"]
|
||||
|
||||
self.meta = BehaviorLeRobotDatasetMetadata(
|
||||
repo_id=self.repo_id,
|
||||
root=self.root,
|
||||
revision=self.revision,
|
||||
force_cache_sync=force_cache_sync,
|
||||
modalities=modalities,
|
||||
cameras=cameras,
|
||||
)
|
||||
|
||||
if episodes is not None:
|
||||
self.episodes = sorted([i for i in episodes if i < len(self.meta.episodes)])
|
||||
else:
|
||||
self.episodes = list(range(len(self.meta.episodes)))
|
||||
|
||||
logger.info(f"Total episodes: {len(self.episodes)}")
|
||||
|
||||
self._chunk_streaming_using_keyframe = chunk_streaming_using_keyframe
|
||||
if self._chunk_streaming_using_keyframe:
|
||||
if not shuffle:
|
||||
logger.warning("Chunk streaming enabled but shuffle=False. This may reduce randomness.")
|
||||
self.chunks = self._get_keyframe_chunk_indices()
|
||||
self.current_streaming_chunk_idx = None if shuffle else 0
|
||||
self.current_streaming_frame_idx = None if shuffle else self.chunks[0][0] if self.chunks else 0
|
||||
self.obs_loaders = {}
|
||||
self._should_obs_loaders_reload = True
|
||||
|
||||
self._lazy_loading = False
|
||||
self._recorded_frames = self.meta.total_frames
|
||||
self._writer_closed_for_reading = False
|
||||
|
||||
try:
|
||||
if force_cache_sync:
|
||||
raise FileNotFoundError
|
||||
self.hf_dataset = self.load_hf_dataset()
|
||||
except (AssertionError, FileNotFoundError, NotADirectoryError):
|
||||
self.revision = get_safe_version(self.repo_id, self.revision)
|
||||
self.download_episodes(download_videos)
|
||||
self.hf_dataset = self.load_hf_dataset()
|
||||
|
||||
if self.delta_timestamps is not None:
|
||||
check_delta_timestamps(self.delta_timestamps, self.meta.fps, self.tolerance_s)
|
||||
self.delta_indices = get_delta_indices(self.delta_timestamps, self.meta.fps)
|
||||
|
||||
@property
|
||||
def fps(self) -> int:
|
||||
"""Frames per second."""
|
||||
return self.meta.fps
|
||||
|
||||
@property
|
||||
def features(self) -> dict:
|
||||
"""Dataset features (filtered by modalities/cameras)."""
|
||||
return self.meta.filtered_features
|
||||
|
||||
@property
|
||||
def num_episodes(self) -> int:
|
||||
"""Number of episodes."""
|
||||
return len(self.episodes)
|
||||
|
||||
@property
|
||||
def num_frames(self) -> int:
|
||||
"""Total number of frames."""
|
||||
return len(self.hf_dataset)
|
||||
|
||||
def get_episodes_file_paths(self) -> list[str]:
|
||||
"""
|
||||
Get download patterns for requested episodes.
|
||||
|
||||
Returns glob patterns for download rather than specific file paths.
|
||||
|
||||
Note: Unlike the base LeRobotDataset, this method cannot filter downloads to only
|
||||
requested episodes because:
|
||||
1. BEHAVIOR-1K episode indices are encoded (e.g., 10010 for task 1, episode 10)
|
||||
2. Episodes are chunked across multiple parquet/video files
|
||||
3. The parquet files are organized by chunk, not by episode
|
||||
|
||||
Therefore, we download full data/meta/video directories and rely on
|
||||
`self.load_hf_dataset()` to filter to requested episodes from the loaded data.
|
||||
"""
|
||||
allow_patterns = ["data/**", "meta/**"]
|
||||
|
||||
# Filter by modalities and cameras for video patterns
|
||||
if len(self.meta.video_keys) > 0:
|
||||
if len(self.meta.modalities) != 3 or len(self.meta.camera_names) != 3:
|
||||
# Only download specific modality/camera combinations
|
||||
for modality in self.meta.modalities:
|
||||
for camera in self.meta.camera_names:
|
||||
allow_patterns.append(f"**/observation.images.{modality}.{camera}/**")
|
||||
else:
|
||||
# Download all videos (no filtering needed)
|
||||
allow_patterns.append("videos/**")
|
||||
|
||||
return allow_patterns
|
||||
|
||||
def download_episodes(self, download_videos: bool = True) -> None:
|
||||
"""
|
||||
Download episodes with modality/camera filtering.
|
||||
|
||||
Follows the same pattern as base LeRobotDataset.download() but uses
|
||||
get_episodes_file_paths() which returns patterns for modality/camera filtering.
|
||||
"""
|
||||
ignore_patterns = None if download_videos else "videos/"
|
||||
files = self.get_episodes_file_paths()
|
||||
self.pull_from_repo(allow_patterns=files, ignore_patterns=ignore_patterns)
|
||||
|
||||
def pull_from_repo(
|
||||
self,
|
||||
allow_patterns: list[str] | str | None = None,
|
||||
ignore_patterns: list[str] | str | None = None,
|
||||
) -> None:
|
||||
"""Pull dataset from HuggingFace Hub."""
|
||||
|
||||
from huggingface_hub import snapshot_download
|
||||
|
||||
logger.info(f"Pulling dataset {self.repo_id} from HuggingFace Hub...")
|
||||
snapshot_download(
|
||||
self.repo_id,
|
||||
repo_type="dataset",
|
||||
revision=self.revision,
|
||||
local_dir=self.root,
|
||||
allow_patterns=allow_patterns,
|
||||
ignore_patterns=ignore_patterns,
|
||||
)
|
||||
|
||||
def load_hf_dataset(self) -> datasets.Dataset:
|
||||
"""Load dataset from parquet files."""
|
||||
from datasets import load_dataset
|
||||
|
||||
path = str(self.root / "data")
|
||||
hf_dataset = load_dataset("parquet", data_dir=path, split="train")
|
||||
|
||||
hf_dataset.set_transform(hf_transform_to_torch)
|
||||
return hf_dataset
|
||||
|
||||
def _get_keyframe_chunk_indices(self, chunk_size: int = 250) -> list[tuple[int, int, int]]:
|
||||
"""
|
||||
Divide episodes into chunks based on GOP size (keyframe interval).
|
||||
|
||||
For BEHAVIOR-1K, GOP size is 250 frames for efficient storage.
|
||||
|
||||
Returns:
|
||||
List of (start_index, end_index, local_start_index) tuples
|
||||
"""
|
||||
chunks = []
|
||||
offset = 0
|
||||
|
||||
for ep_array_idx in self.episodes:
|
||||
# self.episodes contains array indices, so access directly
|
||||
ep = self.meta.episodes[ep_array_idx]
|
||||
length = ep["length"]
|
||||
local_starts = list(range(0, length, chunk_size))
|
||||
local_ends = local_starts[1:] + [length]
|
||||
|
||||
for local_start, local_end in zip(local_starts, local_ends, strict=True):
|
||||
chunks.append((offset + local_start, offset + local_end, local_start))
|
||||
offset += length
|
||||
|
||||
return chunks
|
||||
|
||||
def __getitem__(self, idx: int) -> dict:
|
||||
"""Get item by index, with optional chunk streaming."""
|
||||
if not self._chunk_streaming_using_keyframe:
|
||||
item = self.hf_dataset[idx]
|
||||
|
||||
for key in self.meta.video_keys:
|
||||
if key in self.features:
|
||||
ep_idx = item["episode_index"].item()
|
||||
timestamp = item["timestamp"].item()
|
||||
video_path = self.root / self.meta.get_video_file_path(ep_idx, key)
|
||||
frames = decode_video_frames(
|
||||
video_path, [timestamp], self.tolerance_s, self.video_backend
|
||||
)
|
||||
item[key] = frames.squeeze(0)
|
||||
|
||||
if self.image_transforms is not None:
|
||||
for key in self.features:
|
||||
if key.startswith("observation.images."):
|
||||
item[key] = self.image_transforms(item[key])
|
||||
|
||||
if "task_index" in item:
|
||||
task_idx = item["task_index"].item()
|
||||
try:
|
||||
item["task"] = self.meta.tasks.iloc[task_idx].name
|
||||
except (IndexError, AttributeError):
|
||||
item["task"] = f"task_{task_idx}"
|
||||
|
||||
return item
|
||||
|
||||
return self._get_item_streaming(idx)
|
||||
|
||||
def _get_item_streaming(self, idx: int) -> dict:
|
||||
"""Get item in chunk streaming mode."""
|
||||
if self.current_streaming_chunk_idx is None:
|
||||
worker_info = get_worker_info()
|
||||
worker_id = 0 if worker_info is None else worker_info.id
|
||||
rng = np.random.default_rng(self.seed + worker_id)
|
||||
rng.shuffle(self.chunks)
|
||||
self.current_streaming_chunk_idx = rng.integers(0, len(self.chunks)).item()
|
||||
self.current_streaming_frame_idx = self.chunks[self.current_streaming_chunk_idx][0]
|
||||
|
||||
if self.current_streaming_frame_idx >= self.chunks[self.current_streaming_chunk_idx][1]:
|
||||
self.current_streaming_chunk_idx += 1
|
||||
if self.current_streaming_chunk_idx >= len(self.chunks):
|
||||
self.current_streaming_chunk_idx = 0
|
||||
self.current_streaming_frame_idx = self.chunks[self.current_streaming_chunk_idx][0]
|
||||
self._should_obs_loaders_reload = True
|
||||
|
||||
item = self.hf_dataset[self.current_streaming_frame_idx]
|
||||
ep_idx = item["episode_index"].item()
|
||||
|
||||
if self._should_obs_loaders_reload:
|
||||
for loader in self.obs_loaders.values():
|
||||
if hasattr(loader, "close"):
|
||||
loader.close()
|
||||
self.obs_loaders = {}
|
||||
self.current_streaming_episode_idx = ep_idx
|
||||
self._should_obs_loaders_reload = False
|
||||
|
||||
for key in self.meta.video_keys:
|
||||
if key in self.features:
|
||||
timestamp = item["timestamp"].item()
|
||||
video_path = self.root / self.meta.get_video_file_path(ep_idx, key)
|
||||
frames = decode_video_frames(video_path, [timestamp], self.tolerance_s, self.video_backend)
|
||||
item[key] = frames.squeeze(0)
|
||||
|
||||
if self.image_transforms is not None:
|
||||
for key in self.features:
|
||||
if key.startswith("observation.images."):
|
||||
item[key] = self.image_transforms(item[key])
|
||||
|
||||
if "task_index" in item:
|
||||
task_idx = item["task_index"].item()
|
||||
try:
|
||||
item["task"] = self.meta.tasks.iloc[task_idx].name
|
||||
except (IndexError, AttributeError):
|
||||
item["task"] = f"task_{task_idx}"
|
||||
|
||||
self.current_streaming_frame_idx += 1
|
||||
return item
|
||||
|
||||
def __len__(self) -> int:
|
||||
"""Total number of frames."""
|
||||
return len(self.hf_dataset)
|
||||
@@ -0,0 +1,350 @@
|
||||
#!/usr/bin/env python
|
||||
|
||||
# Copyright 2025 The HuggingFace Inc. team. All rights reserved.
|
||||
#
|
||||
# Licensed under the Apache License, Version 2.0 (the "License");
|
||||
# you may not use this file except in compliance with the License.
|
||||
# You may obtain a copy of the License at
|
||||
#
|
||||
# http://www.apache.org/licenses/LICENSE-2.0
|
||||
#
|
||||
# Unless required by applicable law or agreed to in writing, software
|
||||
# distributed under the License is distributed on an "AS IS" BASIS,
|
||||
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
|
||||
# See the License for the specific language governing permissions and
|
||||
# limitations under the License.
|
||||
from collections import OrderedDict
|
||||
|
||||
import numpy as np
|
||||
import torch as th
|
||||
|
||||
ROBOT_TYPE = "R1Pro"
|
||||
FPS = 30
|
||||
|
||||
ROBOT_CAMERA_NAMES = {
|
||||
"A1": {
|
||||
"external": "external::external_camera",
|
||||
"wrist": "external::wrist_camera",
|
||||
},
|
||||
"R1Pro": {
|
||||
"left_wrist": "robot_r1::robot_r1:left_realsense_link:Camera:0",
|
||||
"right_wrist": "robot_r1::robot_r1:right_realsense_link:Camera:0",
|
||||
"head": "robot_r1::robot_r1:zed_link:Camera:0",
|
||||
},
|
||||
}
|
||||
|
||||
# Camera resolutions and corresponding intrinstics
|
||||
HEAD_RESOLUTION = (720, 720)
|
||||
WRIST_RESOLUTION = (480, 480)
|
||||
# TODO: Fix A1
|
||||
CAMERA_INTRINSICS = {
|
||||
"A1": {
|
||||
"external": np.array(
|
||||
[[306.0, 0.0, 360.0], [0.0, 306.0, 360.0], [0.0, 0.0, 1.0]], dtype=np.float32
|
||||
), # 240x240
|
||||
"wrist": np.array(
|
||||
[[388.6639, 0.0, 240.0], [0.0, 388.6639, 240.0], [0.0, 0.0, 1.0]], dtype=np.float32
|
||||
), # 240x240
|
||||
},
|
||||
"R1Pro": {
|
||||
"head": np.array(
|
||||
[[306.0, 0.0, 360.0], [0.0, 306.0, 360.0], [0.0, 0.0, 1.0]], dtype=np.float32
|
||||
), # 720x720
|
||||
"left_wrist": np.array(
|
||||
[[388.6639, 0.0, 240.0], [0.0, 388.6639, 240.0], [0.0, 0.0, 1.0]], dtype=np.float32
|
||||
), # 480x480
|
||||
"right_wrist": np.array(
|
||||
[[388.6639, 0.0, 240.0], [0.0, 388.6639, 240.0], [0.0, 0.0, 1.0]], dtype=np.float32
|
||||
), # 480x480
|
||||
},
|
||||
}
|
||||
|
||||
|
||||
# Dataset features for BEHAVIOR-1K LeRobotDataset v3.0
|
||||
BEHAVIOR_DATASET_FEATURES = {
|
||||
# Actions
|
||||
"action": {
|
||||
"dtype": "float32",
|
||||
"shape": (23,), # 23-dimensional action space for R1Pro
|
||||
"names": None,
|
||||
},
|
||||
# Proprioception
|
||||
"observation.state": {
|
||||
"dtype": "float32",
|
||||
"shape": (256,), # Full proprioception state
|
||||
"names": None,
|
||||
},
|
||||
# Camera relative poses
|
||||
"observation.cam_rel_poses": {
|
||||
"dtype": "float32",
|
||||
"shape": (21,), # 3 cameras * 7 (pos + quat)
|
||||
"names": None,
|
||||
},
|
||||
# Task information
|
||||
"observation.task_info": {
|
||||
"dtype": "float32",
|
||||
"shape": (None,), # Variable size
|
||||
"names": None,
|
||||
},
|
||||
# RGB images
|
||||
"observation.images.rgb.head": {
|
||||
"dtype": "video",
|
||||
"shape": [720, 720, 3],
|
||||
"names": ["height", "width", "channels"],
|
||||
},
|
||||
"observation.images.rgb.left_wrist": {
|
||||
"dtype": "video",
|
||||
"shape": [480, 480, 3],
|
||||
"names": ["height", "width", "channels"],
|
||||
},
|
||||
"observation.images.rgb.right_wrist": {
|
||||
"dtype": "video",
|
||||
"shape": [480, 480, 3],
|
||||
"names": ["height", "width", "channels"],
|
||||
},
|
||||
# Depth images
|
||||
"observation.images.depth.head": {
|
||||
"dtype": "video",
|
||||
"shape": [720, 720, 1],
|
||||
"names": ["height", "width", "channels"],
|
||||
},
|
||||
"observation.images.depth.left_wrist": {
|
||||
"dtype": "video",
|
||||
"shape": [480, 480, 1],
|
||||
"names": ["height", "width", "channels"],
|
||||
},
|
||||
"observation.images.depth.right_wrist": {
|
||||
"dtype": "video",
|
||||
"shape": [480, 480, 1],
|
||||
"names": ["height", "width", "channels"],
|
||||
},
|
||||
# Segmentation instance ID images
|
||||
"observation.images.seg_instance_id.head": {
|
||||
"dtype": "video",
|
||||
"shape": [720, 720, 1],
|
||||
"names": ["height", "width", "channels"],
|
||||
},
|
||||
"observation.images.seg_instance_id.left_wrist": {
|
||||
"dtype": "video",
|
||||
"shape": [480, 480, 1],
|
||||
"names": ["height", "width", "channels"],
|
||||
},
|
||||
"observation.images.seg_instance_id.right_wrist": {
|
||||
"dtype": "video",
|
||||
"shape": [480, 480, 1],
|
||||
"names": ["height", "width", "channels"],
|
||||
},
|
||||
}
|
||||
|
||||
|
||||
# Action indices
|
||||
ACTION_QPOS_INDICES = {
|
||||
"A1": OrderedDict(
|
||||
{
|
||||
"arm": np.s_[0:6],
|
||||
"gripper": np.s_[6:7],
|
||||
}
|
||||
),
|
||||
"R1Pro": OrderedDict(
|
||||
{
|
||||
"base": np.s_[0:3],
|
||||
"torso": np.s_[3:7],
|
||||
"left_arm": np.s_[7:14],
|
||||
"left_gripper": np.s_[14:15],
|
||||
"right_arm": np.s_[15:22],
|
||||
"right_gripper": np.s_[22:23],
|
||||
}
|
||||
),
|
||||
}
|
||||
|
||||
|
||||
# Proprioception configuration
|
||||
PROPRIOCEPTION_INDICES = {
|
||||
"A1": OrderedDict(
|
||||
{
|
||||
"joint_qpos": np.s_[0:8],
|
||||
"joint_qpos_sin": np.s_[8:16],
|
||||
"joint_qpos_cos": np.s_[16:24],
|
||||
"joint_qvel": np.s_[24:32],
|
||||
"joint_qeffort": np.s_[32:40],
|
||||
"eef_0_pos": np.s_[40:43],
|
||||
"eef_0_quat": np.s_[43:47],
|
||||
"grasp_0": np.s_[47:48],
|
||||
"gripper_0_qpos": np.s_[48:50],
|
||||
"gripper_0_qvel": np.s_[50:52],
|
||||
}
|
||||
),
|
||||
"R1Pro": OrderedDict(
|
||||
{
|
||||
"joint_qpos": np.s_[
|
||||
0:28
|
||||
], # Full robot joint positions, the first 6 are base joints, which is NOT allowed in standard track
|
||||
"joint_qpos_sin": np.s_[
|
||||
28:56
|
||||
], # Full robot joint positions, the first 6 are base joints, which is NOT allowed in standard track
|
||||
"joint_qpos_cos": np.s_[
|
||||
56:84
|
||||
], # Full robot joint positions, the first 6 are base joints, which is NOT allowed in standard track
|
||||
"joint_qvel": np.s_[84:112],
|
||||
"joint_qeffort": np.s_[112:140],
|
||||
"robot_pos": np.s_[140:143], # Global pos, this is NOT allowed in standard track
|
||||
"robot_ori_cos": np.s_[143:146], # Global ori, this is NOT allowed in standard track
|
||||
"robot_ori_sin": np.s_[146:149], # Global ori, this is NOT allowed in standard track
|
||||
"robot_2d_ori": np.s_[149:150], # 2D global ori, this is NOT allowed in standard track
|
||||
"robot_2d_ori_cos": np.s_[150:151], # 2D global ori, this is NOT allowed in standard track
|
||||
"robot_2d_ori_sin": np.s_[151:152], # 2D global ori, this is NOT allowed in standard track
|
||||
"robot_lin_vel": np.s_[152:155],
|
||||
"robot_ang_vel": np.s_[155:158],
|
||||
"arm_left_qpos": np.s_[158:165],
|
||||
"arm_left_qpos_sin": np.s_[165:172],
|
||||
"arm_left_qpos_cos": np.s_[172:179],
|
||||
"arm_left_qvel": np.s_[179:186],
|
||||
"eef_left_pos": np.s_[186:189],
|
||||
"eef_left_quat": np.s_[189:193],
|
||||
"gripper_left_qpos": np.s_[193:195],
|
||||
"gripper_left_qvel": np.s_[195:197],
|
||||
"arm_right_qpos": np.s_[197:204],
|
||||
"arm_right_qpos_sin": np.s_[204:211],
|
||||
"arm_right_qpos_cos": np.s_[211:218],
|
||||
"arm_right_qvel": np.s_[218:225],
|
||||
"eef_right_pos": np.s_[225:228],
|
||||
"eef_right_quat": np.s_[228:232],
|
||||
"gripper_right_qpos": np.s_[232:234],
|
||||
"gripper_right_qvel": np.s_[234:236],
|
||||
"trunk_qpos": np.s_[236:240],
|
||||
"trunk_qvel": np.s_[240:244],
|
||||
"base_qpos": np.s_[244:247], # Base joint position, this is NOT allowed in standard track
|
||||
"base_qpos_sin": np.s_[247:250], # Base joint position, this is NOT allowed in standard track
|
||||
"base_qpos_cos": np.s_[250:253], # Base joint position, this is NOT allowed in standard track
|
||||
"base_qvel": np.s_[253:256],
|
||||
}
|
||||
),
|
||||
}
|
||||
|
||||
# Proprioception indices
|
||||
PROPRIO_QPOS_INDICES = {
|
||||
"A1": OrderedDict(
|
||||
{
|
||||
"arm": np.s_[0:6],
|
||||
"gripper": np.s_[6:8],
|
||||
}
|
||||
),
|
||||
"R1Pro": OrderedDict(
|
||||
{
|
||||
"torso": np.s_[6:10],
|
||||
"left_arm": np.s_[10:24:2],
|
||||
"right_arm": np.s_[11:24:2],
|
||||
"left_gripper": np.s_[24:26],
|
||||
"right_gripper": np.s_[26:28],
|
||||
}
|
||||
),
|
||||
}
|
||||
|
||||
|
||||
# Joint limits (lower, upper)
|
||||
JOINT_RANGE = {
|
||||
"A1": {
|
||||
"arm": (
|
||||
th.tensor([-2.8798, 0.0, -3.3161, -2.8798, -1.6581, -2.8798], dtype=th.float32),
|
||||
th.tensor([2.8798, 3.1415, 0.0, 2.8798, 1.6581, 2.8798], dtype=th.float32),
|
||||
),
|
||||
"gripper": (th.tensor([0.00], dtype=th.float32), th.tensor([0.03], dtype=th.float32)),
|
||||
},
|
||||
"R1Pro": {
|
||||
"base": (
|
||||
th.tensor([-0.75, -0.75, -1.0], dtype=th.float32),
|
||||
th.tensor([0.75, 0.75, 1.0], dtype=th.float32),
|
||||
),
|
||||
"torso": (
|
||||
th.tensor([-1.1345, -2.7925, -1.8326, -3.0543], dtype=th.float32),
|
||||
th.tensor([1.8326, 2.5307, 1.5708, 3.0543], dtype=th.float32),
|
||||
),
|
||||
"left_arm": (
|
||||
th.tensor([-4.4506, -0.1745, -2.3562, -2.0944, -2.3562, -1.0472, -1.5708], dtype=th.float32),
|
||||
th.tensor([1.3090, 3.1416, 2.3562, 0.3491, 2.3562, 1.0472, 1.5708], dtype=th.float32),
|
||||
),
|
||||
"left_gripper": (th.tensor([0.00], dtype=th.float32), th.tensor([0.05], dtype=th.float32)),
|
||||
"right_arm": (
|
||||
th.tensor([-4.4506, -3.1416, -2.3562, -2.0944, -2.3562, -1.0472, -1.5708], dtype=th.float32),
|
||||
th.tensor([1.3090, 0.1745, 2.3562, 0.3491, 2.3562, 1.0472, 1.5708], dtype=th.float32),
|
||||
),
|
||||
"right_gripper": (th.tensor([0.00], dtype=th.float32), th.tensor([0.05], dtype=th.float32)),
|
||||
},
|
||||
}
|
||||
|
||||
|
||||
EEF_POSITION_RANGE = {
|
||||
"A1": {
|
||||
"0": (th.tensor([0.0, -0.7, 0.0], dtype=th.float32), th.tensor([0.7, 0.7, 0.7], dtype=th.float32)),
|
||||
},
|
||||
"R1Pro": {
|
||||
"left": (
|
||||
th.tensor([0.0, -0.65, 0.0], dtype=th.float32),
|
||||
th.tensor([0.65, 0.65, 2.5], dtype=th.float32),
|
||||
),
|
||||
"right": (
|
||||
th.tensor([0.0, -0.65, 0.0], dtype=th.float32),
|
||||
th.tensor([0.65, 0.65, 2.5], dtype=th.float32),
|
||||
),
|
||||
},
|
||||
}
|
||||
|
||||
|
||||
TASK_NAMES_TO_INDICES = {
|
||||
# B10
|
||||
"turning_on_radio": 0,
|
||||
"picking_up_trash": 1,
|
||||
"putting_away_Halloween_decorations": 2,
|
||||
"cleaning_up_plates_and_food": 3,
|
||||
"can_meat": 4,
|
||||
"setting_mousetraps": 5,
|
||||
"hiding_Easter_eggs": 6,
|
||||
"picking_up_toys": 7,
|
||||
"rearranging_kitchen_furniture": 8,
|
||||
"putting_up_Christmas_decorations_inside": 9,
|
||||
# B20
|
||||
"set_up_a_coffee_station_in_your_kitchen": 10,
|
||||
"putting_dishes_away_after_cleaning": 11,
|
||||
"preparing_lunch_box": 12,
|
||||
"loading_the_car": 13,
|
||||
"carrying_in_groceries": 14,
|
||||
"bringing_in_wood": 15,
|
||||
"moving_boxes_to_storage": 16,
|
||||
"bringing_water": 17,
|
||||
"tidying_bedroom": 18,
|
||||
"outfit_a_basic_toolbox": 19,
|
||||
# B30
|
||||
"sorting_vegetables": 20,
|
||||
"collecting_childrens_toys": 21,
|
||||
"putting_shoes_on_rack": 22,
|
||||
"boxing_books_up_for_storage": 23,
|
||||
"storing_food": 24,
|
||||
"clearing_food_from_table_into_fridge": 25,
|
||||
"assembling_gift_baskets": 26,
|
||||
"sorting_household_items": 27,
|
||||
"getting_organized_for_work": 28,
|
||||
"clean_up_your_desk": 29,
|
||||
# B40
|
||||
"setting_the_fire": 30,
|
||||
"clean_boxing_gloves": 31,
|
||||
"wash_a_baseball_cap": 32,
|
||||
"wash_dog_toys": 33,
|
||||
"hanging_pictures": 34,
|
||||
"attach_a_camera_to_a_tripod": 35,
|
||||
"clean_a_patio": 36,
|
||||
"clean_a_trumpet": 37,
|
||||
"spraying_for_bugs": 38,
|
||||
"spraying_fruit_trees": 39,
|
||||
# B50
|
||||
"make_microwave_popcorn": 40,
|
||||
"cook_cabbage": 41,
|
||||
"chop_an_onion": 42,
|
||||
"slicing_vegetables": 43,
|
||||
"chopping_wood": 44,
|
||||
"cook_hot_dogs": 45,
|
||||
"cook_bacon": 46,
|
||||
"freeze_pies": 47,
|
||||
"canning_food": 48,
|
||||
"make_pizza": 49,
|
||||
}
|
||||
TASK_INDICES_TO_NAMES = {v: k for k, v in TASK_NAMES_TO_INDICES.items()}
|
||||
+605
@@ -0,0 +1,605 @@
|
||||
#!/usr/bin/env python
|
||||
|
||||
# Copyright 2025 The HuggingFace Inc. team. All rights reserved.
|
||||
#
|
||||
# Licensed under the Apache License, Version 2.0 (the "License");
|
||||
# you may not use this file except in compliance with the License.
|
||||
# You may obtain a copy of the License at
|
||||
#
|
||||
# http://www.apache.org/licenses/LICENSE-2.0
|
||||
#
|
||||
# Unless required by applicable law or agreed to in writing, software
|
||||
# distributed under the License is distributed on an "AS IS" BASIS,
|
||||
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
|
||||
# See the License for the specific language governing permissions and
|
||||
# limitations under the License.
|
||||
"""Convert Behavior Dataset to LeRobotDataset v3.0 format"""
|
||||
|
||||
import argparse
|
||||
import json
|
||||
import logging
|
||||
import shutil
|
||||
from pathlib import Path
|
||||
|
||||
import jsonlines
|
||||
import pandas as pd
|
||||
import pyarrow as pa
|
||||
import tqdm
|
||||
from datasets import Dataset, Features, Image
|
||||
|
||||
from lerobot.datasets.compute_stats import aggregate_stats
|
||||
from lerobot.datasets.utils import (
|
||||
DEFAULT_CHUNK_SIZE,
|
||||
DEFAULT_DATA_FILE_SIZE_IN_MB,
|
||||
DEFAULT_DATA_PATH,
|
||||
DEFAULT_VIDEO_FILE_SIZE_IN_MB,
|
||||
DEFAULT_VIDEO_PATH,
|
||||
LEGACY_EPISODES_PATH,
|
||||
LEGACY_EPISODES_STATS_PATH,
|
||||
LEGACY_TASKS_PATH,
|
||||
cast_stats_to_numpy,
|
||||
flatten_dict,
|
||||
get_file_size_in_mb,
|
||||
get_parquet_file_size_in_mb,
|
||||
get_parquet_num_frames,
|
||||
load_info,
|
||||
update_chunk_file_indices,
|
||||
write_episodes,
|
||||
write_info,
|
||||
write_stats,
|
||||
write_tasks,
|
||||
)
|
||||
from lerobot.datasets.video_utils import concatenate_video_files, get_video_duration_in_s
|
||||
from lerobot.utils.utils import init_logging
|
||||
|
||||
# script to convert one single task to v3.1
|
||||
# TASK = 1
|
||||
NEW_ROOT = Path("/fsx/jade_choghari/tmp/bb")
|
||||
|
||||
|
||||
def get_total_episodes_task(local_dir: Path, task_id: int, task_ranges: dict, step) -> int:
|
||||
"""
|
||||
Calculates the total number of episodes for a single, specified task.
|
||||
"""
|
||||
# Simply load the episodes for the task and count them.
|
||||
episodes = legacy_load_episodes_task(
|
||||
local_dir=local_dir, task_id=task_id, task_ranges=task_ranges, step=step
|
||||
)
|
||||
return len(episodes)
|
||||
|
||||
|
||||
NUM_CAMERAS = 9
|
||||
|
||||
|
||||
def get_total_frames_task(local_dir, meta_path, task_id: int, task_ranges: dict, step: int) -> int:
|
||||
episodes_metadata = legacy_load_episodes_task(
|
||||
local_dir=local_dir, task_id=task_id, task_ranges=task_ranges, step=step
|
||||
)
|
||||
total_frames = 0
|
||||
# like 'duration'
|
||||
for ep in episodes_metadata.values():
|
||||
duration_s = ep["length"]
|
||||
total_frames += int(duration_s)
|
||||
return total_frames
|
||||
|
||||
|
||||
def convert_info(
|
||||
root, new_root, data_file_size_in_mb, video_file_size_in_mb, meta_path, task_id: int, task_ranges, step
|
||||
):
|
||||
info = load_info(root)
|
||||
info["codebase_version"] = "v3.0"
|
||||
del info["total_videos"]
|
||||
info["data_files_size_in_mb"] = data_file_size_in_mb
|
||||
info["video_files_size_in_mb"] = video_file_size_in_mb
|
||||
info["data_path"] = DEFAULT_DATA_PATH
|
||||
info["video_path"] = DEFAULT_VIDEO_PATH if info["video_path"] is not None else None
|
||||
info["fps"] = int(info["fps"])
|
||||
for key in info["features"]:
|
||||
if info["features"][key]["dtype"] == "video":
|
||||
# already has fps in video_info
|
||||
continue
|
||||
info["features"][key]["fps"] = info["fps"]
|
||||
|
||||
info["total_episodes"] = get_total_episodes_task(root, task_id, task_ranges, step)
|
||||
info["total_videos"] = info["total_episodes"] * NUM_CAMERAS
|
||||
info["total_frames"] = get_total_frames_task(root, meta_path, task_id, task_ranges, step)
|
||||
info["total_tasks"] = 1
|
||||
write_info(info, new_root)
|
||||
|
||||
|
||||
def load_jsonlines(fpath: Path) -> list[any]:
|
||||
with jsonlines.open(fpath, "r") as reader:
|
||||
return list(reader)
|
||||
|
||||
|
||||
def legacy_load_tasks(local_dir: Path) -> tuple[dict, dict]:
|
||||
tasks = load_jsonlines(local_dir / LEGACY_TASKS_PATH)
|
||||
# return tasks dict such that
|
||||
tasks = {item["task_index"]: item["task"] for item in sorted(tasks, key=lambda x: x["task_index"])}
|
||||
task_to_task_index = {task: task_index for task_index, task in tasks.items()}
|
||||
return tasks, task_to_task_index
|
||||
|
||||
|
||||
def convert_tasks(root, new_root, task_id: int):
|
||||
tasks, _ = legacy_load_tasks(root)
|
||||
if task_id not in tasks:
|
||||
raise ValueError(f"Task ID {task_id} not found in tasks (available: {list(tasks.keys())})")
|
||||
tasks = {task_id: tasks[task_id]}
|
||||
task_indices = tasks.keys()
|
||||
task_strings = tasks.values()
|
||||
df_tasks = pd.DataFrame({"task_index": task_indices}, index=task_strings)
|
||||
write_tasks(df_tasks, new_root)
|
||||
|
||||
|
||||
def concat_data_files(paths_to_cat, new_root, chunk_idx, file_idx, image_keys):
|
||||
# TODO(rcadene): to save RAM use Dataset.from_parquet(file) and concatenate_datasets
|
||||
dataframes = [pd.read_parquet(file) for file in paths_to_cat]
|
||||
# Concatenate all DataFrames along rows
|
||||
concatenated_df = pd.concat(dataframes, ignore_index=True)
|
||||
|
||||
path = new_root / DEFAULT_DATA_PATH.format(chunk_index=chunk_idx, file_index=file_idx)
|
||||
path.parent.mkdir(parents=True, exist_ok=True)
|
||||
if len(image_keys) > 0:
|
||||
schema = pa.Schema.from_pandas(concatenated_df)
|
||||
features = Features.from_arrow_schema(schema)
|
||||
for key in image_keys:
|
||||
features[key] = Image()
|
||||
schema = features.arrow_schema
|
||||
else:
|
||||
schema = None
|
||||
|
||||
concatenated_df.to_parquet(path, index=False, schema=schema)
|
||||
|
||||
|
||||
def get_image_keys(root):
|
||||
info = load_info(root)
|
||||
features = info["features"]
|
||||
image_keys = [key for key, ft in features.items() if ft["dtype"] == "image"]
|
||||
return image_keys
|
||||
|
||||
|
||||
def convert_data(root: Path, new_root: Path, data_file_size_in_mb: int, task_index: int):
|
||||
task_dir_name = f"task-00{task_index}"
|
||||
data_dir = root / "data" / task_dir_name
|
||||
ep_paths = sorted(data_dir.glob("*.parquet"))
|
||||
image_keys = get_image_keys(root)
|
||||
|
||||
ep_idx = 0
|
||||
chunk_idx = 0
|
||||
file_idx = 0
|
||||
size_in_mb = 0
|
||||
num_frames = 0
|
||||
paths_to_cat = []
|
||||
episodes_metadata = []
|
||||
|
||||
logging.info(f"Converting data files from {len(ep_paths)} episodes")
|
||||
|
||||
for ep_path in tqdm.tqdm(ep_paths, desc="convert data files"):
|
||||
ep_size_in_mb = get_parquet_file_size_in_mb(ep_path)
|
||||
ep_num_frames = get_parquet_num_frames(ep_path)
|
||||
ep_metadata = {
|
||||
"episode_index": ep_idx,
|
||||
"data/chunk_index": chunk_idx,
|
||||
"data/file_index": file_idx,
|
||||
"dataset_from_index": num_frames,
|
||||
"dataset_to_index": num_frames + ep_num_frames,
|
||||
}
|
||||
size_in_mb += ep_size_in_mb
|
||||
num_frames += ep_num_frames
|
||||
episodes_metadata.append(ep_metadata)
|
||||
ep_idx += 1
|
||||
|
||||
if size_in_mb < data_file_size_in_mb:
|
||||
paths_to_cat.append(ep_path)
|
||||
continue
|
||||
|
||||
if paths_to_cat:
|
||||
concat_data_files(paths_to_cat, new_root, chunk_idx, file_idx, image_keys)
|
||||
|
||||
# Reset for the next file
|
||||
size_in_mb = ep_size_in_mb
|
||||
paths_to_cat = [ep_path]
|
||||
|
||||
chunk_idx, file_idx = update_chunk_file_indices(chunk_idx, file_idx, DEFAULT_CHUNK_SIZE)
|
||||
|
||||
# Write remaining data if any
|
||||
if paths_to_cat:
|
||||
concat_data_files(paths_to_cat, new_root, chunk_idx, file_idx, image_keys)
|
||||
|
||||
return episodes_metadata
|
||||
|
||||
|
||||
def convert_videos_of_camera(
|
||||
root: Path, new_root: Path, video_key: str, video_file_size_in_mb: int, task_index: int
|
||||
):
|
||||
# Access old paths to mp4
|
||||
# videos_dir = root / "videos"
|
||||
# ep_paths = sorted(videos_dir.glob(f"*/{video_key}/*.mp4"))
|
||||
task_dir_name = f"task-00{task_index}"
|
||||
videos_dir = root / "videos" / task_dir_name / video_key
|
||||
ep_paths = sorted(videos_dir.glob("*.mp4"))
|
||||
print("ep_paths", ep_paths)
|
||||
ep_idx = 0
|
||||
chunk_idx = 0
|
||||
file_idx = 0
|
||||
size_in_mb = 0
|
||||
duration_in_s = 0.0
|
||||
paths_to_cat = []
|
||||
episodes_metadata = []
|
||||
|
||||
for ep_path in tqdm.tqdm(ep_paths, desc=f"convert videos of {video_key}"):
|
||||
ep_size_in_mb = get_file_size_in_mb(ep_path)
|
||||
ep_duration_in_s = get_video_duration_in_s(ep_path)
|
||||
|
||||
# Check if adding this episode would exceed the limit
|
||||
if size_in_mb + ep_size_in_mb >= video_file_size_in_mb and len(paths_to_cat) > 0:
|
||||
# Size limit would be exceeded, save current accumulation WITHOUT this episode
|
||||
concatenate_video_files(
|
||||
paths_to_cat,
|
||||
new_root
|
||||
/ DEFAULT_VIDEO_PATH.format(video_key=video_key, chunk_index=chunk_idx, file_index=file_idx),
|
||||
)
|
||||
|
||||
# Update episodes metadata for the file we just saved
|
||||
for i, _ in enumerate(paths_to_cat):
|
||||
past_ep_idx = ep_idx - len(paths_to_cat) + i
|
||||
episodes_metadata[past_ep_idx][f"videos/{video_key}/chunk_index"] = chunk_idx
|
||||
episodes_metadata[past_ep_idx][f"videos/{video_key}/file_index"] = file_idx
|
||||
|
||||
# Move to next file and start fresh with current episode
|
||||
chunk_idx, file_idx = update_chunk_file_indices(chunk_idx, file_idx, DEFAULT_CHUNK_SIZE)
|
||||
size_in_mb = 0
|
||||
duration_in_s = 0.0
|
||||
paths_to_cat = []
|
||||
|
||||
# Add current episode metadata
|
||||
ep_metadata = {
|
||||
"episode_index": ep_idx,
|
||||
f"videos/{video_key}/chunk_index": chunk_idx, # Will be updated when file is saved
|
||||
f"videos/{video_key}/file_index": file_idx, # Will be updated when file is saved
|
||||
f"videos/{video_key}/from_timestamp": duration_in_s,
|
||||
f"videos/{video_key}/to_timestamp": duration_in_s + ep_duration_in_s,
|
||||
}
|
||||
episodes_metadata.append(ep_metadata)
|
||||
|
||||
# Add current episode to accumulation
|
||||
paths_to_cat.append(ep_path)
|
||||
size_in_mb += ep_size_in_mb
|
||||
duration_in_s += ep_duration_in_s
|
||||
ep_idx += 1
|
||||
|
||||
# Write remaining videos if any
|
||||
if paths_to_cat:
|
||||
concatenate_video_files(
|
||||
paths_to_cat,
|
||||
new_root
|
||||
/ DEFAULT_VIDEO_PATH.format(video_key=video_key, chunk_index=chunk_idx, file_index=file_idx),
|
||||
)
|
||||
|
||||
# Update episodes metadata for the final file
|
||||
for i, _ in enumerate(paths_to_cat):
|
||||
past_ep_idx = ep_idx - len(paths_to_cat) + i
|
||||
episodes_metadata[past_ep_idx][f"videos/{video_key}/chunk_index"] = chunk_idx
|
||||
episodes_metadata[past_ep_idx][f"videos/{video_key}/file_index"] = file_idx
|
||||
|
||||
return episodes_metadata
|
||||
|
||||
|
||||
def get_video_keys(root):
|
||||
info = load_info(root)
|
||||
features = info["features"]
|
||||
video_keys = [key for key, ft in features.items() if ft["dtype"] == "video"]
|
||||
return video_keys
|
||||
|
||||
|
||||
def convert_videos(root: Path, new_root: Path, video_file_size_in_mb: int, task_id: int):
|
||||
logging.info(f"Converting videos from {root} to {new_root}")
|
||||
|
||||
video_keys = get_video_keys(root)
|
||||
if len(video_keys) == 0:
|
||||
return None
|
||||
|
||||
video_keys = sorted(video_keys)
|
||||
|
||||
eps_metadata_per_cam = []
|
||||
for camera in video_keys:
|
||||
eps_metadata = convert_videos_of_camera(root, new_root, camera, video_file_size_in_mb, task_id)
|
||||
eps_metadata_per_cam.append(eps_metadata)
|
||||
|
||||
num_eps_per_cam = [len(eps_cam_map) for eps_cam_map in eps_metadata_per_cam]
|
||||
if len(set(num_eps_per_cam)) != 1:
|
||||
raise ValueError(f"All cams dont have same number of episodes ({num_eps_per_cam}).")
|
||||
|
||||
episods_metadata = []
|
||||
num_cameras = len(video_keys)
|
||||
num_episodes = num_eps_per_cam[0]
|
||||
for ep_idx in tqdm.tqdm(range(num_episodes), desc="convert videos"):
|
||||
# Sanity check
|
||||
ep_ids = [eps_metadata_per_cam[cam_idx][ep_idx]["episode_index"] for cam_idx in range(num_cameras)]
|
||||
ep_ids += [ep_idx]
|
||||
if len(set(ep_ids)) != 1:
|
||||
raise ValueError(f"All episode indices need to match ({ep_ids}).")
|
||||
|
||||
ep_dict = {}
|
||||
for cam_idx in range(num_cameras):
|
||||
ep_dict.update(eps_metadata_per_cam[cam_idx][ep_idx])
|
||||
episods_metadata.append(ep_dict)
|
||||
|
||||
return episods_metadata
|
||||
|
||||
|
||||
def infer_task_episode_ranges(episodes_jsonl_path: Path) -> dict:
|
||||
"""
|
||||
Parse the Behavior-1K episodes.jsonl metadata and infer contiguous episode ranges per unique task.
|
||||
Returns a dict:
|
||||
{ task_id: { "task_string": ..., "ep_start": ..., "ep_end": ... } }
|
||||
"""
|
||||
task_ranges = {}
|
||||
task_id = 0
|
||||
current_task_str = None
|
||||
ep_start = None
|
||||
ep_end = None
|
||||
|
||||
with open(episodes_jsonl_path) as f:
|
||||
for line in f:
|
||||
if not line.strip():
|
||||
continue
|
||||
ep = json.loads(line)
|
||||
ep_idx = ep["episode_index"]
|
||||
task_str = ep["tasks"][0] if ep["tasks"] else "UNKNOWN"
|
||||
|
||||
if current_task_str is None:
|
||||
current_task_str = task_str
|
||||
ep_start = ep_idx
|
||||
ep_end = ep_idx
|
||||
elif task_str == current_task_str:
|
||||
ep_end = ep_idx
|
||||
else:
|
||||
# close previous task group
|
||||
task_ranges[task_id] = {
|
||||
"task_string": current_task_str,
|
||||
"ep_start": ep_start,
|
||||
"ep_end": ep_end,
|
||||
}
|
||||
task_id += 1
|
||||
# start new one
|
||||
current_task_str = task_str
|
||||
ep_start = ep_idx
|
||||
ep_end = ep_idx
|
||||
|
||||
# store last task
|
||||
if current_task_str is not None:
|
||||
task_ranges[task_id] = {
|
||||
"task_string": current_task_str,
|
||||
"ep_start": ep_start,
|
||||
"ep_end": ep_end,
|
||||
}
|
||||
|
||||
return task_ranges
|
||||
|
||||
|
||||
def legacy_load_episodes_task(local_dir: Path, task_id: int, task_ranges: dict, step: int = 10) -> dict:
|
||||
"""
|
||||
Load only the episodes belonging to a specific task, inferred automatically from episode ranges.
|
||||
|
||||
Args:
|
||||
local_dir (Path): Root path containing legacy meta/episodes.jsonl
|
||||
task_id (int): Which task to load (key from the inferred task_ranges dict)
|
||||
task_ranges (dict): Mapping from infer_task_episode_ranges()
|
||||
step (int): Episode index step (Behavior-1K = 10)
|
||||
"""
|
||||
all_episodes = legacy_load_episodes(local_dir)
|
||||
|
||||
# get the range for this task
|
||||
if task_id not in task_ranges:
|
||||
raise ValueError(f"Task id {task_id} not found in task_ranges")
|
||||
|
||||
ep_start = task_ranges[task_id]["ep_start"]
|
||||
ep_end = task_ranges[task_id]["ep_end"]
|
||||
|
||||
task_episode_indices = range(ep_start, ep_end + step, step)
|
||||
return {i: all_episodes[i] for i in task_episode_indices if i in all_episodes}
|
||||
|
||||
|
||||
def legacy_load_episodes(local_dir: Path) -> dict:
|
||||
episodes = load_jsonlines(local_dir / LEGACY_EPISODES_PATH)
|
||||
return {item["episode_index"]: item for item in sorted(episodes, key=lambda x: x["episode_index"])}
|
||||
|
||||
|
||||
def legacy_load_episodes_stats(local_dir: Path) -> dict:
|
||||
episodes_stats = load_jsonlines(local_dir / LEGACY_EPISODES_STATS_PATH)
|
||||
return {
|
||||
item["episode_index"]: cast_stats_to_numpy(item["stats"])
|
||||
for item in sorted(episodes_stats, key=lambda x: x["episode_index"])
|
||||
}
|
||||
|
||||
|
||||
def legacy_load_episodes_stats_task(local_dir: Path, task_id: int, task_ranges: dict, step: int = 10) -> dict:
|
||||
all_stats = legacy_load_episodes_stats(local_dir)
|
||||
|
||||
if task_id not in task_ranges:
|
||||
raise ValueError(f"Task id {task_id} not found in task_ranges")
|
||||
|
||||
ep_start = task_ranges[task_id]["ep_start"]
|
||||
ep_end = task_ranges[task_id]["ep_end"]
|
||||
|
||||
task_episode_indices = range(ep_start, ep_end + step, step)
|
||||
return {i: all_stats[i] for i in task_episode_indices if i in all_stats}
|
||||
|
||||
|
||||
def generate_episode_metadata_dict(
|
||||
episodes_legacy_metadata, episodes_metadata, episodes_stats, episodes_videos=None
|
||||
):
|
||||
num_episodes = len(episodes_metadata)
|
||||
episodes_legacy_metadata_vals = list(episodes_legacy_metadata.values())
|
||||
episodes_stats_vals = list(episodes_stats.values())
|
||||
episodes_stats_keys = list(episodes_stats.keys())
|
||||
|
||||
for i in range(num_episodes):
|
||||
ep_legacy_metadata = episodes_legacy_metadata_vals[i]
|
||||
ep_metadata = episodes_metadata[i]
|
||||
ep_stats = episodes_stats_vals[i]
|
||||
|
||||
ep_ids_set = {
|
||||
ep_legacy_metadata["episode_index"],
|
||||
ep_metadata["episode_index"],
|
||||
episodes_stats_keys[i],
|
||||
}
|
||||
|
||||
if episodes_videos is None:
|
||||
ep_video = {}
|
||||
else:
|
||||
ep_video = episodes_videos[i]
|
||||
ep_ids_set.add(ep_video["episode_index"])
|
||||
# we skip this check because ep_ids have a step of 10, whereas we convert with a step of 1
|
||||
# if len(ep_ids_set) != 1:
|
||||
# raise ValueError(f"Number of episodes is not the same ({ep_ids_set}).")
|
||||
|
||||
ep_dict = {**ep_metadata, **ep_video, **ep_legacy_metadata, **flatten_dict({"stats": ep_stats})}
|
||||
ep_dict["meta/episodes/chunk_index"] = 0
|
||||
ep_dict["meta/episodes/file_index"] = 0
|
||||
yield ep_dict
|
||||
|
||||
|
||||
def convert_episodes_metadata(
|
||||
root, new_root, episodes_metadata, task_id: int, task_ranges, episodes_video_metadata=None
|
||||
):
|
||||
logging.info(f"Converting episodes metadata from {root} to {new_root}")
|
||||
|
||||
# filter by task
|
||||
episodes_legacy_metadata = legacy_load_episodes_task(root, task_id=task_id, task_ranges=task_ranges)
|
||||
episodes_stats = legacy_load_episodes_stats_task(root, task_id=task_id, task_ranges=task_ranges)
|
||||
|
||||
num_eps_set = {len(episodes_legacy_metadata), len(episodes_metadata)}
|
||||
if episodes_video_metadata is not None:
|
||||
num_eps_set.add(len(episodes_video_metadata))
|
||||
|
||||
if len(num_eps_set) != 1:
|
||||
raise ValueError(f"Number of episodes is not the same ({num_eps_set}).")
|
||||
|
||||
ds_episodes = Dataset.from_generator(
|
||||
lambda: generate_episode_metadata_dict(
|
||||
episodes_legacy_metadata, episodes_metadata, episodes_stats, episodes_video_metadata
|
||||
)
|
||||
)
|
||||
write_episodes(ds_episodes, new_root)
|
||||
|
||||
stats = aggregate_stats(list(episodes_stats.values()))
|
||||
write_stats(stats, new_root)
|
||||
|
||||
|
||||
def convert_dataset_local(
|
||||
data_path: Path,
|
||||
new_repo: Path,
|
||||
task_id: int,
|
||||
data_file_size_in_mb: int = DEFAULT_DATA_FILE_SIZE_IN_MB,
|
||||
video_file_size_in_mb: int = DEFAULT_VIDEO_FILE_SIZE_IN_MB,
|
||||
force_conversion: bool = False,
|
||||
):
|
||||
"""
|
||||
Convert a local dataset to v3.x format, task-by-task, without using the Hugging Face Hub.
|
||||
|
||||
Args:
|
||||
data_path (Path): path to local dataset root (e.g. /fsx/.../2025-challenge-demos)
|
||||
new_repo (Path): path where converted dataset will be written (e.g. /fsx/.../behavior1k_v3)
|
||||
task_id (int): which task to convert (index)
|
||||
data_file_size_in_mb (int): max size per data chunk
|
||||
video_file_size_in_mb (int): max size per video chunk
|
||||
force_conversion (bool): overwrite existing conversion if True
|
||||
"""
|
||||
|
||||
root = Path(data_path)
|
||||
new_root = Path(new_repo)
|
||||
|
||||
# Clean up if needed
|
||||
if new_root.exists() and force_conversion:
|
||||
shutil.rmtree(new_root)
|
||||
new_root.mkdir(parents=True, exist_ok=True)
|
||||
|
||||
print(f"🔹 Starting conversion for task {task_id}")
|
||||
print(f"Input root: {root}")
|
||||
print(f"Output root: {new_root}")
|
||||
# Infer task episode ranges
|
||||
episodes_meta_path = root / "meta" / "episodes.jsonl"
|
||||
task_ranges = infer_task_episode_ranges(episodes_meta_path)
|
||||
convert_info(
|
||||
root,
|
||||
new_root,
|
||||
data_file_size_in_mb,
|
||||
video_file_size_in_mb,
|
||||
episodes_meta_path,
|
||||
task_id,
|
||||
task_ranges,
|
||||
step=10,
|
||||
)
|
||||
convert_tasks(root, new_root, task_id)
|
||||
episodes_metadata = convert_data(root, new_root, data_file_size_in_mb, task_index=task_id)
|
||||
episodes_videos_metadata = convert_videos(root, new_root, video_file_size_in_mb, task_id=task_id)
|
||||
convert_episodes_metadata(
|
||||
root,
|
||||
new_root,
|
||||
episodes_metadata,
|
||||
task_id=task_id,
|
||||
task_ranges=task_ranges,
|
||||
episodes_video_metadata=episodes_videos_metadata,
|
||||
)
|
||||
|
||||
print(f"✅ Conversion complete for task {task_id}")
|
||||
print(f"Converted dataset written to: {new_root}")
|
||||
|
||||
|
||||
if __name__ == "__main__":
|
||||
import argparse
|
||||
from pathlib import Path
|
||||
|
||||
init_logging()
|
||||
|
||||
parser = argparse.ArgumentParser(
|
||||
description="Convert Behavior-1K tasks to LeRobot v3 format (local only)"
|
||||
)
|
||||
parser.add_argument(
|
||||
"--data-path",
|
||||
type=str,
|
||||
required=True,
|
||||
help="Path to the local Behavior-1K dataset (e.g. /fsx/francesco_capuano/.cache/behavior-1k/2025-challenge-demos)",
|
||||
)
|
||||
parser.add_argument(
|
||||
"--new-repo",
|
||||
type=str,
|
||||
required=True,
|
||||
help="Path to the output directory for the converted dataset",
|
||||
)
|
||||
parser.add_argument(
|
||||
"--task-id",
|
||||
type=int,
|
||||
required=True,
|
||||
help="Task index to convert (e.g. 0, 1, 2, ...)",
|
||||
)
|
||||
parser.add_argument(
|
||||
"--data-file-size-in-mb",
|
||||
type=int,
|
||||
default=DEFAULT_DATA_FILE_SIZE_IN_MB,
|
||||
help=f"Maximum size per data chunk (default: {DEFAULT_DATA_FILE_SIZE_IN_MB})",
|
||||
)
|
||||
parser.add_argument(
|
||||
"--video-file-size-in-mb",
|
||||
type=int,
|
||||
default=DEFAULT_VIDEO_FILE_SIZE_IN_MB,
|
||||
help=f"Maximum size per video chunk (default: {DEFAULT_VIDEO_FILE_SIZE_IN_MB})",
|
||||
)
|
||||
parser.add_argument(
|
||||
"--force-conversion",
|
||||
action="store_true",
|
||||
help="Force overwrite of existing conversion output if present.",
|
||||
)
|
||||
|
||||
args = parser.parse_args()
|
||||
|
||||
convert_dataset_local(
|
||||
data_path=Path(args.data_path),
|
||||
new_repo=Path(args.new_repo),
|
||||
task_id=args.task_id,
|
||||
data_file_size_in_mb=args.data_file_size_in_mb,
|
||||
video_file_size_in_mb=args.video_file_size_in_mb,
|
||||
force_conversion=args.force_conversion,
|
||||
)
|
||||
@@ -0,0 +1,130 @@
|
||||
#!/usr/bin/env python
|
||||
|
||||
# Copyright 2025 The HuggingFace Inc. team. All rights reserved.
|
||||
#
|
||||
# Licensed under the Apache License, Version 2.0 (the "License");
|
||||
# you may not use this file except in compliance with the License.
|
||||
# You may obtain a copy of the License at
|
||||
#
|
||||
# http://www.apache.org/licenses/LICENSE-2.0
|
||||
#
|
||||
# Unless required by applicable law or agreed to in writing, software
|
||||
# distributed under the License is distributed on an "AS IS" BASIS,
|
||||
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
|
||||
# See the License for the specific language governing permissions and
|
||||
# limitations under the License.
|
||||
|
||||
"""
|
||||
Test script to verify BEHAVIOR-1K dataset loading with v3.0 wrapper.
|
||||
"""
|
||||
|
||||
import argparse
|
||||
import logging
|
||||
|
||||
from behavior_lerobot_dataset_v3 import BehaviorLeRobotDatasetV3
|
||||
|
||||
from lerobot.utils.utils import init_logging
|
||||
|
||||
init_logging()
|
||||
|
||||
|
||||
def load_behavior1k_dataset(repo_id, root):
|
||||
"""Test basic dataset loading."""
|
||||
logging.info("=" * 80)
|
||||
logging.info("Testing BEHAVIOR-1K dataset loading")
|
||||
logging.info("=" * 80)
|
||||
|
||||
logging.info(f"\n1. Loading dataset with repo_id: {repo_id}")
|
||||
dataset = BehaviorLeRobotDatasetV3(
|
||||
repo_id=repo_id,
|
||||
root=root,
|
||||
modalities=["rgb"],
|
||||
cameras=["head"],
|
||||
chunk_streaming_using_keyframe=False,
|
||||
check_timestamp_sync=False,
|
||||
)
|
||||
|
||||
logging.info("\n2. Dataset loaded successfully!")
|
||||
logging.info(f" - Number of episodes: {dataset.num_episodes}")
|
||||
logging.info(f" - Number of frames: {dataset.num_frames}")
|
||||
logging.info(f" - FPS: {dataset.fps}")
|
||||
logging.info(f" - Features: {list(dataset.features)}")
|
||||
|
||||
return dataset
|
||||
|
||||
|
||||
def load_behavior1k_dataset_with_multiple_modalities(repo_id, root):
|
||||
"""Test loading multiple modalities and cameras."""
|
||||
logging.info("\n" + "=" * 80)
|
||||
logging.info("Testing multi-modality loading with repo_id: {repo_id}")
|
||||
logging.info("=" * 80)
|
||||
|
||||
logging.info(f"\n1. Loading dataset with RGB + Depth with repo_id: {repo_id}")
|
||||
dataset = BehaviorLeRobotDatasetV3(
|
||||
repo_id=repo_id,
|
||||
root=root,
|
||||
modalities=["rgb", "depth"],
|
||||
cameras=["head", "left_wrist", "right_wrist"],
|
||||
chunk_streaming_using_keyframe=False,
|
||||
check_timestamp_sync=False,
|
||||
video_backend="pyav",
|
||||
)
|
||||
|
||||
logging.info(f"\n2. Dataset loaded with modalities: {list(dataset.features)}")
|
||||
logging.info(f" - Total features: {len(dataset.features)}")
|
||||
|
||||
rgb_keys = [k for k in dataset.features if "rgb" in k]
|
||||
depth_keys = [k for k in dataset.features if "depth" in k]
|
||||
logging.info(f" - RGB features: {rgb_keys}")
|
||||
logging.info(f" - Depth features: {depth_keys}")
|
||||
|
||||
logging.info("\n3. SUCCESS! Multi-modality loading works.")
|
||||
|
||||
return dataset
|
||||
|
||||
|
||||
def stream_behavior1k_dataset(repo_id, root):
|
||||
"""Test chunk streaming mode."""
|
||||
logging.info("\n" + "=" * 80)
|
||||
logging.info("Testing chunk streaming mode")
|
||||
logging.info("=" * 80)
|
||||
|
||||
logging.info("\n1. Loading dataset with chunk streaming...")
|
||||
dataset = BehaviorLeRobotDatasetV3(
|
||||
repo_id=repo_id,
|
||||
root=root,
|
||||
modalities=["rgb"],
|
||||
cameras=["head"],
|
||||
chunk_streaming_using_keyframe=True,
|
||||
shuffle=True,
|
||||
seed=42,
|
||||
check_timestamp_sync=False,
|
||||
)
|
||||
|
||||
logging.info("\n2. Dataset loaded in streaming mode")
|
||||
logging.info(f" - Number of chunks: {len(dataset.chunks)}")
|
||||
logging.info(f" - First chunk range: {dataset.chunks[0]}")
|
||||
|
||||
logging.info("\n3. Testing frame access in streaming mode...")
|
||||
for i in range(min(3, len(dataset))):
|
||||
frame = dataset[i]
|
||||
logging.info(
|
||||
f" - Frame {i}: episode_index={frame['episode_index'].item()}, "
|
||||
f"task_index={frame['task_index'].item()}"
|
||||
)
|
||||
|
||||
logging.info("\n4. SUCCESS! Chunk streaming works.")
|
||||
|
||||
return dataset
|
||||
|
||||
|
||||
if __name__ == "__main__":
|
||||
parser = argparse.ArgumentParser()
|
||||
parser.add_argument("--repo-id", type=str, default=None)
|
||||
parser.add_argument("--root", type=str, default=None)
|
||||
|
||||
args = parser.parse_args()
|
||||
|
||||
load_behavior1k_dataset(args.repo_id, args.root)
|
||||
load_behavior1k_dataset_with_multiple_modalities(args.repo_id, args.root)
|
||||
stream_behavior1k_dataset(args.repo_id, args.root)
|
||||
Reference in New Issue
Block a user