diff --git a/ds_version_convert/v21_to_v30/convert_dataset_v21_to_v30.py b/ds_version_convert/v21_to_v30/convert_dataset_v21_to_v30.py index 6f320fa..aebd596 100644 --- a/ds_version_convert/v21_to_v30/convert_dataset_v21_to_v30.py +++ b/ds_version_convert/v21_to_v30/convert_dataset_v21_to_v30.py @@ -58,9 +58,9 @@ from lerobot.datasets.utils import ( LEGACY_TASKS_PATH, cast_stats_to_numpy, flatten_dict, + get_file_size_in_mb, get_parquet_file_size_in_mb, get_parquet_num_frames, - get_video_size_in_mb, load_info, update_chunk_file_indices, write_episodes, @@ -74,7 +74,7 @@ from lerobot.utils.utils import init_logging from requests import HTTPError V21 = "v2.1" - +V30 = "v3.0" """ ------------------------- @@ -88,7 +88,7 @@ OLD videos/chunk-000/CAMERA/episode_000000.mp4 NEW -videos/chunk-000/file_000.mp4 +videos/CAMERA/chunk-000/file_000.mp4 ------------------------- OLD episodes.jsonl @@ -144,6 +144,17 @@ def legacy_load_tasks(local_dir: Path) -> tuple[dict, dict]: return tasks, task_to_task_index +def validate_local_dataset_version(local_path: Path) -> None: + """Validate that the local dataset has the expected v2.1 version.""" + info = load_info(local_path) + dataset_version = info.get("codebase_version", "unknown") + if dataset_version != V21: + raise ValueError( + f"Local dataset has codebase version '{dataset_version}', expected '{V21}'. " + f"This script is specifically for converting v2.1 datasets to v3.0." + ) + + def convert_tasks(root, new_root): logging.info(f"Converting tasks from {root} to {new_root}") tasks, _ = legacy_load_tasks(root) @@ -289,7 +300,7 @@ def convert_videos_of_camera(root: Path, new_root: Path, video_key: str, video_f episodes_metadata = [] for ep_path in tqdm.tqdm(ep_paths, desc=f"convert videos of {video_key}"): - ep_size_in_mb = get_video_size_in_mb(ep_path) + ep_size_in_mb = get_file_size_in_mb(ep_path) ep_duration_in_s = get_video_duration_in_s(ep_path) # Check if adding this episode would exceed the limit @@ -297,7 +308,8 @@ def convert_videos_of_camera(root: Path, new_root: Path, video_key: str, video_f # Size limit would be exceeded, save current accumulation WITHOUT this episode concatenate_video_files( paths_to_cat, - new_root / DEFAULT_VIDEO_PATH.format(video_key=video_key, chunk_index=chunk_idx, file_index=file_idx), + new_root + / DEFAULT_VIDEO_PATH.format(video_key=video_key, chunk_index=chunk_idx, file_index=file_idx), ) # Update episodes metadata for the file we just saved @@ -332,7 +344,8 @@ def convert_videos_of_camera(root: Path, new_root: Path, video_key: str, video_f if paths_to_cat: concatenate_video_files( paths_to_cat, - new_root / DEFAULT_VIDEO_PATH.format(video_key=video_key, chunk_index=chunk_idx, file_index=file_idx), + new_root + / DEFAULT_VIDEO_PATH.format(video_key=video_key, chunk_index=chunk_idx, file_index=file_idx), ) # Update episodes metadata for the final file @@ -344,7 +357,9 @@ def convert_videos_of_camera(root: Path, new_root: Path, video_key: str, video_f return episodes_metadata -def generate_episode_metadata_dict(episodes_legacy_metadata, episodes_metadata, episodes_stats, episodes_videos=None): +def generate_episode_metadata_dict( + episodes_legacy_metadata, episodes_metadata, episodes_stats, episodes_videos=None +): num_episodes = len(episodes_metadata) episodes_legacy_metadata_vals = list(episodes_legacy_metadata.values()) episodes_stats_vals = list(episodes_stats.values()) @@ -402,13 +417,13 @@ def convert_episodes_metadata(root, new_root, episodes_metadata, episodes_video_ def convert_info(root, new_root, data_file_size_in_mb, video_file_size_in_mb): info = load_info(root) - info["codebase_version"] = "v3.0" + info["codebase_version"] = V30 del info["total_chunks"] del info["total_videos"] info["data_files_size_in_mb"] = data_file_size_in_mb info["video_files_size_in_mb"] = video_file_size_in_mb info["data_path"] = DEFAULT_DATA_PATH - info["video_path"] = DEFAULT_VIDEO_PATH + info["video_path"] = DEFAULT_VIDEO_PATH if info["video_path"] is not None else None info["fps"] = int(info["fps"]) logging.info(f"Converting info from {root} to {new_root}") for key in info["features"]: @@ -424,16 +439,36 @@ def convert_dataset( branch: str | None = None, data_file_size_in_mb: int | None = None, video_file_size_in_mb: int | None = None, + root: str | Path | None = None, + push_to_hub: bool = True, + force_conversion: bool = False, ): - root = HF_LEROBOT_HOME / repo_id - old_root = HF_LEROBOT_HOME / f"{repo_id}_old" - new_root = HF_LEROBOT_HOME / f"{repo_id}_v30" - if data_file_size_in_mb is None: data_file_size_in_mb = DEFAULT_DATA_FILE_SIZE_IN_MB if video_file_size_in_mb is None: video_file_size_in_mb = DEFAULT_VIDEO_FILE_SIZE_IN_MB + # First check if the dataset already has a v3.0 version + if root is None and not force_conversion: + try: + print("Trying to download v3.0 version of the dataset from the hub...") + snapshot_download(repo_id, repo_type="dataset", revision=V30, local_dir=HF_LEROBOT_HOME / repo_id) + return + except Exception: + print("Dataset does not have an uploaded v3.0 version. Continuing with conversion.") + + # Set root based on whether local dataset path is provided + use_local_dataset = False + root = HF_LEROBOT_HOME / repo_id if root is None else Path(root) / repo_id + if root.exists(): + validate_local_dataset_version(root) + use_local_dataset = True + print(f"Using local dataset at {root}") + + old_root = root.parent / f"{root.name}_old" + new_root = root.parent / f"{root.name}_v30" + + # Handle old_root cleanup if both old_root and root exist if old_root.is_dir() and root.is_dir(): shutil.rmtree(str(root)) shutil.move(str(old_root), str(root)) @@ -441,12 +476,13 @@ def convert_dataset( if new_root.is_dir(): shutil.rmtree(new_root) - snapshot_download( - repo_id, - repo_type="dataset", - revision=V21, - local_dir=root, - ) + if not use_local_dataset: + snapshot_download( + repo_id, + repo_type="dataset", + revision=V21, + local_dir=root, + ) convert_info(root, new_root, data_file_size_in_mb, video_file_size_in_mb) convert_tasks(root, new_root) @@ -457,21 +493,22 @@ def convert_dataset( shutil.move(str(root), str(old_root)) shutil.move(str(new_root), str(root)) - hub_api = HfApi() - try: - hub_api.delete_tag(repo_id, tag=CODEBASE_VERSION, repo_type="dataset") - except HTTPError as e: - print(f"tag={CODEBASE_VERSION} probably doesn't exist. Skipping exception ({e})") - pass - hub_api.delete_files( - delete_patterns=["data/chunk*/episode_*", "meta/*.jsonl", "videos/chunk*"], - repo_id=repo_id, - revision=branch, - repo_type="dataset", - ) - hub_api.create_tag(repo_id, tag=CODEBASE_VERSION, revision=branch, repo_type="dataset") + if push_to_hub: + hub_api = HfApi() + try: + hub_api.delete_tag(repo_id, tag=CODEBASE_VERSION, repo_type="dataset") + except HTTPError as e: + print(f"tag={CODEBASE_VERSION} probably doesn't exist. Skipping exception ({e})") + pass + hub_api.delete_files( + delete_patterns=["data/chunk*/episode_*", "meta/*.jsonl", "videos/chunk*"], + repo_id=repo_id, + revision=branch, + repo_type="dataset", + ) + hub_api.create_tag(repo_id, tag=CODEBASE_VERSION, revision=branch, repo_type="dataset") - LeRobotDataset(repo_id).push_to_hub() + LeRobotDataset(repo_id).push_to_hub() if __name__ == "__main__": @@ -502,6 +539,23 @@ if __name__ == "__main__": default=None, help="File size in MB. Defaults to 100 for data and 500 for videos.", ) + parser.add_argument( + "--root", + type=str, + default=None, + help="Local directory to use for downloading/writing the dataset.", + ) + parser.add_argument( + "--push-to-hub", + type=lambda input: input.lower() == "true", + default=True, + help="Push the converted dataset to the hub.", + ) + parser.add_argument( + "--force-conversion", + action="store_true", + help="Force conversion even if the dataset already has a v3.0 version.", + ) args = parser.parse_args() convert_dataset(**vars(args)) diff --git a/ds_version_convert/v30_to_v21/README.md b/ds_version_convert/v30_to_v21/README.md index 2b98f81..6c43656 100644 --- a/ds_version_convert/v30_to_v21/README.md +++ b/ds_version_convert/v30_to_v21/README.md @@ -2,7 +2,15 @@ ## Get started -1. Install v3.0 lerobot +1. Downgrade datasets: + + ```bash + pip install "datasets<4.0.0" + ``` + + > Need to downgrade datasets first since `4.0.0` introduces `List` and `Column`. + +2. Install v3.0 lerobot ```bash git clone https://github.com/huggingface/lerobot.git diff --git a/ds_version_convert/v30_to_v21/convert_dataset_v30_to_v21.py b/ds_version_convert/v30_to_v21/convert_dataset_v30_to_v21.py index 6561277..11b1696 100644 --- a/ds_version_convert/v30_to_v21/convert_dataset_v30_to_v21.py +++ b/ds_version_convert/v30_to_v21/convert_dataset_v30_to_v21.py @@ -22,7 +22,6 @@ import logging import math import shutil import subprocess -import sys from collections import defaultdict from pathlib import Path from typing import Any, Iterable @@ -31,6 +30,7 @@ import jsonlines import numpy as np import pyarrow.parquet as pq import tqdm +from datasets import Dataset from huggingface_hub import snapshot_download from lerobot.datasets.utils import ( DEFAULT_CHUNK_SIZE, @@ -52,10 +52,10 @@ from lerobot.utils.utils import init_logging V21 = "v2.1" V30 = "v3.0" -LEGACY_DATA_PATH_TEMPLATE = "data/chunk-{chunk_index:03d}/episode_{episode_index:06d}.parquet" -LEGACY_VIDEO_PATH_TEMPLATE = "videos/chunk-{chunk_index:03d}/{video_key}/episode_{episode_index:06d}.mp4" +LEGACY_DATA_PATH_TEMPLATE = "data/chunk-{episode_chunk:03d}/episode_{episode_index:06d}.parquet" +LEGACY_VIDEO_PATH_TEMPLATE = "videos/chunk-{episode_chunk:03d}/{video_key}/episode_{episode_index:06d}.mp4" MIN_VIDEO_DURATION = 1e-6 -LEGACY_STATS_KEYS = ("mean", "std", "min", "max", "q01", "q99") +LEGACY_STATS_KEYS = ("mean", "std", "min", "max", "count") def _to_serializable(value: Any) -> Any: @@ -181,15 +181,15 @@ def convert_data(root: Path, new_root: Path, episode_records: list[dict[str, Any f"episode_index={episode_index}, length={length}" ) - episode_table = table.slice(start, length) + episode_table = table.slice(start, length).to_pandas() dest_chunk = episode_index // DEFAULT_CHUNK_SIZE dest_path = new_root / LEGACY_DATA_PATH_TEMPLATE.format( - chunk_index=dest_chunk, + episode_chunk=dest_chunk, episode_index=episode_index, ) dest_path.parent.mkdir(parents=True, exist_ok=True) - pq.write_table(episode_table, dest_path) + Dataset.from_pandas(episode_table).to_parquet(dest_path) def _group_episodes_by_video_file( @@ -365,7 +365,7 @@ def convert_videos(root: Path, new_root: Path, episode_records: list[dict[str, A dest_chunk = episode_index // DEFAULT_CHUNK_SIZE dest_path = new_root / LEGACY_VIDEO_PATH_TEMPLATE.format( - chunk_index=dest_chunk, + episode_chunk=dest_chunk, video_key=video_key, episode_index=episode_index, )