🐛 fix dataset version convert (#75)

* fix v30_to_v21

* sync v21_to_v30 with official
This commit is contained in:
Qizhi Chen
2025-12-03 12:53:59 +08:00
committed by GitHub
parent 01d1df3920
commit 97c278f339
3 changed files with 104 additions and 42 deletions
@@ -58,9 +58,9 @@ from lerobot.datasets.utils import (
LEGACY_TASKS_PATH, LEGACY_TASKS_PATH,
cast_stats_to_numpy, cast_stats_to_numpy,
flatten_dict, flatten_dict,
get_file_size_in_mb,
get_parquet_file_size_in_mb, get_parquet_file_size_in_mb,
get_parquet_num_frames, get_parquet_num_frames,
get_video_size_in_mb,
load_info, load_info,
update_chunk_file_indices, update_chunk_file_indices,
write_episodes, write_episodes,
@@ -74,7 +74,7 @@ from lerobot.utils.utils import init_logging
from requests import HTTPError from requests import HTTPError
V21 = "v2.1" V21 = "v2.1"
V30 = "v3.0"
""" """
------------------------- -------------------------
@@ -88,7 +88,7 @@ OLD
videos/chunk-000/CAMERA/episode_000000.mp4 videos/chunk-000/CAMERA/episode_000000.mp4
NEW NEW
videos/chunk-000/file_000.mp4 videos/CAMERA/chunk-000/file_000.mp4
------------------------- -------------------------
OLD OLD
episodes.jsonl episodes.jsonl
@@ -144,6 +144,17 @@ def legacy_load_tasks(local_dir: Path) -> tuple[dict, dict]:
return tasks, task_to_task_index return tasks, task_to_task_index
def validate_local_dataset_version(local_path: Path) -> None:
"""Validate that the local dataset has the expected v2.1 version."""
info = load_info(local_path)
dataset_version = info.get("codebase_version", "unknown")
if dataset_version != V21:
raise ValueError(
f"Local dataset has codebase version '{dataset_version}', expected '{V21}'. "
f"This script is specifically for converting v2.1 datasets to v3.0."
)
def convert_tasks(root, new_root): def convert_tasks(root, new_root):
logging.info(f"Converting tasks from {root} to {new_root}") logging.info(f"Converting tasks from {root} to {new_root}")
tasks, _ = legacy_load_tasks(root) tasks, _ = legacy_load_tasks(root)
@@ -289,7 +300,7 @@ def convert_videos_of_camera(root: Path, new_root: Path, video_key: str, video_f
episodes_metadata = [] episodes_metadata = []
for ep_path in tqdm.tqdm(ep_paths, desc=f"convert videos of {video_key}"): for ep_path in tqdm.tqdm(ep_paths, desc=f"convert videos of {video_key}"):
ep_size_in_mb = get_video_size_in_mb(ep_path) ep_size_in_mb = get_file_size_in_mb(ep_path)
ep_duration_in_s = get_video_duration_in_s(ep_path) ep_duration_in_s = get_video_duration_in_s(ep_path)
# Check if adding this episode would exceed the limit # Check if adding this episode would exceed the limit
@@ -297,7 +308,8 @@ def convert_videos_of_camera(root: Path, new_root: Path, video_key: str, video_f
# Size limit would be exceeded, save current accumulation WITHOUT this episode # Size limit would be exceeded, save current accumulation WITHOUT this episode
concatenate_video_files( concatenate_video_files(
paths_to_cat, paths_to_cat,
new_root / DEFAULT_VIDEO_PATH.format(video_key=video_key, chunk_index=chunk_idx, file_index=file_idx), new_root
/ DEFAULT_VIDEO_PATH.format(video_key=video_key, chunk_index=chunk_idx, file_index=file_idx),
) )
# Update episodes metadata for the file we just saved # Update episodes metadata for the file we just saved
@@ -332,7 +344,8 @@ def convert_videos_of_camera(root: Path, new_root: Path, video_key: str, video_f
if paths_to_cat: if paths_to_cat:
concatenate_video_files( concatenate_video_files(
paths_to_cat, paths_to_cat,
new_root / DEFAULT_VIDEO_PATH.format(video_key=video_key, chunk_index=chunk_idx, file_index=file_idx), new_root
/ DEFAULT_VIDEO_PATH.format(video_key=video_key, chunk_index=chunk_idx, file_index=file_idx),
) )
# Update episodes metadata for the final file # Update episodes metadata for the final file
@@ -344,7 +357,9 @@ def convert_videos_of_camera(root: Path, new_root: Path, video_key: str, video_f
return episodes_metadata return episodes_metadata
def generate_episode_metadata_dict(episodes_legacy_metadata, episodes_metadata, episodes_stats, episodes_videos=None): def generate_episode_metadata_dict(
episodes_legacy_metadata, episodes_metadata, episodes_stats, episodes_videos=None
):
num_episodes = len(episodes_metadata) num_episodes = len(episodes_metadata)
episodes_legacy_metadata_vals = list(episodes_legacy_metadata.values()) episodes_legacy_metadata_vals = list(episodes_legacy_metadata.values())
episodes_stats_vals = list(episodes_stats.values()) episodes_stats_vals = list(episodes_stats.values())
@@ -402,13 +417,13 @@ def convert_episodes_metadata(root, new_root, episodes_metadata, episodes_video_
def convert_info(root, new_root, data_file_size_in_mb, video_file_size_in_mb): def convert_info(root, new_root, data_file_size_in_mb, video_file_size_in_mb):
info = load_info(root) info = load_info(root)
info["codebase_version"] = "v3.0" info["codebase_version"] = V30
del info["total_chunks"] del info["total_chunks"]
del info["total_videos"] del info["total_videos"]
info["data_files_size_in_mb"] = data_file_size_in_mb info["data_files_size_in_mb"] = data_file_size_in_mb
info["video_files_size_in_mb"] = video_file_size_in_mb info["video_files_size_in_mb"] = video_file_size_in_mb
info["data_path"] = DEFAULT_DATA_PATH info["data_path"] = DEFAULT_DATA_PATH
info["video_path"] = DEFAULT_VIDEO_PATH info["video_path"] = DEFAULT_VIDEO_PATH if info["video_path"] is not None else None
info["fps"] = int(info["fps"]) info["fps"] = int(info["fps"])
logging.info(f"Converting info from {root} to {new_root}") logging.info(f"Converting info from {root} to {new_root}")
for key in info["features"]: for key in info["features"]:
@@ -424,16 +439,36 @@ def convert_dataset(
branch: str | None = None, branch: str | None = None,
data_file_size_in_mb: int | None = None, data_file_size_in_mb: int | None = None,
video_file_size_in_mb: int | None = None, video_file_size_in_mb: int | None = None,
root: str | Path | None = None,
push_to_hub: bool = True,
force_conversion: bool = False,
): ):
root = HF_LEROBOT_HOME / repo_id
old_root = HF_LEROBOT_HOME / f"{repo_id}_old"
new_root = HF_LEROBOT_HOME / f"{repo_id}_v30"
if data_file_size_in_mb is None: if data_file_size_in_mb is None:
data_file_size_in_mb = DEFAULT_DATA_FILE_SIZE_IN_MB data_file_size_in_mb = DEFAULT_DATA_FILE_SIZE_IN_MB
if video_file_size_in_mb is None: if video_file_size_in_mb is None:
video_file_size_in_mb = DEFAULT_VIDEO_FILE_SIZE_IN_MB video_file_size_in_mb = DEFAULT_VIDEO_FILE_SIZE_IN_MB
# First check if the dataset already has a v3.0 version
if root is None and not force_conversion:
try:
print("Trying to download v3.0 version of the dataset from the hub...")
snapshot_download(repo_id, repo_type="dataset", revision=V30, local_dir=HF_LEROBOT_HOME / repo_id)
return
except Exception:
print("Dataset does not have an uploaded v3.0 version. Continuing with conversion.")
# Set root based on whether local dataset path is provided
use_local_dataset = False
root = HF_LEROBOT_HOME / repo_id if root is None else Path(root) / repo_id
if root.exists():
validate_local_dataset_version(root)
use_local_dataset = True
print(f"Using local dataset at {root}")
old_root = root.parent / f"{root.name}_old"
new_root = root.parent / f"{root.name}_v30"
# Handle old_root cleanup if both old_root and root exist
if old_root.is_dir() and root.is_dir(): if old_root.is_dir() and root.is_dir():
shutil.rmtree(str(root)) shutil.rmtree(str(root))
shutil.move(str(old_root), str(root)) shutil.move(str(old_root), str(root))
@@ -441,12 +476,13 @@ def convert_dataset(
if new_root.is_dir(): if new_root.is_dir():
shutil.rmtree(new_root) shutil.rmtree(new_root)
snapshot_download( if not use_local_dataset:
repo_id, snapshot_download(
repo_type="dataset", repo_id,
revision=V21, repo_type="dataset",
local_dir=root, revision=V21,
) local_dir=root,
)
convert_info(root, new_root, data_file_size_in_mb, video_file_size_in_mb) convert_info(root, new_root, data_file_size_in_mb, video_file_size_in_mb)
convert_tasks(root, new_root) convert_tasks(root, new_root)
@@ -457,21 +493,22 @@ def convert_dataset(
shutil.move(str(root), str(old_root)) shutil.move(str(root), str(old_root))
shutil.move(str(new_root), str(root)) shutil.move(str(new_root), str(root))
hub_api = HfApi() if push_to_hub:
try: hub_api = HfApi()
hub_api.delete_tag(repo_id, tag=CODEBASE_VERSION, repo_type="dataset") try:
except HTTPError as e: hub_api.delete_tag(repo_id, tag=CODEBASE_VERSION, repo_type="dataset")
print(f"tag={CODEBASE_VERSION} probably doesn't exist. Skipping exception ({e})") except HTTPError as e:
pass print(f"tag={CODEBASE_VERSION} probably doesn't exist. Skipping exception ({e})")
hub_api.delete_files( pass
delete_patterns=["data/chunk*/episode_*", "meta/*.jsonl", "videos/chunk*"], hub_api.delete_files(
repo_id=repo_id, delete_patterns=["data/chunk*/episode_*", "meta/*.jsonl", "videos/chunk*"],
revision=branch, repo_id=repo_id,
repo_type="dataset", revision=branch,
) repo_type="dataset",
hub_api.create_tag(repo_id, tag=CODEBASE_VERSION, revision=branch, repo_type="dataset") )
hub_api.create_tag(repo_id, tag=CODEBASE_VERSION, revision=branch, repo_type="dataset")
LeRobotDataset(repo_id).push_to_hub() LeRobotDataset(repo_id).push_to_hub()
if __name__ == "__main__": if __name__ == "__main__":
@@ -502,6 +539,23 @@ if __name__ == "__main__":
default=None, default=None,
help="File size in MB. Defaults to 100 for data and 500 for videos.", help="File size in MB. Defaults to 100 for data and 500 for videos.",
) )
parser.add_argument(
"--root",
type=str,
default=None,
help="Local directory to use for downloading/writing the dataset.",
)
parser.add_argument(
"--push-to-hub",
type=lambda input: input.lower() == "true",
default=True,
help="Push the converted dataset to the hub.",
)
parser.add_argument(
"--force-conversion",
action="store_true",
help="Force conversion even if the dataset already has a v3.0 version.",
)
args = parser.parse_args() args = parser.parse_args()
convert_dataset(**vars(args)) convert_dataset(**vars(args))
+9 -1
View File
@@ -2,7 +2,15 @@
## Get started ## Get started
1. Install v3.0 lerobot 1. Downgrade datasets:
```bash
pip install "datasets<4.0.0"
```
> Need to downgrade datasets first since `4.0.0` introduces `List` and `Column`.
2. Install v3.0 lerobot
```bash ```bash
git clone https://github.com/huggingface/lerobot.git git clone https://github.com/huggingface/lerobot.git
@@ -22,7 +22,6 @@ import logging
import math import math
import shutil import shutil
import subprocess import subprocess
import sys
from collections import defaultdict from collections import defaultdict
from pathlib import Path from pathlib import Path
from typing import Any, Iterable from typing import Any, Iterable
@@ -31,6 +30,7 @@ import jsonlines
import numpy as np import numpy as np
import pyarrow.parquet as pq import pyarrow.parquet as pq
import tqdm import tqdm
from datasets import Dataset
from huggingface_hub import snapshot_download from huggingface_hub import snapshot_download
from lerobot.datasets.utils import ( from lerobot.datasets.utils import (
DEFAULT_CHUNK_SIZE, DEFAULT_CHUNK_SIZE,
@@ -52,10 +52,10 @@ from lerobot.utils.utils import init_logging
V21 = "v2.1" V21 = "v2.1"
V30 = "v3.0" V30 = "v3.0"
LEGACY_DATA_PATH_TEMPLATE = "data/chunk-{chunk_index:03d}/episode_{episode_index:06d}.parquet" LEGACY_DATA_PATH_TEMPLATE = "data/chunk-{episode_chunk:03d}/episode_{episode_index:06d}.parquet"
LEGACY_VIDEO_PATH_TEMPLATE = "videos/chunk-{chunk_index:03d}/{video_key}/episode_{episode_index:06d}.mp4" LEGACY_VIDEO_PATH_TEMPLATE = "videos/chunk-{episode_chunk:03d}/{video_key}/episode_{episode_index:06d}.mp4"
MIN_VIDEO_DURATION = 1e-6 MIN_VIDEO_DURATION = 1e-6
LEGACY_STATS_KEYS = ("mean", "std", "min", "max", "q01", "q99") LEGACY_STATS_KEYS = ("mean", "std", "min", "max", "count")
def _to_serializable(value: Any) -> Any: def _to_serializable(value: Any) -> Any:
@@ -181,15 +181,15 @@ def convert_data(root: Path, new_root: Path, episode_records: list[dict[str, Any
f"episode_index={episode_index}, length={length}" f"episode_index={episode_index}, length={length}"
) )
episode_table = table.slice(start, length) episode_table = table.slice(start, length).to_pandas()
dest_chunk = episode_index // DEFAULT_CHUNK_SIZE dest_chunk = episode_index // DEFAULT_CHUNK_SIZE
dest_path = new_root / LEGACY_DATA_PATH_TEMPLATE.format( dest_path = new_root / LEGACY_DATA_PATH_TEMPLATE.format(
chunk_index=dest_chunk, episode_chunk=dest_chunk,
episode_index=episode_index, episode_index=episode_index,
) )
dest_path.parent.mkdir(parents=True, exist_ok=True) dest_path.parent.mkdir(parents=True, exist_ok=True)
pq.write_table(episode_table, dest_path) Dataset.from_pandas(episode_table).to_parquet(dest_path)
def _group_episodes_by_video_file( def _group_episodes_by_video_file(
@@ -365,7 +365,7 @@ def convert_videos(root: Path, new_root: Path, episode_records: list[dict[str, A
dest_chunk = episode_index // DEFAULT_CHUNK_SIZE dest_chunk = episode_index // DEFAULT_CHUNK_SIZE
dest_path = new_root / LEGACY_VIDEO_PATH_TEMPLATE.format( dest_path = new_root / LEGACY_VIDEO_PATH_TEMPLATE.format(
chunk_index=dest_chunk, episode_chunk=dest_chunk,
video_key=video_key, video_key=video_key,
episode_index=episode_index, episode_index=episode_index,
) )