mirror of
https://github.com/Tavish9/any4lerobot.git
synced 2026-05-22 09:29:44 +00:00
🐛 fix dataset version convert (#75)
* fix v30_to_v21 * sync v21_to_v30 with official
This commit is contained in:
@@ -58,9 +58,9 @@ from lerobot.datasets.utils import (
|
|||||||
LEGACY_TASKS_PATH,
|
LEGACY_TASKS_PATH,
|
||||||
cast_stats_to_numpy,
|
cast_stats_to_numpy,
|
||||||
flatten_dict,
|
flatten_dict,
|
||||||
|
get_file_size_in_mb,
|
||||||
get_parquet_file_size_in_mb,
|
get_parquet_file_size_in_mb,
|
||||||
get_parquet_num_frames,
|
get_parquet_num_frames,
|
||||||
get_video_size_in_mb,
|
|
||||||
load_info,
|
load_info,
|
||||||
update_chunk_file_indices,
|
update_chunk_file_indices,
|
||||||
write_episodes,
|
write_episodes,
|
||||||
@@ -74,7 +74,7 @@ from lerobot.utils.utils import init_logging
|
|||||||
from requests import HTTPError
|
from requests import HTTPError
|
||||||
|
|
||||||
V21 = "v2.1"
|
V21 = "v2.1"
|
||||||
|
V30 = "v3.0"
|
||||||
|
|
||||||
"""
|
"""
|
||||||
-------------------------
|
-------------------------
|
||||||
@@ -88,7 +88,7 @@ OLD
|
|||||||
videos/chunk-000/CAMERA/episode_000000.mp4
|
videos/chunk-000/CAMERA/episode_000000.mp4
|
||||||
|
|
||||||
NEW
|
NEW
|
||||||
videos/chunk-000/file_000.mp4
|
videos/CAMERA/chunk-000/file_000.mp4
|
||||||
-------------------------
|
-------------------------
|
||||||
OLD
|
OLD
|
||||||
episodes.jsonl
|
episodes.jsonl
|
||||||
@@ -144,6 +144,17 @@ def legacy_load_tasks(local_dir: Path) -> tuple[dict, dict]:
|
|||||||
return tasks, task_to_task_index
|
return tasks, task_to_task_index
|
||||||
|
|
||||||
|
|
||||||
|
def validate_local_dataset_version(local_path: Path) -> None:
|
||||||
|
"""Validate that the local dataset has the expected v2.1 version."""
|
||||||
|
info = load_info(local_path)
|
||||||
|
dataset_version = info.get("codebase_version", "unknown")
|
||||||
|
if dataset_version != V21:
|
||||||
|
raise ValueError(
|
||||||
|
f"Local dataset has codebase version '{dataset_version}', expected '{V21}'. "
|
||||||
|
f"This script is specifically for converting v2.1 datasets to v3.0."
|
||||||
|
)
|
||||||
|
|
||||||
|
|
||||||
def convert_tasks(root, new_root):
|
def convert_tasks(root, new_root):
|
||||||
logging.info(f"Converting tasks from {root} to {new_root}")
|
logging.info(f"Converting tasks from {root} to {new_root}")
|
||||||
tasks, _ = legacy_load_tasks(root)
|
tasks, _ = legacy_load_tasks(root)
|
||||||
@@ -289,7 +300,7 @@ def convert_videos_of_camera(root: Path, new_root: Path, video_key: str, video_f
|
|||||||
episodes_metadata = []
|
episodes_metadata = []
|
||||||
|
|
||||||
for ep_path in tqdm.tqdm(ep_paths, desc=f"convert videos of {video_key}"):
|
for ep_path in tqdm.tqdm(ep_paths, desc=f"convert videos of {video_key}"):
|
||||||
ep_size_in_mb = get_video_size_in_mb(ep_path)
|
ep_size_in_mb = get_file_size_in_mb(ep_path)
|
||||||
ep_duration_in_s = get_video_duration_in_s(ep_path)
|
ep_duration_in_s = get_video_duration_in_s(ep_path)
|
||||||
|
|
||||||
# Check if adding this episode would exceed the limit
|
# Check if adding this episode would exceed the limit
|
||||||
@@ -297,7 +308,8 @@ def convert_videos_of_camera(root: Path, new_root: Path, video_key: str, video_f
|
|||||||
# Size limit would be exceeded, save current accumulation WITHOUT this episode
|
# Size limit would be exceeded, save current accumulation WITHOUT this episode
|
||||||
concatenate_video_files(
|
concatenate_video_files(
|
||||||
paths_to_cat,
|
paths_to_cat,
|
||||||
new_root / DEFAULT_VIDEO_PATH.format(video_key=video_key, chunk_index=chunk_idx, file_index=file_idx),
|
new_root
|
||||||
|
/ DEFAULT_VIDEO_PATH.format(video_key=video_key, chunk_index=chunk_idx, file_index=file_idx),
|
||||||
)
|
)
|
||||||
|
|
||||||
# Update episodes metadata for the file we just saved
|
# Update episodes metadata for the file we just saved
|
||||||
@@ -332,7 +344,8 @@ def convert_videos_of_camera(root: Path, new_root: Path, video_key: str, video_f
|
|||||||
if paths_to_cat:
|
if paths_to_cat:
|
||||||
concatenate_video_files(
|
concatenate_video_files(
|
||||||
paths_to_cat,
|
paths_to_cat,
|
||||||
new_root / DEFAULT_VIDEO_PATH.format(video_key=video_key, chunk_index=chunk_idx, file_index=file_idx),
|
new_root
|
||||||
|
/ DEFAULT_VIDEO_PATH.format(video_key=video_key, chunk_index=chunk_idx, file_index=file_idx),
|
||||||
)
|
)
|
||||||
|
|
||||||
# Update episodes metadata for the final file
|
# Update episodes metadata for the final file
|
||||||
@@ -344,7 +357,9 @@ def convert_videos_of_camera(root: Path, new_root: Path, video_key: str, video_f
|
|||||||
return episodes_metadata
|
return episodes_metadata
|
||||||
|
|
||||||
|
|
||||||
def generate_episode_metadata_dict(episodes_legacy_metadata, episodes_metadata, episodes_stats, episodes_videos=None):
|
def generate_episode_metadata_dict(
|
||||||
|
episodes_legacy_metadata, episodes_metadata, episodes_stats, episodes_videos=None
|
||||||
|
):
|
||||||
num_episodes = len(episodes_metadata)
|
num_episodes = len(episodes_metadata)
|
||||||
episodes_legacy_metadata_vals = list(episodes_legacy_metadata.values())
|
episodes_legacy_metadata_vals = list(episodes_legacy_metadata.values())
|
||||||
episodes_stats_vals = list(episodes_stats.values())
|
episodes_stats_vals = list(episodes_stats.values())
|
||||||
@@ -402,13 +417,13 @@ def convert_episodes_metadata(root, new_root, episodes_metadata, episodes_video_
|
|||||||
|
|
||||||
def convert_info(root, new_root, data_file_size_in_mb, video_file_size_in_mb):
|
def convert_info(root, new_root, data_file_size_in_mb, video_file_size_in_mb):
|
||||||
info = load_info(root)
|
info = load_info(root)
|
||||||
info["codebase_version"] = "v3.0"
|
info["codebase_version"] = V30
|
||||||
del info["total_chunks"]
|
del info["total_chunks"]
|
||||||
del info["total_videos"]
|
del info["total_videos"]
|
||||||
info["data_files_size_in_mb"] = data_file_size_in_mb
|
info["data_files_size_in_mb"] = data_file_size_in_mb
|
||||||
info["video_files_size_in_mb"] = video_file_size_in_mb
|
info["video_files_size_in_mb"] = video_file_size_in_mb
|
||||||
info["data_path"] = DEFAULT_DATA_PATH
|
info["data_path"] = DEFAULT_DATA_PATH
|
||||||
info["video_path"] = DEFAULT_VIDEO_PATH
|
info["video_path"] = DEFAULT_VIDEO_PATH if info["video_path"] is not None else None
|
||||||
info["fps"] = int(info["fps"])
|
info["fps"] = int(info["fps"])
|
||||||
logging.info(f"Converting info from {root} to {new_root}")
|
logging.info(f"Converting info from {root} to {new_root}")
|
||||||
for key in info["features"]:
|
for key in info["features"]:
|
||||||
@@ -424,16 +439,36 @@ def convert_dataset(
|
|||||||
branch: str | None = None,
|
branch: str | None = None,
|
||||||
data_file_size_in_mb: int | None = None,
|
data_file_size_in_mb: int | None = None,
|
||||||
video_file_size_in_mb: int | None = None,
|
video_file_size_in_mb: int | None = None,
|
||||||
|
root: str | Path | None = None,
|
||||||
|
push_to_hub: bool = True,
|
||||||
|
force_conversion: bool = False,
|
||||||
):
|
):
|
||||||
root = HF_LEROBOT_HOME / repo_id
|
|
||||||
old_root = HF_LEROBOT_HOME / f"{repo_id}_old"
|
|
||||||
new_root = HF_LEROBOT_HOME / f"{repo_id}_v30"
|
|
||||||
|
|
||||||
if data_file_size_in_mb is None:
|
if data_file_size_in_mb is None:
|
||||||
data_file_size_in_mb = DEFAULT_DATA_FILE_SIZE_IN_MB
|
data_file_size_in_mb = DEFAULT_DATA_FILE_SIZE_IN_MB
|
||||||
if video_file_size_in_mb is None:
|
if video_file_size_in_mb is None:
|
||||||
video_file_size_in_mb = DEFAULT_VIDEO_FILE_SIZE_IN_MB
|
video_file_size_in_mb = DEFAULT_VIDEO_FILE_SIZE_IN_MB
|
||||||
|
|
||||||
|
# First check if the dataset already has a v3.0 version
|
||||||
|
if root is None and not force_conversion:
|
||||||
|
try:
|
||||||
|
print("Trying to download v3.0 version of the dataset from the hub...")
|
||||||
|
snapshot_download(repo_id, repo_type="dataset", revision=V30, local_dir=HF_LEROBOT_HOME / repo_id)
|
||||||
|
return
|
||||||
|
except Exception:
|
||||||
|
print("Dataset does not have an uploaded v3.0 version. Continuing with conversion.")
|
||||||
|
|
||||||
|
# Set root based on whether local dataset path is provided
|
||||||
|
use_local_dataset = False
|
||||||
|
root = HF_LEROBOT_HOME / repo_id if root is None else Path(root) / repo_id
|
||||||
|
if root.exists():
|
||||||
|
validate_local_dataset_version(root)
|
||||||
|
use_local_dataset = True
|
||||||
|
print(f"Using local dataset at {root}")
|
||||||
|
|
||||||
|
old_root = root.parent / f"{root.name}_old"
|
||||||
|
new_root = root.parent / f"{root.name}_v30"
|
||||||
|
|
||||||
|
# Handle old_root cleanup if both old_root and root exist
|
||||||
if old_root.is_dir() and root.is_dir():
|
if old_root.is_dir() and root.is_dir():
|
||||||
shutil.rmtree(str(root))
|
shutil.rmtree(str(root))
|
||||||
shutil.move(str(old_root), str(root))
|
shutil.move(str(old_root), str(root))
|
||||||
@@ -441,12 +476,13 @@ def convert_dataset(
|
|||||||
if new_root.is_dir():
|
if new_root.is_dir():
|
||||||
shutil.rmtree(new_root)
|
shutil.rmtree(new_root)
|
||||||
|
|
||||||
snapshot_download(
|
if not use_local_dataset:
|
||||||
repo_id,
|
snapshot_download(
|
||||||
repo_type="dataset",
|
repo_id,
|
||||||
revision=V21,
|
repo_type="dataset",
|
||||||
local_dir=root,
|
revision=V21,
|
||||||
)
|
local_dir=root,
|
||||||
|
)
|
||||||
|
|
||||||
convert_info(root, new_root, data_file_size_in_mb, video_file_size_in_mb)
|
convert_info(root, new_root, data_file_size_in_mb, video_file_size_in_mb)
|
||||||
convert_tasks(root, new_root)
|
convert_tasks(root, new_root)
|
||||||
@@ -457,21 +493,22 @@ def convert_dataset(
|
|||||||
shutil.move(str(root), str(old_root))
|
shutil.move(str(root), str(old_root))
|
||||||
shutil.move(str(new_root), str(root))
|
shutil.move(str(new_root), str(root))
|
||||||
|
|
||||||
hub_api = HfApi()
|
if push_to_hub:
|
||||||
try:
|
hub_api = HfApi()
|
||||||
hub_api.delete_tag(repo_id, tag=CODEBASE_VERSION, repo_type="dataset")
|
try:
|
||||||
except HTTPError as e:
|
hub_api.delete_tag(repo_id, tag=CODEBASE_VERSION, repo_type="dataset")
|
||||||
print(f"tag={CODEBASE_VERSION} probably doesn't exist. Skipping exception ({e})")
|
except HTTPError as e:
|
||||||
pass
|
print(f"tag={CODEBASE_VERSION} probably doesn't exist. Skipping exception ({e})")
|
||||||
hub_api.delete_files(
|
pass
|
||||||
delete_patterns=["data/chunk*/episode_*", "meta/*.jsonl", "videos/chunk*"],
|
hub_api.delete_files(
|
||||||
repo_id=repo_id,
|
delete_patterns=["data/chunk*/episode_*", "meta/*.jsonl", "videos/chunk*"],
|
||||||
revision=branch,
|
repo_id=repo_id,
|
||||||
repo_type="dataset",
|
revision=branch,
|
||||||
)
|
repo_type="dataset",
|
||||||
hub_api.create_tag(repo_id, tag=CODEBASE_VERSION, revision=branch, repo_type="dataset")
|
)
|
||||||
|
hub_api.create_tag(repo_id, tag=CODEBASE_VERSION, revision=branch, repo_type="dataset")
|
||||||
|
|
||||||
LeRobotDataset(repo_id).push_to_hub()
|
LeRobotDataset(repo_id).push_to_hub()
|
||||||
|
|
||||||
|
|
||||||
if __name__ == "__main__":
|
if __name__ == "__main__":
|
||||||
@@ -502,6 +539,23 @@ if __name__ == "__main__":
|
|||||||
default=None,
|
default=None,
|
||||||
help="File size in MB. Defaults to 100 for data and 500 for videos.",
|
help="File size in MB. Defaults to 100 for data and 500 for videos.",
|
||||||
)
|
)
|
||||||
|
parser.add_argument(
|
||||||
|
"--root",
|
||||||
|
type=str,
|
||||||
|
default=None,
|
||||||
|
help="Local directory to use for downloading/writing the dataset.",
|
||||||
|
)
|
||||||
|
parser.add_argument(
|
||||||
|
"--push-to-hub",
|
||||||
|
type=lambda input: input.lower() == "true",
|
||||||
|
default=True,
|
||||||
|
help="Push the converted dataset to the hub.",
|
||||||
|
)
|
||||||
|
parser.add_argument(
|
||||||
|
"--force-conversion",
|
||||||
|
action="store_true",
|
||||||
|
help="Force conversion even if the dataset already has a v3.0 version.",
|
||||||
|
)
|
||||||
|
|
||||||
args = parser.parse_args()
|
args = parser.parse_args()
|
||||||
convert_dataset(**vars(args))
|
convert_dataset(**vars(args))
|
||||||
|
|||||||
@@ -2,7 +2,15 @@
|
|||||||
|
|
||||||
## Get started
|
## Get started
|
||||||
|
|
||||||
1. Install v3.0 lerobot
|
1. Downgrade datasets:
|
||||||
|
|
||||||
|
```bash
|
||||||
|
pip install "datasets<4.0.0"
|
||||||
|
```
|
||||||
|
|
||||||
|
> Need to downgrade datasets first since `4.0.0` introduces `List` and `Column`.
|
||||||
|
|
||||||
|
2. Install v3.0 lerobot
|
||||||
|
|
||||||
```bash
|
```bash
|
||||||
git clone https://github.com/huggingface/lerobot.git
|
git clone https://github.com/huggingface/lerobot.git
|
||||||
|
|||||||
@@ -22,7 +22,6 @@ import logging
|
|||||||
import math
|
import math
|
||||||
import shutil
|
import shutil
|
||||||
import subprocess
|
import subprocess
|
||||||
import sys
|
|
||||||
from collections import defaultdict
|
from collections import defaultdict
|
||||||
from pathlib import Path
|
from pathlib import Path
|
||||||
from typing import Any, Iterable
|
from typing import Any, Iterable
|
||||||
@@ -31,6 +30,7 @@ import jsonlines
|
|||||||
import numpy as np
|
import numpy as np
|
||||||
import pyarrow.parquet as pq
|
import pyarrow.parquet as pq
|
||||||
import tqdm
|
import tqdm
|
||||||
|
from datasets import Dataset
|
||||||
from huggingface_hub import snapshot_download
|
from huggingface_hub import snapshot_download
|
||||||
from lerobot.datasets.utils import (
|
from lerobot.datasets.utils import (
|
||||||
DEFAULT_CHUNK_SIZE,
|
DEFAULT_CHUNK_SIZE,
|
||||||
@@ -52,10 +52,10 @@ from lerobot.utils.utils import init_logging
|
|||||||
V21 = "v2.1"
|
V21 = "v2.1"
|
||||||
V30 = "v3.0"
|
V30 = "v3.0"
|
||||||
|
|
||||||
LEGACY_DATA_PATH_TEMPLATE = "data/chunk-{chunk_index:03d}/episode_{episode_index:06d}.parquet"
|
LEGACY_DATA_PATH_TEMPLATE = "data/chunk-{episode_chunk:03d}/episode_{episode_index:06d}.parquet"
|
||||||
LEGACY_VIDEO_PATH_TEMPLATE = "videos/chunk-{chunk_index:03d}/{video_key}/episode_{episode_index:06d}.mp4"
|
LEGACY_VIDEO_PATH_TEMPLATE = "videos/chunk-{episode_chunk:03d}/{video_key}/episode_{episode_index:06d}.mp4"
|
||||||
MIN_VIDEO_DURATION = 1e-6
|
MIN_VIDEO_DURATION = 1e-6
|
||||||
LEGACY_STATS_KEYS = ("mean", "std", "min", "max", "q01", "q99")
|
LEGACY_STATS_KEYS = ("mean", "std", "min", "max", "count")
|
||||||
|
|
||||||
|
|
||||||
def _to_serializable(value: Any) -> Any:
|
def _to_serializable(value: Any) -> Any:
|
||||||
@@ -181,15 +181,15 @@ def convert_data(root: Path, new_root: Path, episode_records: list[dict[str, Any
|
|||||||
f"episode_index={episode_index}, length={length}"
|
f"episode_index={episode_index}, length={length}"
|
||||||
)
|
)
|
||||||
|
|
||||||
episode_table = table.slice(start, length)
|
episode_table = table.slice(start, length).to_pandas()
|
||||||
|
|
||||||
dest_chunk = episode_index // DEFAULT_CHUNK_SIZE
|
dest_chunk = episode_index // DEFAULT_CHUNK_SIZE
|
||||||
dest_path = new_root / LEGACY_DATA_PATH_TEMPLATE.format(
|
dest_path = new_root / LEGACY_DATA_PATH_TEMPLATE.format(
|
||||||
chunk_index=dest_chunk,
|
episode_chunk=dest_chunk,
|
||||||
episode_index=episode_index,
|
episode_index=episode_index,
|
||||||
)
|
)
|
||||||
dest_path.parent.mkdir(parents=True, exist_ok=True)
|
dest_path.parent.mkdir(parents=True, exist_ok=True)
|
||||||
pq.write_table(episode_table, dest_path)
|
Dataset.from_pandas(episode_table).to_parquet(dest_path)
|
||||||
|
|
||||||
|
|
||||||
def _group_episodes_by_video_file(
|
def _group_episodes_by_video_file(
|
||||||
@@ -365,7 +365,7 @@ def convert_videos(root: Path, new_root: Path, episode_records: list[dict[str, A
|
|||||||
|
|
||||||
dest_chunk = episode_index // DEFAULT_CHUNK_SIZE
|
dest_chunk = episode_index // DEFAULT_CHUNK_SIZE
|
||||||
dest_path = new_root / LEGACY_VIDEO_PATH_TEMPLATE.format(
|
dest_path = new_root / LEGACY_VIDEO_PATH_TEMPLATE.format(
|
||||||
chunk_index=dest_chunk,
|
episode_chunk=dest_chunk,
|
||||||
video_key=video_key,
|
video_key=video_key,
|
||||||
episode_index=episode_index,
|
episode_index=episode_index,
|
||||||
)
|
)
|
||||||
|
|||||||
Reference in New Issue
Block a user