🐛 fix dataset version convert (#75)

* fix v30_to_v21

* sync v21_to_v30 with official
This commit is contained in:
Qizhi Chen
2025-12-03 12:53:59 +08:00
committed by GitHub
parent 01d1df3920
commit 97c278f339
3 changed files with 104 additions and 42 deletions
+9 -1
View File
@@ -2,7 +2,15 @@
## Get started
1. Install v3.0 lerobot
1. Downgrade datasets:
```bash
pip install "datasets<4.0.0"
```
> Need to downgrade datasets first since `4.0.0` introduces `List` and `Column`.
2. Install v3.0 lerobot
```bash
git clone https://github.com/huggingface/lerobot.git
@@ -22,7 +22,6 @@ import logging
import math
import shutil
import subprocess
import sys
from collections import defaultdict
from pathlib import Path
from typing import Any, Iterable
@@ -31,6 +30,7 @@ import jsonlines
import numpy as np
import pyarrow.parquet as pq
import tqdm
from datasets import Dataset
from huggingface_hub import snapshot_download
from lerobot.datasets.utils import (
DEFAULT_CHUNK_SIZE,
@@ -52,10 +52,10 @@ from lerobot.utils.utils import init_logging
V21 = "v2.1"
V30 = "v3.0"
LEGACY_DATA_PATH_TEMPLATE = "data/chunk-{chunk_index:03d}/episode_{episode_index:06d}.parquet"
LEGACY_VIDEO_PATH_TEMPLATE = "videos/chunk-{chunk_index:03d}/{video_key}/episode_{episode_index:06d}.mp4"
LEGACY_DATA_PATH_TEMPLATE = "data/chunk-{episode_chunk:03d}/episode_{episode_index:06d}.parquet"
LEGACY_VIDEO_PATH_TEMPLATE = "videos/chunk-{episode_chunk:03d}/{video_key}/episode_{episode_index:06d}.mp4"
MIN_VIDEO_DURATION = 1e-6
LEGACY_STATS_KEYS = ("mean", "std", "min", "max", "q01", "q99")
LEGACY_STATS_KEYS = ("mean", "std", "min", "max", "count")
def _to_serializable(value: Any) -> Any:
@@ -181,15 +181,15 @@ def convert_data(root: Path, new_root: Path, episode_records: list[dict[str, Any
f"episode_index={episode_index}, length={length}"
)
episode_table = table.slice(start, length)
episode_table = table.slice(start, length).to_pandas()
dest_chunk = episode_index // DEFAULT_CHUNK_SIZE
dest_path = new_root / LEGACY_DATA_PATH_TEMPLATE.format(
chunk_index=dest_chunk,
episode_chunk=dest_chunk,
episode_index=episode_index,
)
dest_path.parent.mkdir(parents=True, exist_ok=True)
pq.write_table(episode_table, dest_path)
Dataset.from_pandas(episode_table).to_parquet(dest_path)
def _group_episodes_by_video_file(
@@ -365,7 +365,7 @@ def convert_videos(root: Path, new_root: Path, episode_records: list[dict[str, A
dest_chunk = episode_index // DEFAULT_CHUNK_SIZE
dest_path = new_root / LEGACY_VIDEO_PATH_TEMPLATE.format(
chunk_index=dest_chunk,
episode_chunk=dest_chunk,
video_key=video_key,
episode_index=episode_index,
)