chore(dataset v2.0): drop support for dataset v2.0 format

2026-06-19 01:07:18 +00:00 · 2025-09-01 21:31:46 +02:00
parent adad3698e1
commit 0a30636fc6
6 changed files with 4 additions and 372 deletions
@@ -70,10 +70,8 @@ class CompatibilityError(Exception): ...

 class BackwardCompatibilityError(CompatibilityError):
    def __init__(self, repo_id: str, version: packaging.version.Version):
-        if version.major == 3:
-            message = V30_MESSAGE.format(repo_id=repo_id, version=version)
-        elif version.major == 2:
-            message = V2_MESSAGE.format(repo_id=repo_id, version=version)
+        if version.major == 2 and version.minor == 1:
+                message = V30_MESSAGE.format(repo_id=repo_id, version=version)
        else:
            raise NotImplementedError(
                "Contact the maintainer on [Discord](https://discord.com/invite/s3KuuzsPFb)."
@@ -39,7 +39,7 @@ from torchvision import transforms

 from lerobot.configs.types import FeatureType, PolicyFeature
 from lerobot.datasets.backward_compatibility import (
-    V21_MESSAGE,
+    FUTURE_MESSAGE,
    BackwardCompatibilityError,
    ForwardCompatibilityError,
 )
@@ -343,7 +343,7 @@ def check_version_compatibility(
    if v_check.major < v_current.major and enforce_breaking_major:
        raise BackwardCompatibilityError(repo_id, v_check)
    elif v_check.minor < v_current.minor:
-        logging.warning(V21_MESSAGE.format(repo_id=repo_id, version=v_check))
+        logging.warning(FUTURE_MESSAGE.format(repo_id=repo_id, version=v_check))


 def get_repo_versions(repo_id: str) -> list[packaging.version.Version]:
@@ -1,87 +0,0 @@
-# Copyright 2024 The HuggingFace Inc. team. All rights reserved.
-#
-# Licensed under the Apache License, Version 2.0 (the "License");
-# you may not use this file except in compliance with the License.
-# You may obtain a copy of the License at
-#
-#     http://www.apache.org/licenses/LICENSE-2.0
-#
-# Unless required by applicable law or agreed to in writing, software
-# distributed under the License is distributed on an "AS IS" BASIS,
-# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-# See the License for the specific language governing permissions and
-# limitations under the License.
-
-import logging
-import traceback
-from pathlib import Path
-
-from datasets import get_dataset_config_info
-from huggingface_hub import HfApi
-
-from lerobot import available_datasets
-from lerobot.datasets.lerobot_dataset import LeRobotDatasetMetadata
-from lerobot.datasets.utils import INFO_PATH, write_info
-from lerobot.datasets.v21.convert_dataset_v20_to_v21 import V20, SuppressWarnings
-
-LOCAL_DIR = Path("data/")
-
-hub_api = HfApi()
-
-
-def fix_dataset(repo_id: str) -> str:
-    if not hub_api.revision_exists(repo_id, V20, repo_type="dataset"):
-        return f"{repo_id}: skipped (not in {V20})."
-
-    dataset_info = get_dataset_config_info(repo_id, "default")
-    with SuppressWarnings():
-        lerobot_metadata = LeRobotDatasetMetadata(repo_id, revision=V20, force_cache_sync=True)
-
-    meta_features = {key for key, ft in lerobot_metadata.features.items() if ft["dtype"] != "video"}
-    parquet_features = set(dataset_info.features)
-
-    diff_parquet_meta = parquet_features - meta_features
-    diff_meta_parquet = meta_features - parquet_features
-
-    if diff_parquet_meta:
-        raise ValueError(f"In parquet not in info.json: {parquet_features - meta_features}")
-
-    if not diff_meta_parquet:
-        return f"{repo_id}: skipped (no diff)"
-
-    if diff_meta_parquet:
-        logging.warning(f"In info.json not in parquet: {meta_features - parquet_features}")
-        assert diff_meta_parquet == {"language_instruction"}
-        lerobot_metadata.features.pop("language_instruction")
-        write_info(lerobot_metadata.info, lerobot_metadata.root)
-        commit_info = hub_api.upload_file(
-            path_or_fileobj=lerobot_metadata.root / INFO_PATH,
-            path_in_repo=INFO_PATH,
-            repo_id=repo_id,
-            repo_type="dataset",
-            revision=V20,
-            commit_message="Remove 'language_instruction'",
-            create_pr=True,
-        )
-        return f"{repo_id}: success - PR: {commit_info.pr_url}"
-
-
-def batch_fix():
-    status = {}
-    LOCAL_DIR.mkdir(parents=True, exist_ok=True)
-    logfile = LOCAL_DIR / "fix_features_v20.txt"
-    for num, repo_id in enumerate(available_datasets):
-        print(f"\nConverting {repo_id} ({num}/{len(available_datasets)})")
-        print("---------------------------------------------------------")
-        try:
-            status = fix_dataset(repo_id)
-        except Exception:
-            status = f"{repo_id}: failed\n    {traceback.format_exc()}"
-
-        logging.info(status)
-        with open(logfile, "a") as file:
-            file.write(status + "\n")
-
-
-if __name__ == "__main__":
-    batch_fix()
@@ -1,54 +0,0 @@
-#!/usr/bin/env python
-
-# Copyright 2024 The HuggingFace Inc. team. All rights reserved.
-#
-# Licensed under the Apache License, Version 2.0 (the "License");
-# you may not use this file except in compliance with the License.
-# You may obtain a copy of the License at
-#
-#     http://www.apache.org/licenses/LICENSE-2.0
-#
-# Unless required by applicable law or agreed to in writing, software
-# distributed under the License is distributed on an "AS IS" BASIS,
-# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-# See the License for the specific language governing permissions and
-# limitations under the License.
-
-"""
-This script is for internal use to convert all datasets under the 'lerobot' hub user account to v2.1.
-"""
-
-import traceback
-from pathlib import Path
-
-from huggingface_hub import HfApi
-
-from lerobot import available_datasets
-from lerobot.datasets.v21.convert_dataset_v20_to_v21 import V21, convert_dataset
-
-LOCAL_DIR = Path("data/")
-
-
-def batch_convert():
-    status = {}
-    LOCAL_DIR.mkdir(parents=True, exist_ok=True)
-    logfile = LOCAL_DIR / "conversion_log_v21.txt"
-    hub_api = HfApi()
-    for num, repo_id in enumerate(available_datasets):
-        print(f"\nConverting {repo_id} ({num}/{len(available_datasets)})")
-        print("---------------------------------------------------------")
-        try:
-            if hub_api.revision_exists(repo_id, V21, repo_type="dataset"):
-                status = f"{repo_id}: success (already in {V21})."
-            else:
-                convert_dataset(repo_id)
-                status = f"{repo_id}: success."
-        except Exception:
-            status = f"{repo_id}: failed\n    {traceback.format_exc()}"
-
-        with open(logfile, "a") as file:
-            file.write(status + "\n")
-
-
-if __name__ == "__main__":
-    batch_convert()
@@ -1,111 +0,0 @@
-# Copyright 2024 The HuggingFace Inc. team. All rights reserved.
-#
-# Licensed under the Apache License, Version 2.0 (the "License");
-# you may not use this file except in compliance with the License.
-# You may obtain a copy of the License at
-#
-#     http://www.apache.org/licenses/LICENSE-2.0
-#
-# Unless required by applicable law or agreed to in writing, software
-# distributed under the License is distributed on an "AS IS" BASIS,
-# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-# See the License for the specific language governing permissions and
-# limitations under the License.
-
-"""
-This script will help you convert any LeRobot dataset already pushed to the hub from codebase version 2.0 to
-2.1. It will:
-
- Generate per-episodes stats and writes them in `episodes_stats.jsonl`
- Check consistency between these new stats and the old ones.
- Remove the deprecated `stats.json`.
- Update codebase_version in `info.json`.
- Push this new version to the hub on the 'main' branch and tags it with "v2.1".
-
-Usage:
-
-```bash
-python -m lerobot.datasets.v21.convert_dataset_v20_to_v21 \
-    --repo-id=aliberts/koch_tutorial
-```
-
-"""
-
-import argparse
-import logging
-
-from huggingface_hub import HfApi
-
-from lerobot.datasets.lerobot_dataset import CODEBASE_VERSION, LeRobotDataset
-from lerobot.datasets.utils import STATS_PATH, load_stats, write_info
-from lerobot.datasets.v21.convert_stats import check_aggregate_stats, convert_stats
-
-V20 = "v2.0"
-V21 = "v2.1"
-
-
-class SuppressWarnings:
-    def __enter__(self):
-        self.previous_level = logging.getLogger().getEffectiveLevel()
-        logging.getLogger().setLevel(logging.ERROR)
-
-    def __exit__(self, exc_type, exc_val, exc_tb):
-        logging.getLogger().setLevel(self.previous_level)
-
-
-def convert_dataset(
-    repo_id: str,
-    branch: str | None = None,
-    num_workers: int = 4,
-):
-    with SuppressWarnings():
-        dataset = LeRobotDataset(repo_id, revision=V20, force_cache_sync=True)
-
-    convert_stats(dataset, num_workers=num_workers)
-    ref_stats = load_stats(dataset.root)
-    check_aggregate_stats(dataset, ref_stats)
-
-    dataset.meta.info["codebase_version"] = CODEBASE_VERSION
-    write_info(dataset.meta.info, dataset.root)
-
-    dataset.push_to_hub(branch=branch, tag_version=False, allow_patterns="meta/")
-
-    # delete old stats.json file
-    if (dataset.root / STATS_PATH).is_file:
-        (dataset.root / STATS_PATH).unlink()
-
-    hub_api = HfApi()
-    if hub_api.file_exists(
-        repo_id=dataset.repo_id, filename=STATS_PATH, revision=branch, repo_type="dataset"
-    ):
-        hub_api.delete_file(
-            path_in_repo=STATS_PATH, repo_id=dataset.repo_id, revision=branch, repo_type="dataset"
-        )
-
-    hub_api.create_tag(repo_id, tag=CODEBASE_VERSION, revision=branch, repo_type="dataset")
-
-
-if __name__ == "__main__":
-    parser = argparse.ArgumentParser()
-    parser.add_argument(
-        "--repo-id",
-        type=str,
-        required=True,
-        help="Repository identifier on Hugging Face: a community or a user name `/` the name of the dataset "
-        "(e.g. `lerobot/pusht`, `cadene/aloha_sim_insertion_human`).",
-    )
-    parser.add_argument(
-        "--branch",
-        type=str,
-        default=None,
-        help="Repo branch to push your dataset. Defaults to the main branch.",
-    )
-    parser.add_argument(
-        "--num-workers",
-        type=int,
-        default=4,
-        help="Number of workers for parallelizing stats compute. Defaults to 4.",
-    )
-
-    args = parser.parse_args()
-    convert_dataset(**vars(args))
@@ -1,114 +0,0 @@
-# Copyright 2024 The HuggingFace Inc. team. All rights reserved.
-#
-# Licensed under the Apache License, Version 2.0 (the "License");
-# you may not use this file except in compliance with the License.
-# You may obtain a copy of the License at
-#
-#     http://www.apache.org/licenses/LICENSE-2.0
-#
-# Unless required by applicable law or agreed to in writing, software
-# distributed under the License is distributed on an "AS IS" BASIS,
-# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-# See the License for the specific language governing permissions and
-# limitations under the License.
-
-from concurrent.futures import ThreadPoolExecutor, as_completed
-from pathlib import Path
-
-import jsonlines
-import numpy as np
-from tqdm import tqdm
-
-from lerobot.datasets.compute_stats import aggregate_stats, get_feature_stats, sample_indices
-from lerobot.datasets.lerobot_dataset import LeRobotDataset
-from lerobot.datasets.utils import LEGACY_EPISODES_STATS_PATH, serialize_dict
-
-
-def append_jsonlines(data: dict, fpath: Path) -> None:
-    fpath.parent.mkdir(exist_ok=True, parents=True)
-    with jsonlines.open(fpath, "a") as writer:
-        writer.write(data)
-
-
-def legacy_write_episode_stats(episode_index: int, episode_stats: dict, local_dir: Path):
-    # We wrap episode_stats in a dictionary since `episode_stats["episode_index"]`
-    # is a dictionary of stats and not an integer.
-    episode_stats = {"episode_index": episode_index, "stats": serialize_dict(episode_stats)}
-    append_jsonlines(episode_stats, local_dir / LEGACY_EPISODES_STATS_PATH)
-
-
-def sample_episode_video_frames(dataset: LeRobotDataset, episode_index: int, ft_key: str) -> np.ndarray:
-    ep_len = dataset.meta.episodes[episode_index]["length"]
-    sampled_indices = sample_indices(ep_len)
-    query_timestamps = dataset._get_query_timestamps(0.0, {ft_key: sampled_indices})
-    video_frames = dataset._query_videos(query_timestamps, episode_index)
-    return video_frames[ft_key].numpy()
-
-
-def convert_episode_stats(dataset: LeRobotDataset, ep_idx: int):
-    ep_start_idx = dataset.episode_data_index["from"][ep_idx]
-    ep_end_idx = dataset.episode_data_index["to"][ep_idx]
-    ep_data = dataset.hf_dataset.select(range(ep_start_idx, ep_end_idx))
-
-    ep_stats = {}
-    for key, ft in dataset.features.items():
-        if ft["dtype"] == "video":
-            # We sample only for videos
-            ep_ft_data = sample_episode_video_frames(dataset, ep_idx, key)
-        else:
-            ep_ft_data = np.array(ep_data[key])
-
-        axes_to_reduce = (0, 2, 3) if ft["dtype"] in ["image", "video"] else 0
-        keepdims = True if ft["dtype"] in ["image", "video"] else ep_ft_data.ndim == 1
-        ep_stats[key] = get_feature_stats(ep_ft_data, axis=axes_to_reduce, keepdims=keepdims)
-
-        if ft["dtype"] in ["image", "video"]:  # remove batch dim
-            ep_stats[key] = {
-                k: v if k == "count" else np.squeeze(v, axis=0) for k, v in ep_stats[key].items()
-            }
-
-    dataset.meta.episodes_stats[ep_idx] = ep_stats
-
-
-def convert_stats(dataset: LeRobotDataset, num_workers: int = 0):
-    assert dataset.episodes is None
-    print("Computing episodes stats")
-    total_episodes = dataset.meta.total_episodes
-    if num_workers > 0:
-        with ThreadPoolExecutor(max_workers=num_workers) as executor:
-            futures = {
-                executor.submit(convert_episode_stats, dataset, ep_idx): ep_idx
-                for ep_idx in range(total_episodes)
-            }
-            for future in tqdm(as_completed(futures), total=total_episodes):
-                future.result()
-    else:
-        for ep_idx in tqdm(range(total_episodes)):
-            convert_episode_stats(dataset, ep_idx)
-
-    for ep_idx in tqdm(range(total_episodes)):
-        legacy_write_episode_stats(ep_idx, dataset.meta.episodes_stats[ep_idx], dataset.root)
-
-
-def check_aggregate_stats(
-    dataset: LeRobotDataset,
-    reference_stats: dict[str, dict[str, np.ndarray]],
-    video_rtol_atol: tuple[float] = (1e-2, 1e-2),
-    default_rtol_atol: tuple[float] = (5e-6, 6e-5),
-):
-    """Verifies that the aggregated stats from episodes_stats are close to reference stats."""
-    agg_stats = aggregate_stats(list(dataset.meta.episodes_stats.values()))
-    for key, ft in dataset.features.items():
-        # These values might need some fine-tuning
-        if ft["dtype"] == "video":
-            # to account for image sub-sampling
-            rtol, atol = video_rtol_atol
-        else:
-            rtol, atol = default_rtol_atol
-
-        for stat, val in agg_stats[key].items():
-            if key in reference_stats and stat in reference_stats[key]:
-                err_msg = f"feature='{key}' stats='{stat}'"
-                np.testing.assert_allclose(
-                    val, reference_stats[key][stat], rtol=rtol, atol=atol, err_msg=err_msg
-                )