fix(re-encoding): removing inconsistent re-encoding option in lerobot_edit_dataset

This commit is contained in:
CarolinePascal
2026-05-13 16:22:09 +02:00
parent f0830972f0
commit cf6f92ac74
4 changed files with 76 additions and 39 deletions
+33
View File
@@ -16,6 +16,8 @@
"""Video encoder configurations.""" """Video encoder configurations."""
from __future__ import annotations
import logging import logging
from dataclasses import dataclass, field from dataclasses import dataclass, field
from typing import Any from typing import Any
@@ -35,6 +37,9 @@ HW_VIDEO_CODECS = [
"h264_qsv", # Intel Quick Sync "h264_qsv", # Intel Quick Sync
] ]
VALID_VIDEO_CODECS: frozenset[str] = frozenset({"h264", "hevc", "libsvtav1", "auto", *HW_VIDEO_CODECS}) VALID_VIDEO_CODECS: frozenset[str] = frozenset({"h264", "hevc", "libsvtav1", "auto", *HW_VIDEO_CODECS})
# Aliases for legacy video codec names.
VIDEO_CODECS_ALIASES: dict[str, str] = {"av1": "libsvtav1"}
LIBSVTAV1_DEFAULT_PRESET: int = 12 LIBSVTAV1_DEFAULT_PRESET: int = 12
@@ -88,6 +93,30 @@ class VideoEncoderConfig:
self.preset = LIBSVTAV1_DEFAULT_PRESET self.preset = LIBSVTAV1_DEFAULT_PRESET
self.validate() self.validate()
@classmethod
def from_video_info(cls, video_info: dict | None) -> VideoEncoderConfig:
"""Reconstruct a :class:`VideoEncoderConfig` from a video feature's ``info`` block.
Missing or ``None`` values fall back to the class defaults.
"""
video_info = video_info or {}
kwargs: dict[str, Any] = {}
for src_key, dst_field in (("video.codec", "vcodec"), ("video.pix_fmt", "pix_fmt")):
value = video_info.get(src_key)
if value is not None:
kwargs[dst_field] = value
for field_name in VIDEO_ENCODER_INFO_FIELD_NAMES:
value = video_info.get(f"video.{field_name}")
if value is None:
continue
# Persisted as ``{}`` after merges with disagreeing sources — treat as default.
if field_name == "extra_options" and not value:
continue
kwargs[field_name] = value
return cls(**kwargs)
def detect_available_encoders(self, encoders: list[str] | str) -> list[str]: def detect_available_encoders(self, encoders: list[str] | str) -> list[str]:
"""Return the subset of available encoders based on the specified video backend. """Return the subset of available encoders based on the specified video backend.
@@ -116,7 +145,11 @@ class VideoEncoderConfig:
For ``"auto"``, the first hardware encoder in the preference list that is available is chosen; if none are available, ``libsvtav1`` is used. If the For ``"auto"``, the first hardware encoder in the preference list that is available is chosen; if none are available, ``libsvtav1`` is used. If the
resolved codec (explicit or after auto-selection) is not available, raises ``ValueError``. resolved codec (explicit or after auto-selection) is not available, raises ``ValueError``.
Stream-derived canonical codec names listed in :data:`VIDEO_CODEC_ALIASES` are
rewritten to their corresponding encoder name (e.g. ``"av1"`` → ``"libsvtav1"``).
""" """
self.vcodec = VIDEO_CODECS_ALIASES.get(self.vcodec, self.vcodec)
if self.vcodec not in VALID_VIDEO_CODECS: if self.vcodec not in VALID_VIDEO_CODECS:
raise ValueError(f"Invalid vcodec '{self.vcodec}'. Must be one of: {sorted(VALID_VIDEO_CODECS)}") raise ValueError(f"Invalid vcodec '{self.vcodec}'. Must be one of: {sorted(VALID_VIDEO_CODECS)}")
if self.vcodec == "auto": if self.vcodec == "auto":
+20 -19
View File
@@ -96,17 +96,19 @@ def delete_episodes(
episode_indices: list[int], episode_indices: list[int],
output_dir: str | Path | None = None, output_dir: str | Path | None = None,
repo_id: str | None = None, repo_id: str | None = None,
camera_encoder: VideoEncoderConfig | None = None,
) -> LeRobotDataset: ) -> LeRobotDataset:
"""Delete episodes from a LeRobotDataset and create a new dataset. """Delete episodes from a LeRobotDataset and create a new dataset.
Video segments that need re-encoding (because the source file mixes kept and
deleted episodes) are re-encoded with the source dataset's existing encoder
settings — read back from ``meta/info.json`` — so the output dataset stays
consistent with its own metadata.
Args: Args:
dataset: The source LeRobotDataset. dataset: The source LeRobotDataset.
episode_indices: List of episode indices to delete. episode_indices: List of episode indices to delete.
output_dir: Root directory where the edited dataset will be stored. If not specified, defaults to $HF_LEROBOT_HOME/repo_id. Equivalent to new_root in EditDatasetConfig. output_dir: Root directory where the edited dataset will be stored. If not specified, defaults to $HF_LEROBOT_HOME/repo_id. Equivalent to new_root in EditDatasetConfig.
repo_id: Edited dataset identifier. Equivalent to new_repo_id in EditDatasetConfig. repo_id: Edited dataset identifier. Equivalent to new_repo_id in EditDatasetConfig.
camera_encoder: Video encoder settings used when re-encoding video segments
(``None`` uses :func:`~lerobot.configs.camera_encoder_defaults`).
""" """
if not episode_indices: if not episode_indices:
raise ValueError("No episodes to delete") raise ValueError("No episodes to delete")
@@ -139,7 +141,7 @@ def delete_episodes(
video_metadata = None video_metadata = None
if dataset.meta.video_keys: if dataset.meta.video_keys:
video_metadata = _copy_and_reindex_videos(dataset, new_meta, episode_mapping, camera_encoder) video_metadata = _copy_and_reindex_videos(dataset, new_meta, episode_mapping)
data_metadata = _copy_and_reindex_data(dataset, new_meta, episode_mapping) data_metadata = _copy_and_reindex_data(dataset, new_meta, episode_mapping)
@@ -161,17 +163,19 @@ def split_dataset(
dataset: LeRobotDataset, dataset: LeRobotDataset,
splits: dict[str, float | list[int]], splits: dict[str, float | list[int]],
output_dir: str | Path | None = None, output_dir: str | Path | None = None,
camera_encoder: VideoEncoderConfig | None = None,
) -> dict[str, LeRobotDataset]: ) -> dict[str, LeRobotDataset]:
"""Split a LeRobotDataset into multiple smaller datasets. """Split a LeRobotDataset into multiple smaller datasets.
Video segments that need re-encoding (because the source file mixes episodes
that fall into different splits) are re-encoded with the source dataset's
existing encoder settings — read back from ``meta/info.json`` — so each
output split stays consistent with its own metadata.
Args: Args:
dataset: The source LeRobotDataset to split. dataset: The source LeRobotDataset to split.
splits: Either a dict mapping split names to episode indices, or a dict mapping splits: Either a dict mapping split names to episode indices, or a dict mapping
split names to fractions (must sum to <= 1.0). split names to fractions (must sum to <= 1.0).
output_dir: Root directory where the split datasets will be stored. If not specified, defaults to $HF_LEROBOT_HOME/repo_id. output_dir: Root directory where the split datasets will be stored. If not specified, defaults to $HF_LEROBOT_HOME/repo_id.
camera_encoder: Video encoder settings used when re-encoding video segments
(``None`` uses :func:`~lerobot.configs.camera_encoder_defaults`).
Examples: Examples:
Split by specific episodes Split by specific episodes
@@ -232,7 +236,7 @@ def split_dataset(
video_metadata = None video_metadata = None
if dataset.meta.video_keys: if dataset.meta.video_keys:
video_metadata = _copy_and_reindex_videos(dataset, new_meta, episode_mapping, camera_encoder) video_metadata = _copy_and_reindex_videos(dataset, new_meta, episode_mapping)
data_metadata = _copy_and_reindex_data(dataset, new_meta, episode_mapping) data_metadata = _copy_and_reindex_data(dataset, new_meta, episode_mapping)
@@ -588,7 +592,7 @@ def _keep_episodes_from_video_with_av(
output_path: Path, output_path: Path,
episodes_to_keep: list[tuple[int, int]], episodes_to_keep: list[tuple[int, int]],
fps: float, fps: float,
camera_encoder: VideoEncoderConfig | None = None, camera_encoder: VideoEncoderConfig,
) -> None: ) -> None:
"""Keep only specified episodes from a video file using PyAV. """Keep only specified episodes from a video file using PyAV.
@@ -602,11 +606,8 @@ def _keep_episodes_from_video_with_av(
Ranges are half-open intervals: [start_frame, end_frame), where start_frame Ranges are half-open intervals: [start_frame, end_frame), where start_frame
is inclusive and end_frame is exclusive. is inclusive and end_frame is exclusive.
fps: Frame rate of the video. fps: Frame rate of the video.
camera_encoder: Video encoder settings camera_encoder: Video encoder settings used to re-encode the kept frames.
(``None`` uses :func:`~lerobot.configs.camera_encoder_defaults`).
""" """
if camera_encoder is None:
camera_encoder = camera_encoder_defaults()
from fractions import Fraction from fractions import Fraction
import av import av
@@ -699,26 +700,23 @@ def _copy_and_reindex_videos(
src_dataset: LeRobotDataset, src_dataset: LeRobotDataset,
dst_meta: LeRobotDatasetMetadata, dst_meta: LeRobotDatasetMetadata,
episode_mapping: dict[int, int], episode_mapping: dict[int, int],
camera_encoder: VideoEncoderConfig | None = None,
) -> dict[int, dict]: ) -> dict[int, dict]:
"""Copy and filter video files, only re-encoding files with deleted episodes. """Copy and filter video files, only re-encoding files with deleted episodes.
For video files that only contain kept episodes, we copy them directly. For video files that only contain kept episodes, we copy them directly.
For files with mixed kept/deleted episodes, we use PyAV filters to efficiently For files with mixed kept/deleted episodes, we use PyAV filters to efficiently
re-encode only the desired segments. re-encode only the desired segments. The encoder used for re-encoding is
derived per video key from the source dataset's ``meta/info.json`` so the
destination metadata keeps describing the videos accurately.
Args: Args:
src_dataset: Source dataset to copy from src_dataset: Source dataset to copy from
dst_meta: Destination metadata object dst_meta: Destination metadata object
episode_mapping: Mapping from old episode indices to new indices episode_mapping: Mapping from old episode indices to new indices
camera_encoder: Video encoder settings used when re-encoding segments
(``None`` uses :func:`~lerobot.configs.camera_encoder_defaults`).
Returns: Returns:
dict mapping episode index to its video metadata (chunk_index, file_index, timestamps) dict mapping episode index to its video metadata (chunk_index, file_index, timestamps)
""" """
if camera_encoder is None:
camera_encoder = camera_encoder_defaults()
if src_dataset.meta.episodes is None: if src_dataset.meta.episodes is None:
src_dataset.meta.episodes = load_episodes(src_dataset.meta.root) src_dataset.meta.episodes = load_episodes(src_dataset.meta.root)
@@ -726,6 +724,9 @@ def _copy_and_reindex_videos(
for video_key in src_dataset.meta.video_keys: for video_key in src_dataset.meta.video_keys:
logging.info(f"Processing videos for {video_key}") logging.info(f"Processing videos for {video_key}")
camera_encoder = VideoEncoderConfig.from_video_info(
src_dataset.meta.info.features.get(video_key, {}).get("info")
)
if dst_meta.video_path is None: if dst_meta.video_path is None:
raise ValueError("Destination metadata has no video_path defined") raise ValueError("Destination metadata has no video_path defined")
@@ -49,14 +49,6 @@ Delete episodes and save to a new dataset at a specific path and with a new repo
--operation.type delete_episodes \ --operation.type delete_episodes \
--operation.episode_indices "[0, 2, 5]" --operation.episode_indices "[0, 2, 5]"
Delete episodes and re-encode video segments with h264:
lerobot-edit-dataset \
--repo_id lerobot/pusht \
--operation.type delete_episodes \
--operation.episode_indices "[0, 2, 5]" \
--operation.camera_encoder.vcodec h264 \
--operation.camera_encoder.crf 23
Split dataset by fractions (pusht_train, pusht_val): Split dataset by fractions (pusht_train, pusht_val):
lerobot-edit-dataset \ lerobot-edit-dataset \
--repo_id lerobot/pusht \ --repo_id lerobot/pusht \
@@ -82,14 +74,6 @@ Split into more than two splits:
--operation.type split \ --operation.type split \
--operation.splits '{"train": 0.6, "val": 0.2, "test": 0.2}' --operation.splits '{"train": 0.6, "val": 0.2, "test": 0.2}'
Split dataset and re-encode video segments with h264:
lerobot-edit-dataset \
--repo_id lerobot/pusht \
--operation.type split \
--operation.splits '{"train": 0.8, "val": 0.2}' \
--operation.camera_encoder.vcodec h264 \
--operation.camera_encoder.crf 23
Merge multiple datasets: Merge multiple datasets:
lerobot-edit-dataset \ lerobot-edit-dataset \
--new_repo_id lerobot/pusht_merged \ --new_repo_id lerobot/pusht_merged \
@@ -234,14 +218,12 @@ class OperationConfig(draccus.ChoiceRegistry, abc.ABC):
@dataclass @dataclass
class DeleteEpisodesConfig(OperationConfig): class DeleteEpisodesConfig(OperationConfig):
episode_indices: list[int] | None = None episode_indices: list[int] | None = None
camera_encoder: VideoEncoderConfig = field(default_factory=camera_encoder_defaults)
@OperationConfig.register_subclass("split") @OperationConfig.register_subclass("split")
@dataclass @dataclass
class SplitConfig(OperationConfig): class SplitConfig(OperationConfig):
splits: dict[str, float | list[int]] | None = None splits: dict[str, float | list[int]] | None = None
camera_encoder: VideoEncoderConfig = field(default_factory=camera_encoder_defaults)
@OperationConfig.register_subclass("merge") @OperationConfig.register_subclass("merge")
@@ -370,7 +352,6 @@ def handle_delete_episodes(cfg: EditDatasetConfig) -> None:
episode_indices=cfg.operation.episode_indices, episode_indices=cfg.operation.episode_indices,
output_dir=output_dir, output_dir=output_dir,
repo_id=output_repo_id, repo_id=output_repo_id,
camera_encoder=cfg.operation.camera_encoder,
) )
logging.info(f"Dataset saved to {output_dir}") logging.info(f"Dataset saved to {output_dir}")
@@ -402,7 +383,6 @@ def handle_split(cfg: EditDatasetConfig) -> None:
dataset, dataset,
splits=cfg.operation.splits, splits=cfg.operation.splits,
output_dir=cfg.new_root, output_dir=cfg.new_root,
camera_encoder=cfg.operation.camera_encoder,
) )
for split_name, split_ds in split_datasets.items(): for split_name, split_ds in split_datasets.items():
+23
View File
@@ -36,6 +36,7 @@ from lerobot.datasets.video_utils import (
encode_video_frames, encode_video_frames,
get_video_info, get_video_info,
) )
from tests.fixtures.constants import DUMMY_VIDEO_INFO
# Per-codec skip markers — validation tests only fire when the codec is available # Per-codec skip markers — validation tests only fire when the codec is available
@@ -570,3 +571,25 @@ class TestEncoderConfigPersistence:
dataset.finalize() dataset.finalize()
assert _read_feature_info(dataset) == first_info assert _read_feature_info(dataset) == first_info
class TestFromVideoInfo:
"""``VideoEncoderConfig.from_video_info`` reconstructs an encoder config
from the ``video.*`` keys persisted in a dataset's ``info.json``.
"""
@require_libsvtav1
def test_reconstructs_from_dummy_video_info(self):
cfg = VideoEncoderConfig.from_video_info(DUMMY_VIDEO_INFO)
# Canonical stream codec ``"av1"`` is aliased to the encoder name.
assert cfg.vcodec == "libsvtav1"
assert cfg.pix_fmt == DUMMY_VIDEO_INFO["video.pix_fmt"]
assert cfg.g == DUMMY_VIDEO_INFO["video.g"]
assert cfg.crf == DUMMY_VIDEO_INFO["video.crf"]
assert cfg.preset == DUMMY_VIDEO_INFO["video.preset"]
assert cfg.fast_decode == DUMMY_VIDEO_INFO["video.fast_decode"]
assert cfg.video_backend == DUMMY_VIDEO_INFO["video.video_backend"]
# ``{}`` placeholder (typical after a merge with disagreeing sources)
# must not leak into the reconstructed config.
assert cfg.extra_options == VideoEncoderConfig().extra_options