feat(VideoEncoderConfig): propagating the VideoEncoderConfig in the codebase

This commit is contained in:
CarolinePascal
2026-04-22 21:18:29 +02:00
parent 04bfa75a0a
commit 3942061e22
5 changed files with 157 additions and 114 deletions
+34 -49
View File
@@ -62,7 +62,7 @@ from .utils import (
DEFAULT_EPISODES_PATH,
update_chunk_file_indices,
)
from .video_utils import encode_video_frames, get_video_info
from .video_utils import VideoEncoderConfig, encode_video_frames, get_video_info
def _load_episode_with_stats(src_dataset: LeRobotDataset, episode_idx: int) -> dict:
@@ -92,6 +92,7 @@ def delete_episodes(
episode_indices: list[int],
output_dir: str | Path | None = None,
repo_id: str | None = None,
camera_encoder_config: VideoEncoderConfig | None = None,
) -> LeRobotDataset:
"""Delete episodes from a LeRobotDataset and create a new dataset.
@@ -100,6 +101,7 @@ def delete_episodes(
episode_indices: List of episode indices to delete.
output_dir: Root directory where the edited dataset will be stored. If not specified, defaults to $HF_LEROBOT_HOME/repo_id. Equivalent to new_root in EditDatasetConfig.
repo_id: Edited dataset identifier. Equivalent to new_repo_id in EditDatasetConfig.
camera_encoder_config: Video encoder settings used when re-encoding video segments (default: :class:`VideoEncoderConfig()`).
"""
if not episode_indices:
raise ValueError("No episodes to delete")
@@ -132,7 +134,7 @@ def delete_episodes(
video_metadata = None
if dataset.meta.video_keys:
video_metadata = _copy_and_reindex_videos(dataset, new_meta, episode_mapping)
video_metadata = _copy_and_reindex_videos(dataset, new_meta, episode_mapping, camera_encoder_config)
data_metadata = _copy_and_reindex_data(dataset, new_meta, episode_mapping)
@@ -154,6 +156,7 @@ def split_dataset(
dataset: LeRobotDataset,
splits: dict[str, float | list[int]],
output_dir: str | Path | None = None,
camera_encoder_config: VideoEncoderConfig | None = None,
) -> dict[str, LeRobotDataset]:
"""Split a LeRobotDataset into multiple smaller datasets.
@@ -162,6 +165,7 @@ def split_dataset(
splits: Either a dict mapping split names to episode indices, or a dict mapping
split names to fractions (must sum to <= 1.0).
output_dir: Root directory where the split datasets will be stored. If not specified, defaults to $HF_LEROBOT_HOME/repo_id.
camera_encoder_config: Video encoder settings used when re-encoding video segments (default: :class:`VideoEncoderConfig()`).
Examples:
Split by specific episodes
@@ -222,7 +226,9 @@ def split_dataset(
video_metadata = None
if dataset.meta.video_keys:
video_metadata = _copy_and_reindex_videos(dataset, new_meta, episode_mapping)
video_metadata = _copy_and_reindex_videos(
dataset, new_meta, episode_mapping, camera_encoder_config
)
data_metadata = _copy_and_reindex_data(dataset, new_meta, episode_mapping)
@@ -578,8 +584,7 @@ def _keep_episodes_from_video_with_av(
output_path: Path,
episodes_to_keep: list[tuple[int, int]],
fps: float,
vcodec: str = "libsvtav1",
pix_fmt: str = "yuv420p",
camera_encoder_config: VideoEncoderConfig | None = None,
) -> None:
"""Keep only specified episodes from a video file using PyAV.
@@ -593,9 +598,10 @@ def _keep_episodes_from_video_with_av(
Ranges are half-open intervals: [start_frame, end_frame), where start_frame
is inclusive and end_frame is exclusive.
fps: Frame rate of the video.
vcodec: Video codec to use for encoding.
pix_fmt: Pixel format for output video.
camera_encoder_config: Video encoder settings (default: :class:`VideoEncoderConfig()`).
"""
if camera_encoder_config is None:
camera_encoder_config = VideoEncoderConfig()
from fractions import Fraction
import av
@@ -619,12 +625,12 @@ def _keep_episodes_from_video_with_av(
# Convert fps to Fraction for PyAV compatibility.
fps_fraction = Fraction(fps).limit_denominator(1000)
v_out = out.add_stream(vcodec, rate=fps_fraction)
v_out = out.add_stream(camera_encoder_config.vcodec, rate=fps_fraction)
# PyAV type stubs don't distinguish video streams from audio/subtitle streams.
v_out.width = v_in.codec_context.width
v_out.height = v_in.codec_context.height
v_out.pix_fmt = pix_fmt
v_out.pix_fmt = camera_encoder_config.pix_fmt
# Set time_base to match the frame rate for proper timestamp handling.
v_out.time_base = Fraction(1, int(fps))
@@ -687,8 +693,7 @@ def _copy_and_reindex_videos(
src_dataset: LeRobotDataset,
dst_meta: LeRobotDatasetMetadata,
episode_mapping: dict[int, int],
vcodec: str = "libsvtav1",
pix_fmt: str = "yuv420p",
camera_encoder_config: VideoEncoderConfig | None = None,
) -> dict[int, dict]:
"""Copy and filter video files, only re-encoding files with deleted episodes.
@@ -700,10 +705,13 @@ def _copy_and_reindex_videos(
src_dataset: Source dataset to copy from
dst_meta: Destination metadata object
episode_mapping: Mapping from old episode indices to new indices
camera_encoder_config: Video encoder settings used when re-encoding segments (default: :class:`VideoEncoderConfig()`).
Returns:
dict mapping episode index to its video metadata (chunk_index, file_index, timestamps)
"""
if camera_encoder_config is None:
camera_encoder_config = VideoEncoderConfig()
if src_dataset.meta.episodes is None:
src_dataset.meta.episodes = load_episodes(src_dataset.meta.root)
@@ -792,8 +800,7 @@ def _copy_and_reindex_videos(
dst_video_path,
episodes_to_keep_ranges,
src_dataset.meta.fps,
vcodec,
pix_fmt,
camera_encoder_config,
)
cumulative_ts = 0.0
@@ -1264,11 +1271,7 @@ def _estimate_frame_size_via_calibration(
episode_indices: list[int],
temp_dir: Path,
fps: int,
vcodec: str,
pix_fmt: str,
g: int,
crf: int,
fast_decode: int,
camera_encoder_config: VideoEncoderConfig,
num_calibration_frames: int = 30,
) -> float:
"""Estimate MB per frame by encoding a small calibration sample.
@@ -1282,11 +1285,7 @@ def _estimate_frame_size_via_calibration(
episode_indices: List of episode indices being processed.
temp_dir: Temporary directory for calibration files.
fps: Frames per second for video encoding.
vcodec: Video codec (libsvtav1, h264, hevc).
pix_fmt: Pixel format (yuv420p, etc.).
g: GOP size (group of pictures).
crf: Constant Rate Factor (quality).
fast_decode: Fast decode tuning parameter.
camera_encoder_config: Video encoder settings used for calibration encoding.
num_calibration_frames: Number of frames to use for calibration (default: 30).
Returns:
@@ -1322,11 +1321,7 @@ def _estimate_frame_size_via_calibration(
imgs_dir=calibration_dir,
video_path=calibration_video_path,
fps=fps,
vcodec=vcodec,
pix_fmt=pix_fmt,
g=g,
crf=crf,
fast_decode=fast_decode,
camera_encoder_config=camera_encoder_config,
overwrite=True,
)
@@ -1644,11 +1639,7 @@ def convert_image_to_video_dataset(
dataset: LeRobotDataset,
output_dir: Path | None = None,
repo_id: str | None = None,
vcodec: str = "libsvtav1",
pix_fmt: str = "yuv420p",
g: int = 2,
crf: int = 30,
fast_decode: int = 0,
camera_encoder_config: VideoEncoderConfig | None = None,
episode_indices: list[int] | None = None,
num_workers: int = 4,
max_episodes_per_batch: int | None = None,
@@ -1663,11 +1654,7 @@ def convert_image_to_video_dataset(
dataset: The source LeRobot dataset with images
output_dir: Root directory where the edited dataset will be stored. If not specified, defaults to $HF_LEROBOT_HOME/repo_id. Equivalent to new_root in EditDatasetConfig.
repo_id: Edited dataset identifier. Equivalent to new_repo_id in EditDatasetConfig.
vcodec: Video codec (default: libsvtav1)
pix_fmt: Pixel format (default: yuv420p)
g: Group of pictures size (default: 2)
crf: Constant rate factor (default: 30)
fast_decode: Fast decode tuning (default: 0)
camera_encoder_config: Video encoder settings (default: :class:`VideoEncoderConfig()`).
episode_indices: List of episode indices to convert (None = all episodes)
num_workers: Number of threads for parallel processing (default: 4)
max_episodes_per_batch: Maximum episodes per video batch to avoid memory issues (None = no limit)
@@ -1676,6 +1663,9 @@ def convert_image_to_video_dataset(
Returns:
New LeRobotDataset with images encoded as videos
"""
if camera_encoder_config is None:
camera_encoder_config = VideoEncoderConfig()
# Check that it's an image dataset
if len(dataset.meta.video_keys) > 0:
raise ValueError(
@@ -1699,7 +1689,10 @@ def convert_image_to_video_dataset(
logging.info(
f"Converting {len(episode_indices)} episodes with {len(img_keys)} cameras from {dataset.repo_id}"
)
logging.info(f"Video codec: {vcodec}, pixel format: {pix_fmt}, GOP: {g}, CRF: {crf}")
logging.info(
f"Video codec: {camera_encoder_config.vcodec}, pixel format: {camera_encoder_config.pix_fmt}, "
f"GOP: {camera_encoder_config.g}, CRF: {camera_encoder_config.crf}"
)
# Create new features dict, converting image features to video features
new_features = {}
@@ -1769,11 +1762,7 @@ def convert_image_to_video_dataset(
episode_indices=episode_indices,
temp_dir=temp_dir,
fps=fps,
vcodec=vcodec,
pix_fmt=pix_fmt,
g=g,
crf=crf,
fast_decode=fast_decode,
camera_encoder_config=camera_encoder_config,
)
logging.info(f"Processing camera: {img_key}")
@@ -1815,11 +1804,7 @@ def convert_image_to_video_dataset(
imgs_dir=imgs_dir,
video_path=video_path,
fps=fps,
vcodec=vcodec,
pix_fmt=pix_fmt,
g=g,
crf=crf,
fast_decode=fast_decode,
camera_encoder_config=camera_encoder_config,
overwrite=True,
)
+21 -9
View File
@@ -52,6 +52,7 @@ from .utils import (
)
from .video_utils import (
StreamingVideoEncoder,
VideoEncoderConfig,
concatenate_video_files,
encode_video_frames,
get_video_duration_in_s,
@@ -65,14 +66,19 @@ def _encode_video_worker(
episode_index: int,
root: Path,
fps: int,
vcodec: str = "libsvtav1",
camera_encoder_config: VideoEncoderConfig | None = None,
encoder_threads: int | None = None,
) -> Path:
temp_path = Path(tempfile.mkdtemp(dir=root)) / f"{video_key}_{episode_index:03d}.mp4"
fpath = DEFAULT_IMAGE_PATH.format(image_key=video_key, episode_index=episode_index, frame_index=0)
img_dir = (root / fpath).parent
encode_video_frames(
img_dir, temp_path, fps, vcodec=vcodec, overwrite=True, encoder_threads=encoder_threads
img_dir,
temp_path,
fps,
camera_encoder_config=camera_encoder_config,
encoder_threads=encoder_threads,
overwrite=True,
)
shutil.rmtree(img_dir)
return temp_path
@@ -89,20 +95,21 @@ class DatasetWriter:
self,
meta: LeRobotDatasetMetadata,
root: Path,
vcodec: str,
camera_encoder_config: VideoEncoderConfig,
encoder_threads: int | None,
batch_encoding_size: int,
streaming_encoder: StreamingVideoEncoder | None = None,
initial_frames: int = 0,
):
"""Initialize the writer with metadata, codec, and encoding config.
"""Initialize the writer with metadata, codec, and encoder config.
Args:
meta: Dataset metadata instance (used for feature schema, chunk
settings, and episode persistence).
root: Local dataset root directory.
vcodec: Video codec for encoding (e.g. ``'libsvtav1'``, ``'h264'``).
encoder_threads: Threads per encoder instance. ``None`` for auto.
camera_encoder_config: Video encoder settings applied to all cameras.
encoder_threads: Number of encoder threads (global). ``None``
lets the codec decide.
batch_encoding_size: Number of episodes to accumulate before
batch-encoding videos.
streaming_encoder: Optional pre-built :class:`StreamingVideoEncoder`
@@ -111,7 +118,7 @@ class DatasetWriter:
"""
self._meta = meta
self._root = root
self._vcodec = vcodec
self._camera_encoder_config = camera_encoder_config
self._encoder_threads = encoder_threads
self._batch_encoding_size = batch_encoding_size
self._streaming_encoder = streaming_encoder
@@ -284,7 +291,7 @@ class DatasetWriter:
episode_index,
self._root,
self._meta.fps,
self._vcodec,
self._camera_encoder_config,
self._encoder_threads,
): video_key
for video_key in self._meta.video_keys
@@ -564,7 +571,12 @@ class DatasetWriter:
def _encode_temporary_episode_video(self, video_key: str, episode_index: int) -> Path:
"""Use ffmpeg to convert frames stored as png into mp4 videos."""
return _encode_video_worker(
video_key, episode_index, self._root, self._meta.fps, self._vcodec, self._encoder_threads
video_key,
episode_index,
self._root,
self._meta.fps,
self._camera_encoder_config,
self._encoder_threads,
)
def close_writer(self) -> None:
+48 -40
View File
@@ -37,7 +37,7 @@ from .utils import (
from .video_utils import (
StreamingVideoEncoder,
get_safe_default_video_backend,
resolve_vcodec,
VideoEncoderConfig,
)
logger = logging.getLogger(__name__)
@@ -58,10 +58,10 @@ class LeRobotDataset(torch.utils.data.Dataset):
video_backend: str | None = None,
return_uint8: bool = False,
batch_encoding_size: int = 1,
vcodec: str = "libsvtav1",
camera_encoder_config: VideoEncoderConfig | None = None,
encoder_threads: int | None = None,
streaming_encoding: bool = False,
encoder_queue_maxsize: int = 30,
encoder_threads: int | None = None,
):
"""
2 modes are available for instantiating this class, depending on 2 different use cases:
@@ -177,16 +177,15 @@ class LeRobotDataset(torch.utils.data.Dataset):
You can also use the 'pyav' decoder used by Torchvision, which used to be the default option, or 'video_reader' which is another decoder of Torchvision.
batch_encoding_size (int, optional): Number of episodes to accumulate before batch encoding videos.
Set to 1 for immediate encoding (default), or higher for batched encoding. Defaults to 1.
vcodec (str, optional): Video codec for encoding videos during recording. Options: 'h264', 'hevc',
'libsvtav1', 'auto', or hardware-specific codecs like 'h264_videotoolbox', 'h264_nvenc'.
Defaults to 'libsvtav1'. Use 'auto' to auto-detect the best available hardware encoder.
camera_encoder_config (VideoEncoderConfig | None, optional): Video encoder settings for cameras
(codec, quality, etc.). Defaults to
:class:`~lerobot.datasets.video_utils.VideoEncoderConfig` defaults when ``None``.
encoder_threads (int | None, optional): Number of encoder threads (global). ``None`` lets the
codec decide.
streaming_encoding (bool, optional): If True, encode video frames in real-time during capture
instead of writing PNG images first. This makes save_episode() near-instant. Defaults to False.
encoder_queue_maxsize (int, optional): Maximum number of frames to buffer per camera when using
streaming encoding. Defaults to 30 (~1s at 30fps).
encoder_threads (int | None, optional): Number of threads per encoder instance. None lets the
codec auto-detect (default). Lower values reduce CPU usage per encoder. Maps to 'lp' (via svtav1-params) for
libsvtav1 and 'threads' for h264/hevc.
Note:
Write-mode parameters (``streaming_encoding``, ``batch_encoding_size``) passed to
@@ -205,7 +204,9 @@ class LeRobotDataset(torch.utils.data.Dataset):
self._video_backend = video_backend if video_backend else get_safe_default_video_backend()
self._return_uint8 = return_uint8
self._batch_encoding_size = batch_encoding_size
self._vcodec = resolve_vcodec(vcodec)
if camera_encoder_config is None:
camera_encoder_config = VideoEncoderConfig()
self._camera_encoder_config = camera_encoder_config
self._encoder_threads = encoder_threads
if self._requested_root is not None:
@@ -251,13 +252,16 @@ class LeRobotDataset(torch.utils.data.Dataset):
streaming_enc = None
if streaming_encoding and len(self.meta.video_keys) > 0:
streaming_enc = self._build_streaming_encoder(
self.meta.fps, self._vcodec, encoder_queue_maxsize, encoder_threads
self.meta.fps,
self._camera_encoder_config,
self._encoder_threads,
encoder_queue_maxsize,
)
self.writer = DatasetWriter(
meta=self.meta,
root=self.root,
vcodec=self._vcodec,
encoder_threads=encoder_threads,
camera_encoder_config=self._camera_encoder_config,
encoder_threads=self._encoder_threads,
batch_encoding_size=batch_encoding_size,
streaming_encoder=streaming_enc,
initial_frames=self.meta.total_frames,
@@ -298,19 +302,15 @@ class LeRobotDataset(torch.utils.data.Dataset):
@staticmethod
def _build_streaming_encoder(
fps: int,
vcodec: str,
encoder_queue_maxsize: int,
camera_encoder_config: VideoEncoderConfig,
encoder_threads: int | None,
encoder_queue_maxsize: int,
) -> StreamingVideoEncoder:
return StreamingVideoEncoder(
fps=fps,
vcodec=vcodec,
pix_fmt="yuv420p",
g=2,
crf=30,
preset=None,
queue_maxsize=encoder_queue_maxsize,
camera_encoder_config=camera_encoder_config,
encoder_threads=encoder_threads,
queue_maxsize=encoder_queue_maxsize,
)
# ── Metadata properties ───────────────────────────────────────────
@@ -624,8 +624,8 @@ class LeRobotDataset(torch.utils.data.Dataset):
image_writer_processes: int = 0,
image_writer_threads: int = 0,
video_backend: str | None = None,
camera_encoder_config: VideoEncoderConfig | None = None,
batch_encoding_size: int = 1,
vcodec: str = "libsvtav1",
metadata_buffer_size: int = 10,
streaming_encoding: bool = False,
encoder_queue_maxsize: int = 30,
@@ -656,20 +656,23 @@ class LeRobotDataset(torch.utils.data.Dataset):
video_backend: Video decoding backend (used when reading back).
batch_encoding_size: Number of episodes to accumulate before
batch-encoding videos. ``1`` means encode immediately.
vcodec: Video codec for encoding. Options include ``'libsvtav1'``,
``'h264'``, ``'hevc'``, ``'auto'``.
camera_encoder_config: Video encoder settings for cameras; defaults
match :class:`~lerobot.datasets.video_utils.VideoEncoderConfig`
when ``None``.
encoder_threads: Number of encoder threads (global). ``None``
lets the codec decide.
metadata_buffer_size: Number of episode metadata records to buffer
before flushing to parquet.
streaming_encoding: If ``True``, encode video frames in real-time
during capture instead of writing images first.
encoder_queue_maxsize: Max buffered frames per camera when using
streaming encoding.
encoder_threads: Threads per encoder instance. ``None`` for auto.
Returns:
A new :class:`LeRobotDataset` in write mode.
"""
vcodec = resolve_vcodec(vcodec)
if camera_encoder_config is None:
camera_encoder_config = VideoEncoderConfig()
obj = cls.__new__(cls)
obj.meta = LeRobotDatasetMetadata.create(
repo_id=repo_id,
@@ -693,20 +696,21 @@ class LeRobotDataset(torch.utils.data.Dataset):
obj._video_backend = video_backend if video_backend is not None else get_safe_default_video_backend()
obj._return_uint8 = False
obj._batch_encoding_size = batch_encoding_size
obj._vcodec = vcodec
obj._camera_encoder_config = camera_encoder_config
obj._encoder_threads = encoder_threads
# Reader is lazily created on first access (write-only mode)
obj.reader = None
# Create writer
streaming_enc = None
if streaming_encoding and len(obj.meta.video_keys) > 0:
streaming_enc = cls._build_streaming_encoder(fps, vcodec, encoder_queue_maxsize, encoder_threads)
streaming_enc = cls._build_streaming_encoder(
fps, camera_encoder_config, encoder_threads, encoder_queue_maxsize
)
obj.writer = DatasetWriter(
meta=obj.meta,
root=obj.root,
vcodec=vcodec,
camera_encoder_config=camera_encoder_config,
encoder_threads=encoder_threads,
batch_encoding_size=batch_encoding_size,
streaming_encoder=streaming_enc,
@@ -729,12 +733,12 @@ class LeRobotDataset(torch.utils.data.Dataset):
force_cache_sync: bool = False,
video_backend: str | None = None,
batch_encoding_size: int = 1,
vcodec: str = "libsvtav1",
camera_encoder_config: VideoEncoderConfig | None = None,
encoder_threads: int | None = None,
image_writer_processes: int = 0,
image_writer_threads: int = 0,
streaming_encoding: bool = False,
encoder_queue_maxsize: int = 30,
encoder_threads: int | None = None,
) -> "LeRobotDataset":
"""Resume recording on an existing dataset.
@@ -757,13 +761,16 @@ class LeRobotDataset(torch.utils.data.Dataset):
video_backend: Video decoding backend for reading back data.
batch_encoding_size: Number of episodes to accumulate before
batch-encoding videos.
vcodec: Video codec for encoding.
camera_encoder_config: Video encoder settings for cameras; defaults
match :class:`~lerobot.datasets.video_utils.VideoEncoderConfig`
when ``None``.
encoder_threads: Number of encoder threads (global). ``None``
lets the codec decide.
image_writer_processes: Subprocesses for async image writing.
image_writer_threads: Threads for async image writing.
streaming_encoding: If ``True``, encode video in real-time during
capture.
encoder_queue_maxsize: Max buffered frames per camera for streaming.
encoder_threads: Threads per encoder instance. ``None`` for auto.
Returns:
A :class:`LeRobotDataset` in write mode, ready to append episodes.
@@ -774,7 +781,6 @@ class LeRobotDataset(torch.utils.data.Dataset):
"Writing into the revision-safe Hub snapshot cache (used when root=None) would corrupt "
"the shared cache. Please provide a local directory path."
)
vcodec = resolve_vcodec(vcodec)
obj = cls.__new__(cls)
obj.repo_id = repo_id
obj._requested_root = Path(root)
@@ -786,8 +792,6 @@ class LeRobotDataset(torch.utils.data.Dataset):
obj._video_backend = video_backend if video_backend else get_safe_default_video_backend()
obj._return_uint8 = False
obj._batch_encoding_size = batch_encoding_size
obj._vcodec = vcodec
obj._encoder_threads = encoder_threads
if obj._requested_root is not None:
obj._requested_root.mkdir(exist_ok=True, parents=True)
@@ -796,21 +800,25 @@ class LeRobotDataset(torch.utils.data.Dataset):
obj.meta = LeRobotDatasetMetadata(
obj.repo_id, obj._requested_root, obj.revision, force_cache_sync=force_cache_sync
)
if camera_encoder_config is None:
camera_encoder_config = VideoEncoderConfig()
obj._camera_encoder_config = camera_encoder_config
obj._encoder_threads = encoder_threads
obj.root = obj.meta.root
# Reader is lazily created on first access (write-only mode)
obj.reader = None
# Create writer for appending
streaming_enc = None
if streaming_encoding and len(obj.meta.video_keys) > 0:
streaming_enc = cls._build_streaming_encoder(
obj.meta.fps, vcodec, encoder_queue_maxsize, encoder_threads
obj.meta.fps, camera_encoder_config, encoder_threads, encoder_queue_maxsize
)
obj.writer = DatasetWriter(
meta=obj.meta,
root=obj.root,
vcodec=vcodec,
camera_encoder_config=camera_encoder_config,
encoder_threads=encoder_threads,
batch_encoding_size=batch_encoding_size,
streaming_encoder=streaming_enc,
+26 -11
View File
@@ -49,6 +49,14 @@ Delete episodes and save to a new dataset at a specific path and with a new repo
--operation.type delete_episodes \
--operation.episode_indices "[0, 2, 5]"
Delete episodes and re-encode video segments with h264:
lerobot-edit-dataset \
--repo_id lerobot/pusht \
--operation.type delete_episodes \
--operation.episode_indices "[0, 2, 5]" \
--operation.camera_encoder_config.vcodec h264 \
--operation.camera_encoder_config.crf 23
Split dataset by fractions (pusht_train, pusht_val):
lerobot-edit-dataset \
--repo_id lerobot/pusht \
@@ -74,6 +82,14 @@ Split into more than two splits:
--operation.type split \
--operation.splits '{"train": 0.6, "val": 0.2, "test": 0.2}'
Split dataset and re-encode video segments with h264:
lerobot-edit-dataset \
--repo_id lerobot/pusht \
--operation.type split \
--operation.splits '{"train": 0.8, "val": 0.2}' \
--operation.camera_encoder_config.vcodec h264 \
--operation.camera_encoder_config.crf 23
Merge multiple datasets:
lerobot-edit-dataset \
--new_repo_id lerobot/pusht_merged \
@@ -187,7 +203,7 @@ import abc
import logging
import shutil
import sys
from dataclasses import dataclass
from dataclasses import dataclass, field
from pathlib import Path
import draccus
@@ -195,6 +211,8 @@ import draccus
from lerobot.configs import parser
from lerobot.datasets import (
LeRobotDataset,
VideoEncoderConfig,
camera_encoder_defaults,
convert_image_to_video_dataset,
delete_episodes,
merge_datasets,
@@ -218,12 +236,14 @@ class OperationConfig(draccus.ChoiceRegistry, abc.ABC):
@dataclass
class DeleteEpisodesConfig(OperationConfig):
episode_indices: list[int] | None = None
camera_encoder_config: VideoEncoderConfig = field(default_factory=camera_encoder_defaults)
@OperationConfig.register_subclass("split")
@dataclass
class SplitConfig(OperationConfig):
splits: dict[str, float | list[int]] | None = None
camera_encoder_config: VideoEncoderConfig = field(default_factory=camera_encoder_defaults)
@OperationConfig.register_subclass("merge")
@@ -250,11 +270,7 @@ class ModifyTasksConfig(OperationConfig):
@dataclass
class ConvertImageToVideoConfig(OperationConfig):
output_dir: str | None = None
vcodec: str = "libsvtav1"
pix_fmt: str = "yuv420p"
g: int = 2
crf: int = 30
fast_decode: int = 0
camera_encoder_config: VideoEncoderConfig = field(default_factory=camera_encoder_defaults)
episode_indices: list[int] | None = None
num_workers: int = 4
max_episodes_per_batch: int | None = None
@@ -356,6 +372,7 @@ def handle_delete_episodes(cfg: EditDatasetConfig) -> None:
episode_indices=cfg.operation.episode_indices,
output_dir=output_dir,
repo_id=output_repo_id,
camera_encoder_config=cfg.operation.camera_encoder_config,
)
logging.info(f"Dataset saved to {output_dir}")
@@ -387,6 +404,7 @@ def handle_split(cfg: EditDatasetConfig) -> None:
dataset,
splits=cfg.operation.splits,
output_dir=cfg.new_root,
camera_encoder_config=cfg.operation.camera_encoder_config,
)
for split_name, split_ds in split_datasets.items():
@@ -557,11 +575,8 @@ def handle_convert_image_to_video(cfg: EditDatasetConfig) -> None:
dataset=dataset,
output_dir=output_dir,
repo_id=output_repo_id,
vcodec=getattr(cfg.operation, "vcodec", "libsvtav1"),
pix_fmt=getattr(cfg.operation, "pix_fmt", "yuv420p"),
g=getattr(cfg.operation, "g", 2),
crf=getattr(cfg.operation, "crf", 30),
fast_decode=getattr(cfg.operation, "fast_decode", 0),
camera_encoder_config=getattr(cfg.operation, "camera_encoder_config", None)
or camera_encoder_defaults(),
episode_indices=getattr(cfg.operation, "episode_indices", None),
num_workers=getattr(cfg.operation, "num_workers", 4),
max_episodes_per_batch=getattr(cfg.operation, "max_episodes_per_batch", None),
+28 -5
View File
@@ -63,6 +63,27 @@ lerobot-record \\
--dataset.streaming_encoding=true \\
--dataset.encoder_threads=2
```
Example recording with custom video encoding parameters:
```shell
lerobot-record \\
--robot.type=so100_follower \\
--robot.port=/dev/tty.usbmodem58760431541 \\
--robot.cameras="{laptop: {type: opencv, index_or_path: 0, width: 640, height: 480, fps: 30}}" \\
--robot.id=black \\
--teleop.type=so100_leader \\
--teleop.port=/dev/tty.usbmodem58760431551 \\
--teleop.id=blue \\
--dataset.repo_id=<my_username>/<my_dataset_name> \\
--dataset.num_episodes=2 \\
--dataset.single_task="Grab the cube" \\
--dataset.streaming_encoding=true \\
--dataset.encoder_threads=2 \\
--dataset.camera_encoder_config.vcodec=h264 \\
--dataset.camera_encoder_config.preset=fast \\
--dataset.camera_encoder_config.extra_options={"tune": "film", "profile:v": "high", "bf": 2} \\
--display_data=true
```
"""
import logging
@@ -84,8 +105,10 @@ from lerobot.configs import parser
from lerobot.configs.dataset import DatasetRecordConfig
from lerobot.datasets import (
LeRobotDataset,
VideoEncoderConfig,
VideoEncodingManager,
aggregate_pipeline_dataset_features,
camera_encoder_defaults,
create_initial_features,
safe_stop_image_writer,
)
@@ -377,10 +400,10 @@ def record(
cfg.dataset.repo_id,
root=cfg.dataset.root,
batch_encoding_size=cfg.dataset.video_encoding_batch_size,
vcodec=cfg.dataset.vcodec,
camera_encoder_config=cfg.dataset.camera_encoder_config,
encoder_threads=cfg.dataset.encoder_threads,
streaming_encoding=cfg.dataset.streaming_encoding,
encoder_queue_maxsize=cfg.dataset.encoder_queue_maxsize,
encoder_threads=cfg.dataset.encoder_threads,
image_writer_processes=cfg.dataset.num_image_writer_processes if num_cameras > 0 else 0,
image_writer_threads=cfg.dataset.num_image_writer_threads_per_camera * num_cameras
if num_cameras > 0
@@ -406,10 +429,10 @@ def record(
image_writer_processes=cfg.dataset.num_image_writer_processes,
image_writer_threads=cfg.dataset.num_image_writer_threads_per_camera * len(robot.cameras),
batch_encoding_size=cfg.dataset.video_encoding_batch_size,
vcodec=cfg.dataset.vcodec,
camera_encoder_config=cfg.dataset.camera_encoder_config,
encoder_threads=cfg.dataset.encoder_threads,
streaming_encoding=cfg.dataset.streaming_encoding,
encoder_queue_maxsize=cfg.dataset.encoder_queue_maxsize,
encoder_threads=cfg.dataset.encoder_threads,
)
robot.connect()
@@ -420,7 +443,7 @@ def record(
if not cfg.dataset.streaming_encoding:
logging.info(
"Streaming encoding is disabled. If you have capable hardware, consider enabling it for way faster episode saving. --dataset.streaming_encoding=true --dataset.encoder_threads=2 # --dataset.vcodec=auto. More info in the documentation: https://huggingface.co/docs/lerobot/streaming_video_encoding"
"Streaming encoding is disabled. If you have capable hardware, consider enabling it for way faster episode saving. --dataset.streaming_encoding=true --dataset.encoder_threads=2 # --dataset.camera_encoder_config.vcodec=auto. More info in the documentation: https://huggingface.co/docs/lerobot/streaming_video_encoding"
)
with VideoEncodingManager(dataset):