feat(VideoEncoderConfig): propagating the VideoEncoderConfig in the codebase

This commit is contained in:
CarolinePascal
2026-04-22 21:18:29 +02:00
parent 479e444517
commit 38ff579bcc
6 changed files with 165 additions and 118 deletions
+8 -4
View File
@@ -39,6 +39,7 @@ from tqdm import tqdm
from lerobot.datasets.lerobot_dataset import LeRobotDataset from lerobot.datasets.lerobot_dataset import LeRobotDataset
from lerobot.datasets.video_utils import ( from lerobot.datasets.video_utils import (
VideoEncoderConfig,
decode_video_frames, decode_video_frames,
encode_video_frames, encode_video_frames,
) )
@@ -251,10 +252,13 @@ def benchmark_encoding_decoding(
imgs_dir=imgs_dir, imgs_dir=imgs_dir,
video_path=video_path, video_path=video_path,
fps=fps, fps=fps,
vcodec=encoding_cfg["vcodec"], camera_encoder_config=VideoEncoderConfig(
pix_fmt=encoding_cfg["pix_fmt"], vcodec=encoding_cfg["vcodec"],
g=encoding_cfg.get("g"), pix_fmt=encoding_cfg["pix_fmt"],
crf=encoding_cfg.get("crf"), g=encoding_cfg.get("g"),
crf=encoding_cfg.get("crf"),
preset=encoding_cfg.get("preset"),
),
# fast_decode=encoding_cfg.get("fastdecode"), # fast_decode=encoding_cfg.get("fastdecode"),
overwrite=True, overwrite=True,
) )
+34 -49
View File
@@ -62,7 +62,7 @@ from .utils import (
DEFAULT_EPISODES_PATH, DEFAULT_EPISODES_PATH,
update_chunk_file_indices, update_chunk_file_indices,
) )
from .video_utils import encode_video_frames, get_video_info from .video_utils import VideoEncoderConfig, encode_video_frames, get_video_info
def _load_episode_with_stats(src_dataset: LeRobotDataset, episode_idx: int) -> dict: def _load_episode_with_stats(src_dataset: LeRobotDataset, episode_idx: int) -> dict:
@@ -92,6 +92,7 @@ def delete_episodes(
episode_indices: list[int], episode_indices: list[int],
output_dir: str | Path | None = None, output_dir: str | Path | None = None,
repo_id: str | None = None, repo_id: str | None = None,
camera_encoder_config: VideoEncoderConfig | None = None,
) -> LeRobotDataset: ) -> LeRobotDataset:
"""Delete episodes from a LeRobotDataset and create a new dataset. """Delete episodes from a LeRobotDataset and create a new dataset.
@@ -100,6 +101,7 @@ def delete_episodes(
episode_indices: List of episode indices to delete. episode_indices: List of episode indices to delete.
output_dir: Root directory where the edited dataset will be stored. If not specified, defaults to $HF_LEROBOT_HOME/repo_id. Equivalent to new_root in EditDatasetConfig. output_dir: Root directory where the edited dataset will be stored. If not specified, defaults to $HF_LEROBOT_HOME/repo_id. Equivalent to new_root in EditDatasetConfig.
repo_id: Edited dataset identifier. Equivalent to new_repo_id in EditDatasetConfig. repo_id: Edited dataset identifier. Equivalent to new_repo_id in EditDatasetConfig.
camera_encoder_config: Video encoder settings used when re-encoding video segments (default: :class:`VideoEncoderConfig()`).
""" """
if not episode_indices: if not episode_indices:
raise ValueError("No episodes to delete") raise ValueError("No episodes to delete")
@@ -132,7 +134,7 @@ def delete_episodes(
video_metadata = None video_metadata = None
if dataset.meta.video_keys: if dataset.meta.video_keys:
video_metadata = _copy_and_reindex_videos(dataset, new_meta, episode_mapping) video_metadata = _copy_and_reindex_videos(dataset, new_meta, episode_mapping, camera_encoder_config)
data_metadata = _copy_and_reindex_data(dataset, new_meta, episode_mapping) data_metadata = _copy_and_reindex_data(dataset, new_meta, episode_mapping)
@@ -154,6 +156,7 @@ def split_dataset(
dataset: LeRobotDataset, dataset: LeRobotDataset,
splits: dict[str, float | list[int]], splits: dict[str, float | list[int]],
output_dir: str | Path | None = None, output_dir: str | Path | None = None,
camera_encoder_config: VideoEncoderConfig | None = None,
) -> dict[str, LeRobotDataset]: ) -> dict[str, LeRobotDataset]:
"""Split a LeRobotDataset into multiple smaller datasets. """Split a LeRobotDataset into multiple smaller datasets.
@@ -162,6 +165,7 @@ def split_dataset(
splits: Either a dict mapping split names to episode indices, or a dict mapping splits: Either a dict mapping split names to episode indices, or a dict mapping
split names to fractions (must sum to <= 1.0). split names to fractions (must sum to <= 1.0).
output_dir: Root directory where the split datasets will be stored. If not specified, defaults to $HF_LEROBOT_HOME/repo_id. output_dir: Root directory where the split datasets will be stored. If not specified, defaults to $HF_LEROBOT_HOME/repo_id.
camera_encoder_config: Video encoder settings used when re-encoding video segments (default: :class:`VideoEncoderConfig()`).
Examples: Examples:
Split by specific episodes Split by specific episodes
@@ -222,7 +226,9 @@ def split_dataset(
video_metadata = None video_metadata = None
if dataset.meta.video_keys: if dataset.meta.video_keys:
video_metadata = _copy_and_reindex_videos(dataset, new_meta, episode_mapping) video_metadata = _copy_and_reindex_videos(
dataset, new_meta, episode_mapping, camera_encoder_config
)
data_metadata = _copy_and_reindex_data(dataset, new_meta, episode_mapping) data_metadata = _copy_and_reindex_data(dataset, new_meta, episode_mapping)
@@ -578,8 +584,7 @@ def _keep_episodes_from_video_with_av(
output_path: Path, output_path: Path,
episodes_to_keep: list[tuple[int, int]], episodes_to_keep: list[tuple[int, int]],
fps: float, fps: float,
vcodec: str = "libsvtav1", camera_encoder_config: VideoEncoderConfig | None = None,
pix_fmt: str = "yuv420p",
) -> None: ) -> None:
"""Keep only specified episodes from a video file using PyAV. """Keep only specified episodes from a video file using PyAV.
@@ -593,9 +598,10 @@ def _keep_episodes_from_video_with_av(
Ranges are half-open intervals: [start_frame, end_frame), where start_frame Ranges are half-open intervals: [start_frame, end_frame), where start_frame
is inclusive and end_frame is exclusive. is inclusive and end_frame is exclusive.
fps: Frame rate of the video. fps: Frame rate of the video.
vcodec: Video codec to use for encoding. camera_encoder_config: Video encoder settings (default: :class:`VideoEncoderConfig()`).
pix_fmt: Pixel format for output video.
""" """
if camera_encoder_config is None:
camera_encoder_config = VideoEncoderConfig()
from fractions import Fraction from fractions import Fraction
import av import av
@@ -619,12 +625,12 @@ def _keep_episodes_from_video_with_av(
# Convert fps to Fraction for PyAV compatibility. # Convert fps to Fraction for PyAV compatibility.
fps_fraction = Fraction(fps).limit_denominator(1000) fps_fraction = Fraction(fps).limit_denominator(1000)
v_out = out.add_stream(vcodec, rate=fps_fraction) v_out = out.add_stream(camera_encoder_config.vcodec, rate=fps_fraction)
# PyAV type stubs don't distinguish video streams from audio/subtitle streams. # PyAV type stubs don't distinguish video streams from audio/subtitle streams.
v_out.width = v_in.codec_context.width v_out.width = v_in.codec_context.width
v_out.height = v_in.codec_context.height v_out.height = v_in.codec_context.height
v_out.pix_fmt = pix_fmt v_out.pix_fmt = camera_encoder_config.pix_fmt
# Set time_base to match the frame rate for proper timestamp handling. # Set time_base to match the frame rate for proper timestamp handling.
v_out.time_base = Fraction(1, int(fps)) v_out.time_base = Fraction(1, int(fps))
@@ -687,8 +693,7 @@ def _copy_and_reindex_videos(
src_dataset: LeRobotDataset, src_dataset: LeRobotDataset,
dst_meta: LeRobotDatasetMetadata, dst_meta: LeRobotDatasetMetadata,
episode_mapping: dict[int, int], episode_mapping: dict[int, int],
vcodec: str = "libsvtav1", camera_encoder_config: VideoEncoderConfig | None = None,
pix_fmt: str = "yuv420p",
) -> dict[int, dict]: ) -> dict[int, dict]:
"""Copy and filter video files, only re-encoding files with deleted episodes. """Copy and filter video files, only re-encoding files with deleted episodes.
@@ -700,10 +705,13 @@ def _copy_and_reindex_videos(
src_dataset: Source dataset to copy from src_dataset: Source dataset to copy from
dst_meta: Destination metadata object dst_meta: Destination metadata object
episode_mapping: Mapping from old episode indices to new indices episode_mapping: Mapping from old episode indices to new indices
camera_encoder_config: Video encoder settings used when re-encoding segments (default: :class:`VideoEncoderConfig()`).
Returns: Returns:
dict mapping episode index to its video metadata (chunk_index, file_index, timestamps) dict mapping episode index to its video metadata (chunk_index, file_index, timestamps)
""" """
if camera_encoder_config is None:
camera_encoder_config = VideoEncoderConfig()
if src_dataset.meta.episodes is None: if src_dataset.meta.episodes is None:
src_dataset.meta.episodes = load_episodes(src_dataset.meta.root) src_dataset.meta.episodes = load_episodes(src_dataset.meta.root)
@@ -792,8 +800,7 @@ def _copy_and_reindex_videos(
dst_video_path, dst_video_path,
episodes_to_keep_ranges, episodes_to_keep_ranges,
src_dataset.meta.fps, src_dataset.meta.fps,
vcodec, camera_encoder_config,
pix_fmt,
) )
cumulative_ts = 0.0 cumulative_ts = 0.0
@@ -1264,11 +1271,7 @@ def _estimate_frame_size_via_calibration(
episode_indices: list[int], episode_indices: list[int],
temp_dir: Path, temp_dir: Path,
fps: int, fps: int,
vcodec: str, camera_encoder_config: VideoEncoderConfig,
pix_fmt: str,
g: int,
crf: int,
fast_decode: int,
num_calibration_frames: int = 30, num_calibration_frames: int = 30,
) -> float: ) -> float:
"""Estimate MB per frame by encoding a small calibration sample. """Estimate MB per frame by encoding a small calibration sample.
@@ -1282,11 +1285,7 @@ def _estimate_frame_size_via_calibration(
episode_indices: List of episode indices being processed. episode_indices: List of episode indices being processed.
temp_dir: Temporary directory for calibration files. temp_dir: Temporary directory for calibration files.
fps: Frames per second for video encoding. fps: Frames per second for video encoding.
vcodec: Video codec (libsvtav1, h264, hevc). camera_encoder_config: Video encoder settings used for calibration encoding.
pix_fmt: Pixel format (yuv420p, etc.).
g: GOP size (group of pictures).
crf: Constant Rate Factor (quality).
fast_decode: Fast decode tuning parameter.
num_calibration_frames: Number of frames to use for calibration (default: 30). num_calibration_frames: Number of frames to use for calibration (default: 30).
Returns: Returns:
@@ -1322,11 +1321,7 @@ def _estimate_frame_size_via_calibration(
imgs_dir=calibration_dir, imgs_dir=calibration_dir,
video_path=calibration_video_path, video_path=calibration_video_path,
fps=fps, fps=fps,
vcodec=vcodec, camera_encoder_config=camera_encoder_config,
pix_fmt=pix_fmt,
g=g,
crf=crf,
fast_decode=fast_decode,
overwrite=True, overwrite=True,
) )
@@ -1644,11 +1639,7 @@ def convert_image_to_video_dataset(
dataset: LeRobotDataset, dataset: LeRobotDataset,
output_dir: Path | None = None, output_dir: Path | None = None,
repo_id: str | None = None, repo_id: str | None = None,
vcodec: str = "libsvtav1", camera_encoder_config: VideoEncoderConfig | None = None,
pix_fmt: str = "yuv420p",
g: int = 2,
crf: int = 30,
fast_decode: int = 0,
episode_indices: list[int] | None = None, episode_indices: list[int] | None = None,
num_workers: int = 4, num_workers: int = 4,
max_episodes_per_batch: int | None = None, max_episodes_per_batch: int | None = None,
@@ -1663,11 +1654,7 @@ def convert_image_to_video_dataset(
dataset: The source LeRobot dataset with images dataset: The source LeRobot dataset with images
output_dir: Root directory where the edited dataset will be stored. If not specified, defaults to $HF_LEROBOT_HOME/repo_id. Equivalent to new_root in EditDatasetConfig. output_dir: Root directory where the edited dataset will be stored. If not specified, defaults to $HF_LEROBOT_HOME/repo_id. Equivalent to new_root in EditDatasetConfig.
repo_id: Edited dataset identifier. Equivalent to new_repo_id in EditDatasetConfig. repo_id: Edited dataset identifier. Equivalent to new_repo_id in EditDatasetConfig.
vcodec: Video codec (default: libsvtav1) camera_encoder_config: Video encoder settings (default: :class:`VideoEncoderConfig()`).
pix_fmt: Pixel format (default: yuv420p)
g: Group of pictures size (default: 2)
crf: Constant rate factor (default: 30)
fast_decode: Fast decode tuning (default: 0)
episode_indices: List of episode indices to convert (None = all episodes) episode_indices: List of episode indices to convert (None = all episodes)
num_workers: Number of threads for parallel processing (default: 4) num_workers: Number of threads for parallel processing (default: 4)
max_episodes_per_batch: Maximum episodes per video batch to avoid memory issues (None = no limit) max_episodes_per_batch: Maximum episodes per video batch to avoid memory issues (None = no limit)
@@ -1676,6 +1663,9 @@ def convert_image_to_video_dataset(
Returns: Returns:
New LeRobotDataset with images encoded as videos New LeRobotDataset with images encoded as videos
""" """
if camera_encoder_config is None:
camera_encoder_config = VideoEncoderConfig()
# Check that it's an image dataset # Check that it's an image dataset
if len(dataset.meta.video_keys) > 0: if len(dataset.meta.video_keys) > 0:
raise ValueError( raise ValueError(
@@ -1699,7 +1689,10 @@ def convert_image_to_video_dataset(
logging.info( logging.info(
f"Converting {len(episode_indices)} episodes with {len(img_keys)} cameras from {dataset.repo_id}" f"Converting {len(episode_indices)} episodes with {len(img_keys)} cameras from {dataset.repo_id}"
) )
logging.info(f"Video codec: {vcodec}, pixel format: {pix_fmt}, GOP: {g}, CRF: {crf}") logging.info(
f"Video codec: {camera_encoder_config.vcodec}, pixel format: {camera_encoder_config.pix_fmt}, "
f"GOP: {camera_encoder_config.g}, CRF: {camera_encoder_config.crf}"
)
# Create new features dict, converting image features to video features # Create new features dict, converting image features to video features
new_features = {} new_features = {}
@@ -1769,11 +1762,7 @@ def convert_image_to_video_dataset(
episode_indices=episode_indices, episode_indices=episode_indices,
temp_dir=temp_dir, temp_dir=temp_dir,
fps=fps, fps=fps,
vcodec=vcodec, camera_encoder_config=camera_encoder_config,
pix_fmt=pix_fmt,
g=g,
crf=crf,
fast_decode=fast_decode,
) )
logging.info(f"Processing camera: {img_key}") logging.info(f"Processing camera: {img_key}")
@@ -1815,11 +1804,7 @@ def convert_image_to_video_dataset(
imgs_dir=imgs_dir, imgs_dir=imgs_dir,
video_path=video_path, video_path=video_path,
fps=fps, fps=fps,
vcodec=vcodec, camera_encoder_config=camera_encoder_config,
pix_fmt=pix_fmt,
g=g,
crf=crf,
fast_decode=fast_decode,
overwrite=True, overwrite=True,
) )
+21 -9
View File
@@ -52,6 +52,7 @@ from .utils import (
) )
from .video_utils import ( from .video_utils import (
StreamingVideoEncoder, StreamingVideoEncoder,
VideoEncoderConfig,
concatenate_video_files, concatenate_video_files,
encode_video_frames, encode_video_frames,
get_video_duration_in_s, get_video_duration_in_s,
@@ -65,14 +66,19 @@ def _encode_video_worker(
episode_index: int, episode_index: int,
root: Path, root: Path,
fps: int, fps: int,
vcodec: str = "libsvtav1", camera_encoder_config: VideoEncoderConfig | None = None,
encoder_threads: int | None = None, encoder_threads: int | None = None,
) -> Path: ) -> Path:
temp_path = Path(tempfile.mkdtemp(dir=root)) / f"{video_key}_{episode_index:03d}.mp4" temp_path = Path(tempfile.mkdtemp(dir=root)) / f"{video_key}_{episode_index:03d}.mp4"
fpath = DEFAULT_IMAGE_PATH.format(image_key=video_key, episode_index=episode_index, frame_index=0) fpath = DEFAULT_IMAGE_PATH.format(image_key=video_key, episode_index=episode_index, frame_index=0)
img_dir = (root / fpath).parent img_dir = (root / fpath).parent
encode_video_frames( encode_video_frames(
img_dir, temp_path, fps, vcodec=vcodec, overwrite=True, encoder_threads=encoder_threads img_dir,
temp_path,
fps,
camera_encoder_config=camera_encoder_config,
encoder_threads=encoder_threads,
overwrite=True,
) )
shutil.rmtree(img_dir) shutil.rmtree(img_dir)
return temp_path return temp_path
@@ -89,20 +95,21 @@ class DatasetWriter:
self, self,
meta: LeRobotDatasetMetadata, meta: LeRobotDatasetMetadata,
root: Path, root: Path,
vcodec: str, camera_encoder_config: VideoEncoderConfig,
encoder_threads: int | None, encoder_threads: int | None,
batch_encoding_size: int, batch_encoding_size: int,
streaming_encoder: StreamingVideoEncoder | None = None, streaming_encoder: StreamingVideoEncoder | None = None,
initial_frames: int = 0, initial_frames: int = 0,
): ):
"""Initialize the writer with metadata, codec, and encoding config. """Initialize the writer with metadata, codec, and encoder config.
Args: Args:
meta: Dataset metadata instance (used for feature schema, chunk meta: Dataset metadata instance (used for feature schema, chunk
settings, and episode persistence). settings, and episode persistence).
root: Local dataset root directory. root: Local dataset root directory.
vcodec: Video codec for encoding (e.g. ``'libsvtav1'``, ``'h264'``). camera_encoder_config: Video encoder settings applied to all cameras.
encoder_threads: Threads per encoder instance. ``None`` for auto. encoder_threads: Number of encoder threads (global). ``None``
lets the codec decide.
batch_encoding_size: Number of episodes to accumulate before batch_encoding_size: Number of episodes to accumulate before
batch-encoding videos. batch-encoding videos.
streaming_encoder: Optional pre-built :class:`StreamingVideoEncoder` streaming_encoder: Optional pre-built :class:`StreamingVideoEncoder`
@@ -111,7 +118,7 @@ class DatasetWriter:
""" """
self._meta = meta self._meta = meta
self._root = root self._root = root
self._vcodec = vcodec self._camera_encoder_config = camera_encoder_config
self._encoder_threads = encoder_threads self._encoder_threads = encoder_threads
self._batch_encoding_size = batch_encoding_size self._batch_encoding_size = batch_encoding_size
self._streaming_encoder = streaming_encoder self._streaming_encoder = streaming_encoder
@@ -284,7 +291,7 @@ class DatasetWriter:
episode_index, episode_index,
self._root, self._root,
self._meta.fps, self._meta.fps,
self._vcodec, self._camera_encoder_config,
self._encoder_threads, self._encoder_threads,
): video_key ): video_key
for video_key in self._meta.video_keys for video_key in self._meta.video_keys
@@ -564,7 +571,12 @@ class DatasetWriter:
def _encode_temporary_episode_video(self, video_key: str, episode_index: int) -> Path: def _encode_temporary_episode_video(self, video_key: str, episode_index: int) -> Path:
"""Use ffmpeg to convert frames stored as png into mp4 videos.""" """Use ffmpeg to convert frames stored as png into mp4 videos."""
return _encode_video_worker( return _encode_video_worker(
video_key, episode_index, self._root, self._meta.fps, self._vcodec, self._encoder_threads video_key,
episode_index,
self._root,
self._meta.fps,
self._camera_encoder_config,
self._encoder_threads,
) )
def close_writer(self) -> None: def close_writer(self) -> None:
+48 -40
View File
@@ -37,7 +37,7 @@ from .utils import (
from .video_utils import ( from .video_utils import (
StreamingVideoEncoder, StreamingVideoEncoder,
get_safe_default_video_backend, get_safe_default_video_backend,
resolve_vcodec, VideoEncoderConfig,
) )
logger = logging.getLogger(__name__) logger = logging.getLogger(__name__)
@@ -58,10 +58,10 @@ class LeRobotDataset(torch.utils.data.Dataset):
video_backend: str | None = None, video_backend: str | None = None,
return_uint8: bool = False, return_uint8: bool = False,
batch_encoding_size: int = 1, batch_encoding_size: int = 1,
vcodec: str = "libsvtav1", camera_encoder_config: VideoEncoderConfig | None = None,
encoder_threads: int | None = None,
streaming_encoding: bool = False, streaming_encoding: bool = False,
encoder_queue_maxsize: int = 30, encoder_queue_maxsize: int = 30,
encoder_threads: int | None = None,
): ):
""" """
2 modes are available for instantiating this class, depending on 2 different use cases: 2 modes are available for instantiating this class, depending on 2 different use cases:
@@ -177,16 +177,15 @@ class LeRobotDataset(torch.utils.data.Dataset):
You can also use the 'pyav' decoder used by Torchvision, which used to be the default option, or 'video_reader' which is another decoder of Torchvision. You can also use the 'pyav' decoder used by Torchvision, which used to be the default option, or 'video_reader' which is another decoder of Torchvision.
batch_encoding_size (int, optional): Number of episodes to accumulate before batch encoding videos. batch_encoding_size (int, optional): Number of episodes to accumulate before batch encoding videos.
Set to 1 for immediate encoding (default), or higher for batched encoding. Defaults to 1. Set to 1 for immediate encoding (default), or higher for batched encoding. Defaults to 1.
vcodec (str, optional): Video codec for encoding videos during recording. Options: 'h264', 'hevc', camera_encoder_config (VideoEncoderConfig | None, optional): Video encoder settings for cameras
'libsvtav1', 'auto', or hardware-specific codecs like 'h264_videotoolbox', 'h264_nvenc'. (codec, quality, etc.). Defaults to
Defaults to 'libsvtav1'. Use 'auto' to auto-detect the best available hardware encoder. :class:`~lerobot.datasets.video_utils.VideoEncoderConfig` defaults when ``None``.
encoder_threads (int | None, optional): Number of encoder threads (global). ``None`` lets the
codec decide.
streaming_encoding (bool, optional): If True, encode video frames in real-time during capture streaming_encoding (bool, optional): If True, encode video frames in real-time during capture
instead of writing PNG images first. This makes save_episode() near-instant. Defaults to False. instead of writing PNG images first. This makes save_episode() near-instant. Defaults to False.
encoder_queue_maxsize (int, optional): Maximum number of frames to buffer per camera when using encoder_queue_maxsize (int, optional): Maximum number of frames to buffer per camera when using
streaming encoding. Defaults to 30 (~1s at 30fps). streaming encoding. Defaults to 30 (~1s at 30fps).
encoder_threads (int | None, optional): Number of threads per encoder instance. None lets the
codec auto-detect (default). Lower values reduce CPU usage per encoder. Maps to 'lp' (via svtav1-params) for
libsvtav1 and 'threads' for h264/hevc.
Note: Note:
Write-mode parameters (``streaming_encoding``, ``batch_encoding_size``) passed to Write-mode parameters (``streaming_encoding``, ``batch_encoding_size``) passed to
@@ -205,7 +204,9 @@ class LeRobotDataset(torch.utils.data.Dataset):
self._video_backend = video_backend if video_backend else get_safe_default_video_backend() self._video_backend = video_backend if video_backend else get_safe_default_video_backend()
self._return_uint8 = return_uint8 self._return_uint8 = return_uint8
self._batch_encoding_size = batch_encoding_size self._batch_encoding_size = batch_encoding_size
self._vcodec = resolve_vcodec(vcodec) if camera_encoder_config is None:
camera_encoder_config = VideoEncoderConfig()
self._camera_encoder_config = camera_encoder_config
self._encoder_threads = encoder_threads self._encoder_threads = encoder_threads
if self._requested_root is not None: if self._requested_root is not None:
@@ -251,13 +252,16 @@ class LeRobotDataset(torch.utils.data.Dataset):
streaming_enc = None streaming_enc = None
if streaming_encoding and len(self.meta.video_keys) > 0: if streaming_encoding and len(self.meta.video_keys) > 0:
streaming_enc = self._build_streaming_encoder( streaming_enc = self._build_streaming_encoder(
self.meta.fps, self._vcodec, encoder_queue_maxsize, encoder_threads self.meta.fps,
self._camera_encoder_config,
self._encoder_threads,
encoder_queue_maxsize,
) )
self.writer = DatasetWriter( self.writer = DatasetWriter(
meta=self.meta, meta=self.meta,
root=self.root, root=self.root,
vcodec=self._vcodec, camera_encoder_config=self._camera_encoder_config,
encoder_threads=encoder_threads, encoder_threads=self._encoder_threads,
batch_encoding_size=batch_encoding_size, batch_encoding_size=batch_encoding_size,
streaming_encoder=streaming_enc, streaming_encoder=streaming_enc,
initial_frames=self.meta.total_frames, initial_frames=self.meta.total_frames,
@@ -298,19 +302,15 @@ class LeRobotDataset(torch.utils.data.Dataset):
@staticmethod @staticmethod
def _build_streaming_encoder( def _build_streaming_encoder(
fps: int, fps: int,
vcodec: str, camera_encoder_config: VideoEncoderConfig,
encoder_queue_maxsize: int,
encoder_threads: int | None, encoder_threads: int | None,
encoder_queue_maxsize: int,
) -> StreamingVideoEncoder: ) -> StreamingVideoEncoder:
return StreamingVideoEncoder( return StreamingVideoEncoder(
fps=fps, fps=fps,
vcodec=vcodec, camera_encoder_config=camera_encoder_config,
pix_fmt="yuv420p",
g=2,
crf=30,
preset=None,
queue_maxsize=encoder_queue_maxsize,
encoder_threads=encoder_threads, encoder_threads=encoder_threads,
queue_maxsize=encoder_queue_maxsize,
) )
# ── Metadata properties ─────────────────────────────────────────── # ── Metadata properties ───────────────────────────────────────────
@@ -624,8 +624,8 @@ class LeRobotDataset(torch.utils.data.Dataset):
image_writer_processes: int = 0, image_writer_processes: int = 0,
image_writer_threads: int = 0, image_writer_threads: int = 0,
video_backend: str | None = None, video_backend: str | None = None,
camera_encoder_config: VideoEncoderConfig | None = None,
batch_encoding_size: int = 1, batch_encoding_size: int = 1,
vcodec: str = "libsvtav1",
metadata_buffer_size: int = 10, metadata_buffer_size: int = 10,
streaming_encoding: bool = False, streaming_encoding: bool = False,
encoder_queue_maxsize: int = 30, encoder_queue_maxsize: int = 30,
@@ -656,20 +656,23 @@ class LeRobotDataset(torch.utils.data.Dataset):
video_backend: Video decoding backend (used when reading back). video_backend: Video decoding backend (used when reading back).
batch_encoding_size: Number of episodes to accumulate before batch_encoding_size: Number of episodes to accumulate before
batch-encoding videos. ``1`` means encode immediately. batch-encoding videos. ``1`` means encode immediately.
vcodec: Video codec for encoding. Options include ``'libsvtav1'``, camera_encoder_config: Video encoder settings for cameras; defaults
``'h264'``, ``'hevc'``, ``'auto'``. match :class:`~lerobot.datasets.video_utils.VideoEncoderConfig`
when ``None``.
encoder_threads: Number of encoder threads (global). ``None``
lets the codec decide.
metadata_buffer_size: Number of episode metadata records to buffer metadata_buffer_size: Number of episode metadata records to buffer
before flushing to parquet. before flushing to parquet.
streaming_encoding: If ``True``, encode video frames in real-time streaming_encoding: If ``True``, encode video frames in real-time
during capture instead of writing images first. during capture instead of writing images first.
encoder_queue_maxsize: Max buffered frames per camera when using encoder_queue_maxsize: Max buffered frames per camera when using
streaming encoding. streaming encoding.
encoder_threads: Threads per encoder instance. ``None`` for auto.
Returns: Returns:
A new :class:`LeRobotDataset` in write mode. A new :class:`LeRobotDataset` in write mode.
""" """
vcodec = resolve_vcodec(vcodec) if camera_encoder_config is None:
camera_encoder_config = VideoEncoderConfig()
obj = cls.__new__(cls) obj = cls.__new__(cls)
obj.meta = LeRobotDatasetMetadata.create( obj.meta = LeRobotDatasetMetadata.create(
repo_id=repo_id, repo_id=repo_id,
@@ -693,20 +696,21 @@ class LeRobotDataset(torch.utils.data.Dataset):
obj._video_backend = video_backend if video_backend is not None else get_safe_default_video_backend() obj._video_backend = video_backend if video_backend is not None else get_safe_default_video_backend()
obj._return_uint8 = False obj._return_uint8 = False
obj._batch_encoding_size = batch_encoding_size obj._batch_encoding_size = batch_encoding_size
obj._vcodec = vcodec obj._camera_encoder_config = camera_encoder_config
obj._encoder_threads = encoder_threads obj._encoder_threads = encoder_threads
# Reader is lazily created on first access (write-only mode) # Reader is lazily created on first access (write-only mode)
obj.reader = None obj.reader = None
# Create writer
streaming_enc = None streaming_enc = None
if streaming_encoding and len(obj.meta.video_keys) > 0: if streaming_encoding and len(obj.meta.video_keys) > 0:
streaming_enc = cls._build_streaming_encoder(fps, vcodec, encoder_queue_maxsize, encoder_threads) streaming_enc = cls._build_streaming_encoder(
fps, camera_encoder_config, encoder_threads, encoder_queue_maxsize
)
obj.writer = DatasetWriter( obj.writer = DatasetWriter(
meta=obj.meta, meta=obj.meta,
root=obj.root, root=obj.root,
vcodec=vcodec, camera_encoder_config=camera_encoder_config,
encoder_threads=encoder_threads, encoder_threads=encoder_threads,
batch_encoding_size=batch_encoding_size, batch_encoding_size=batch_encoding_size,
streaming_encoder=streaming_enc, streaming_encoder=streaming_enc,
@@ -729,12 +733,12 @@ class LeRobotDataset(torch.utils.data.Dataset):
force_cache_sync: bool = False, force_cache_sync: bool = False,
video_backend: str | None = None, video_backend: str | None = None,
batch_encoding_size: int = 1, batch_encoding_size: int = 1,
vcodec: str = "libsvtav1", camera_encoder_config: VideoEncoderConfig | None = None,
encoder_threads: int | None = None,
image_writer_processes: int = 0, image_writer_processes: int = 0,
image_writer_threads: int = 0, image_writer_threads: int = 0,
streaming_encoding: bool = False, streaming_encoding: bool = False,
encoder_queue_maxsize: int = 30, encoder_queue_maxsize: int = 30,
encoder_threads: int | None = None,
) -> "LeRobotDataset": ) -> "LeRobotDataset":
"""Resume recording on an existing dataset. """Resume recording on an existing dataset.
@@ -757,13 +761,16 @@ class LeRobotDataset(torch.utils.data.Dataset):
video_backend: Video decoding backend for reading back data. video_backend: Video decoding backend for reading back data.
batch_encoding_size: Number of episodes to accumulate before batch_encoding_size: Number of episodes to accumulate before
batch-encoding videos. batch-encoding videos.
vcodec: Video codec for encoding. camera_encoder_config: Video encoder settings for cameras; defaults
match :class:`~lerobot.datasets.video_utils.VideoEncoderConfig`
when ``None``.
encoder_threads: Number of encoder threads (global). ``None``
lets the codec decide.
image_writer_processes: Subprocesses for async image writing. image_writer_processes: Subprocesses for async image writing.
image_writer_threads: Threads for async image writing. image_writer_threads: Threads for async image writing.
streaming_encoding: If ``True``, encode video in real-time during streaming_encoding: If ``True``, encode video in real-time during
capture. capture.
encoder_queue_maxsize: Max buffered frames per camera for streaming. encoder_queue_maxsize: Max buffered frames per camera for streaming.
encoder_threads: Threads per encoder instance. ``None`` for auto.
Returns: Returns:
A :class:`LeRobotDataset` in write mode, ready to append episodes. A :class:`LeRobotDataset` in write mode, ready to append episodes.
@@ -774,7 +781,6 @@ class LeRobotDataset(torch.utils.data.Dataset):
"Writing into the revision-safe Hub snapshot cache (used when root=None) would corrupt " "Writing into the revision-safe Hub snapshot cache (used when root=None) would corrupt "
"the shared cache. Please provide a local directory path." "the shared cache. Please provide a local directory path."
) )
vcodec = resolve_vcodec(vcodec)
obj = cls.__new__(cls) obj = cls.__new__(cls)
obj.repo_id = repo_id obj.repo_id = repo_id
obj._requested_root = Path(root) obj._requested_root = Path(root)
@@ -786,8 +792,6 @@ class LeRobotDataset(torch.utils.data.Dataset):
obj._video_backend = video_backend if video_backend else get_safe_default_video_backend() obj._video_backend = video_backend if video_backend else get_safe_default_video_backend()
obj._return_uint8 = False obj._return_uint8 = False
obj._batch_encoding_size = batch_encoding_size obj._batch_encoding_size = batch_encoding_size
obj._vcodec = vcodec
obj._encoder_threads = encoder_threads
if obj._requested_root is not None: if obj._requested_root is not None:
obj._requested_root.mkdir(exist_ok=True, parents=True) obj._requested_root.mkdir(exist_ok=True, parents=True)
@@ -796,21 +800,25 @@ class LeRobotDataset(torch.utils.data.Dataset):
obj.meta = LeRobotDatasetMetadata( obj.meta = LeRobotDatasetMetadata(
obj.repo_id, obj._requested_root, obj.revision, force_cache_sync=force_cache_sync obj.repo_id, obj._requested_root, obj.revision, force_cache_sync=force_cache_sync
) )
if camera_encoder_config is None:
camera_encoder_config = VideoEncoderConfig()
obj._camera_encoder_config = camera_encoder_config
obj._encoder_threads = encoder_threads
obj.root = obj.meta.root obj.root = obj.meta.root
# Reader is lazily created on first access (write-only mode) # Reader is lazily created on first access (write-only mode)
obj.reader = None obj.reader = None
# Create writer for appending
streaming_enc = None streaming_enc = None
if streaming_encoding and len(obj.meta.video_keys) > 0: if streaming_encoding and len(obj.meta.video_keys) > 0:
streaming_enc = cls._build_streaming_encoder( streaming_enc = cls._build_streaming_encoder(
obj.meta.fps, vcodec, encoder_queue_maxsize, encoder_threads obj.meta.fps, camera_encoder_config, encoder_threads, encoder_queue_maxsize
) )
obj.writer = DatasetWriter( obj.writer = DatasetWriter(
meta=obj.meta, meta=obj.meta,
root=obj.root, root=obj.root,
vcodec=vcodec, camera_encoder_config=camera_encoder_config,
encoder_threads=encoder_threads, encoder_threads=encoder_threads,
batch_encoding_size=batch_encoding_size, batch_encoding_size=batch_encoding_size,
streaming_encoder=streaming_enc, streaming_encoder=streaming_enc,
+26 -11
View File
@@ -49,6 +49,14 @@ Delete episodes and save to a new dataset at a specific path and with a new repo
--operation.type delete_episodes \ --operation.type delete_episodes \
--operation.episode_indices "[0, 2, 5]" --operation.episode_indices "[0, 2, 5]"
Delete episodes and re-encode video segments with h264:
lerobot-edit-dataset \
--repo_id lerobot/pusht \
--operation.type delete_episodes \
--operation.episode_indices "[0, 2, 5]" \
--operation.camera_encoder_config.vcodec h264 \
--operation.camera_encoder_config.crf 23
Split dataset by fractions (pusht_train, pusht_val): Split dataset by fractions (pusht_train, pusht_val):
lerobot-edit-dataset \ lerobot-edit-dataset \
--repo_id lerobot/pusht \ --repo_id lerobot/pusht \
@@ -74,6 +82,14 @@ Split into more than two splits:
--operation.type split \ --operation.type split \
--operation.splits '{"train": 0.6, "val": 0.2, "test": 0.2}' --operation.splits '{"train": 0.6, "val": 0.2, "test": 0.2}'
Split dataset and re-encode video segments with h264:
lerobot-edit-dataset \
--repo_id lerobot/pusht \
--operation.type split \
--operation.splits '{"train": 0.8, "val": 0.2}' \
--operation.camera_encoder_config.vcodec h264 \
--operation.camera_encoder_config.crf 23
Merge multiple datasets: Merge multiple datasets:
lerobot-edit-dataset \ lerobot-edit-dataset \
--new_repo_id lerobot/pusht_merged \ --new_repo_id lerobot/pusht_merged \
@@ -187,7 +203,7 @@ import abc
import logging import logging
import shutil import shutil
import sys import sys
from dataclasses import dataclass from dataclasses import dataclass, field
from pathlib import Path from pathlib import Path
import draccus import draccus
@@ -195,6 +211,8 @@ import draccus
from lerobot.configs import parser from lerobot.configs import parser
from lerobot.datasets import ( from lerobot.datasets import (
LeRobotDataset, LeRobotDataset,
VideoEncoderConfig,
camera_encoder_defaults,
convert_image_to_video_dataset, convert_image_to_video_dataset,
delete_episodes, delete_episodes,
merge_datasets, merge_datasets,
@@ -218,12 +236,14 @@ class OperationConfig(draccus.ChoiceRegistry, abc.ABC):
@dataclass @dataclass
class DeleteEpisodesConfig(OperationConfig): class DeleteEpisodesConfig(OperationConfig):
episode_indices: list[int] | None = None episode_indices: list[int] | None = None
camera_encoder_config: VideoEncoderConfig = field(default_factory=camera_encoder_defaults)
@OperationConfig.register_subclass("split") @OperationConfig.register_subclass("split")
@dataclass @dataclass
class SplitConfig(OperationConfig): class SplitConfig(OperationConfig):
splits: dict[str, float | list[int]] | None = None splits: dict[str, float | list[int]] | None = None
camera_encoder_config: VideoEncoderConfig = field(default_factory=camera_encoder_defaults)
@OperationConfig.register_subclass("merge") @OperationConfig.register_subclass("merge")
@@ -250,11 +270,7 @@ class ModifyTasksConfig(OperationConfig):
@dataclass @dataclass
class ConvertImageToVideoConfig(OperationConfig): class ConvertImageToVideoConfig(OperationConfig):
output_dir: str | None = None output_dir: str | None = None
vcodec: str = "libsvtav1" camera_encoder_config: VideoEncoderConfig = field(default_factory=camera_encoder_defaults)
pix_fmt: str = "yuv420p"
g: int = 2
crf: int = 30
fast_decode: int = 0
episode_indices: list[int] | None = None episode_indices: list[int] | None = None
num_workers: int = 4 num_workers: int = 4
max_episodes_per_batch: int | None = None max_episodes_per_batch: int | None = None
@@ -356,6 +372,7 @@ def handle_delete_episodes(cfg: EditDatasetConfig) -> None:
episode_indices=cfg.operation.episode_indices, episode_indices=cfg.operation.episode_indices,
output_dir=output_dir, output_dir=output_dir,
repo_id=output_repo_id, repo_id=output_repo_id,
camera_encoder_config=cfg.operation.camera_encoder_config,
) )
logging.info(f"Dataset saved to {output_dir}") logging.info(f"Dataset saved to {output_dir}")
@@ -387,6 +404,7 @@ def handle_split(cfg: EditDatasetConfig) -> None:
dataset, dataset,
splits=cfg.operation.splits, splits=cfg.operation.splits,
output_dir=cfg.new_root, output_dir=cfg.new_root,
camera_encoder_config=cfg.operation.camera_encoder_config,
) )
for split_name, split_ds in split_datasets.items(): for split_name, split_ds in split_datasets.items():
@@ -557,11 +575,8 @@ def handle_convert_image_to_video(cfg: EditDatasetConfig) -> None:
dataset=dataset, dataset=dataset,
output_dir=output_dir, output_dir=output_dir,
repo_id=output_repo_id, repo_id=output_repo_id,
vcodec=getattr(cfg.operation, "vcodec", "libsvtav1"), camera_encoder_config=getattr(cfg.operation, "camera_encoder_config", None)
pix_fmt=getattr(cfg.operation, "pix_fmt", "yuv420p"), or camera_encoder_defaults(),
g=getattr(cfg.operation, "g", 2),
crf=getattr(cfg.operation, "crf", 30),
fast_decode=getattr(cfg.operation, "fast_decode", 0),
episode_indices=getattr(cfg.operation, "episode_indices", None), episode_indices=getattr(cfg.operation, "episode_indices", None),
num_workers=getattr(cfg.operation, "num_workers", 4), num_workers=getattr(cfg.operation, "num_workers", 4),
max_episodes_per_batch=getattr(cfg.operation, "max_episodes_per_batch", None), max_episodes_per_batch=getattr(cfg.operation, "max_episodes_per_batch", None),
+28 -5
View File
@@ -63,6 +63,27 @@ lerobot-record \\
--dataset.streaming_encoding=true \\ --dataset.streaming_encoding=true \\
--dataset.encoder_threads=2 --dataset.encoder_threads=2
``` ```
Example recording with custom video encoding parameters:
```shell
lerobot-record \\
--robot.type=so100_follower \\
--robot.port=/dev/tty.usbmodem58760431541 \\
--robot.cameras="{laptop: {type: opencv, index_or_path: 0, width: 640, height: 480, fps: 30}}" \\
--robot.id=black \\
--teleop.type=so100_leader \\
--teleop.port=/dev/tty.usbmodem58760431551 \\
--teleop.id=blue \\
--dataset.repo_id=<my_username>/<my_dataset_name> \\
--dataset.num_episodes=2 \\
--dataset.single_task="Grab the cube" \\
--dataset.streaming_encoding=true \\
--dataset.encoder_threads=2 \\
--dataset.camera_encoder_config.vcodec=h264 \\
--dataset.camera_encoder_config.preset=fast \\
--dataset.camera_encoder_config.extra_options={"tune": "film", "profile:v": "high", "bf": 2} \\
--display_data=true
```
""" """
import logging import logging
@@ -84,8 +105,10 @@ from lerobot.configs import parser
from lerobot.configs.dataset import DatasetRecordConfig from lerobot.configs.dataset import DatasetRecordConfig
from lerobot.datasets import ( from lerobot.datasets import (
LeRobotDataset, LeRobotDataset,
VideoEncoderConfig,
VideoEncodingManager, VideoEncodingManager,
aggregate_pipeline_dataset_features, aggregate_pipeline_dataset_features,
camera_encoder_defaults,
create_initial_features, create_initial_features,
safe_stop_image_writer, safe_stop_image_writer,
) )
@@ -377,10 +400,10 @@ def record(
cfg.dataset.repo_id, cfg.dataset.repo_id,
root=cfg.dataset.root, root=cfg.dataset.root,
batch_encoding_size=cfg.dataset.video_encoding_batch_size, batch_encoding_size=cfg.dataset.video_encoding_batch_size,
vcodec=cfg.dataset.vcodec, camera_encoder_config=cfg.dataset.camera_encoder_config,
encoder_threads=cfg.dataset.encoder_threads,
streaming_encoding=cfg.dataset.streaming_encoding, streaming_encoding=cfg.dataset.streaming_encoding,
encoder_queue_maxsize=cfg.dataset.encoder_queue_maxsize, encoder_queue_maxsize=cfg.dataset.encoder_queue_maxsize,
encoder_threads=cfg.dataset.encoder_threads,
image_writer_processes=cfg.dataset.num_image_writer_processes if num_cameras > 0 else 0, image_writer_processes=cfg.dataset.num_image_writer_processes if num_cameras > 0 else 0,
image_writer_threads=cfg.dataset.num_image_writer_threads_per_camera * num_cameras image_writer_threads=cfg.dataset.num_image_writer_threads_per_camera * num_cameras
if num_cameras > 0 if num_cameras > 0
@@ -406,10 +429,10 @@ def record(
image_writer_processes=cfg.dataset.num_image_writer_processes, image_writer_processes=cfg.dataset.num_image_writer_processes,
image_writer_threads=cfg.dataset.num_image_writer_threads_per_camera * len(robot.cameras), image_writer_threads=cfg.dataset.num_image_writer_threads_per_camera * len(robot.cameras),
batch_encoding_size=cfg.dataset.video_encoding_batch_size, batch_encoding_size=cfg.dataset.video_encoding_batch_size,
vcodec=cfg.dataset.vcodec, camera_encoder_config=cfg.dataset.camera_encoder_config,
encoder_threads=cfg.dataset.encoder_threads,
streaming_encoding=cfg.dataset.streaming_encoding, streaming_encoding=cfg.dataset.streaming_encoding,
encoder_queue_maxsize=cfg.dataset.encoder_queue_maxsize, encoder_queue_maxsize=cfg.dataset.encoder_queue_maxsize,
encoder_threads=cfg.dataset.encoder_threads,
) )
robot.connect() robot.connect()
@@ -420,7 +443,7 @@ def record(
if not cfg.dataset.streaming_encoding: if not cfg.dataset.streaming_encoding:
logging.info( logging.info(
"Streaming encoding is disabled. If you have capable hardware, consider enabling it for way faster episode saving. --dataset.streaming_encoding=true --dataset.encoder_threads=2 # --dataset.vcodec=auto. More info in the documentation: https://huggingface.co/docs/lerobot/streaming_video_encoding" "Streaming encoding is disabled. If you have capable hardware, consider enabling it for way faster episode saving. --dataset.streaming_encoding=true --dataset.encoder_threads=2 # --dataset.camera_encoder_config.vcodec=auto. More info in the documentation: https://huggingface.co/docs/lerobot/streaming_video_encoding"
) )
with VideoEncodingManager(dataset): with VideoEncodingManager(dataset):