diff --git a/benchmarks/video/run_video_benchmark.py b/benchmarks/video/run_video_benchmark.py index 064a84b48..4ec766735 100644 --- a/benchmarks/video/run_video_benchmark.py +++ b/benchmarks/video/run_video_benchmark.py @@ -39,6 +39,7 @@ from tqdm import tqdm from lerobot.datasets.lerobot_dataset import LeRobotDataset from lerobot.datasets.video_utils import ( + VideoEncoderConfig, decode_video_frames, encode_video_frames, ) @@ -251,10 +252,13 @@ def benchmark_encoding_decoding( imgs_dir=imgs_dir, video_path=video_path, fps=fps, - vcodec=encoding_cfg["vcodec"], - pix_fmt=encoding_cfg["pix_fmt"], - g=encoding_cfg.get("g"), - crf=encoding_cfg.get("crf"), + camera_encoder_config=VideoEncoderConfig( + vcodec=encoding_cfg["vcodec"], + pix_fmt=encoding_cfg["pix_fmt"], + g=encoding_cfg.get("g"), + crf=encoding_cfg.get("crf"), + preset=encoding_cfg.get("preset"), + ), # fast_decode=encoding_cfg.get("fastdecode"), overwrite=True, ) diff --git a/src/lerobot/datasets/dataset_tools.py b/src/lerobot/datasets/dataset_tools.py index 46dd9bff2..dd162f150 100644 --- a/src/lerobot/datasets/dataset_tools.py +++ b/src/lerobot/datasets/dataset_tools.py @@ -62,7 +62,7 @@ from .utils import ( DEFAULT_EPISODES_PATH, update_chunk_file_indices, ) -from .video_utils import encode_video_frames, get_video_info +from .video_utils import VideoEncoderConfig, encode_video_frames, get_video_info def _load_episode_with_stats(src_dataset: LeRobotDataset, episode_idx: int) -> dict: @@ -92,6 +92,7 @@ def delete_episodes( episode_indices: list[int], output_dir: str | Path | None = None, repo_id: str | None = None, + camera_encoder_config: VideoEncoderConfig | None = None, ) -> LeRobotDataset: """Delete episodes from a LeRobotDataset and create a new dataset. @@ -100,6 +101,7 @@ def delete_episodes( episode_indices: List of episode indices to delete. output_dir: Root directory where the edited dataset will be stored. If not specified, defaults to $HF_LEROBOT_HOME/repo_id. Equivalent to new_root in EditDatasetConfig. repo_id: Edited dataset identifier. Equivalent to new_repo_id in EditDatasetConfig. + camera_encoder_config: Video encoder settings used when re-encoding video segments (default: :class:`VideoEncoderConfig()`). """ if not episode_indices: raise ValueError("No episodes to delete") @@ -132,7 +134,7 @@ def delete_episodes( video_metadata = None if dataset.meta.video_keys: - video_metadata = _copy_and_reindex_videos(dataset, new_meta, episode_mapping) + video_metadata = _copy_and_reindex_videos(dataset, new_meta, episode_mapping, camera_encoder_config) data_metadata = _copy_and_reindex_data(dataset, new_meta, episode_mapping) @@ -154,6 +156,7 @@ def split_dataset( dataset: LeRobotDataset, splits: dict[str, float | list[int]], output_dir: str | Path | None = None, + camera_encoder_config: VideoEncoderConfig | None = None, ) -> dict[str, LeRobotDataset]: """Split a LeRobotDataset into multiple smaller datasets. @@ -162,6 +165,7 @@ def split_dataset( splits: Either a dict mapping split names to episode indices, or a dict mapping split names to fractions (must sum to <= 1.0). output_dir: Root directory where the split datasets will be stored. If not specified, defaults to $HF_LEROBOT_HOME/repo_id. + camera_encoder_config: Video encoder settings used when re-encoding video segments (default: :class:`VideoEncoderConfig()`). Examples: Split by specific episodes @@ -222,7 +226,9 @@ def split_dataset( video_metadata = None if dataset.meta.video_keys: - video_metadata = _copy_and_reindex_videos(dataset, new_meta, episode_mapping) + video_metadata = _copy_and_reindex_videos( + dataset, new_meta, episode_mapping, camera_encoder_config + ) data_metadata = _copy_and_reindex_data(dataset, new_meta, episode_mapping) @@ -578,8 +584,7 @@ def _keep_episodes_from_video_with_av( output_path: Path, episodes_to_keep: list[tuple[int, int]], fps: float, - vcodec: str = "libsvtav1", - pix_fmt: str = "yuv420p", + camera_encoder_config: VideoEncoderConfig | None = None, ) -> None: """Keep only specified episodes from a video file using PyAV. @@ -593,9 +598,10 @@ def _keep_episodes_from_video_with_av( Ranges are half-open intervals: [start_frame, end_frame), where start_frame is inclusive and end_frame is exclusive. fps: Frame rate of the video. - vcodec: Video codec to use for encoding. - pix_fmt: Pixel format for output video. + camera_encoder_config: Video encoder settings (default: :class:`VideoEncoderConfig()`). """ + if camera_encoder_config is None: + camera_encoder_config = VideoEncoderConfig() from fractions import Fraction import av @@ -619,12 +625,12 @@ def _keep_episodes_from_video_with_av( # Convert fps to Fraction for PyAV compatibility. fps_fraction = Fraction(fps).limit_denominator(1000) - v_out = out.add_stream(vcodec, rate=fps_fraction) + v_out = out.add_stream(camera_encoder_config.vcodec, rate=fps_fraction) # PyAV type stubs don't distinguish video streams from audio/subtitle streams. v_out.width = v_in.codec_context.width v_out.height = v_in.codec_context.height - v_out.pix_fmt = pix_fmt + v_out.pix_fmt = camera_encoder_config.pix_fmt # Set time_base to match the frame rate for proper timestamp handling. v_out.time_base = Fraction(1, int(fps)) @@ -687,8 +693,7 @@ def _copy_and_reindex_videos( src_dataset: LeRobotDataset, dst_meta: LeRobotDatasetMetadata, episode_mapping: dict[int, int], - vcodec: str = "libsvtav1", - pix_fmt: str = "yuv420p", + camera_encoder_config: VideoEncoderConfig | None = None, ) -> dict[int, dict]: """Copy and filter video files, only re-encoding files with deleted episodes. @@ -700,10 +705,13 @@ def _copy_and_reindex_videos( src_dataset: Source dataset to copy from dst_meta: Destination metadata object episode_mapping: Mapping from old episode indices to new indices + camera_encoder_config: Video encoder settings used when re-encoding segments (default: :class:`VideoEncoderConfig()`). Returns: dict mapping episode index to its video metadata (chunk_index, file_index, timestamps) """ + if camera_encoder_config is None: + camera_encoder_config = VideoEncoderConfig() if src_dataset.meta.episodes is None: src_dataset.meta.episodes = load_episodes(src_dataset.meta.root) @@ -792,8 +800,7 @@ def _copy_and_reindex_videos( dst_video_path, episodes_to_keep_ranges, src_dataset.meta.fps, - vcodec, - pix_fmt, + camera_encoder_config, ) cumulative_ts = 0.0 @@ -1264,11 +1271,7 @@ def _estimate_frame_size_via_calibration( episode_indices: list[int], temp_dir: Path, fps: int, - vcodec: str, - pix_fmt: str, - g: int, - crf: int, - fast_decode: int, + camera_encoder_config: VideoEncoderConfig, num_calibration_frames: int = 30, ) -> float: """Estimate MB per frame by encoding a small calibration sample. @@ -1282,11 +1285,7 @@ def _estimate_frame_size_via_calibration( episode_indices: List of episode indices being processed. temp_dir: Temporary directory for calibration files. fps: Frames per second for video encoding. - vcodec: Video codec (libsvtav1, h264, hevc). - pix_fmt: Pixel format (yuv420p, etc.). - g: GOP size (group of pictures). - crf: Constant Rate Factor (quality). - fast_decode: Fast decode tuning parameter. + camera_encoder_config: Video encoder settings used for calibration encoding. num_calibration_frames: Number of frames to use for calibration (default: 30). Returns: @@ -1322,11 +1321,7 @@ def _estimate_frame_size_via_calibration( imgs_dir=calibration_dir, video_path=calibration_video_path, fps=fps, - vcodec=vcodec, - pix_fmt=pix_fmt, - g=g, - crf=crf, - fast_decode=fast_decode, + camera_encoder_config=camera_encoder_config, overwrite=True, ) @@ -1644,11 +1639,7 @@ def convert_image_to_video_dataset( dataset: LeRobotDataset, output_dir: Path | None = None, repo_id: str | None = None, - vcodec: str = "libsvtav1", - pix_fmt: str = "yuv420p", - g: int = 2, - crf: int = 30, - fast_decode: int = 0, + camera_encoder_config: VideoEncoderConfig | None = None, episode_indices: list[int] | None = None, num_workers: int = 4, max_episodes_per_batch: int | None = None, @@ -1663,11 +1654,7 @@ def convert_image_to_video_dataset( dataset: The source LeRobot dataset with images output_dir: Root directory where the edited dataset will be stored. If not specified, defaults to $HF_LEROBOT_HOME/repo_id. Equivalent to new_root in EditDatasetConfig. repo_id: Edited dataset identifier. Equivalent to new_repo_id in EditDatasetConfig. - vcodec: Video codec (default: libsvtav1) - pix_fmt: Pixel format (default: yuv420p) - g: Group of pictures size (default: 2) - crf: Constant rate factor (default: 30) - fast_decode: Fast decode tuning (default: 0) + camera_encoder_config: Video encoder settings (default: :class:`VideoEncoderConfig()`). episode_indices: List of episode indices to convert (None = all episodes) num_workers: Number of threads for parallel processing (default: 4) max_episodes_per_batch: Maximum episodes per video batch to avoid memory issues (None = no limit) @@ -1676,6 +1663,9 @@ def convert_image_to_video_dataset( Returns: New LeRobotDataset with images encoded as videos """ + if camera_encoder_config is None: + camera_encoder_config = VideoEncoderConfig() + # Check that it's an image dataset if len(dataset.meta.video_keys) > 0: raise ValueError( @@ -1699,7 +1689,10 @@ def convert_image_to_video_dataset( logging.info( f"Converting {len(episode_indices)} episodes with {len(img_keys)} cameras from {dataset.repo_id}" ) - logging.info(f"Video codec: {vcodec}, pixel format: {pix_fmt}, GOP: {g}, CRF: {crf}") + logging.info( + f"Video codec: {camera_encoder_config.vcodec}, pixel format: {camera_encoder_config.pix_fmt}, " + f"GOP: {camera_encoder_config.g}, CRF: {camera_encoder_config.crf}" + ) # Create new features dict, converting image features to video features new_features = {} @@ -1769,11 +1762,7 @@ def convert_image_to_video_dataset( episode_indices=episode_indices, temp_dir=temp_dir, fps=fps, - vcodec=vcodec, - pix_fmt=pix_fmt, - g=g, - crf=crf, - fast_decode=fast_decode, + camera_encoder_config=camera_encoder_config, ) logging.info(f"Processing camera: {img_key}") @@ -1815,11 +1804,7 @@ def convert_image_to_video_dataset( imgs_dir=imgs_dir, video_path=video_path, fps=fps, - vcodec=vcodec, - pix_fmt=pix_fmt, - g=g, - crf=crf, - fast_decode=fast_decode, + camera_encoder_config=camera_encoder_config, overwrite=True, ) diff --git a/src/lerobot/datasets/dataset_writer.py b/src/lerobot/datasets/dataset_writer.py index cf306a86a..060e7bc48 100644 --- a/src/lerobot/datasets/dataset_writer.py +++ b/src/lerobot/datasets/dataset_writer.py @@ -52,6 +52,7 @@ from .utils import ( ) from .video_utils import ( StreamingVideoEncoder, + VideoEncoderConfig, concatenate_video_files, encode_video_frames, get_video_duration_in_s, @@ -65,14 +66,19 @@ def _encode_video_worker( episode_index: int, root: Path, fps: int, - vcodec: str = "libsvtav1", + camera_encoder_config: VideoEncoderConfig | None = None, encoder_threads: int | None = None, ) -> Path: temp_path = Path(tempfile.mkdtemp(dir=root)) / f"{video_key}_{episode_index:03d}.mp4" fpath = DEFAULT_IMAGE_PATH.format(image_key=video_key, episode_index=episode_index, frame_index=0) img_dir = (root / fpath).parent encode_video_frames( - img_dir, temp_path, fps, vcodec=vcodec, overwrite=True, encoder_threads=encoder_threads + img_dir, + temp_path, + fps, + camera_encoder_config=camera_encoder_config, + encoder_threads=encoder_threads, + overwrite=True, ) shutil.rmtree(img_dir) return temp_path @@ -89,20 +95,21 @@ class DatasetWriter: self, meta: LeRobotDatasetMetadata, root: Path, - vcodec: str, + camera_encoder_config: VideoEncoderConfig, encoder_threads: int | None, batch_encoding_size: int, streaming_encoder: StreamingVideoEncoder | None = None, initial_frames: int = 0, ): - """Initialize the writer with metadata, codec, and encoding config. + """Initialize the writer with metadata, codec, and encoder config. Args: meta: Dataset metadata instance (used for feature schema, chunk settings, and episode persistence). root: Local dataset root directory. - vcodec: Video codec for encoding (e.g. ``'libsvtav1'``, ``'h264'``). - encoder_threads: Threads per encoder instance. ``None`` for auto. + camera_encoder_config: Video encoder settings applied to all cameras. + encoder_threads: Number of encoder threads (global). ``None`` + lets the codec decide. batch_encoding_size: Number of episodes to accumulate before batch-encoding videos. streaming_encoder: Optional pre-built :class:`StreamingVideoEncoder` @@ -111,7 +118,7 @@ class DatasetWriter: """ self._meta = meta self._root = root - self._vcodec = vcodec + self._camera_encoder_config = camera_encoder_config self._encoder_threads = encoder_threads self._batch_encoding_size = batch_encoding_size self._streaming_encoder = streaming_encoder @@ -284,7 +291,7 @@ class DatasetWriter: episode_index, self._root, self._meta.fps, - self._vcodec, + self._camera_encoder_config, self._encoder_threads, ): video_key for video_key in self._meta.video_keys @@ -564,7 +571,12 @@ class DatasetWriter: def _encode_temporary_episode_video(self, video_key: str, episode_index: int) -> Path: """Use ffmpeg to convert frames stored as png into mp4 videos.""" return _encode_video_worker( - video_key, episode_index, self._root, self._meta.fps, self._vcodec, self._encoder_threads + video_key, + episode_index, + self._root, + self._meta.fps, + self._camera_encoder_config, + self._encoder_threads, ) def close_writer(self) -> None: diff --git a/src/lerobot/datasets/lerobot_dataset.py b/src/lerobot/datasets/lerobot_dataset.py index 077e1efd1..79efa330b 100644 --- a/src/lerobot/datasets/lerobot_dataset.py +++ b/src/lerobot/datasets/lerobot_dataset.py @@ -37,7 +37,7 @@ from .utils import ( from .video_utils import ( StreamingVideoEncoder, get_safe_default_video_backend, - resolve_vcodec, + VideoEncoderConfig, ) logger = logging.getLogger(__name__) @@ -58,10 +58,10 @@ class LeRobotDataset(torch.utils.data.Dataset): video_backend: str | None = None, return_uint8: bool = False, batch_encoding_size: int = 1, - vcodec: str = "libsvtav1", + camera_encoder_config: VideoEncoderConfig | None = None, + encoder_threads: int | None = None, streaming_encoding: bool = False, encoder_queue_maxsize: int = 30, - encoder_threads: int | None = None, ): """ 2 modes are available for instantiating this class, depending on 2 different use cases: @@ -177,16 +177,15 @@ class LeRobotDataset(torch.utils.data.Dataset): You can also use the 'pyav' decoder used by Torchvision, which used to be the default option, or 'video_reader' which is another decoder of Torchvision. batch_encoding_size (int, optional): Number of episodes to accumulate before batch encoding videos. Set to 1 for immediate encoding (default), or higher for batched encoding. Defaults to 1. - vcodec (str, optional): Video codec for encoding videos during recording. Options: 'h264', 'hevc', - 'libsvtav1', 'auto', or hardware-specific codecs like 'h264_videotoolbox', 'h264_nvenc'. - Defaults to 'libsvtav1'. Use 'auto' to auto-detect the best available hardware encoder. + camera_encoder_config (VideoEncoderConfig | None, optional): Video encoder settings for cameras + (codec, quality, etc.). Defaults to + :class:`~lerobot.datasets.video_utils.VideoEncoderConfig` defaults when ``None``. + encoder_threads (int | None, optional): Number of encoder threads (global). ``None`` lets the + codec decide. streaming_encoding (bool, optional): If True, encode video frames in real-time during capture instead of writing PNG images first. This makes save_episode() near-instant. Defaults to False. encoder_queue_maxsize (int, optional): Maximum number of frames to buffer per camera when using streaming encoding. Defaults to 30 (~1s at 30fps). - encoder_threads (int | None, optional): Number of threads per encoder instance. None lets the - codec auto-detect (default). Lower values reduce CPU usage per encoder. Maps to 'lp' (via svtav1-params) for - libsvtav1 and 'threads' for h264/hevc. Note: Write-mode parameters (``streaming_encoding``, ``batch_encoding_size``) passed to @@ -205,7 +204,9 @@ class LeRobotDataset(torch.utils.data.Dataset): self._video_backend = video_backend if video_backend else get_safe_default_video_backend() self._return_uint8 = return_uint8 self._batch_encoding_size = batch_encoding_size - self._vcodec = resolve_vcodec(vcodec) + if camera_encoder_config is None: + camera_encoder_config = VideoEncoderConfig() + self._camera_encoder_config = camera_encoder_config self._encoder_threads = encoder_threads if self._requested_root is not None: @@ -251,13 +252,16 @@ class LeRobotDataset(torch.utils.data.Dataset): streaming_enc = None if streaming_encoding and len(self.meta.video_keys) > 0: streaming_enc = self._build_streaming_encoder( - self.meta.fps, self._vcodec, encoder_queue_maxsize, encoder_threads + self.meta.fps, + self._camera_encoder_config, + self._encoder_threads, + encoder_queue_maxsize, ) self.writer = DatasetWriter( meta=self.meta, root=self.root, - vcodec=self._vcodec, - encoder_threads=encoder_threads, + camera_encoder_config=self._camera_encoder_config, + encoder_threads=self._encoder_threads, batch_encoding_size=batch_encoding_size, streaming_encoder=streaming_enc, initial_frames=self.meta.total_frames, @@ -298,19 +302,15 @@ class LeRobotDataset(torch.utils.data.Dataset): @staticmethod def _build_streaming_encoder( fps: int, - vcodec: str, - encoder_queue_maxsize: int, + camera_encoder_config: VideoEncoderConfig, encoder_threads: int | None, + encoder_queue_maxsize: int, ) -> StreamingVideoEncoder: return StreamingVideoEncoder( fps=fps, - vcodec=vcodec, - pix_fmt="yuv420p", - g=2, - crf=30, - preset=None, - queue_maxsize=encoder_queue_maxsize, + camera_encoder_config=camera_encoder_config, encoder_threads=encoder_threads, + queue_maxsize=encoder_queue_maxsize, ) # ── Metadata properties ─────────────────────────────────────────── @@ -624,8 +624,8 @@ class LeRobotDataset(torch.utils.data.Dataset): image_writer_processes: int = 0, image_writer_threads: int = 0, video_backend: str | None = None, + camera_encoder_config: VideoEncoderConfig | None = None, batch_encoding_size: int = 1, - vcodec: str = "libsvtav1", metadata_buffer_size: int = 10, streaming_encoding: bool = False, encoder_queue_maxsize: int = 30, @@ -656,20 +656,23 @@ class LeRobotDataset(torch.utils.data.Dataset): video_backend: Video decoding backend (used when reading back). batch_encoding_size: Number of episodes to accumulate before batch-encoding videos. ``1`` means encode immediately. - vcodec: Video codec for encoding. Options include ``'libsvtav1'``, - ``'h264'``, ``'hevc'``, ``'auto'``. + camera_encoder_config: Video encoder settings for cameras; defaults + match :class:`~lerobot.datasets.video_utils.VideoEncoderConfig` + when ``None``. + encoder_threads: Number of encoder threads (global). ``None`` + lets the codec decide. metadata_buffer_size: Number of episode metadata records to buffer before flushing to parquet. streaming_encoding: If ``True``, encode video frames in real-time during capture instead of writing images first. encoder_queue_maxsize: Max buffered frames per camera when using streaming encoding. - encoder_threads: Threads per encoder instance. ``None`` for auto. Returns: A new :class:`LeRobotDataset` in write mode. """ - vcodec = resolve_vcodec(vcodec) + if camera_encoder_config is None: + camera_encoder_config = VideoEncoderConfig() obj = cls.__new__(cls) obj.meta = LeRobotDatasetMetadata.create( repo_id=repo_id, @@ -693,20 +696,21 @@ class LeRobotDataset(torch.utils.data.Dataset): obj._video_backend = video_backend if video_backend is not None else get_safe_default_video_backend() obj._return_uint8 = False obj._batch_encoding_size = batch_encoding_size - obj._vcodec = vcodec + obj._camera_encoder_config = camera_encoder_config obj._encoder_threads = encoder_threads # Reader is lazily created on first access (write-only mode) obj.reader = None - # Create writer streaming_enc = None if streaming_encoding and len(obj.meta.video_keys) > 0: - streaming_enc = cls._build_streaming_encoder(fps, vcodec, encoder_queue_maxsize, encoder_threads) + streaming_enc = cls._build_streaming_encoder( + fps, camera_encoder_config, encoder_threads, encoder_queue_maxsize + ) obj.writer = DatasetWriter( meta=obj.meta, root=obj.root, - vcodec=vcodec, + camera_encoder_config=camera_encoder_config, encoder_threads=encoder_threads, batch_encoding_size=batch_encoding_size, streaming_encoder=streaming_enc, @@ -729,12 +733,12 @@ class LeRobotDataset(torch.utils.data.Dataset): force_cache_sync: bool = False, video_backend: str | None = None, batch_encoding_size: int = 1, - vcodec: str = "libsvtav1", + camera_encoder_config: VideoEncoderConfig | None = None, + encoder_threads: int | None = None, image_writer_processes: int = 0, image_writer_threads: int = 0, streaming_encoding: bool = False, encoder_queue_maxsize: int = 30, - encoder_threads: int | None = None, ) -> "LeRobotDataset": """Resume recording on an existing dataset. @@ -757,13 +761,16 @@ class LeRobotDataset(torch.utils.data.Dataset): video_backend: Video decoding backend for reading back data. batch_encoding_size: Number of episodes to accumulate before batch-encoding videos. - vcodec: Video codec for encoding. + camera_encoder_config: Video encoder settings for cameras; defaults + match :class:`~lerobot.datasets.video_utils.VideoEncoderConfig` + when ``None``. + encoder_threads: Number of encoder threads (global). ``None`` + lets the codec decide. image_writer_processes: Subprocesses for async image writing. image_writer_threads: Threads for async image writing. streaming_encoding: If ``True``, encode video in real-time during capture. encoder_queue_maxsize: Max buffered frames per camera for streaming. - encoder_threads: Threads per encoder instance. ``None`` for auto. Returns: A :class:`LeRobotDataset` in write mode, ready to append episodes. @@ -774,7 +781,6 @@ class LeRobotDataset(torch.utils.data.Dataset): "Writing into the revision-safe Hub snapshot cache (used when root=None) would corrupt " "the shared cache. Please provide a local directory path." ) - vcodec = resolve_vcodec(vcodec) obj = cls.__new__(cls) obj.repo_id = repo_id obj._requested_root = Path(root) @@ -786,8 +792,6 @@ class LeRobotDataset(torch.utils.data.Dataset): obj._video_backend = video_backend if video_backend else get_safe_default_video_backend() obj._return_uint8 = False obj._batch_encoding_size = batch_encoding_size - obj._vcodec = vcodec - obj._encoder_threads = encoder_threads if obj._requested_root is not None: obj._requested_root.mkdir(exist_ok=True, parents=True) @@ -796,21 +800,25 @@ class LeRobotDataset(torch.utils.data.Dataset): obj.meta = LeRobotDatasetMetadata( obj.repo_id, obj._requested_root, obj.revision, force_cache_sync=force_cache_sync ) + + if camera_encoder_config is None: + camera_encoder_config = VideoEncoderConfig() + obj._camera_encoder_config = camera_encoder_config + obj._encoder_threads = encoder_threads obj.root = obj.meta.root # Reader is lazily created on first access (write-only mode) obj.reader = None - # Create writer for appending streaming_enc = None if streaming_encoding and len(obj.meta.video_keys) > 0: streaming_enc = cls._build_streaming_encoder( - obj.meta.fps, vcodec, encoder_queue_maxsize, encoder_threads + obj.meta.fps, camera_encoder_config, encoder_threads, encoder_queue_maxsize ) obj.writer = DatasetWriter( meta=obj.meta, root=obj.root, - vcodec=vcodec, + camera_encoder_config=camera_encoder_config, encoder_threads=encoder_threads, batch_encoding_size=batch_encoding_size, streaming_encoder=streaming_enc, diff --git a/src/lerobot/scripts/lerobot_edit_dataset.py b/src/lerobot/scripts/lerobot_edit_dataset.py index a708d37a3..adf3abf6b 100644 --- a/src/lerobot/scripts/lerobot_edit_dataset.py +++ b/src/lerobot/scripts/lerobot_edit_dataset.py @@ -49,6 +49,14 @@ Delete episodes and save to a new dataset at a specific path and with a new repo --operation.type delete_episodes \ --operation.episode_indices "[0, 2, 5]" +Delete episodes and re-encode video segments with h264: + lerobot-edit-dataset \ + --repo_id lerobot/pusht \ + --operation.type delete_episodes \ + --operation.episode_indices "[0, 2, 5]" \ + --operation.camera_encoder_config.vcodec h264 \ + --operation.camera_encoder_config.crf 23 + Split dataset by fractions (pusht_train, pusht_val): lerobot-edit-dataset \ --repo_id lerobot/pusht \ @@ -74,6 +82,14 @@ Split into more than two splits: --operation.type split \ --operation.splits '{"train": 0.6, "val": 0.2, "test": 0.2}' +Split dataset and re-encode video segments with h264: + lerobot-edit-dataset \ + --repo_id lerobot/pusht \ + --operation.type split \ + --operation.splits '{"train": 0.8, "val": 0.2}' \ + --operation.camera_encoder_config.vcodec h264 \ + --operation.camera_encoder_config.crf 23 + Merge multiple datasets: lerobot-edit-dataset \ --new_repo_id lerobot/pusht_merged \ @@ -187,7 +203,7 @@ import abc import logging import shutil import sys -from dataclasses import dataclass +from dataclasses import dataclass, field from pathlib import Path import draccus @@ -195,6 +211,8 @@ import draccus from lerobot.configs import parser from lerobot.datasets import ( LeRobotDataset, + VideoEncoderConfig, + camera_encoder_defaults, convert_image_to_video_dataset, delete_episodes, merge_datasets, @@ -218,12 +236,14 @@ class OperationConfig(draccus.ChoiceRegistry, abc.ABC): @dataclass class DeleteEpisodesConfig(OperationConfig): episode_indices: list[int] | None = None + camera_encoder_config: VideoEncoderConfig = field(default_factory=camera_encoder_defaults) @OperationConfig.register_subclass("split") @dataclass class SplitConfig(OperationConfig): splits: dict[str, float | list[int]] | None = None + camera_encoder_config: VideoEncoderConfig = field(default_factory=camera_encoder_defaults) @OperationConfig.register_subclass("merge") @@ -250,11 +270,7 @@ class ModifyTasksConfig(OperationConfig): @dataclass class ConvertImageToVideoConfig(OperationConfig): output_dir: str | None = None - vcodec: str = "libsvtav1" - pix_fmt: str = "yuv420p" - g: int = 2 - crf: int = 30 - fast_decode: int = 0 + camera_encoder_config: VideoEncoderConfig = field(default_factory=camera_encoder_defaults) episode_indices: list[int] | None = None num_workers: int = 4 max_episodes_per_batch: int | None = None @@ -356,6 +372,7 @@ def handle_delete_episodes(cfg: EditDatasetConfig) -> None: episode_indices=cfg.operation.episode_indices, output_dir=output_dir, repo_id=output_repo_id, + camera_encoder_config=cfg.operation.camera_encoder_config, ) logging.info(f"Dataset saved to {output_dir}") @@ -387,6 +404,7 @@ def handle_split(cfg: EditDatasetConfig) -> None: dataset, splits=cfg.operation.splits, output_dir=cfg.new_root, + camera_encoder_config=cfg.operation.camera_encoder_config, ) for split_name, split_ds in split_datasets.items(): @@ -557,11 +575,8 @@ def handle_convert_image_to_video(cfg: EditDatasetConfig) -> None: dataset=dataset, output_dir=output_dir, repo_id=output_repo_id, - vcodec=getattr(cfg.operation, "vcodec", "libsvtav1"), - pix_fmt=getattr(cfg.operation, "pix_fmt", "yuv420p"), - g=getattr(cfg.operation, "g", 2), - crf=getattr(cfg.operation, "crf", 30), - fast_decode=getattr(cfg.operation, "fast_decode", 0), + camera_encoder_config=getattr(cfg.operation, "camera_encoder_config", None) + or camera_encoder_defaults(), episode_indices=getattr(cfg.operation, "episode_indices", None), num_workers=getattr(cfg.operation, "num_workers", 4), max_episodes_per_batch=getattr(cfg.operation, "max_episodes_per_batch", None), diff --git a/src/lerobot/scripts/lerobot_record.py b/src/lerobot/scripts/lerobot_record.py index 129696bd3..94ee2a0d9 100644 --- a/src/lerobot/scripts/lerobot_record.py +++ b/src/lerobot/scripts/lerobot_record.py @@ -63,6 +63,27 @@ lerobot-record \\ --dataset.streaming_encoding=true \\ --dataset.encoder_threads=2 ``` + +Example recording with custom video encoding parameters: +```shell +lerobot-record \\ + --robot.type=so100_follower \\ + --robot.port=/dev/tty.usbmodem58760431541 \\ + --robot.cameras="{laptop: {type: opencv, index_or_path: 0, width: 640, height: 480, fps: 30}}" \\ + --robot.id=black \\ + --teleop.type=so100_leader \\ + --teleop.port=/dev/tty.usbmodem58760431551 \\ + --teleop.id=blue \\ + --dataset.repo_id=/ \\ + --dataset.num_episodes=2 \\ + --dataset.single_task="Grab the cube" \\ + --dataset.streaming_encoding=true \\ + --dataset.encoder_threads=2 \\ + --dataset.camera_encoder_config.vcodec=h264 \\ + --dataset.camera_encoder_config.preset=fast \\ + --dataset.camera_encoder_config.extra_options={"tune": "film", "profile:v": "high", "bf": 2} \\ + --display_data=true +``` """ import logging @@ -84,8 +105,10 @@ from lerobot.configs import parser from lerobot.configs.dataset import DatasetRecordConfig from lerobot.datasets import ( LeRobotDataset, + VideoEncoderConfig, VideoEncodingManager, aggregate_pipeline_dataset_features, + camera_encoder_defaults, create_initial_features, safe_stop_image_writer, ) @@ -377,10 +400,10 @@ def record( cfg.dataset.repo_id, root=cfg.dataset.root, batch_encoding_size=cfg.dataset.video_encoding_batch_size, - vcodec=cfg.dataset.vcodec, + camera_encoder_config=cfg.dataset.camera_encoder_config, + encoder_threads=cfg.dataset.encoder_threads, streaming_encoding=cfg.dataset.streaming_encoding, encoder_queue_maxsize=cfg.dataset.encoder_queue_maxsize, - encoder_threads=cfg.dataset.encoder_threads, image_writer_processes=cfg.dataset.num_image_writer_processes if num_cameras > 0 else 0, image_writer_threads=cfg.dataset.num_image_writer_threads_per_camera * num_cameras if num_cameras > 0 @@ -406,10 +429,10 @@ def record( image_writer_processes=cfg.dataset.num_image_writer_processes, image_writer_threads=cfg.dataset.num_image_writer_threads_per_camera * len(robot.cameras), batch_encoding_size=cfg.dataset.video_encoding_batch_size, - vcodec=cfg.dataset.vcodec, + camera_encoder_config=cfg.dataset.camera_encoder_config, + encoder_threads=cfg.dataset.encoder_threads, streaming_encoding=cfg.dataset.streaming_encoding, encoder_queue_maxsize=cfg.dataset.encoder_queue_maxsize, - encoder_threads=cfg.dataset.encoder_threads, ) robot.connect() @@ -420,7 +443,7 @@ def record( if not cfg.dataset.streaming_encoding: logging.info( - "Streaming encoding is disabled. If you have capable hardware, consider enabling it for way faster episode saving. --dataset.streaming_encoding=true --dataset.encoder_threads=2 # --dataset.vcodec=auto. More info in the documentation: https://huggingface.co/docs/lerobot/streaming_video_encoding" + "Streaming encoding is disabled. If you have capable hardware, consider enabling it for way faster episode saving. --dataset.streaming_encoding=true --dataset.encoder_threads=2 # --dataset.camera_encoder_config.vcodec=auto. More info in the documentation: https://huggingface.co/docs/lerobot/streaming_video_encoding" ) with VideoEncodingManager(dataset):