From a694e327740e369340fcc3e2b2a7484fd2b8e644 Mon Sep 17 00:00:00 2001 From: CarolinePascal Date: Mon, 15 Jun 2026 14:31:42 +0200 Subject: [PATCH] feat(dataset tools): adding missing docstrings and features for depth fill support in dataset edition tools --- src/lerobot/datasets/dataset_tools.py | 29 ++++++------- src/lerobot/scripts/lerobot_edit_dataset.py | 33 ++++++++++++++- tests/datasets/test_dataset_tools.py | 26 +++++++++++- tests/scripts/test_edit_dataset_parsing.py | 45 +++++++++++++++++++++ 4 files changed, 116 insertions(+), 17 deletions(-) diff --git a/src/lerobot/datasets/dataset_tools.py b/src/lerobot/datasets/dataset_tools.py index c64c4c29c..5170c0137 100644 --- a/src/lerobot/datasets/dataset_tools.py +++ b/src/lerobot/datasets/dataset_tools.py @@ -608,7 +608,7 @@ def _keep_episodes_from_video_with_av( output_path: Path, episodes_to_keep: list[tuple[int, int]], fps: float, - camera_encoder: VideoEncoderConfig, + video_encoder: VideoEncoderConfig, ) -> None: """Keep only specified episodes from a video file using PyAV. @@ -622,7 +622,7 @@ def _keep_episodes_from_video_with_av( Ranges are half-open intervals: [start_frame, end_frame), where start_frame is inclusive and end_frame is exclusive. fps: Frame rate of the video. - camera_encoder: Video encoder settings used to re-encode the kept frames. + video_encoder: Video encoder settings used to re-encode the kept frames. """ from fractions import Fraction @@ -647,13 +647,13 @@ def _keep_episodes_from_video_with_av( # Convert fps to Fraction for PyAV compatibility. fps_fraction = Fraction(fps).limit_denominator(1000) - codec_options = camera_encoder.get_codec_options(as_strings=True) - v_out = out.add_stream(camera_encoder.vcodec, rate=fps_fraction, options=codec_options) + codec_options = video_encoder.get_codec_options(as_strings=True) + v_out = out.add_stream(video_encoder.vcodec, rate=fps_fraction, options=codec_options) # PyAV type stubs don't distinguish video streams from audio/subtitle streams. v_out.width = v_in.codec_context.width v_out.height = v_in.codec_context.height - v_out.pix_fmt = camera_encoder.pix_fmt + v_out.pix_fmt = video_encoder.pix_fmt # Set time_base to match the frame rate for proper timestamp handling. v_out.time_base = Fraction(1, int(fps)) @@ -1670,7 +1670,7 @@ def convert_image_to_video_dataset( output_dir: Path | None = None, repo_id: str | None = None, camera_encoder: VideoEncoderConfig | None = None, - depth_encoder: VideoEncoderConfig | None = None, + depth_encoder: DepthEncoderConfig | None = None, episode_indices: list[int] | None = None, num_workers: int = 4, max_episodes_per_batch: int | None = None, @@ -1685,8 +1685,11 @@ def convert_image_to_video_dataset( dataset: The source LeRobot dataset with images output_dir: Root directory where the edited dataset will be stored. If not specified, defaults to $HF_LEROBOT_HOME/repo_id. Equivalent to new_root in EditDatasetConfig. repo_id: Edited dataset identifier. Equivalent to new_repo_id in EditDatasetConfig. - camera_encoder: Video encoder settings + camera_encoder: Video encoder settings applied to RGB cameras (``None`` uses :func:`~lerobot.configs.camera_encoder_defaults`). + depth_encoder: Video encoder settings applied to depth-map cameras, including + the quantization parameters persisted to the dataset metadata + (``None`` uses :func:`~lerobot.configs.depth_encoder_defaults`). episode_indices: List of episode indices to convert (None = all episodes) num_workers: Number of threads for parallel processing (default: 4) max_episodes_per_batch: Maximum episodes per video batch to avoid memory issues (None = no limit) @@ -1879,13 +1882,11 @@ def convert_image_to_video_dataset( # Update video info for all image keys (now videos) # We need to manually set video info since update_video_info() checks video_keys first for img_key in img_keys: - if not new_meta.features[img_key].get("info", None): - video_path = new_meta.root / new_meta.video_path.format( - video_key=img_key, chunk_index=0, file_index=0 - ) - new_meta.info.features[img_key]["info"] = get_video_info( - video_path, video_encoder=camera_encoder - ) + target_encoder = depth_encoder if img_key in dataset.meta.depth_keys else camera_encoder + video_path = new_meta.root / new_meta.video_path.format( + video_key=img_key, chunk_index=0, file_index=0 + ) + new_meta.info.features[img_key]["info"] = get_video_info(video_path, video_encoder=target_encoder) write_info(new_meta.info, new_meta.root) diff --git a/src/lerobot/scripts/lerobot_edit_dataset.py b/src/lerobot/scripts/lerobot_edit_dataset.py index eaadf47de..ab0be4a41 100644 --- a/src/lerobot/scripts/lerobot_edit_dataset.py +++ b/src/lerobot/scripts/lerobot_edit_dataset.py @@ -133,6 +133,15 @@ Convert image dataset to video format and save locally: --new_root /path/to/output/pusht_video \ --operation.type convert_image_to_video +Convert image dataset (with depth maps) to video format, customizing the depth encoder: + lerobot-edit-dataset \ + --repo_id lerobot/pusht_image \ + --new_root /path/to/output/pusht_video \ + --operation.type convert_image_to_video \ + --operation.depth_encoder.depth_min 0.01 \ + --operation.depth_encoder.depth_max 10.0 \ + --operation.depth_encoder.use_log true + Convert image dataset to video format and save with new repo_id: lerobot-edit-dataset \ --repo_id lerobot/pusht_image \ @@ -211,6 +220,13 @@ Re-encode videos in-place (overwrites original dataset): --operation.camera_encoder.vcodec h264 \ --operation.overwrite true +Re-encode both RGB and depth videos in a dataset (depth quantization params are preserved): + lerobot-edit-dataset \ + --repo_id lerobot/pusht_depth \ + --operation.type reencode_videos \ + --operation.camera_encoder.vcodec libx264 \ + --operation.depth_encoder.vcodec ffv1 + Using JSON config file: lerobot-edit-dataset \ --config_path path/to/edit_config.json @@ -225,7 +241,13 @@ from pathlib import Path import draccus -from lerobot.configs import VideoEncoderConfig, camera_encoder_defaults, parser +from lerobot.configs import ( + DepthEncoderConfig, + VideoEncoderConfig, + camera_encoder_defaults, + depth_encoder_defaults, + parser, +) from lerobot.datasets import ( LeRobotDataset, convert_image_to_video_dataset, @@ -288,6 +310,7 @@ class ModifyTasksConfig(OperationConfig): class ConvertImageToVideoConfig(OperationConfig): output_dir: str | None = None camera_encoder: VideoEncoderConfig = field(default_factory=camera_encoder_defaults) + depth_encoder: DepthEncoderConfig = field(default_factory=depth_encoder_defaults) episode_indices: list[int] | None = None num_workers: int = 4 max_episodes_per_batch: int | None = None @@ -309,6 +332,7 @@ class RecomputeStatsConfig(OperationConfig): @dataclass class ReencodeVideosConfig(OperationConfig): camera_encoder: VideoEncoderConfig = field(default_factory=camera_encoder_defaults) + depth_encoder: DepthEncoderConfig = field(default_factory=depth_encoder_defaults) num_workers: int = 0 encoder_threads: int | None = None overwrite: bool = False @@ -602,6 +626,7 @@ def handle_convert_image_to_video(cfg: EditDatasetConfig) -> None: output_dir=output_dir, repo_id=output_repo_id, camera_encoder=getattr(cfg.operation, "camera_encoder", None) or camera_encoder_defaults(), + depth_encoder=getattr(cfg.operation, "depth_encoder", None) or depth_encoder_defaults(), episode_indices=getattr(cfg.operation, "episode_indices", None), num_workers=getattr(cfg.operation, "num_workers", 4), max_episodes_per_batch=getattr(cfg.operation, "max_episodes_per_batch", None), @@ -719,10 +744,14 @@ def handle_reencode_videos(cfg: EditDatasetConfig) -> None: shutil.copytree(input_root, output_root) dataset = LeRobotDataset(output_repo_id, root=output_root) - logging.info(f"Re-encoding videos in {output_repo_id} with {cfg.operation.camera_encoder}") + logging.info( + f"Re-encoding videos in {output_repo_id} with RGB encoder {cfg.operation.camera_encoder} " + f"and depth encoder {cfg.operation.depth_encoder}" + ) reencode_dataset( dataset, camera_encoder=cfg.operation.camera_encoder, + depth_encoder=cfg.operation.depth_encoder, encoder_threads=cfg.operation.encoder_threads, num_workers=cfg.operation.num_workers, ) diff --git a/tests/datasets/test_dataset_tools.py b/tests/datasets/test_dataset_tools.py index 0633ec900..440ea3bb8 100644 --- a/tests/datasets/test_dataset_tools.py +++ b/tests/datasets/test_dataset_tools.py @@ -1380,12 +1380,24 @@ def test_convert_image_to_video_dataset_depth(tmp_path, empty_lerobot_dataset_fa mock_get_safe_version.return_value = "v3.0" mock_snapshot_download.return_value = str(output_dir) + # Use non-default quantization params so the persisted metadata must + # come from the depth encoder (not RGB encoder defaults). + depth_encoder = DepthEncoderConfig( + vcodec="hevc", + pix_fmt="gray12le", + g=2, + crf=30, + depth_min=0.05, + depth_max=8.0, + shift=2.0, + use_log=False, + ) video_dataset = convert_image_to_video_dataset( dataset=source_dataset, output_dir=output_dir, repo_id="dummy/depth_video", camera_encoder=VideoEncoderConfig(vcodec="libsvtav1", pix_fmt="yuv420p", g=2, crf=30), - depth_encoder=DepthEncoderConfig(vcodec="hevc", pix_fmt="gray12le", g=2, crf=30), + depth_encoder=depth_encoder, num_workers=1, ) @@ -1398,6 +1410,18 @@ def test_convert_image_to_video_dataset_depth(tmp_path, empty_lerobot_dataset_fa depth_path = video_dataset.root / video_dataset.meta.get_video_file_path(0, "observation.images.depth") assert depth_path.exists(), f"Depth video file should exist: {depth_path}" + # The persisted depth-video metadata must carry the depth quantization params + # from the depth encoder (so frames dequantize correctly on read), and the RGB + # camera must not be marked as a depth map. + persisted_info = load_info(video_dataset.root) + depth_info = persisted_info.features["observation.images.depth"]["info"] + assert depth_info["is_depth_map"] is True + assert DepthEncoderConfig.from_video_info(depth_info) == depth_encoder + + cam_info = persisted_info.features["observation.images.cam"]["info"] + assert cam_info.get("is_depth_map") is False + assert "video.codec" in cam_info + # ─── reencode_dataset ───────────────────────────────────────────────── diff --git a/tests/scripts/test_edit_dataset_parsing.py b/tests/scripts/test_edit_dataset_parsing.py index c90cffb38..4dfad81f9 100644 --- a/tests/scripts/test_edit_dataset_parsing.py +++ b/tests/scripts/test_edit_dataset_parsing.py @@ -27,6 +27,7 @@ from lerobot.scripts.lerobot_edit_dataset import ( MergeConfig, ModifyTasksConfig, OperationConfig, + ReencodeVideosConfig, RemoveFeatureConfig, SplitConfig, _validate_config, @@ -103,3 +104,47 @@ class TestOperationTypeParsing: ) resolved_name = OperationConfig.get_choice_name(type(cfg.operation)) assert resolved_name == type_name + + +class TestDepthEncoderParsing: + """Test that the depth encoder is exposed and parsed for video operations.""" + + def test_reencode_has_default_depth_encoder(self): + cfg = parse_cfg(["--repo_id", "test/repo", "--operation.type", "reencode_videos"]) + assert isinstance(cfg.operation, ReencodeVideosConfig) + # A depth encoder is configured by default so depth videos are re-encoded too. + assert cfg.operation.depth_encoder is not None + assert hasattr(cfg.operation.depth_encoder, "depth_min") + + def test_reencode_parses_depth_encoder_overrides(self): + cfg = parse_cfg( + [ + "--repo_id", + "test/repo", + "--operation.type", + "reencode_videos", + "--operation.depth_encoder.vcodec", + "ffv1", + "--operation.depth_encoder.depth_max", + "12.0", + "--operation.depth_encoder.use_log", + "false", + ] + ) + assert cfg.operation.depth_encoder.vcodec == "ffv1" + assert cfg.operation.depth_encoder.depth_max == 12.0 + assert cfg.operation.depth_encoder.use_log is False + + def test_convert_image_to_video_parses_depth_encoder_overrides(self): + cfg = parse_cfg( + [ + "--repo_id", + "test/repo", + "--operation.type", + "convert_image_to_video", + "--operation.depth_encoder.depth_min", + "0.05", + ] + ) + assert isinstance(cfg.operation, ConvertImageToVideoConfig) + assert cfg.operation.depth_encoder.depth_min == 0.05