feat(dataset tools): adding missing docstrings and features for depth fill support in dataset edition tools

This commit is contained in:
CarolinePascal
2026-06-15 14:31:42 +02:00
parent 655338abf3
commit a694e32774
4 changed files with 116 additions and 17 deletions
+15 -14
View File
@@ -608,7 +608,7 @@ def _keep_episodes_from_video_with_av(
output_path: Path,
episodes_to_keep: list[tuple[int, int]],
fps: float,
camera_encoder: VideoEncoderConfig,
video_encoder: VideoEncoderConfig,
) -> None:
"""Keep only specified episodes from a video file using PyAV.
@@ -622,7 +622,7 @@ def _keep_episodes_from_video_with_av(
Ranges are half-open intervals: [start_frame, end_frame), where start_frame
is inclusive and end_frame is exclusive.
fps: Frame rate of the video.
camera_encoder: Video encoder settings used to re-encode the kept frames.
video_encoder: Video encoder settings used to re-encode the kept frames.
"""
from fractions import Fraction
@@ -647,13 +647,13 @@ def _keep_episodes_from_video_with_av(
# Convert fps to Fraction for PyAV compatibility.
fps_fraction = Fraction(fps).limit_denominator(1000)
codec_options = camera_encoder.get_codec_options(as_strings=True)
v_out = out.add_stream(camera_encoder.vcodec, rate=fps_fraction, options=codec_options)
codec_options = video_encoder.get_codec_options(as_strings=True)
v_out = out.add_stream(video_encoder.vcodec, rate=fps_fraction, options=codec_options)
# PyAV type stubs don't distinguish video streams from audio/subtitle streams.
v_out.width = v_in.codec_context.width
v_out.height = v_in.codec_context.height
v_out.pix_fmt = camera_encoder.pix_fmt
v_out.pix_fmt = video_encoder.pix_fmt
# Set time_base to match the frame rate for proper timestamp handling.
v_out.time_base = Fraction(1, int(fps))
@@ -1670,7 +1670,7 @@ def convert_image_to_video_dataset(
output_dir: Path | None = None,
repo_id: str | None = None,
camera_encoder: VideoEncoderConfig | None = None,
depth_encoder: VideoEncoderConfig | None = None,
depth_encoder: DepthEncoderConfig | None = None,
episode_indices: list[int] | None = None,
num_workers: int = 4,
max_episodes_per_batch: int | None = None,
@@ -1685,8 +1685,11 @@ def convert_image_to_video_dataset(
dataset: The source LeRobot dataset with images
output_dir: Root directory where the edited dataset will be stored. If not specified, defaults to $HF_LEROBOT_HOME/repo_id. Equivalent to new_root in EditDatasetConfig.
repo_id: Edited dataset identifier. Equivalent to new_repo_id in EditDatasetConfig.
camera_encoder: Video encoder settings
camera_encoder: Video encoder settings applied to RGB cameras
(``None`` uses :func:`~lerobot.configs.camera_encoder_defaults`).
depth_encoder: Video encoder settings applied to depth-map cameras, including
the quantization parameters persisted to the dataset metadata
(``None`` uses :func:`~lerobot.configs.depth_encoder_defaults`).
episode_indices: List of episode indices to convert (None = all episodes)
num_workers: Number of threads for parallel processing (default: 4)
max_episodes_per_batch: Maximum episodes per video batch to avoid memory issues (None = no limit)
@@ -1879,13 +1882,11 @@ def convert_image_to_video_dataset(
# Update video info for all image keys (now videos)
# We need to manually set video info since update_video_info() checks video_keys first
for img_key in img_keys:
if not new_meta.features[img_key].get("info", None):
video_path = new_meta.root / new_meta.video_path.format(
video_key=img_key, chunk_index=0, file_index=0
)
new_meta.info.features[img_key]["info"] = get_video_info(
video_path, video_encoder=camera_encoder
)
target_encoder = depth_encoder if img_key in dataset.meta.depth_keys else camera_encoder
video_path = new_meta.root / new_meta.video_path.format(
video_key=img_key, chunk_index=0, file_index=0
)
new_meta.info.features[img_key]["info"] = get_video_info(video_path, video_encoder=target_encoder)
write_info(new_meta.info, new_meta.root)
+31 -2
View File
@@ -133,6 +133,15 @@ Convert image dataset to video format and save locally:
--new_root /path/to/output/pusht_video \
--operation.type convert_image_to_video
Convert image dataset (with depth maps) to video format, customizing the depth encoder:
lerobot-edit-dataset \
--repo_id lerobot/pusht_image \
--new_root /path/to/output/pusht_video \
--operation.type convert_image_to_video \
--operation.depth_encoder.depth_min 0.01 \
--operation.depth_encoder.depth_max 10.0 \
--operation.depth_encoder.use_log true
Convert image dataset to video format and save with new repo_id:
lerobot-edit-dataset \
--repo_id lerobot/pusht_image \
@@ -211,6 +220,13 @@ Re-encode videos in-place (overwrites original dataset):
--operation.camera_encoder.vcodec h264 \
--operation.overwrite true
Re-encode both RGB and depth videos in a dataset (depth quantization params are preserved):
lerobot-edit-dataset \
--repo_id lerobot/pusht_depth \
--operation.type reencode_videos \
--operation.camera_encoder.vcodec libx264 \
--operation.depth_encoder.vcodec ffv1
Using JSON config file:
lerobot-edit-dataset \
--config_path path/to/edit_config.json
@@ -225,7 +241,13 @@ from pathlib import Path
import draccus
from lerobot.configs import VideoEncoderConfig, camera_encoder_defaults, parser
from lerobot.configs import (
DepthEncoderConfig,
VideoEncoderConfig,
camera_encoder_defaults,
depth_encoder_defaults,
parser,
)
from lerobot.datasets import (
LeRobotDataset,
convert_image_to_video_dataset,
@@ -288,6 +310,7 @@ class ModifyTasksConfig(OperationConfig):
class ConvertImageToVideoConfig(OperationConfig):
output_dir: str | None = None
camera_encoder: VideoEncoderConfig = field(default_factory=camera_encoder_defaults)
depth_encoder: DepthEncoderConfig = field(default_factory=depth_encoder_defaults)
episode_indices: list[int] | None = None
num_workers: int = 4
max_episodes_per_batch: int | None = None
@@ -309,6 +332,7 @@ class RecomputeStatsConfig(OperationConfig):
@dataclass
class ReencodeVideosConfig(OperationConfig):
camera_encoder: VideoEncoderConfig = field(default_factory=camera_encoder_defaults)
depth_encoder: DepthEncoderConfig = field(default_factory=depth_encoder_defaults)
num_workers: int = 0
encoder_threads: int | None = None
overwrite: bool = False
@@ -602,6 +626,7 @@ def handle_convert_image_to_video(cfg: EditDatasetConfig) -> None:
output_dir=output_dir,
repo_id=output_repo_id,
camera_encoder=getattr(cfg.operation, "camera_encoder", None) or camera_encoder_defaults(),
depth_encoder=getattr(cfg.operation, "depth_encoder", None) or depth_encoder_defaults(),
episode_indices=getattr(cfg.operation, "episode_indices", None),
num_workers=getattr(cfg.operation, "num_workers", 4),
max_episodes_per_batch=getattr(cfg.operation, "max_episodes_per_batch", None),
@@ -719,10 +744,14 @@ def handle_reencode_videos(cfg: EditDatasetConfig) -> None:
shutil.copytree(input_root, output_root)
dataset = LeRobotDataset(output_repo_id, root=output_root)
logging.info(f"Re-encoding videos in {output_repo_id} with {cfg.operation.camera_encoder}")
logging.info(
f"Re-encoding videos in {output_repo_id} with RGB encoder {cfg.operation.camera_encoder} "
f"and depth encoder {cfg.operation.depth_encoder}"
)
reencode_dataset(
dataset,
camera_encoder=cfg.operation.camera_encoder,
depth_encoder=cfg.operation.depth_encoder,
encoder_threads=cfg.operation.encoder_threads,
num_workers=cfg.operation.num_workers,
)
+25 -1
View File
@@ -1380,12 +1380,24 @@ def test_convert_image_to_video_dataset_depth(tmp_path, empty_lerobot_dataset_fa
mock_get_safe_version.return_value = "v3.0"
mock_snapshot_download.return_value = str(output_dir)
# Use non-default quantization params so the persisted metadata must
# come from the depth encoder (not RGB encoder defaults).
depth_encoder = DepthEncoderConfig(
vcodec="hevc",
pix_fmt="gray12le",
g=2,
crf=30,
depth_min=0.05,
depth_max=8.0,
shift=2.0,
use_log=False,
)
video_dataset = convert_image_to_video_dataset(
dataset=source_dataset,
output_dir=output_dir,
repo_id="dummy/depth_video",
camera_encoder=VideoEncoderConfig(vcodec="libsvtav1", pix_fmt="yuv420p", g=2, crf=30),
depth_encoder=DepthEncoderConfig(vcodec="hevc", pix_fmt="gray12le", g=2, crf=30),
depth_encoder=depth_encoder,
num_workers=1,
)
@@ -1398,6 +1410,18 @@ def test_convert_image_to_video_dataset_depth(tmp_path, empty_lerobot_dataset_fa
depth_path = video_dataset.root / video_dataset.meta.get_video_file_path(0, "observation.images.depth")
assert depth_path.exists(), f"Depth video file should exist: {depth_path}"
# The persisted depth-video metadata must carry the depth quantization params
# from the depth encoder (so frames dequantize correctly on read), and the RGB
# camera must not be marked as a depth map.
persisted_info = load_info(video_dataset.root)
depth_info = persisted_info.features["observation.images.depth"]["info"]
assert depth_info["is_depth_map"] is True
assert DepthEncoderConfig.from_video_info(depth_info) == depth_encoder
cam_info = persisted_info.features["observation.images.cam"]["info"]
assert cam_info.get("is_depth_map") is False
assert "video.codec" in cam_info
# ─── reencode_dataset ─────────────────────────────────────────────────
@@ -27,6 +27,7 @@ from lerobot.scripts.lerobot_edit_dataset import (
MergeConfig,
ModifyTasksConfig,
OperationConfig,
ReencodeVideosConfig,
RemoveFeatureConfig,
SplitConfig,
_validate_config,
@@ -103,3 +104,47 @@ class TestOperationTypeParsing:
)
resolved_name = OperationConfig.get_choice_name(type(cfg.operation))
assert resolved_name == type_name
class TestDepthEncoderParsing:
"""Test that the depth encoder is exposed and parsed for video operations."""
def test_reencode_has_default_depth_encoder(self):
cfg = parse_cfg(["--repo_id", "test/repo", "--operation.type", "reencode_videos"])
assert isinstance(cfg.operation, ReencodeVideosConfig)
# A depth encoder is configured by default so depth videos are re-encoded too.
assert cfg.operation.depth_encoder is not None
assert hasattr(cfg.operation.depth_encoder, "depth_min")
def test_reencode_parses_depth_encoder_overrides(self):
cfg = parse_cfg(
[
"--repo_id",
"test/repo",
"--operation.type",
"reencode_videos",
"--operation.depth_encoder.vcodec",
"ffv1",
"--operation.depth_encoder.depth_max",
"12.0",
"--operation.depth_encoder.use_log",
"false",
]
)
assert cfg.operation.depth_encoder.vcodec == "ffv1"
assert cfg.operation.depth_encoder.depth_max == 12.0
assert cfg.operation.depth_encoder.use_log is False
def test_convert_image_to_video_parses_depth_encoder_overrides(self):
cfg = parse_cfg(
[
"--repo_id",
"test/repo",
"--operation.type",
"convert_image_to_video",
"--operation.depth_encoder.depth_min",
"0.05",
]
)
assert isinstance(cfg.operation, ConvertImageToVideoConfig)
assert cfg.operation.depth_encoder.depth_min == 0.05