mirror of
https://github.com/huggingface/lerobot.git
synced 2026-06-17 08:17:02 +00:00
feat(dataset tools): adding missing docstrings and features for depth fill support in dataset edition tools
This commit is contained in:
@@ -608,7 +608,7 @@ def _keep_episodes_from_video_with_av(
|
||||
output_path: Path,
|
||||
episodes_to_keep: list[tuple[int, int]],
|
||||
fps: float,
|
||||
camera_encoder: VideoEncoderConfig,
|
||||
video_encoder: VideoEncoderConfig,
|
||||
) -> None:
|
||||
"""Keep only specified episodes from a video file using PyAV.
|
||||
|
||||
@@ -622,7 +622,7 @@ def _keep_episodes_from_video_with_av(
|
||||
Ranges are half-open intervals: [start_frame, end_frame), where start_frame
|
||||
is inclusive and end_frame is exclusive.
|
||||
fps: Frame rate of the video.
|
||||
camera_encoder: Video encoder settings used to re-encode the kept frames.
|
||||
video_encoder: Video encoder settings used to re-encode the kept frames.
|
||||
"""
|
||||
from fractions import Fraction
|
||||
|
||||
@@ -647,13 +647,13 @@ def _keep_episodes_from_video_with_av(
|
||||
|
||||
# Convert fps to Fraction for PyAV compatibility.
|
||||
fps_fraction = Fraction(fps).limit_denominator(1000)
|
||||
codec_options = camera_encoder.get_codec_options(as_strings=True)
|
||||
v_out = out.add_stream(camera_encoder.vcodec, rate=fps_fraction, options=codec_options)
|
||||
codec_options = video_encoder.get_codec_options(as_strings=True)
|
||||
v_out = out.add_stream(video_encoder.vcodec, rate=fps_fraction, options=codec_options)
|
||||
|
||||
# PyAV type stubs don't distinguish video streams from audio/subtitle streams.
|
||||
v_out.width = v_in.codec_context.width
|
||||
v_out.height = v_in.codec_context.height
|
||||
v_out.pix_fmt = camera_encoder.pix_fmt
|
||||
v_out.pix_fmt = video_encoder.pix_fmt
|
||||
|
||||
# Set time_base to match the frame rate for proper timestamp handling.
|
||||
v_out.time_base = Fraction(1, int(fps))
|
||||
@@ -1670,7 +1670,7 @@ def convert_image_to_video_dataset(
|
||||
output_dir: Path | None = None,
|
||||
repo_id: str | None = None,
|
||||
camera_encoder: VideoEncoderConfig | None = None,
|
||||
depth_encoder: VideoEncoderConfig | None = None,
|
||||
depth_encoder: DepthEncoderConfig | None = None,
|
||||
episode_indices: list[int] | None = None,
|
||||
num_workers: int = 4,
|
||||
max_episodes_per_batch: int | None = None,
|
||||
@@ -1685,8 +1685,11 @@ def convert_image_to_video_dataset(
|
||||
dataset: The source LeRobot dataset with images
|
||||
output_dir: Root directory where the edited dataset will be stored. If not specified, defaults to $HF_LEROBOT_HOME/repo_id. Equivalent to new_root in EditDatasetConfig.
|
||||
repo_id: Edited dataset identifier. Equivalent to new_repo_id in EditDatasetConfig.
|
||||
camera_encoder: Video encoder settings
|
||||
camera_encoder: Video encoder settings applied to RGB cameras
|
||||
(``None`` uses :func:`~lerobot.configs.camera_encoder_defaults`).
|
||||
depth_encoder: Video encoder settings applied to depth-map cameras, including
|
||||
the quantization parameters persisted to the dataset metadata
|
||||
(``None`` uses :func:`~lerobot.configs.depth_encoder_defaults`).
|
||||
episode_indices: List of episode indices to convert (None = all episodes)
|
||||
num_workers: Number of threads for parallel processing (default: 4)
|
||||
max_episodes_per_batch: Maximum episodes per video batch to avoid memory issues (None = no limit)
|
||||
@@ -1879,13 +1882,11 @@ def convert_image_to_video_dataset(
|
||||
# Update video info for all image keys (now videos)
|
||||
# We need to manually set video info since update_video_info() checks video_keys first
|
||||
for img_key in img_keys:
|
||||
if not new_meta.features[img_key].get("info", None):
|
||||
video_path = new_meta.root / new_meta.video_path.format(
|
||||
video_key=img_key, chunk_index=0, file_index=0
|
||||
)
|
||||
new_meta.info.features[img_key]["info"] = get_video_info(
|
||||
video_path, video_encoder=camera_encoder
|
||||
)
|
||||
target_encoder = depth_encoder if img_key in dataset.meta.depth_keys else camera_encoder
|
||||
video_path = new_meta.root / new_meta.video_path.format(
|
||||
video_key=img_key, chunk_index=0, file_index=0
|
||||
)
|
||||
new_meta.info.features[img_key]["info"] = get_video_info(video_path, video_encoder=target_encoder)
|
||||
|
||||
write_info(new_meta.info, new_meta.root)
|
||||
|
||||
|
||||
@@ -133,6 +133,15 @@ Convert image dataset to video format and save locally:
|
||||
--new_root /path/to/output/pusht_video \
|
||||
--operation.type convert_image_to_video
|
||||
|
||||
Convert image dataset (with depth maps) to video format, customizing the depth encoder:
|
||||
lerobot-edit-dataset \
|
||||
--repo_id lerobot/pusht_image \
|
||||
--new_root /path/to/output/pusht_video \
|
||||
--operation.type convert_image_to_video \
|
||||
--operation.depth_encoder.depth_min 0.01 \
|
||||
--operation.depth_encoder.depth_max 10.0 \
|
||||
--operation.depth_encoder.use_log true
|
||||
|
||||
Convert image dataset to video format and save with new repo_id:
|
||||
lerobot-edit-dataset \
|
||||
--repo_id lerobot/pusht_image \
|
||||
@@ -211,6 +220,13 @@ Re-encode videos in-place (overwrites original dataset):
|
||||
--operation.camera_encoder.vcodec h264 \
|
||||
--operation.overwrite true
|
||||
|
||||
Re-encode both RGB and depth videos in a dataset (depth quantization params are preserved):
|
||||
lerobot-edit-dataset \
|
||||
--repo_id lerobot/pusht_depth \
|
||||
--operation.type reencode_videos \
|
||||
--operation.camera_encoder.vcodec libx264 \
|
||||
--operation.depth_encoder.vcodec ffv1
|
||||
|
||||
Using JSON config file:
|
||||
lerobot-edit-dataset \
|
||||
--config_path path/to/edit_config.json
|
||||
@@ -225,7 +241,13 @@ from pathlib import Path
|
||||
|
||||
import draccus
|
||||
|
||||
from lerobot.configs import VideoEncoderConfig, camera_encoder_defaults, parser
|
||||
from lerobot.configs import (
|
||||
DepthEncoderConfig,
|
||||
VideoEncoderConfig,
|
||||
camera_encoder_defaults,
|
||||
depth_encoder_defaults,
|
||||
parser,
|
||||
)
|
||||
from lerobot.datasets import (
|
||||
LeRobotDataset,
|
||||
convert_image_to_video_dataset,
|
||||
@@ -288,6 +310,7 @@ class ModifyTasksConfig(OperationConfig):
|
||||
class ConvertImageToVideoConfig(OperationConfig):
|
||||
output_dir: str | None = None
|
||||
camera_encoder: VideoEncoderConfig = field(default_factory=camera_encoder_defaults)
|
||||
depth_encoder: DepthEncoderConfig = field(default_factory=depth_encoder_defaults)
|
||||
episode_indices: list[int] | None = None
|
||||
num_workers: int = 4
|
||||
max_episodes_per_batch: int | None = None
|
||||
@@ -309,6 +332,7 @@ class RecomputeStatsConfig(OperationConfig):
|
||||
@dataclass
|
||||
class ReencodeVideosConfig(OperationConfig):
|
||||
camera_encoder: VideoEncoderConfig = field(default_factory=camera_encoder_defaults)
|
||||
depth_encoder: DepthEncoderConfig = field(default_factory=depth_encoder_defaults)
|
||||
num_workers: int = 0
|
||||
encoder_threads: int | None = None
|
||||
overwrite: bool = False
|
||||
@@ -602,6 +626,7 @@ def handle_convert_image_to_video(cfg: EditDatasetConfig) -> None:
|
||||
output_dir=output_dir,
|
||||
repo_id=output_repo_id,
|
||||
camera_encoder=getattr(cfg.operation, "camera_encoder", None) or camera_encoder_defaults(),
|
||||
depth_encoder=getattr(cfg.operation, "depth_encoder", None) or depth_encoder_defaults(),
|
||||
episode_indices=getattr(cfg.operation, "episode_indices", None),
|
||||
num_workers=getattr(cfg.operation, "num_workers", 4),
|
||||
max_episodes_per_batch=getattr(cfg.operation, "max_episodes_per_batch", None),
|
||||
@@ -719,10 +744,14 @@ def handle_reencode_videos(cfg: EditDatasetConfig) -> None:
|
||||
shutil.copytree(input_root, output_root)
|
||||
dataset = LeRobotDataset(output_repo_id, root=output_root)
|
||||
|
||||
logging.info(f"Re-encoding videos in {output_repo_id} with {cfg.operation.camera_encoder}")
|
||||
logging.info(
|
||||
f"Re-encoding videos in {output_repo_id} with RGB encoder {cfg.operation.camera_encoder} "
|
||||
f"and depth encoder {cfg.operation.depth_encoder}"
|
||||
)
|
||||
reencode_dataset(
|
||||
dataset,
|
||||
camera_encoder=cfg.operation.camera_encoder,
|
||||
depth_encoder=cfg.operation.depth_encoder,
|
||||
encoder_threads=cfg.operation.encoder_threads,
|
||||
num_workers=cfg.operation.num_workers,
|
||||
)
|
||||
|
||||
@@ -1380,12 +1380,24 @@ def test_convert_image_to_video_dataset_depth(tmp_path, empty_lerobot_dataset_fa
|
||||
mock_get_safe_version.return_value = "v3.0"
|
||||
mock_snapshot_download.return_value = str(output_dir)
|
||||
|
||||
# Use non-default quantization params so the persisted metadata must
|
||||
# come from the depth encoder (not RGB encoder defaults).
|
||||
depth_encoder = DepthEncoderConfig(
|
||||
vcodec="hevc",
|
||||
pix_fmt="gray12le",
|
||||
g=2,
|
||||
crf=30,
|
||||
depth_min=0.05,
|
||||
depth_max=8.0,
|
||||
shift=2.0,
|
||||
use_log=False,
|
||||
)
|
||||
video_dataset = convert_image_to_video_dataset(
|
||||
dataset=source_dataset,
|
||||
output_dir=output_dir,
|
||||
repo_id="dummy/depth_video",
|
||||
camera_encoder=VideoEncoderConfig(vcodec="libsvtav1", pix_fmt="yuv420p", g=2, crf=30),
|
||||
depth_encoder=DepthEncoderConfig(vcodec="hevc", pix_fmt="gray12le", g=2, crf=30),
|
||||
depth_encoder=depth_encoder,
|
||||
num_workers=1,
|
||||
)
|
||||
|
||||
@@ -1398,6 +1410,18 @@ def test_convert_image_to_video_dataset_depth(tmp_path, empty_lerobot_dataset_fa
|
||||
depth_path = video_dataset.root / video_dataset.meta.get_video_file_path(0, "observation.images.depth")
|
||||
assert depth_path.exists(), f"Depth video file should exist: {depth_path}"
|
||||
|
||||
# The persisted depth-video metadata must carry the depth quantization params
|
||||
# from the depth encoder (so frames dequantize correctly on read), and the RGB
|
||||
# camera must not be marked as a depth map.
|
||||
persisted_info = load_info(video_dataset.root)
|
||||
depth_info = persisted_info.features["observation.images.depth"]["info"]
|
||||
assert depth_info["is_depth_map"] is True
|
||||
assert DepthEncoderConfig.from_video_info(depth_info) == depth_encoder
|
||||
|
||||
cam_info = persisted_info.features["observation.images.cam"]["info"]
|
||||
assert cam_info.get("is_depth_map") is False
|
||||
assert "video.codec" in cam_info
|
||||
|
||||
|
||||
# ─── reencode_dataset ─────────────────────────────────────────────────
|
||||
|
||||
|
||||
@@ -27,6 +27,7 @@ from lerobot.scripts.lerobot_edit_dataset import (
|
||||
MergeConfig,
|
||||
ModifyTasksConfig,
|
||||
OperationConfig,
|
||||
ReencodeVideosConfig,
|
||||
RemoveFeatureConfig,
|
||||
SplitConfig,
|
||||
_validate_config,
|
||||
@@ -103,3 +104,47 @@ class TestOperationTypeParsing:
|
||||
)
|
||||
resolved_name = OperationConfig.get_choice_name(type(cfg.operation))
|
||||
assert resolved_name == type_name
|
||||
|
||||
|
||||
class TestDepthEncoderParsing:
|
||||
"""Test that the depth encoder is exposed and parsed for video operations."""
|
||||
|
||||
def test_reencode_has_default_depth_encoder(self):
|
||||
cfg = parse_cfg(["--repo_id", "test/repo", "--operation.type", "reencode_videos"])
|
||||
assert isinstance(cfg.operation, ReencodeVideosConfig)
|
||||
# A depth encoder is configured by default so depth videos are re-encoded too.
|
||||
assert cfg.operation.depth_encoder is not None
|
||||
assert hasattr(cfg.operation.depth_encoder, "depth_min")
|
||||
|
||||
def test_reencode_parses_depth_encoder_overrides(self):
|
||||
cfg = parse_cfg(
|
||||
[
|
||||
"--repo_id",
|
||||
"test/repo",
|
||||
"--operation.type",
|
||||
"reencode_videos",
|
||||
"--operation.depth_encoder.vcodec",
|
||||
"ffv1",
|
||||
"--operation.depth_encoder.depth_max",
|
||||
"12.0",
|
||||
"--operation.depth_encoder.use_log",
|
||||
"false",
|
||||
]
|
||||
)
|
||||
assert cfg.operation.depth_encoder.vcodec == "ffv1"
|
||||
assert cfg.operation.depth_encoder.depth_max == 12.0
|
||||
assert cfg.operation.depth_encoder.use_log is False
|
||||
|
||||
def test_convert_image_to_video_parses_depth_encoder_overrides(self):
|
||||
cfg = parse_cfg(
|
||||
[
|
||||
"--repo_id",
|
||||
"test/repo",
|
||||
"--operation.type",
|
||||
"convert_image_to_video",
|
||||
"--operation.depth_encoder.depth_min",
|
||||
"0.05",
|
||||
]
|
||||
)
|
||||
assert isinstance(cfg.operation, ConvertImageToVideoConfig)
|
||||
assert cfg.operation.depth_encoder.depth_min == 0.05
|
||||
|
||||
Reference in New Issue
Block a user