From dbb32ead5fa9e3089609e309d724630ece67ab06 Mon Sep 17 00:00:00 2001 From: CarolinePascal Date: Fri, 12 Sep 2025 17:03:36 +0200 Subject: [PATCH] fix(video becnhmark) * fixing typos on PyAV decoders names * adding torchcodec among video backends * updating images datasets to v3.0 --- benchmarks/video/run_video_benchmark.py | 30 ++++++++----------------- src/lerobot/datasets/lerobot_dataset.py | 5 +++-- 2 files changed, 12 insertions(+), 23 deletions(-) diff --git a/benchmarks/video/run_video_benchmark.py b/benchmarks/video/run_video_benchmark.py index bababf636..15d893e4d 100644 --- a/benchmarks/video/run_video_benchmark.py +++ b/benchmarks/video/run_video_benchmark.py @@ -37,14 +37,14 @@ from tqdm import tqdm from lerobot.datasets.lerobot_dataset import LeRobotDataset from lerobot.datasets.video_utils import ( - decode_video_frames_torchvision, + decode_video_frames, encode_video_frames, ) from lerobot.utils.benchmark import TimeBenchmark BASE_ENCODING = OrderedDict( [ - ("vcodec", "libx264"), + ("vcodec", "h264"), ("pix_fmt", "yuv444p"), ("g", 2), ("crf", None), @@ -147,18 +147,6 @@ def sample_timestamps(timestamps_mode: str, ep_num_images: int, fps: int) -> lis return [idx / fps for idx in frame_indexes] -def decode_video_frames( - video_path: str, - timestamps: list[float], - tolerance_s: float, - backend: str, -) -> torch.Tensor: - if backend in ["pyav", "video_reader"]: - return decode_video_frames_torchvision(video_path, timestamps, tolerance_s, backend) - else: - raise NotImplementedError(backend) - - def benchmark_decoding( imgs_dir: Path, video_path: Path, @@ -406,9 +394,9 @@ if __name__ == "__main__": nargs="*", default=[ "lerobot/pusht_image", - "aliberts/aloha_mobile_shrimp_image", - "aliberts/paris_street", - "aliberts/kitchen", + "CarolinePascal/aloha_mobile_shrimp_image", + "CarolinePascal/paris_street", + "CarolinePascal/kitchen", ], help="Datasets repo-ids to test against. First episodes only are used. Must be images.", ) @@ -416,7 +404,7 @@ if __name__ == "__main__": "--vcodec", type=str, nargs="*", - default=["libx264", "hevc", "libsvtav1"], + default=["h264", "hevc", "libsvtav1"], help="Video codecs to be tested", ) parser.add_argument( @@ -446,7 +434,7 @@ if __name__ == "__main__": # nargs="*", # default=[0, 1], # help="Use the fastdecode tuning option. 0 disables it. " - # "For libx264 and libx265/hevc, only 1 is possible. " + # "For h264 and h265/hevc, only 1 is possible. " # "For libsvtav1, 1, 2 or 3 are possible values with a higher number meaning a faster decoding optimization", # ) parser.add_argument( @@ -465,8 +453,8 @@ if __name__ == "__main__": "--backends", type=str, nargs="*", - default=["pyav", "video_reader"], - help="Torchvision decoding backend to be tested.", + default=["torchcodec", "pyav", "video_reader"], + help="Video decoding backend to be tested.", ) parser.add_argument( "--num-samples", diff --git a/src/lerobot/datasets/lerobot_dataset.py b/src/lerobot/datasets/lerobot_dataset.py index a869cb920..ae5591a4b 100644 --- a/src/lerobot/datasets/lerobot_dataset.py +++ b/src/lerobot/datasets/lerobot_dataset.py @@ -440,8 +440,9 @@ class LeRobotDataset(torch.utils.data.Dataset): download_videos (bool, optional): Flag to download the videos. Note that when set to True but the video files are already present on local disk, they won't be downloaded again. Defaults to True. - video_backend (str | None, optional): Video backend to use for decoding videos. Defaults to torchcodec when available int the platform; otherwise, defaults to 'pyav'. - You can also use the 'pyav' decoder used by Torchvision, which used to be the default option, or 'video_reader' which is another decoder of Torchvision. + video_backend (str | None, optional): Video backend to use for decoding videos. Defaults to 'torchcodec' + when available on the platform; otherwise, defaults to torchvision's default backend : 'pyav'. + You can also use 'video_reader' which is another decoder of torchvision. batch_encoding_size (int, optional): Number of episodes to accumulate before batch encoding videos. Set to 1 for immediate encoding (default), or higher for batched encoding. Defaults to 1. """