fix(video becnhmark)

* fixing typos on PyAV decoders names * adding torchcodec among video backends * updating images datasets to v3.0
2026-07-15 05:51:52 +00:00 · 2025-09-12 17:03:36 +02:00
2 changed files with 12 additions and 23 deletions
@@ -37,14 +37,14 @@ from tqdm import tqdm

 from lerobot.datasets.lerobot_dataset import LeRobotDataset
 from lerobot.datasets.video_utils import (
-    decode_video_frames_torchvision,
+    decode_video_frames,
    encode_video_frames,
 )
 from lerobot.utils.benchmark import TimeBenchmark

 BASE_ENCODING = OrderedDict(
    [
-        ("vcodec", "libx264"),
+        ("vcodec", "h264"),
        ("pix_fmt", "yuv444p"),
        ("g", 2),
        ("crf", None),
@@ -147,18 +147,6 @@ def sample_timestamps(timestamps_mode: str, ep_num_images: int, fps: int) -> lis
    return [idx / fps for idx in frame_indexes]


-def decode_video_frames(
-    video_path: str,
-    timestamps: list[float],
-    tolerance_s: float,
-    backend: str,
-) -> torch.Tensor:
-    if backend in ["pyav", "video_reader"]:
-        return decode_video_frames_torchvision(video_path, timestamps, tolerance_s, backend)
-    else:
-        raise NotImplementedError(backend)
-
-
 def benchmark_decoding(
    imgs_dir: Path,
    video_path: Path,
@@ -406,9 +394,9 @@ if __name__ == "__main__":
        nargs="*",
        default=[
            "lerobot/pusht_image",
-            "aliberts/aloha_mobile_shrimp_image",
-            "aliberts/paris_street",
-            "aliberts/kitchen",
+            "CarolinePascal/aloha_mobile_shrimp_image",
+            "CarolinePascal/paris_street",
+            "CarolinePascal/kitchen",
        ],
        help="Datasets repo-ids to test against. First episodes only are used. Must be images.",
    )
@@ -416,7 +404,7 @@ if __name__ == "__main__":
        "--vcodec",
        type=str,
        nargs="*",
-        default=["libx264", "hevc", "libsvtav1"],
+        default=["h264", "hevc", "libsvtav1"],
        help="Video codecs to be tested",
    )
    parser.add_argument(
@@ -446,7 +434,7 @@ if __name__ == "__main__":
    #     nargs="*",
    #     default=[0, 1],
    #     help="Use the fastdecode tuning option. 0 disables it. "
-    #         "For libx264 and libx265/hevc, only 1 is possible. "
+    #         "For h264 and h265/hevc, only 1 is possible. "
    #         "For libsvtav1, 1, 2 or 3 are possible values with a higher number meaning a faster decoding optimization",
    # )
    parser.add_argument(
@@ -465,8 +453,8 @@ if __name__ == "__main__":
        "--backends",
        type=str,
        nargs="*",
-        default=["pyav", "video_reader"],
-        help="Torchvision decoding backend to be tested.",
+        default=["torchcodec", "pyav", "video_reader"],
+        help="Video decoding backend to be tested.",
    )
    parser.add_argument(
        "--num-samples",
@@ -440,8 +440,9 @@ class LeRobotDataset(torch.utils.data.Dataset):
            download_videos (bool, optional): Flag to download the videos. Note that when set to True but the
                video files are already present on local disk, they won't be downloaded again. Defaults to
                True.
-            video_backend (str | None, optional): Video backend to use for decoding videos. Defaults to torchcodec when available int the platform; otherwise, defaults to 'pyav'.
-                You can also use the 'pyav' decoder used by Torchvision, which used to be the default option, or 'video_reader' which is another decoder of Torchvision.
+            video_backend (str | None, optional): Video backend to use for decoding videos. Defaults to 'torchcodec' 
+                when available on the platform; otherwise, defaults to torchvision's default backend : 'pyav'.
+                You can also use 'video_reader' which is another decoder of torchvision.
            batch_encoding_size (int, optional): Number of episodes to accumulate before batch encoding videos.
                Set to 1 for immediate encoding (default), or higher for batched encoding. Defaults to 1.
        """