From dbb32ead5fa9e3089609e309d724630ece67ab06 Mon Sep 17 00:00:00 2001
From: CarolinePascal <caroline8.pascal@gmail.com>
Date: Fri, 12 Sep 2025 17:03:36 +0200
Subject: [PATCH] fix(video becnhmark) * fixing typos on PyAV decoders names *
 adding torchcodec among video backends * updating images datasets to v3.0

---
 benchmarks/video/run_video_benchmark.py | 30 ++++++++-----------------
 src/lerobot/datasets/lerobot_dataset.py |  5 +++--
 2 files changed, 12 insertions(+), 23 deletions(-)

diff --git a/benchmarks/video/run_video_benchmark.py b/benchmarks/video/run_video_benchmark.py
index bababf636..15d893e4d 100644
--- a/benchmarks/video/run_video_benchmark.py
+++ b/benchmarks/video/run_video_benchmark.py
@@ -37,14 +37,14 @@ from tqdm import tqdm
 
 from lerobot.datasets.lerobot_dataset import LeRobotDataset
 from lerobot.datasets.video_utils import (
-    decode_video_frames_torchvision,
+    decode_video_frames,
     encode_video_frames,
 )
 from lerobot.utils.benchmark import TimeBenchmark
 
 BASE_ENCODING = OrderedDict(
     [
-        ("vcodec", "libx264"),
+        ("vcodec", "h264"),
         ("pix_fmt", "yuv444p"),
         ("g", 2),
         ("crf", None),
@@ -147,18 +147,6 @@ def sample_timestamps(timestamps_mode: str, ep_num_images: int, fps: int) -> lis
     return [idx / fps for idx in frame_indexes]
 
 
-def decode_video_frames(
-    video_path: str,
-    timestamps: list[float],
-    tolerance_s: float,
-    backend: str,
-) -> torch.Tensor:
-    if backend in ["pyav", "video_reader"]:
-        return decode_video_frames_torchvision(video_path, timestamps, tolerance_s, backend)
-    else:
-        raise NotImplementedError(backend)
-
-
 def benchmark_decoding(
     imgs_dir: Path,
     video_path: Path,
@@ -406,9 +394,9 @@ if __name__ == "__main__":
         nargs="*",
         default=[
             "lerobot/pusht_image",
-            "aliberts/aloha_mobile_shrimp_image",
-            "aliberts/paris_street",
-            "aliberts/kitchen",
+            "CarolinePascal/aloha_mobile_shrimp_image",
+            "CarolinePascal/paris_street",
+            "CarolinePascal/kitchen",
         ],
         help="Datasets repo-ids to test against. First episodes only are used. Must be images.",
     )
@@ -416,7 +404,7 @@ if __name__ == "__main__":
         "--vcodec",
         type=str,
         nargs="*",
-        default=["libx264", "hevc", "libsvtav1"],
+        default=["h264", "hevc", "libsvtav1"],
         help="Video codecs to be tested",
     )
     parser.add_argument(
@@ -446,7 +434,7 @@ if __name__ == "__main__":
     #     nargs="*",
     #     default=[0, 1],
     #     help="Use the fastdecode tuning option. 0 disables it. "
-    #         "For libx264 and libx265/hevc, only 1 is possible. "
+    #         "For h264 and h265/hevc, only 1 is possible. "
     #         "For libsvtav1, 1, 2 or 3 are possible values with a higher number meaning a faster decoding optimization",
     # )
     parser.add_argument(
@@ -465,8 +453,8 @@ if __name__ == "__main__":
         "--backends",
         type=str,
         nargs="*",
-        default=["pyav", "video_reader"],
-        help="Torchvision decoding backend to be tested.",
+        default=["torchcodec", "pyav", "video_reader"],
+        help="Video decoding backend to be tested.",
     )
     parser.add_argument(
         "--num-samples",
diff --git a/src/lerobot/datasets/lerobot_dataset.py b/src/lerobot/datasets/lerobot_dataset.py
index a869cb920..ae5591a4b 100644
--- a/src/lerobot/datasets/lerobot_dataset.py
+++ b/src/lerobot/datasets/lerobot_dataset.py
@@ -440,8 +440,9 @@ class LeRobotDataset(torch.utils.data.Dataset):
             download_videos (bool, optional): Flag to download the videos. Note that when set to True but the
                 video files are already present on local disk, they won't be downloaded again. Defaults to
                 True.
-            video_backend (str | None, optional): Video backend to use for decoding videos. Defaults to torchcodec when available int the platform; otherwise, defaults to 'pyav'.
-                You can also use the 'pyav' decoder used by Torchvision, which used to be the default option, or 'video_reader' which is another decoder of Torchvision.
+            video_backend (str | None, optional): Video backend to use for decoding videos. Defaults to 'torchcodec' 
+                when available on the platform; otherwise, defaults to torchvision's default backend : 'pyav'.
+                You can also use 'video_reader' which is another decoder of torchvision.
             batch_encoding_size (int, optional): Number of episodes to accumulate before batch encoding videos.
                 Set to 1 for immediate encoding (default), or higher for batched encoding. Defaults to 1.
         """