From ecf342d4814a180c3a5eb9faccb85a52f3746f20 Mon Sep 17 00:00:00 2001
From: Maxime Ellerbach <maxime.ellerbach@huggingface.co>
Date: Tue, 16 Jun 2026 11:27:51 +0000
Subject: [PATCH] small fix for the preprocessor and padded images

---
 src/lerobot/policies/fastwam/processor_fastwam.py | 12 ++++++++----
 1 file changed, 8 insertions(+), 4 deletions(-)

diff --git a/src/lerobot/policies/fastwam/processor_fastwam.py b/src/lerobot/policies/fastwam/processor_fastwam.py
index 080fdb9a4..9c31543f9 100644
--- a/src/lerobot/policies/fastwam/processor_fastwam.py
+++ b/src/lerobot/policies/fastwam/processor_fastwam.py
@@ -55,19 +55,23 @@ class FastWAMImageCropResizeProcessorStep(ImageCropResizeProcessorStep):
     """
 
     def observation(self, observation: dict) -> dict:
+        # Delta-timestamp video loading adds `<image_key>_is_pad` boolean masks ([B, T]) that share
+        # the `observation.images.` prefix but are padding flags, not frames. The base crop/resize
+        # matches on the `"image"` substring, so set these aside and restore them untouched rather
+        # than letting it try to resize a mask.
+        pad_keys = {key: value for key, value in observation.items() if "_is_pad" in key}
         leads: dict[str, tuple] = {}
-        flat_input = dict(observation)
-        for key, img in observation.items():
+        flat_input = {key: value for key, value in observation.items() if key not in pad_keys}
+        for key, img in list(flat_input.items()):
             if "image" in key and torch.is_tensor(img) and img.ndim > 4:
                 leads[key] = tuple(img.shape[:-3])
                 flat_input[key] = img.reshape(-1, *img.shape[-3:])
         processed = super().observation(flat_input)
-        if not leads:
-            return processed
         out = dict(processed)
         for key, lead in leads.items():
             im = processed[key]
             out[key] = im.reshape(*lead, *im.shape[-3:])
+        out.update(pad_keys)
         return out