[pre-commit.ci] auto fixes from pre-commit.com hooks

for more information, see https://pre-commit.ci
2026-05-20 19:19:56 +00:00 · 2025-03-04 13:38:47 +00:00
parent 76df8a31b3
commit 38f5fa4523
79 changed files with 2782 additions and 788 deletions
@@ -132,7 +132,11 @@ class DiffusionPolicy(PreTrainedPolicy):

        if len(self._queues["action"]) == 0:
            # stack n latest observations from the queue
-            batch = {k: torch.stack(list(self._queues[k]), dim=1) for k in batch if k in self._queues}
+            batch = {
+                k: torch.stack(list(self._queues[k]), dim=1)
+                for k in batch
+                if k in self._queues
+            }
            actions = self.diffusion.generate_actions(batch)

            # TODO(rcadene): make above methods return output dictionary?
@@ -189,7 +193,9 @@ class DiffusionModel(nn.Module):
        if self.config.env_state_feature:
            global_cond_dim += self.config.env_state_feature.shape[0]

-        self.unet = DiffusionConditionalUnet1d(config, global_cond_dim=global_cond_dim * config.n_obs_steps)
+        self.unet = DiffusionConditionalUnet1d(
+            config, global_cond_dim=global_cond_dim * config.n_obs_steps
+        )

        self.noise_scheduler = _make_noise_scheduler(
            config.noise_scheduler_type,
@@ -209,7 +215,10 @@ class DiffusionModel(nn.Module):

    # ========= inference  ============
    def conditional_sample(
-        self, batch_size: int, global_cond: Tensor | None = None, generator: torch.Generator | None = None
+        self,
+        batch_size: int,
+        global_cond: Tensor | None = None,
+        generator: torch.Generator | None = None,
    ) -> Tensor:
        device = get_device_from_parameters(self)
        dtype = get_dtype_from_parameters(self)
@@ -232,7 +241,9 @@ class DiffusionModel(nn.Module):
                global_cond=global_cond,
            )
            # Compute previous image: x_t -> x_t-1
-            sample = self.noise_scheduler.step(model_output, t, sample, generator=generator).prev_sample
+            sample = self.noise_scheduler.step(
+                model_output, t, sample, generator=generator
+            ).prev_sample

        return sample

@@ -244,27 +255,39 @@ class DiffusionModel(nn.Module):
        if self.config.image_features:
            if self.config.use_separate_rgb_encoder_per_camera:
                # Combine batch and sequence dims while rearranging to make the camera index dimension first.
-                images_per_camera = einops.rearrange(batch["observation.images"], "b s n ... -> n (b s) ...")
+                images_per_camera = einops.rearrange(
+                    batch["observation.images"], "b s n ... -> n (b s) ..."
+                )
                img_features_list = torch.cat(
                    [
                        encoder(images)
-                        for encoder, images in zip(self.rgb_encoder, images_per_camera, strict=True)
+                        for encoder, images in zip(
+                            self.rgb_encoder, images_per_camera, strict=True
+                        )
                    ]
                )
                # Separate batch and sequence dims back out. The camera index dim gets absorbed into the
                # feature dim (effectively concatenating the camera features).
                img_features = einops.rearrange(
-                    img_features_list, "(n b s) ... -> b s (n ...)", b=batch_size, s=n_obs_steps
+                    img_features_list,
+                    "(n b s) ... -> b s (n ...)",
+                    b=batch_size,
+                    s=n_obs_steps,
                )
            else:
                # Combine batch, sequence, and "which camera" dims before passing to shared encoder.
                img_features = self.rgb_encoder(
-                    einops.rearrange(batch["observation.images"], "b s n ... -> (b s n) ...")
+                    einops.rearrange(
+                        batch["observation.images"], "b s n ... -> (b s n) ..."
+                    )
                )
                # Separate batch dim and sequence dim back out. The camera index dim gets absorbed into the
                # feature dim (effectively concatenating the camera features).
                img_features = einops.rearrange(
-                    img_features, "(b s n) ... -> b s (n ...)", b=batch_size, s=n_obs_steps
+                    img_features,
+                    "(b s n) ... -> b s (n ...)",
+                    b=batch_size,
+                    s=n_obs_steps,
                )
            global_cond_feats.append(img_features)

@@ -350,7 +373,9 @@ class DiffusionModel(nn.Module):
        elif self.config.prediction_type == "sample":
            target = batch["action"]
        else:
-            raise ValueError(f"Unsupported prediction type {self.config.prediction_type}")
+            raise ValueError(
+                f"Unsupported prediction type {self.config.prediction_type}"
+            )

        loss = F.mse_loss(pred, target, reduction="none")

@@ -410,7 +435,9 @@ class SpatialSoftmax(nn.Module):

        # we could use torch.linspace directly but that seems to behave slightly differently than numpy
        # and causes a small degradation in pc_success of pre-trained models.
-        pos_x, pos_y = np.meshgrid(np.linspace(-1.0, 1.0, self._in_w), np.linspace(-1.0, 1.0, self._in_h))
+        pos_x, pos_y = np.meshgrid(
+            np.linspace(-1.0, 1.0, self._in_w), np.linspace(-1.0, 1.0, self._in_h)
+        )
        pos_x = torch.from_numpy(pos_x.reshape(self._in_h * self._in_w, 1)).float()
        pos_y = torch.from_numpy(pos_y.reshape(self._in_h * self._in_w, 1)).float()
        # register as buffer so it's moved to the correct device.
@@ -452,7 +479,9 @@ class DiffusionRgbEncoder(nn.Module):
            # Always use center crop for eval
            self.center_crop = torchvision.transforms.CenterCrop(config.crop_shape)
            if config.crop_is_random:
-                self.maybe_random_crop = torchvision.transforms.RandomCrop(config.crop_shape)
+                self.maybe_random_crop = torchvision.transforms.RandomCrop(
+                    config.crop_shape
+                )
            else:
                self.maybe_random_crop = self.center_crop
        else:
@@ -473,7 +502,9 @@ class DiffusionRgbEncoder(nn.Module):
            self.backbone = _replace_submodules(
                root_module=self.backbone,
                predicate=lambda x: isinstance(x, nn.BatchNorm2d),
-                func=lambda x: nn.GroupNorm(num_groups=x.num_features // 16, num_channels=x.num_features),
+                func=lambda x: nn.GroupNorm(
+                    num_groups=x.num_features // 16, num_channels=x.num_features
+                ),
            )

        # Set up pooling and final layers.
@@ -515,7 +546,9 @@ class DiffusionRgbEncoder(nn.Module):


 def _replace_submodules(
-    root_module: nn.Module, predicate: Callable[[nn.Module], bool], func: Callable[[nn.Module], nn.Module]
+    root_module: nn.Module,
+    predicate: Callable[[nn.Module], bool],
+    func: Callable[[nn.Module], nn.Module],
 ) -> nn.Module:
    """
    Args:
@@ -528,7 +561,11 @@ def _replace_submodules(
    if predicate(root_module):
        return func(root_module)

-    replace_list = [k.split(".") for k, m in root_module.named_modules(remove_duplicate=True) if predicate(m)]
+    replace_list = [
+        k.split(".")
+        for k, m in root_module.named_modules(remove_duplicate=True)
+        if predicate(m)
+    ]
    for *parents, k in replace_list:
        parent_module = root_module
        if len(parents) > 0:
@@ -543,7 +580,9 @@ def _replace_submodules(
        else:
            setattr(parent_module, k, tgt_module)
    # verify that all BN are replaced
-    assert not any(predicate(m) for _, m in root_module.named_modules(remove_duplicate=True))
+    assert not any(
+        predicate(m) for _, m in root_module.named_modules(remove_duplicate=True)
+    )
    return root_module


@@ -571,7 +610,9 @@ class DiffusionConv1dBlock(nn.Module):
        super().__init__()

        self.block = nn.Sequential(
-            nn.Conv1d(inp_channels, out_channels, kernel_size, padding=kernel_size // 2),
+            nn.Conv1d(
+                inp_channels, out_channels, kernel_size, padding=kernel_size // 2
+            ),
            nn.GroupNorm(n_groups, out_channels),
            nn.Mish(),
        )
@@ -594,9 +635,13 @@ class DiffusionConditionalUnet1d(nn.Module):
        # Encoder for the diffusion timestep.
        self.diffusion_step_encoder = nn.Sequential(
            DiffusionSinusoidalPosEmb(config.diffusion_step_embed_dim),
-            nn.Linear(config.diffusion_step_embed_dim, config.diffusion_step_embed_dim * 4),
+            nn.Linear(
+                config.diffusion_step_embed_dim, config.diffusion_step_embed_dim * 4
+            ),
            nn.Mish(),
-            nn.Linear(config.diffusion_step_embed_dim * 4, config.diffusion_step_embed_dim),
+            nn.Linear(
+                config.diffusion_step_embed_dim * 4, config.diffusion_step_embed_dim
+            ),
        )

        # The FiLM conditioning dimension.
@@ -621,10 +666,16 @@ class DiffusionConditionalUnet1d(nn.Module):
            self.down_modules.append(
                nn.ModuleList(
                    [
-                        DiffusionConditionalResidualBlock1d(dim_in, dim_out, **common_res_block_kwargs),
-                        DiffusionConditionalResidualBlock1d(dim_out, dim_out, **common_res_block_kwargs),
+                        DiffusionConditionalResidualBlock1d(
+                            dim_in, dim_out, **common_res_block_kwargs
+                        ),
+                        DiffusionConditionalResidualBlock1d(
+                            dim_out, dim_out, **common_res_block_kwargs
+                        ),
                        # Downsample as long as it is not the last block.
-                        nn.Conv1d(dim_out, dim_out, 3, 2, 1) if not is_last else nn.Identity(),
+                        nn.Conv1d(dim_out, dim_out, 3, 2, 1)
+                        if not is_last
+                        else nn.Identity(),
                    ]
                )
            )
@@ -633,10 +684,14 @@ class DiffusionConditionalUnet1d(nn.Module):
        self.mid_modules = nn.ModuleList(
            [
                DiffusionConditionalResidualBlock1d(
-                    config.down_dims[-1], config.down_dims[-1], **common_res_block_kwargs
+                    config.down_dims[-1],
+                    config.down_dims[-1],
+                    **common_res_block_kwargs,
                ),
                DiffusionConditionalResidualBlock1d(
-                    config.down_dims[-1], config.down_dims[-1], **common_res_block_kwargs
+                    config.down_dims[-1],
+                    config.down_dims[-1],
+                    **common_res_block_kwargs,
                ),
            ]
        )
@@ -649,10 +704,16 @@ class DiffusionConditionalUnet1d(nn.Module):
                nn.ModuleList(
                    [
                        # dim_in * 2, because it takes the encoder's skip connection as well
-                        DiffusionConditionalResidualBlock1d(dim_in * 2, dim_out, **common_res_block_kwargs),
-                        DiffusionConditionalResidualBlock1d(dim_out, dim_out, **common_res_block_kwargs),
+                        DiffusionConditionalResidualBlock1d(
+                            dim_in * 2, dim_out, **common_res_block_kwargs
+                        ),
+                        DiffusionConditionalResidualBlock1d(
+                            dim_out, dim_out, **common_res_block_kwargs
+                        ),
                        # Upsample as long as it is not the last block.
-                        nn.ConvTranspose1d(dim_out, dim_out, 4, 2, 1) if not is_last else nn.Identity(),
+                        nn.ConvTranspose1d(dim_out, dim_out, 4, 2, 1)
+                        if not is_last
+                        else nn.Identity(),
                    ]
                )
            )
@@ -726,17 +787,23 @@ class DiffusionConditionalResidualBlock1d(nn.Module):
        self.use_film_scale_modulation = use_film_scale_modulation
        self.out_channels = out_channels

-        self.conv1 = DiffusionConv1dBlock(in_channels, out_channels, kernel_size, n_groups=n_groups)
+        self.conv1 = DiffusionConv1dBlock(
+            in_channels, out_channels, kernel_size, n_groups=n_groups
+        )

        # FiLM modulation (https://arxiv.org/abs/1709.07871) outputs per-channel bias and (maybe) scale.
        cond_channels = out_channels * 2 if use_film_scale_modulation else out_channels
        self.cond_encoder = nn.Sequential(nn.Mish(), nn.Linear(cond_dim, cond_channels))

-        self.conv2 = DiffusionConv1dBlock(out_channels, out_channels, kernel_size, n_groups=n_groups)
+        self.conv2 = DiffusionConv1dBlock(
+            out_channels, out_channels, kernel_size, n_groups=n_groups
+        )

        # A final convolution for dimension matching the residual (if needed).
        self.residual_conv = (
-            nn.Conv1d(in_channels, out_channels, 1) if in_channels != out_channels else nn.Identity()
+            nn.Conv1d(in_channels, out_channels, 1)
+            if in_channels != out_channels
+            else nn.Identity()
        )

    def forward(self, x: Tensor, cond: Tensor) -> Tensor: