feat(policy): use pretrained vision encoder weights by default for diffusion and vqbet (#3202)

* feat: add pretrained vision encoder weights for diffusion and vqbet * fix test by re-generating artifacts --------- Co-authored-by: Steven Palma <imstevenpmwork@ieee.org>
2026-07-19 07:51:43 +00:00 · 2026-05-07 11:10:38 +01:00
parent a0e52d52fe
commit eaf0218bc8
6 changed files with 13 additions and 13 deletions
@@ -100,8 +100,8 @@ class DiffusionConfig(PreTrainedConfig):

    # Inputs / output structure.
    n_obs_steps: int = 2
-    horizon: int = 16
-    n_action_steps: int = 8
+    horizon: int = 64
+    n_action_steps: int = 32

    normalization_mapping: dict[str, NormalizationMode] = field(
        default_factory=lambda: {
@@ -122,10 +122,10 @@ class DiffusionConfig(PreTrainedConfig):
    crop_ratio: float = 1.0
    crop_shape: tuple[int, int] | None = None
    crop_is_random: bool = True
-    pretrained_backbone_weights: str | None = None
-    use_group_norm: bool = True
+    pretrained_backbone_weights: str | None = "ResNet18_Weights.IMAGENET1K_V1"
+    use_group_norm: bool = False
    spatial_softmax_num_keypoints: int = 32
-    use_separate_rgb_encoder_per_camera: bool = False
+    use_separate_rgb_encoder_per_camera: bool = True
    # Unet.
    down_dims: tuple[int, ...] = (512, 1024, 2048)
    kernel_size: int = 5
@@ -97,8 +97,8 @@ class VQBeTConfig(PreTrainedConfig):
    vision_backbone: str = "resnet18"
    crop_shape: tuple[int, int] | None = (84, 84)
    crop_is_random: bool = True
-    pretrained_backbone_weights: str | None = None
-    use_group_norm: bool = True
+    pretrained_backbone_weights: str | None = "ResNet18_Weights.IMAGENET1K_V1"
+    use_group_norm: bool = False
    spatial_softmax_num_keypoints: int = 32
    # VQ-VAE
    n_vqvae_training_steps: int = 20000