nit

2026-05-15 08:39:49 +00:00 · 2025-06-30 12:01:32 +02:00
parent c0146eed7f
commit 96550e4ad1
1 changed files with 4 additions and 3 deletions
@@ -74,11 +74,11 @@ class SmolVLAConfig(PreTrainedConfig):

    # Finetuning settings
    freeze_vision_encoder: bool = True
-    train_expert_only: bool = True
+    train_expert_only: bool = False
    train_state_proj: bool = True

    # Training presets
-    optimizer_lr: float = 1e-4
+    optimizer_lr: float = 2.5e-5 #1e-4
    optimizer_betas: tuple[float, float] = (0.9, 0.95)
    optimizer_eps: float = 1e-8
    optimizer_weight_decay: float = 1e-10
@@ -105,6 +105,7 @@ class SmolVLAConfig(PreTrainedConfig):

    num_expert_layers: int = -1  # Less or equal to 0 is the default where the action expert has the same number of layers of VLM. Otherwise the expert have less layers.
    num_vlm_layers: int = 16  
+    past_obs_keys: str = f"image"
    add_local_special_image_tokens: bool = False

    reverse_images_order: bool = False
@@ -115,7 +116,7 @@ class SmolVLAConfig(PreTrainedConfig):
    causal_action_attention_mask: bool = False

    self_attn_every_n_layers: int = -1# Number of layers used in the VLM (first num_vlm_layers layers)
-    self_attn_every_n_layers: int = 2  # Interleave SA layers each self_attn_every_n_layers
+    #self_attn_every_n_layers: int = 2  # Interleave SA layers each self_attn_every_n_layers
    expert_width_multiplier: float = 0.75  # The action expert hidden size (wrt to the VLM)

    min_period: float = 4e-3  # sensitivity range for the timestep used in sine-cosine positional encoding