diff --git a/lerobot/common/policies/smolvla2/configuration_smolvla2.py b/lerobot/common/policies/smolvla2/configuration_smolvla2.py index 96c3886d4..96f5f4a73 100644 --- a/lerobot/common/policies/smolvla2/configuration_smolvla2.py +++ b/lerobot/common/policies/smolvla2/configuration_smolvla2.py @@ -74,11 +74,11 @@ class SmolVLAConfig(PreTrainedConfig): # Finetuning settings freeze_vision_encoder: bool = True - train_expert_only: bool = True + train_expert_only: bool = False train_state_proj: bool = True # Training presets - optimizer_lr: float = 1e-4 + optimizer_lr: float = 2.5e-5 #1e-4 optimizer_betas: tuple[float, float] = (0.9, 0.95) optimizer_eps: float = 1e-8 optimizer_weight_decay: float = 1e-10 @@ -105,6 +105,7 @@ class SmolVLAConfig(PreTrainedConfig): num_expert_layers: int = -1 # Less or equal to 0 is the default where the action expert has the same number of layers of VLM. Otherwise the expert have less layers. num_vlm_layers: int = 16 + past_obs_keys: str = f"image" add_local_special_image_tokens: bool = False reverse_images_order: bool = False @@ -115,7 +116,7 @@ class SmolVLAConfig(PreTrainedConfig): causal_action_attention_mask: bool = False self_attn_every_n_layers: int = -1# Number of layers used in the VLM (first num_vlm_layers layers) - self_attn_every_n_layers: int = 2 # Interleave SA layers each self_attn_every_n_layers + #self_attn_every_n_layers: int = 2 # Interleave SA layers each self_attn_every_n_layers expert_width_multiplier: float = 0.75 # The action expert hidden size (wrt to the VLM) min_period: float = 4e-3 # sensitivity range for the timestep used in sine-cosine positional encoding