Merge branch 'train-smolvla' into add-multitraining

:wq a
2026-07-24 18:26:11 +00:00 · 2025-09-04 14:32:06 +02:00
parent b7522da85d 61e55830da
commit 1ba896598e
3 changed files with 106 additions and 4 deletions
@@ -0,0 +1,90 @@
 #!/bin/bash
 # smolvla training
 set -euo pipefail
 # repo/env
 cd ~/lerobot || exit 1
 # conda activate lerobot
 export LC_ALL=C
 rm -f core-*
 # storage / caches (use RAID to avoid filling $HOME)
 RAID=/raid/jade
 export TRANSFORMERS_CACHE=$RAID/.cache/huggingface/transformers
 export HF_HOME=$RAID/.cache/huggingface
 export HF_DATASETS_CACHE=$RAID/.cache/huggingface/datasets
 export HF_LEROBOT_HOME=$RAID/.cache/huggingface/lerobot
 export WANDB_CACHE_DIR=$RAID/.cache/wandb
 export TMPDIR=$RAID/.cache/tmp
 mkdir -p $TMPDIR
 export WANDB_MODE=offline
 export HF_DATASETS_OFFLINE=1
 export HF_HUB_OFFLINE=1
 export TOKENIZERS_PARALLELISM=false
 export MUJOCO_GL=egl
 # will only use if accelerate is used
 PORT=29522
 # =================== CONFIG ===================
 ENV=libero
 TASK=libero_spatial
 REPO_ID=physical-intelligence/libero
 POLICY=smolvla
 VLM=HuggingFaceTB/SmolVLM2-2.2B-Instruct
 # Optim / scheduling
 LR=1e-4
 DECAY_LR=2.5e-6
 DECAY_STEPS=30000
 USE_AMP=false
 TRAIN_EXPERT_ONLY=true
 N_ACTION_STEPS=1
 SEED=1000
 # Training loop
 OFFLINE_STEPS=100000
 BATCH_SIZE=32
 EVAL_FREQ=0
 SAVE_FREQ=300000
 EVAL_BATCH_SIZE=1
 NUM_EPISODES=1
 # GPU selection 0, 1, 2, 3
 export CUDA_VISIBLE_DEVICES=1
 # naming/output dir
 TRAIN_DIR=$RAID/logs/lerobot/lerobot_${REPO_ID//\//_}_${POLICY}_lr${LR}bs${BATCH_SIZE}steps${OFFLINE_STEPS}
 echo "Training dir: $TRAIN_DIR"
 # train
 rm -rf "$TRAIN_DIR"
 python src/lerobot/scripts/train.py \
  --policy.type=$POLICY \
  --policy.vlm_model_name=$VLM \
  --dataset.repo_id=$REPO_ID \
  --dataset.root=$HF_DATASETS_CACHE \
  --env.type=$ENV \
  --env.task=$TASK \
  --output_dir=$TRAIN_DIR \
  --batch_size=$BATCH_SIZE \
  --steps=$OFFLINE_STEPS \
  --eval_freq=$EVAL_FREQ \
  --save_freq=$SAVE_FREQ \
  --eval.batch_size=$EVAL_BATCH_SIZE \
  --eval.n_episodes=$NUM_EPISODES \
  --policy.use_amp=$USE_AMP \
  --policy.optimizer_lr=$LR \
  --policy.repo_id=None \
  --policy.scheduler_decay_lr=$DECAY_LR \
  --policy.scheduler_decay_steps=$DECAY_STEPS \
  --policy.n_action_steps=$N_ACTION_STEPS \
  --policy.train_expert_only=$TRAIN_EXPERT_ONLY \
  --policy.vlm_model_name=/raid/jade/.cache/huggingface/models/SmolVLM2-2.2B-Instruct \
  --seed=$SEED \
  --wandb.enable=false
@@ -63,7 +63,7 @@ import torch.nn.functional as F  # noqa: N812
 from torch import Tensor, nn
 from transformers import AutoProcessor
-from lerobot.constants import ACTION, OBS_STATE
+from lerobot.constants import ACTION
 from lerobot.policies.normalize import (
    Normalize,
    Unnormalize,
@@ -75,7 +75,8 @@ from lerobot.policies.utils import (
    populate_queues,
 )
 from lerobot.utils.utils import get_safe_dtype
-
+OBS_STATE = 'state'
 ACTION = 'actions'
 # Matches ".soNNN", optionally followed by "-something", up to the "_buffer_" marker
 _VARIANT_RE = re.compile(r"\.so\d+(?:-[\w]+)?_buffer_")
@@ -824,12 +825,21 @@ class VLAFlowMatching(nn.Module):
        pad_masks = torch.cat(pad_masks, dim=1)
        att_masks = torch.tensor(att_masks, dtype=embs.dtype, device=embs.device)
        att_masks = att_masks[None, :].expand(bsize, len(att_masks))
        # added by jade
        seq_len = pad_masks.shape[1]
        if seq_len < self.config.chunk_size:
            embs = pad_tensor(embs, self.config.chunk_size, pad_value=0)
            pad_masks = pad_tensor(pad_masks, self.config.chunk_size, pad_value=0)
            att_masks = pad_tensor(att_masks, self.config.chunk_size, pad_value=0)
        return embs, pad_masks, att_masks
    def forward(
        self, images, img_masks, lang_tokens, lang_masks, state, actions, noise=None, time=None
    ) -> Tensor:
        """Do a full training forward pass and compute the loss (batch_size x num_steps x num_motors)"""
        #added by jade
        if actions.ndim == 2:
            actions = actions[:, None, :].expand(-1, self.config.chunk_size, -1)
        if noise is None:
            noise = self.sample_noise(actions.shape, actions.device)
@@ -857,7 +867,8 @@ class VLAFlowMatching(nn.Module):
            use_cache=False,
            fill_kv_cache=False,
        )
-        suffix_out = suffix_out[:, -self.config.chunk_size :]
+        # suffix_out = suffix_out[:, -self.config.chunk_size :]
        suffix_out = suffix_out[:, -self.config.chunk_size:, :]
        # Original openpi code, upcast attention output
        suffix_out = suffix_out.to(dtype=torch.float32)
        v_t = self.action_out_proj(suffix_out)
@@ -77,7 +77,8 @@ class SmolVLMWithExpertModel(nn.Module):
            self.vlm = AutoModelForImageTextToText.from_pretrained(
                model_id,
                device_map="auto",
-                torch_dtype="bfloat16",
+                # torch_dtype="bfloat16",
                torch_dtype=torch.float16,
                low_cpu_mem_usage=True,
            )
            config = self.vlm.config