#!/bin/bash #SBATCH --job-name=smolvla2-hirobot #SBATCH --partition=hopper-prod #SBATCH --qos=high #SBATCH --time=48:00:00 #SBATCH --ntasks=1 #SBATCH --gpus-per-task=8 # SmolVLA2 training on an annotated dataset, with image augmentation # and per-component prompt dropout enabled — the two regularisers # that move the model away from the "text_loss=6e-6 memorised one # epoch worth of frames" failure mode toward "learns concepts, not # pixels". # # What the regularisers do: # # * --dataset.image_transforms.enable=true: applies torchvision # v2 ColorJitter (brightness/contrast/saturation/hue), # SharpnessJitter and RandomAffine per frame at training time. # Set max_num_transforms to control how many are sampled per # frame; defaults to 3 of the 6. # * --policy.plan_dropout_prob / memory / subtask: at training, # randomly drop the context messages that carry the named # binding so the model is forced to handle missing/stale context. # Mirrors Pi0.7's prompt-component dropout (§V.E). # # Expected effect: text_loss plateaus higher (~0.5-2.0 instead of # ~1e-5) and the model handles slight prompt/scene drift at # inference instead of collapsing to memorised fragments. set -euo pipefail cd "${LEROBOT_ROOT:-$HOME/lerobot}" export PATH="$HOME/miniconda3/bin:$HOME/.local/bin:$PATH" export LD_LIBRARY_PATH="$HOME/miniconda3/lib:${LD_LIBRARY_PATH:-}" export NCCL_TIMEOUT="${NCCL_TIMEOUT:-1800}" export HF_HUB_DOWNLOAD_TIMEOUT="${HF_HUB_DOWNLOAD_TIMEOUT:-120}" export WANDB_INIT_TIMEOUT="${WANDB_INIT_TIMEOUT:-300}" DATASET="${DATASET:-pepijn223/super_poulain_full_tool3}" POLICY_REPO_ID="${POLICY_REPO_ID:-pepijn223/smolvla2_hirobot_super_poulain_tool4}" JOB_NAME="${JOB_NAME:-smolvla2-hirobot-super-poulain-tool4}" NUM_PROCESSES="${NUM_PROCESSES:-8}" BATCH_SIZE="${BATCH_SIZE:-32}" STEPS="${STEPS:-10000}" RUN_ID="${SLURM_JOB_ID:-$(date +%Y%m%d_%H%M%S)}" OUTPUT_DIR="${OUTPUT_DIR:-/fsx/pepijn/outputs/train/smolvla2_hirobot_${RUN_ID}}" echo "Training smolvla2 on $DATASET" echo " GPUs: $NUM_PROCESSES" echo " batch: $BATCH_SIZE / GPU (global=$((NUM_PROCESSES * BATCH_SIZE)))" echo " steps: $STEPS" echo " output: $OUTPUT_DIR" echo " augmentation: image_transforms ON, prompt dropout {plan:0.15 memory:0.15 subtask:0.20}" accelerate launch --multi_gpu --num_processes="$NUM_PROCESSES" \ -m lerobot.scripts.lerobot_train \ --policy.type=smolvla2 \ --policy.recipe_path=recipes/smolvla2_hirobot.yaml \ --dataset.repo_id="$DATASET" \ --dataset.revision=main \ --dataset.video_backend=pyav \ --dataset.image_transforms.enable=true \ --dataset.image_transforms.max_num_transforms=3 \ --dataset.image_transforms.random_order=true \ --policy.plan_dropout_prob=0.15 \ --policy.memory_dropout_prob=0.15 \ --policy.subtask_dropout_prob=0.20 \ --output_dir="$OUTPUT_DIR" \ --job_name="$JOB_NAME" \ --policy.repo_id="$POLICY_REPO_ID" \ --policy.compile_model=false \ --policy.device=cuda \ --policy.tokenizer_max_length=512 \ --steps="$STEPS" \ --policy.scheduler_decay_steps="$STEPS" \ --batch_size="$BATCH_SIZE" \ --wandb.enable=true \ --wandb.disable_artifact=true \ --wandb.project=hirobot \ --log_freq=100 \ --save_freq=1000 \ --num_workers=0