mirror of
https://github.com/huggingface/lerobot.git
synced 2026-05-23 12:40:08 +00:00
chore(training): align smolvla2_hirobot.slurm with what's actually run
Match the operator's current training command for the _tool6 retrain:
* default DATASET / POLICY_REPO_ID / JOB_NAME point at the tool6
iteration (super_poulain_full_tool3 → smolvla2_hirobot_super_poulain_tool6)
* STEPS default 2000 (short enough to iterate; bump to 10k for full)
* save_freq=$STEPS so the only checkpoint is the final one
* OUTPUT_DIR includes step count so successive runs don't clobber
* Drop the wider augmentation envelope I added earlier — back to
default ColorJitter ranges (brightness ±20% etc) since the
high_level_subtask recipe fix (current-subtask supervision) is
expected to fix the LM-head collapse on its own; the augmentation
is just the standard regulariser, not a load-bearing widener.
* prompt-dropout fractions stay at the original 0.15 / 0.15 / 0.20.
Co-Authored-By: Claude Opus 4.7 (1M context) <noreply@anthropic.com>
This commit is contained in:
@@ -6,27 +6,23 @@
|
|||||||
#SBATCH --ntasks=1
|
#SBATCH --ntasks=1
|
||||||
#SBATCH --gpus-per-task=8
|
#SBATCH --gpus-per-task=8
|
||||||
|
|
||||||
# SmolVLA2 training on an annotated dataset, with image augmentation
|
# SmolVLA2 training on an annotated dataset.
|
||||||
# and per-component prompt dropout enabled — the two regularisers
|
|
||||||
# that move the model away from the "text_loss=6e-6 memorised one
|
|
||||||
# epoch worth of frames" failure mode toward "learns concepts, not
|
|
||||||
# pixels".
|
|
||||||
#
|
#
|
||||||
# What the regularisers do:
|
# The high_level_subtask recipe (recipes/smolvla2_hirobot.yaml) was
|
||||||
|
# fixed in PR3 to supervise the LM head with the *current* active
|
||||||
|
# subtask span at every frame, not the next-span target which is
|
||||||
|
# empty on stable phases. With the old recipe the head learned to
|
||||||
|
# emit ``\n`` on every chunk boundary; the new one supervises a
|
||||||
|
# real, scene-grounded string at every frame.
|
||||||
#
|
#
|
||||||
# * --dataset.image_transforms.enable=true: applies torchvision
|
# Two regularisers are still on:
|
||||||
# v2 ColorJitter (brightness/contrast/saturation/hue),
|
|
||||||
# SharpnessJitter and RandomAffine per frame at training time.
|
|
||||||
# Set max_num_transforms to control how many are sampled per
|
|
||||||
# frame; defaults to 3 of the 6.
|
|
||||||
# * --policy.plan_dropout_prob / memory / subtask: at training,
|
|
||||||
# randomly drop the context messages that carry the named
|
|
||||||
# binding so the model is forced to handle missing/stale context.
|
|
||||||
# Mirrors Pi0.7's prompt-component dropout (§V.E).
|
|
||||||
#
|
#
|
||||||
# Expected effect: text_loss plateaus higher (~0.5-2.0 instead of
|
# * --dataset.image_transforms.enable=true: torchvision-v2
|
||||||
# ~1e-5) and the model handles slight prompt/scene drift at
|
# ColorJitter + SharpnessJitter + RandomAffine per frame; default
|
||||||
# inference instead of collapsing to memorised fragments.
|
# envelope (brightness ±20% etc).
|
||||||
|
# * --policy.{plan,memory,subtask}_dropout_prob: randomly drop the
|
||||||
|
# context messages carrying the named recipe binding so the model
|
||||||
|
# handles missing/stale context. Mirrors Pi0.7 §V.E.
|
||||||
|
|
||||||
set -euo pipefail
|
set -euo pipefail
|
||||||
|
|
||||||
@@ -39,20 +35,20 @@ export HF_HUB_DOWNLOAD_TIMEOUT="${HF_HUB_DOWNLOAD_TIMEOUT:-120}"
|
|||||||
export WANDB_INIT_TIMEOUT="${WANDB_INIT_TIMEOUT:-300}"
|
export WANDB_INIT_TIMEOUT="${WANDB_INIT_TIMEOUT:-300}"
|
||||||
|
|
||||||
DATASET="${DATASET:-pepijn223/super_poulain_full_tool3}"
|
DATASET="${DATASET:-pepijn223/super_poulain_full_tool3}"
|
||||||
POLICY_REPO_ID="${POLICY_REPO_ID:-pepijn223/smolvla2_hirobot_super_poulain_tool4}"
|
POLICY_REPO_ID="${POLICY_REPO_ID:-pepijn223/smolvla2_hirobot_super_poulain_tool6}"
|
||||||
JOB_NAME="${JOB_NAME:-smolvla2-hirobot-super-poulain-tool4}"
|
JOB_NAME="${JOB_NAME:-smolvla2-hirobot-super-poulain-tool6}"
|
||||||
NUM_PROCESSES="${NUM_PROCESSES:-8}"
|
NUM_PROCESSES="${NUM_PROCESSES:-8}"
|
||||||
BATCH_SIZE="${BATCH_SIZE:-32}"
|
BATCH_SIZE="${BATCH_SIZE:-32}"
|
||||||
STEPS="${STEPS:-10000}"
|
STEPS="${STEPS:-2000}"
|
||||||
RUN_ID="${SLURM_JOB_ID:-$(date +%Y%m%d_%H%M%S)}"
|
RUN_ID="${SLURM_JOB_ID:-$(date +%Y%m%d_%H%M%S)}"
|
||||||
OUTPUT_DIR="${OUTPUT_DIR:-/fsx/pepijn/outputs/train/smolvla2_hirobot_${RUN_ID}}"
|
OUTPUT_DIR="${OUTPUT_DIR:-/fsx/pepijn/outputs/train/smolvla2_hirobot_super_poulain_tool3_${STEPS}_${RUN_ID}}"
|
||||||
|
|
||||||
echo "Training smolvla2 on $DATASET"
|
echo "Training smolvla2 on $DATASET"
|
||||||
echo " GPUs: $NUM_PROCESSES"
|
echo " GPUs: $NUM_PROCESSES"
|
||||||
echo " batch: $BATCH_SIZE / GPU (global=$((NUM_PROCESSES * BATCH_SIZE)))"
|
echo " batch: $BATCH_SIZE / GPU (global=$((NUM_PROCESSES * BATCH_SIZE)))"
|
||||||
echo " steps: $STEPS"
|
echo " steps: $STEPS"
|
||||||
echo " output: $OUTPUT_DIR"
|
echo " output: $OUTPUT_DIR"
|
||||||
echo " augmentation: image_transforms ON (wide), prompt dropout {plan:0.20 memory:0.20 subtask:0.30}"
|
echo " augmentation: image_transforms ON, prompt dropout {plan:0.15 memory:0.15 subtask:0.20}"
|
||||||
|
|
||||||
accelerate launch --multi_gpu --num_processes="$NUM_PROCESSES" \
|
accelerate launch --multi_gpu --num_processes="$NUM_PROCESSES" \
|
||||||
-m lerobot.scripts.lerobot_train \
|
-m lerobot.scripts.lerobot_train \
|
||||||
@@ -61,17 +57,6 @@ accelerate launch --multi_gpu --num_processes="$NUM_PROCESSES" \
|
|||||||
--dataset.repo_id="$DATASET" \
|
--dataset.repo_id="$DATASET" \
|
||||||
--dataset.revision=main \
|
--dataset.revision=main \
|
||||||
--dataset.video_backend=pyav \
|
--dataset.video_backend=pyav \
|
||||||
--dataset.image_transforms.enable=true \
|
|
||||||
--dataset.image_transforms.max_num_transforms=4 \
|
|
||||||
--dataset.image_transforms.random_order=true \
|
|
||||||
--dataset.image_transforms.tfs.brightness.kwargs='{"brightness": [0.5, 1.6]}' \
|
|
||||||
--dataset.image_transforms.tfs.contrast.kwargs='{"contrast": [0.6, 1.5]}' \
|
|
||||||
--dataset.image_transforms.tfs.saturation.kwargs='{"saturation": [0.3, 1.7]}' \
|
|
||||||
--dataset.image_transforms.tfs.hue.kwargs='{"hue": [-0.1, 0.1]}' \
|
|
||||||
--dataset.image_transforms.tfs.affine.kwargs='{"degrees": [-15.0, 15.0], "translate": [0.15, 0.15]}' \
|
|
||||||
--policy.plan_dropout_prob=0.20 \
|
|
||||||
--policy.memory_dropout_prob=0.20 \
|
|
||||||
--policy.subtask_dropout_prob=0.30 \
|
|
||||||
--output_dir="$OUTPUT_DIR" \
|
--output_dir="$OUTPUT_DIR" \
|
||||||
--job_name="$JOB_NAME" \
|
--job_name="$JOB_NAME" \
|
||||||
--policy.repo_id="$POLICY_REPO_ID" \
|
--policy.repo_id="$POLICY_REPO_ID" \
|
||||||
@@ -85,5 +70,11 @@ accelerate launch --multi_gpu --num_processes="$NUM_PROCESSES" \
|
|||||||
--wandb.disable_artifact=true \
|
--wandb.disable_artifact=true \
|
||||||
--wandb.project=hirobot \
|
--wandb.project=hirobot \
|
||||||
--log_freq=100 \
|
--log_freq=100 \
|
||||||
--save_freq=1000 \
|
--save_freq="$STEPS" \
|
||||||
--num_workers=0
|
--num_workers=0 \
|
||||||
|
--dataset.image_transforms.enable=true \
|
||||||
|
--dataset.image_transforms.max_num_transforms=3 \
|
||||||
|
--dataset.image_transforms.random_order=true \
|
||||||
|
--policy.plan_dropout_prob=0.15 \
|
||||||
|
--policy.memory_dropout_prob=0.15 \
|
||||||
|
--policy.subtask_dropout_prob=0.20
|
||||||
|
|||||||
Reference in New Issue
Block a user