From 61e55830dadc2bbaa56feed5306c5d0d1282bad0 Mon Sep 17 00:00:00 2001 From: Jade Choghari Date: Thu, 4 Sep 2025 12:12:10 +0200 Subject: [PATCH 01/12] add train --- examples/7_train_libero_smolvla.sh | 90 +++++++++++++++++++ .../policies/smolvla/modeling_smolvla.py | 17 +++- .../policies/smolvla/smolvlm_with_expert.py | 3 +- 3 files changed, 106 insertions(+), 4 deletions(-) create mode 100644 examples/7_train_libero_smolvla.sh diff --git a/examples/7_train_libero_smolvla.sh b/examples/7_train_libero_smolvla.sh new file mode 100644 index 000000000..3943e3c96 --- /dev/null +++ b/examples/7_train_libero_smolvla.sh @@ -0,0 +1,90 @@ +#!/bin/bash +# smolvla training + +set -euo pipefail + +# repo/env +cd ~/lerobot || exit 1 +# conda activate lerobot +export LC_ALL=C + + +rm -f core-* + +# storage / caches (use RAID to avoid filling $HOME) +RAID=/raid/jade +export TRANSFORMERS_CACHE=$RAID/.cache/huggingface/transformers +export HF_HOME=$RAID/.cache/huggingface +export HF_DATASETS_CACHE=$RAID/.cache/huggingface/datasets +export HF_LEROBOT_HOME=$RAID/.cache/huggingface/lerobot +export WANDB_CACHE_DIR=$RAID/.cache/wandb +export TMPDIR=$RAID/.cache/tmp +mkdir -p $TMPDIR +export WANDB_MODE=offline +export HF_DATASETS_OFFLINE=1 +export HF_HUB_OFFLINE=1 +export TOKENIZERS_PARALLELISM=false +export MUJOCO_GL=egl + +# will only use if accelerate is used +PORT=29522 + +# =================== CONFIG =================== +ENV=libero +TASK=libero_spatial +REPO_ID=physical-intelligence/libero + +POLICY=smolvla +VLM=HuggingFaceTB/SmolVLM2-2.2B-Instruct + +# Optim / scheduling +LR=1e-4 +DECAY_LR=2.5e-6 +DECAY_STEPS=30000 +USE_AMP=false +TRAIN_EXPERT_ONLY=true +N_ACTION_STEPS=1 +SEED=1000 + +# Training loop +OFFLINE_STEPS=100000 +BATCH_SIZE=32 +EVAL_FREQ=0 +SAVE_FREQ=300000 +EVAL_BATCH_SIZE=1 +NUM_EPISODES=1 + +# GPU selection 0, 1, 2, 3 +export CUDA_VISIBLE_DEVICES=1 + +# naming/output dir +TRAIN_DIR=$RAID/logs/lerobot/lerobot_${REPO_ID//\//_}_${POLICY}_lr${LR}bs${BATCH_SIZE}steps${OFFLINE_STEPS} +echo "Training dir: $TRAIN_DIR" + +# train +rm -rf "$TRAIN_DIR" + +python src/lerobot/scripts/train.py \ + --policy.type=$POLICY \ + --policy.vlm_model_name=$VLM \ + --dataset.repo_id=$REPO_ID \ + --dataset.root=$HF_DATASETS_CACHE \ + --env.type=$ENV \ + --env.task=$TASK \ + --output_dir=$TRAIN_DIR \ + --batch_size=$BATCH_SIZE \ + --steps=$OFFLINE_STEPS \ + --eval_freq=$EVAL_FREQ \ + --save_freq=$SAVE_FREQ \ + --eval.batch_size=$EVAL_BATCH_SIZE \ + --eval.n_episodes=$NUM_EPISODES \ + --policy.use_amp=$USE_AMP \ + --policy.optimizer_lr=$LR \ + --policy.repo_id=None \ + --policy.scheduler_decay_lr=$DECAY_LR \ + --policy.scheduler_decay_steps=$DECAY_STEPS \ + --policy.n_action_steps=$N_ACTION_STEPS \ + --policy.train_expert_only=$TRAIN_EXPERT_ONLY \ + --policy.vlm_model_name=/raid/jade/.cache/huggingface/models/SmolVLM2-2.2B-Instruct \ + --seed=$SEED \ + --wandb.enable=false diff --git a/src/lerobot/policies/smolvla/modeling_smolvla.py b/src/lerobot/policies/smolvla/modeling_smolvla.py index 18f2fc58a..95ed993d2 100644 --- a/src/lerobot/policies/smolvla/modeling_smolvla.py +++ b/src/lerobot/policies/smolvla/modeling_smolvla.py @@ -63,7 +63,7 @@ import torch.nn.functional as F # noqa: N812 from torch import Tensor, nn from transformers import AutoProcessor -from lerobot.constants import ACTION, OBS_STATE +from lerobot.constants import ACTION from lerobot.policies.normalize import ( Normalize, Unnormalize, @@ -75,7 +75,8 @@ from lerobot.policies.utils import ( populate_queues, ) from lerobot.utils.utils import get_safe_dtype - +OBS_STATE = 'state' +ACTION = 'actions' # Matches ".soNNN", optionally followed by "-something", up to the "_buffer_" marker _VARIANT_RE = re.compile(r"\.so\d+(?:-[\w]+)?_buffer_") @@ -824,12 +825,21 @@ class VLAFlowMatching(nn.Module): pad_masks = torch.cat(pad_masks, dim=1) att_masks = torch.tensor(att_masks, dtype=embs.dtype, device=embs.device) att_masks = att_masks[None, :].expand(bsize, len(att_masks)) + # added by jade + seq_len = pad_masks.shape[1] + if seq_len < self.config.chunk_size: + embs = pad_tensor(embs, self.config.chunk_size, pad_value=0) + pad_masks = pad_tensor(pad_masks, self.config.chunk_size, pad_value=0) + att_masks = pad_tensor(att_masks, self.config.chunk_size, pad_value=0) return embs, pad_masks, att_masks def forward( self, images, img_masks, lang_tokens, lang_masks, state, actions, noise=None, time=None ) -> Tensor: """Do a full training forward pass and compute the loss (batch_size x num_steps x num_motors)""" + #added by jade + if actions.ndim == 2: + actions = actions[:, None, :].expand(-1, self.config.chunk_size, -1) if noise is None: noise = self.sample_noise(actions.shape, actions.device) @@ -857,7 +867,8 @@ class VLAFlowMatching(nn.Module): use_cache=False, fill_kv_cache=False, ) - suffix_out = suffix_out[:, -self.config.chunk_size :] + # suffix_out = suffix_out[:, -self.config.chunk_size :] + suffix_out = suffix_out[:, -self.config.chunk_size:, :] # Original openpi code, upcast attention output suffix_out = suffix_out.to(dtype=torch.float32) v_t = self.action_out_proj(suffix_out) diff --git a/src/lerobot/policies/smolvla/smolvlm_with_expert.py b/src/lerobot/policies/smolvla/smolvlm_with_expert.py index f3d1a693a..f6a49dccf 100644 --- a/src/lerobot/policies/smolvla/smolvlm_with_expert.py +++ b/src/lerobot/policies/smolvla/smolvlm_with_expert.py @@ -77,7 +77,8 @@ class SmolVLMWithExpertModel(nn.Module): self.vlm = AutoModelForImageTextToText.from_pretrained( model_id, device_map="auto", - torch_dtype="bfloat16", + # torch_dtype="bfloat16", + torch_dtype=torch.float16, low_cpu_mem_usage=True, ) config = self.vlm.config From 5c628f17004ac80c7f09892e908819f7eadf612c Mon Sep 17 00:00:00 2001 From: Jade Choghari Date: Wed, 10 Sep 2025 11:32:54 +0200 Subject: [PATCH 02/12] new things --- examples/6_evaluate_libero.sh | 49 +- examples/6_evaluate_libero_2.sh | 76 + examples/7_train_acc.sh | 93 + examples/7_train_libero_smolvla.sh | 17 +- examples/8_train_smolvla_must.sh | 141 ++ examples/checker.py | 27 + examples/checker2.py | 35 + examples/convert_data.py | 253 +++ examples/convert_libero.py | 126 ++ examples/evaluate_libero.py | 255 +++ examples/requirements.in | 8 + examples/script2.py | 70 + examples/script3.py | 64 + examples/script4.py | 3 + log_text.txt | 1765 +++++++++++++++ src/lerobot/configs/policies.py | 1 + src/lerobot/datasets/lerobot_dataset.py | 2 +- src/lerobot/datasets/utils.py | 3 +- src/lerobot/policies/factory.py | 8 +- src/lerobot/policies/normalize.py | 161 ++ .../policies/smolpi0/configuration_smolpi0.py | 210 ++ .../policies/smolpi0/flex_attention.py | 145 ++ .../policies/smolpi0/modeling_smolpi0.py | 1021 +++++++++ .../policies/smolpi0/smolvlm_with_expert.py | 824 +++++++ .../policies/smolvla/modeling_smolvla.py | 971 +++++++- .../policies/smolvla/modeling_smolvla_v2.py | 0 src/lerobot/policies/smolvla/saver.txt | 1 + .../policies/smolvla/smolvlm_with_expert.py | 6 +- src/lerobot/scripts/eval.py | 44 +- src/lerobot/scripts/train.py | 29 +- src/lerobot/scripts/train_2.py | 343 +++ src/lerobot/scripts/train_accelerate.py | 365 +++ tmux_log.txt | 2008 +++++++++++++++++ 33 files changed, 9085 insertions(+), 39 deletions(-) create mode 100644 examples/6_evaluate_libero_2.sh create mode 100644 examples/7_train_acc.sh create mode 100644 examples/8_train_smolvla_must.sh create mode 100644 examples/checker.py create mode 100644 examples/checker2.py create mode 100644 examples/convert_data.py create mode 100644 examples/convert_libero.py create mode 100644 examples/evaluate_libero.py create mode 100644 examples/requirements.in create mode 100644 examples/script2.py create mode 100644 examples/script3.py create mode 100644 examples/script4.py create mode 100644 log_text.txt create mode 100644 src/lerobot/policies/smolpi0/configuration_smolpi0.py create mode 100644 src/lerobot/policies/smolpi0/flex_attention.py create mode 100644 src/lerobot/policies/smolpi0/modeling_smolpi0.py create mode 100644 src/lerobot/policies/smolpi0/smolvlm_with_expert.py create mode 100644 src/lerobot/policies/smolvla/modeling_smolvla_v2.py create mode 100644 src/lerobot/policies/smolvla/saver.txt create mode 100644 src/lerobot/scripts/train_2.py create mode 100644 src/lerobot/scripts/train_accelerate.py create mode 100644 tmux_log.txt diff --git a/examples/6_evaluate_libero.sh b/examples/6_evaluate_libero.sh index 36f8c6473..ad6ca0f13 100644 --- a/examples/6_evaluate_libero.sh +++ b/examples/6_evaluate_libero.sh @@ -1,14 +1,45 @@ #!/bin/bash -unset LEROBOT_HOME -unset HF_LEROBOT_HOME +# storage / caches +RAID=/raid/jade +export TRANSFORMERS_CACHE=$RAID/.cache/huggingface/transformers +export HF_HOME=$RAID/.cache/huggingface +export HF_DATASETS_CACHE=$RAID/.cache/huggingface/datasets +export HF_LEROBOT_HOME=$RAID/.cache/huggingface/lerobot +export WANDB_CACHE_DIR=$RAID/.cache/wandb +export TMPDIR=$RAID/.cache/tmp +mkdir -p $TMPDIR +export WANDB_MODE=offline +export HF_DATASETS_OFFLINE=1 +export HF_HUB_OFFLINE=1 +export TOKENIZERS_PARALLELISM=false +export MUJOCO_GL=egl +export CUDA_VISIBLE_DEVICES=3 + # CONFIGURATION -POLICY_PATH="bicmol/smolvla-libero" +POLICY_PATH="/raid/jade/logs/lerobot/lerobot_2_HuggingFaceVLA_libero_smolvla_lr1e-4bs32steps100000/checkpoints/100000/pretrained_model" +POLICY_PATH="/raid/jade/models/smolvlamust" TASK=libero_spatial ENV_TYPE="libero" -BATCH_SIZE=1 -N_EPISODES=1 +BATCH_SIZE=10 +N_EPISODES=10 +# storage / caches +RAID=/raid/jade +N_ACTION_STEPS=1 +export TRANSFORMERS_CACHE=$RAID/.cache/huggingface/transformers +export HF_HOME=$RAID/.cache/huggingface +export HF_DATASETS_CACHE=$RAID/.cache/huggingface/datasets +export HF_LEROBOT_HOME=$RAID/.cache/huggingface/lerobot +export WANDB_CACHE_DIR=$RAID/.cache/wandb +export TMPDIR=$RAID/.cache/tmp +mkdir -p $TMPDIR +export WANDB_MODE=offline +# export HF_DATASETS_OFFLINE=1 +# export HF_HUB_OFFLINE=1 +export TOKENIZERS_PARALLELISM=false export MUJOCO_GL=egl +export MUJOCO_GL=egl +unset HF_HUB_OFFLINE # RUN EVALUATION python src/lerobot/scripts/eval.py \ --policy.path="$POLICY_PATH" \ @@ -17,3 +48,11 @@ python src/lerobot/scripts/eval.py \ --eval.n_episodes="$N_EPISODES" \ --env.multitask_eval=False \ --env.task=$TASK \ +# python examples/evaluate_libero.py \ +# --policy_path "$POLICY_PATH" \ +# --task_suite_name "$TASK" \ +# --num_steps_wait 10 \ +# --num_trials_per_task 10 \ +# --video_out_path "data/libero/videos" \ +# --device "cuda" \ +# --seed 7 \ No newline at end of file diff --git a/examples/6_evaluate_libero_2.sh b/examples/6_evaluate_libero_2.sh new file mode 100644 index 000000000..9d05c0330 --- /dev/null +++ b/examples/6_evaluate_libero_2.sh @@ -0,0 +1,76 @@ +#!/bin/bash + +# storage / caches +RAID=/raid/jade +export TRANSFORMERS_CACHE=$RAID/.cache/huggingface/transformers +export HF_HOME=$RAID/.cache/huggingface +export HF_DATASETS_CACHE=$RAID/.cache/huggingface/datasets +export HF_LEROBOT_HOME=$RAID/.cache/huggingface/lerobot +export WANDB_CACHE_DIR=$RAID/.cache/wandb +export TMPDIR=$RAID/.cache/tmp +mkdir -p $TMPDIR +export WANDB_MODE=offline +export HF_DATASETS_OFFLINE=1 +export HF_HUB_OFFLINE=1 +export TOKENIZERS_PARALLELISM=false +export MUJOCO_GL=egl +export CUDA_VISIBLE_DEVICES=3 + +# CONFIGURATION +POLICY_PATH="/raid/jade/logs/lerobot/lerobot_2_HuggingFaceVLA_libero_smolvla_lr1e-4bs32steps100000/checkpoints/100000/pretrained_model" +POLICY_PATH="AustineJohnBreaker/smolvla_stratch_libero_spatial" +TASK=libero_spatial +ENV_TYPE="libero" +BATCH_SIZE=10 +N_EPISODES=10 +USE_AMP=false +N_ACTION_STEPS=1 +SELF_ATTN_EVERY_N_LAYERS=2 +VLM_NAME=HuggingFaceTB/SmolVLM-500M-Instruct +PAD_LANG_TO=longest +LOAD_VLM_WEIGHTS=true +NUM_VLM_LAYERS=16 +CHUNK_SIZE=50 +N_OBS_STEPS=1 +NUM_EXPERT_LAYERS=0 +EXPERT_WIDTH_MULTIPLIER=0.5 + + +# storage / caches +RAID=/raid/jade +export TRANSFORMERS_CACHE=$RAID/.cache/huggingface/transformers +export HF_HOME=$RAID/.cache/huggingface +export HF_DATASETS_CACHE=$RAID/.cache/huggingface/datasets +export HF_LEROBOT_HOME=$RAID/.cache/huggingface/lerobot +export WANDB_CACHE_DIR=$RAID/.cache/wandb +export TMPDIR=$RAID/.cache/tmp +mkdir -p $TMPDIR +export WANDB_MODE=offline +# export HF_DATASETS_OFFLINE=1 +# export HF_HUB_OFFLINE=1 +export TOKENIZERS_PARALLELISM=false +export MUJOCO_GL=egl +export MUJOCO_GL=egl +ADD_IMAGE_TOKENS=true +unset HF_HUB_OFFLINE +# RUN EVALUATION +python src/lerobot/scripts/eval.py \ + --policy.path="$POLICY_PATH" \ + --env.type="$ENV_TYPE" \ + --eval.batch_size="$BATCH_SIZE" \ + --eval.n_episodes="$N_EPISODES" \ + --env.multitask_eval=False \ + --env.task=$TASK \ + --policy.use_amp=$USE_AMP \ + --policy.n_action_steps=$N_ACTION_STEPS \ + # --policy.add_image_special_tokens=$ADD_IMAGE_TOKENS \ + --policy.attention_mode=$ATTN_MODE \ + --policy.self_attn_every_n_layers=$SELF_ATTN_EVERY_N_LAYERS \ + --policy.vlm_model_name=$VLM_NAME \ + --policy.pad_language_to=$PAD_LANG_TO \ + --policy.load_vlm_weights=$LOAD_VLM_WEIGHTS \ + --policy.num_vlm_layers=$NUM_VLM_LAYERS \ + --policy.chunk_size=$CHUNK_SIZE \ + --policy.n_obs_steps=$N_OBS_STEPS \ + --policy.num_expert_layers=$NUM_EXPERT_LAYERS \ + --policy.expert_width_multiplier=$EXPERT_WIDTH_MULTIPLIER \ diff --git a/examples/7_train_acc.sh b/examples/7_train_acc.sh new file mode 100644 index 000000000..27f445143 --- /dev/null +++ b/examples/7_train_acc.sh @@ -0,0 +1,93 @@ +#!/bin/bash +# smolvla training with accelerate + +set -euo pipefail + +# repo/env +cd ~/lerobot || exit 1 +# conda activate lerobot +export LC_ALL=C + +rm -f core-* + +# storage / caches +RAID=/raid/jade +export TRANSFORMERS_CACHE=$RAID/.cache/huggingface/transformers +export HF_HOME=$RAID/.cache/huggingface +export HF_DATASETS_CACHE=$RAID/.cache/huggingface/datasets +export HF_LEROBOT_HOME=$RAID/.cache/huggingface/lerobot +export WANDB_CACHE_DIR=$RAID/.cache/wandb +export TMPDIR=$RAID/.cache/tmp +mkdir -p $TMPDIR +export WANDB_MODE=offline +export HF_DATASETS_OFFLINE=1 +export HF_HUB_OFFLINE=1 +export TOKENIZERS_PARALLELISM=false +export MUJOCO_GL=egl + +# CONFIG +ENV=libero +TASK=libero_spatial +REPO_ID=physical-intelligence/libero + +POLICY=smolvla +VLM=HuggingFaceTB/SmolVLM2-500M-Instruct + +# Optim / scheduling +LR=1e-4 +DECAY_LR=2.5e-6 +DECAY_STEPS=30000 +USE_AMP=true # set to true for mixed precision +TRAIN_EXPERT_ONLY=true +N_ACTION_STEPS=1 +SEED=1000 + +# Training loop +OFFLINE_STEPS=100000 +BATCH_SIZE=32 +EVAL_FREQ=0 +SAVE_FREQ=20000 +EVAL_BATCH_SIZE=1 +NUM_EPISODES=1 + +# number of gpus to use +NUM_PROCESSES=2 +export CUDA_VISIBLE_DEVICES=1,3 +PORT=29522 + +# naming/output dir +TRAIN_DIR=$RAID/logs/lerobot/lerobot_2_${REPO_ID//\//_}_${POLICY}_lr${LR}bs${BATCH_SIZE}steps${OFFLINE_STEPS} +echo "Training dir: $TRAIN_DIR" + +rm -rf "$TRAIN_DIR" + +# RUN +python -m accelerate.commands.launch \ + --num_processes $NUM_PROCESSES \ + --num_machines 1 \ + --main_process_port $PORT \ + --mixed_precision=$( [ "$USE_AMP" = true ] && echo "bf16" || echo "no" ) \ + src/lerobot/scripts/train_accelerate.py \ + --policy.type=$POLICY \ + --policy.use_amp=True \ + --policy.vlm_model_name=$VLM \ + --dataset.repo_id=$REPO_ID \ + --dataset.root=$HF_DATASETS_CACHE \ + --env.type=$ENV \ + --env.task=$TASK \ + --output_dir=$TRAIN_DIR \ + --batch_size=$BATCH_SIZE \ + --steps=$OFFLINE_STEPS \ + --eval_freq=$EVAL_FREQ \ + --save_freq=$SAVE_FREQ \ + --eval.batch_size=$EVAL_BATCH_SIZE \ + --eval.n_episodes=$NUM_EPISODES \ + --policy.optimizer_lr=$LR \ + --policy.repo_id=None \ + --policy.scheduler_decay_lr=$DECAY_LR \ + --policy.scheduler_decay_steps=$DECAY_STEPS \ + --policy.n_action_steps=$N_ACTION_STEPS \ + --policy.train_expert_only=$TRAIN_EXPERT_ONLY \ + --policy.vlm_model_name=$VLM \ + --seed=$SEED \ + --wandb.enable=false diff --git a/examples/7_train_libero_smolvla.sh b/examples/7_train_libero_smolvla.sh index 3943e3c96..f0b9de4e5 100644 --- a/examples/7_train_libero_smolvla.sh +++ b/examples/7_train_libero_smolvla.sh @@ -21,8 +21,8 @@ export WANDB_CACHE_DIR=$RAID/.cache/wandb export TMPDIR=$RAID/.cache/tmp mkdir -p $TMPDIR export WANDB_MODE=offline -export HF_DATASETS_OFFLINE=1 -export HF_HUB_OFFLINE=1 +# export HF_DATASETS_OFFLINE=1 +# export HF_HUB_OFFLINE=1 export TOKENIZERS_PARALLELISM=false export MUJOCO_GL=egl @@ -31,11 +31,11 @@ PORT=29522 # =================== CONFIG =================== ENV=libero -TASK=libero_spatial +TASK=libero_object REPO_ID=physical-intelligence/libero - +ROOT=$RAID POLICY=smolvla -VLM=HuggingFaceTB/SmolVLM2-2.2B-Instruct +VLM=HuggingFaceTB/SmolVLM2-500M-Instruct # Optim / scheduling LR=1e-4 @@ -55,10 +55,10 @@ EVAL_BATCH_SIZE=1 NUM_EPISODES=1 # GPU selection 0, 1, 2, 3 -export CUDA_VISIBLE_DEVICES=1 +export CUDA_VISIBLE_DEVICES=0 # naming/output dir -TRAIN_DIR=$RAID/logs/lerobot/lerobot_${REPO_ID//\//_}_${POLICY}_lr${LR}bs${BATCH_SIZE}steps${OFFLINE_STEPS} +TRAIN_DIR=$RAID/logs/lerobot/lerobot_solo_${REPO_ID//\//_}_${POLICY}_lr${LR}bs${BATCH_SIZE}steps${OFFLINE_STEPS} echo "Training dir: $TRAIN_DIR" # train @@ -68,7 +68,6 @@ python src/lerobot/scripts/train.py \ --policy.type=$POLICY \ --policy.vlm_model_name=$VLM \ --dataset.repo_id=$REPO_ID \ - --dataset.root=$HF_DATASETS_CACHE \ --env.type=$ENV \ --env.task=$TASK \ --output_dir=$TRAIN_DIR \ @@ -85,6 +84,6 @@ python src/lerobot/scripts/train.py \ --policy.scheduler_decay_steps=$DECAY_STEPS \ --policy.n_action_steps=$N_ACTION_STEPS \ --policy.train_expert_only=$TRAIN_EXPERT_ONLY \ - --policy.vlm_model_name=/raid/jade/.cache/huggingface/models/SmolVLM2-2.2B-Instruct \ + --policy.vlm_model_name=$VLM \ --seed=$SEED \ --wandb.enable=false diff --git a/examples/8_train_smolvla_must.sh b/examples/8_train_smolvla_must.sh new file mode 100644 index 000000000..98029f3ad --- /dev/null +++ b/examples/8_train_smolvla_must.sh @@ -0,0 +1,141 @@ +#!/bin/bash +# smolvla training with accelerate + +set -euo pipefail + +# repo/env +cd ~/lerobot || exit 1 +# conda activate lerobot +export LC_ALL=C + +rm -f core-* + +# storage / caches +RAID=/raid/jade +export TRANSFORMERS_CACHE=$RAID/.cache/huggingface/transformers +export HF_HOME=$RAID/.cache/huggingface +export HF_DATASETS_CACHE=$RAID/.cache/huggingface/datasets +export HF_LEROBOT_HOME=$RAID/.cache/huggingface/lerobot +export WANDB_CACHE_DIR=$RAID/.cache/wandb +export TMPDIR=$RAID/.cache/tmp +mkdir -p $TMPDIR +export WANDB_MODE=offline +# export HF_DATASETS_OFFLINE=1 +# export HF_HUB_OFFLINE=1 +export TOKENIZERS_PARALLELISM=false +export MUJOCO_GL=egl + +# CONFIG +ENV=libero +TASK=libero_spatial +REPO_ID=HuggingfaceVLA/libero + +POLICY=smolvla +VLM=HuggingFaceTB/SmolVLM2-500M-Instruct + +# Optim / scheduling +LR=1e-4 +DECAY_LR=2.5e-6 +DECAY_STEPS=30000 +USE_AMP=true # set to true for mixed precision +TRAIN_EXPERT_ONLY=true +N_ACTION_STEPS=1 +SEED=1000 +LOAD_VLM_WEIGHTS=true +# Training loop +OFFLINE_STEPS=100000 +BATCH_SIZE=32 +EVAL_FREQ=0 +SAVE_FREQ=20000 +EVAL_BATCH_SIZE=1 +NUM_EPISODES=1 +ADD_IMAGE_TOKENS=tru +N_OBS_STEPS=1 +ATTN_MODE=cross_attn +EXPERT_WIDTH_MULTIPLIER=0.5 +# number of gpus to use +NUM_PROCESSES=2 +NUM_VLM_LAYERS=0 +SELF_ATTN_EVERY_N_LAYERS=2 +CHUNK_SIZE=50 +export CUDA_VISIBLE_DEVICES=0 +PORT=29522 +PREFIX_LENGTH=0 +LOAD_VLM_WEIGHTS=true +# naming/output dir +TRAIN_DIR=$RAID/logs/lerobot/lerobot_new_${REPO_ID//\//_}_${POLICY}_lr${LR}bs${BATCH_SIZE}steps${OFFLINE_STEPS} +echo "Training dir: $TRAIN_DIR" + +rm -rf "$TRAIN_DIR" + +# RUN +# python -m accelerate.commands.launch \ +# --num_processes $NUM_PROCESSES \ +# --num_machines 1 \ +# --main_process_port $PORT \ +# --mixed_precision=$( [ "$USE_AMP" = true ] && echo "bf16" || echo "no" ) \ +# src/lerobot/scripts/train_accelerate.py \ +# --policy.type=$POLICY \ +# --policy.use_amp=True \ +# --policy.vlm_model_name=$VLM \ +# --dataset.repo_id=$REPO_ID \ +# --dataset.root=$HF_DATASETS_CACHE \ +# --env.type=$ENV \ +# --env.task=$TASK \ +# --output_dir=$TRAIN_DIR \ +# --batch_size=$BATCH_SIZE \ +# --steps=$OFFLINE_STEPS \ +# --eval_freq=$EVAL_FREQ \ +# --save_freq=$SAVE_FREQ \ +# --eval.batch_size=$EVAL_BATCH_SIZE \ +# --eval.n_episodes=$NUM_EPISODES \ +# --policy.optimizer_lr=$LR \ +# --policy.repo_id=None \ +# --policy.scheduler_decay_lr=$DECAY_LR \ +# --policy.scheduler_decay_steps=$DECAY_STEPS \ +# --policy.n_action_steps=$N_ACTION_STEPS \ +# --policy.train_expert_only=$TRAIN_EXPERT_ONLY \ +# --policy.vlm_model_name=$VLM \ +# --policy.n_obs_steps=$N_OBS_STEPS \ +# --policy.attention_mode=$ATTN_MODE \ +# --policy.prefix_length=$PREFIX_LENGTH \ +# --policy.num_vlm_layers=$NUM_VLM_LAYERS \ +# --policy.chunk_size=$CHUNK_SIZE \ +# --policy.expert_width_multiplier=$EXPERT_WIDTH_MULTIPLIER \ +# --policy.self_attn_every_n_layers=$SELF_ATTN_EVERY_N_LAYERS \ +# --seed=$SEED \ +# --wandb.enable=false + + +python src/lerobot/scripts/train.py \ + --policy.type=$POLICY \ + --policy.use_amp=False \ + --policy.vlm_model_name=$VLM \ + --dataset.repo_id=$REPO_ID \ + --dataset.root='/raid/jade/.cache/huggingface/lerobot/HuggingFaceVLA/libero/data' \ + --env.type=$ENV \ + --env.task=$TASK \ + --output_dir=$TRAIN_DIR \ + --batch_size=$BATCH_SIZE \ + --steps=$OFFLINE_STEPS \ + --eval_freq=$EVAL_FREQ \ + --save_freq=$SAVE_FREQ \ + --eval.batch_size=$EVAL_BATCH_SIZE \ + --eval.n_episodes=$NUM_EPISODES \ + --policy.optimizer_lr=$LR \ + --policy.repo_id=None \ + --policy.scheduler_decay_lr=$DECAY_LR \ + --policy.scheduler_decay_steps=$DECAY_STEPS \ + --policy.n_action_steps=$N_ACTION_STEPS \ + --policy.train_expert_only=$TRAIN_EXPERT_ONLY \ + --policy.vlm_model_name=$VLM \ + --policy.n_obs_steps=$N_OBS_STEPS \ + --policy.attention_mode=$ATTN_MODE \ + --policy.prefix_length=$PREFIX_LENGTH \ + --policy.num_vlm_layers=$NUM_VLM_LAYERS \ + --policy.chunk_size=$CHUNK_SIZE \ + --policy.load_vlm_weights=$LOAD_VLM_WEIGHTS \ + --policy.expert_width_multiplier=$EXPERT_WIDTH_MULTIPLIER \ + --policy.self_attn_every_n_layers=$SELF_ATTN_EVERY_N_LAYERS \ + --seed=$SEED \ + --wandb.enable=false diff --git a/examples/checker.py b/examples/checker.py new file mode 100644 index 000000000..12377e9b9 --- /dev/null +++ b/examples/checker.py @@ -0,0 +1,27 @@ +from huggingface_hub import HfApi +api = HfApi() +# api.upload_large_folder( +# repo_id="HuggingFaceVLA/libero", +# repo_type="dataset", +# folder_path="/raid/jade/.cache/huggingface/lerobot/HuggingFaceVLA/libero", +# ) +api.upload_large_folder( + repo_id="HuggingFaceVLA/metaworld_mt50", + repo_type="dataset", + folder_path="/raid/jade/.cache/huggingface/lerobot/metaworld_mt50", +) +# repo_id="HuggingFaceVLA/libero" +# # Upload extra files +# api.upload_file( +# repo_id=repo_id, +# repo_type="dataset", +# path_or_fileobj="/raid/jade/libero_converted/README.md", +# path_in_repo="README.md" +# ) + +# api.upload_folder( +# repo_id=repo_id, +# repo_type="dataset", +# folder_path="/raid/jade/libero_converted/meta", +# path_in_repo="meta" +# ) diff --git a/examples/checker2.py b/examples/checker2.py new file mode 100644 index 000000000..a5825d87f --- /dev/null +++ b/examples/checker2.py @@ -0,0 +1,35 @@ +import pyarrow.parquet as pq + +# # First parquet (cached HF version) +meta1 = pq.read_metadata("/raid/jade/.cache/huggingface/datasets/data/chunk-000/episode_000000.parquet") +meta1 = pq.read_metadata("//raid/jade/.cache/huggingface/lerobot/HuggingFaceVLA/libero/data/chunk-000/episode_000019.parquet") +print("First parquet key_value_metadata:") +print(meta1.metadata) # low-level file metadata +# print() +print("Second") +# Second parquet (your converted version) +meta2 = pq.read_metadata("//raid/jade/.cache/huggingface/lerobot/HuggingFaceVLA/libero/data/chunk-000/episode_000019.parquet") +print("\nSecond parquet key_value_metadata:") +# print(meta2.metadata) + +# from datasets import load_dataset +# root_dir = "/raid/jade/libero_converted" + +# # Load all parquet files under the root_dir recursively +# ds = load_dataset("parquet", data_files=f"{root_dir}/**/*.parquet") + +# print(ds) # prints split info +# print(ds["train"].features) # check schema/features + +# # Peek at one row +# example = ds["train"][0] +# print(example.keys()) +# print(type(example["observation.images.image"])) +# print(type(example["observation.images.image2"])) + +import pyarrow.parquet as pq + +for ep in ["episode_000019.parquet", "episode_000021.parquet", "episode_000026.parquet"]: + path = f"/raid/jade/.cache/huggingface/lerobot/HuggingFaceVLA/libero/data/chunk-000/{ep}" + schema = pq.read_schema(path) + print(ep, schema.names) diff --git a/examples/convert_data.py b/examples/convert_data.py new file mode 100644 index 000000000..96ce58cb1 --- /dev/null +++ b/examples/convert_data.py @@ -0,0 +1,253 @@ +#!/usr/bin/env python3 +""" +Convert local LeRobot datasets from v2.0 to v2.1 format. +This script adapts the official converter to work with local datasets. +""" + +import sys +import argparse +import logging +from pathlib import Path + +# Add lerobot to path +sys.path.insert(0, '/home/jade_choghari/lerobot/src') + +from lerobot.datasets.lerobot_dataset import CODEBASE_VERSION, LeRobotDataset +from lerobot.datasets.utils import EPISODES_STATS_PATH, STATS_PATH, load_stats, write_info +from lerobot.datasets.v21.convert_stats import check_aggregate_stats, convert_stats + +logging.basicConfig(level=logging.INFO) +logger = logging.getLogger(__name__) + +def convert_local_dataset( + dataset_path: str, + num_workers: int = 4, + skip_if_converted: bool = True +): + """ + Convert a local dataset from v2.0 to v2.1 format. + + Args: + dataset_path: Path to the local dataset directory + num_workers: Number of workers for parallel processing + skip_if_converted: Skip if already has episodes_stats.jsonl + """ + dataset_path = Path(dataset_path) + + print(f"πŸ”„ Converting local dataset: {dataset_path}") + + # Check if already converted + episodes_stats_path = dataset_path / "meta" / "episodes_stats.jsonl" + if episodes_stats_path.exists() and skip_if_converted: + # Check if file is empty + file_size = episodes_stats_path.stat().st_size + if file_size == 0: + print(f" ⚠️ episodes_stats.jsonl is empty, will regenerate") + else: + # Check if file has content + with open(episodes_stats_path, 'r') as f: + content = f.read().strip() + if not content: + print(f" ⚠️ episodes_stats.jsonl has no content, will regenerate") + else: + print(f" ⏭️ Already has episodes_stats.jsonl, skipping") + return True + + try: + # Check if this is a v2.0 dataset that needs conversion + episodes_stats_path = dataset_path / "meta" / "episodes_stats.jsonl" + stats_path = dataset_path / "meta" / "stats.json" + + if not episodes_stats_path.exists() and stats_path.exists(): + print(f" πŸ”„ Detected v2.0 dataset, creating temporary episodes_stats.jsonl...") + # Create empty episodes_stats.jsonl to allow loading + episodes_stats_path.touch() + created_temp_file = True + else: + created_temp_file = False + + # Load dataset from local path with pyav video backend + print(f" πŸ“‚ Loading dataset from local path...") + # Use a dummy repo_id since we're loading locally + dummy_repo_id = f"{dataset_path.parent.name}/{dataset_path.name}" + dataset = LeRobotDataset( + dummy_repo_id, + root=str(dataset_path), + # video_backend="pyav", + # local_files_only=True + ) + + # Remove temporary file if we created it + if created_temp_file and episodes_stats_path.exists() and episodes_stats_path.stat().st_size == 0: + episodes_stats_path.unlink() + print(f" πŸ—‘οΈ Removed temporary episodes_stats.jsonl") + + # Remove existing episodes_stats if present (ensure clean conversion) + episodes_stats_path = dataset_path / "meta" / "episodes_stats.jsonl" + if episodes_stats_path.exists(): + episodes_stats_path.unlink() + print(f" πŸ—‘οΈ Removed existing episodes_stats.jsonl") + + # Check if video directory exists before conversion + videos_dir = dataset_path / "videos" + if not videos_dir.exists(): + print(f" ⚠️ No videos directory found - will skip video statistics") + + # Convert stats + print(f" πŸ“Š Computing episode statistics...") + convert_stats(dataset, num_workers=num_workers) + + # Load reference stats for validation if they exist + stats_path = dataset.root / STATS_PATH + if stats_path.exists(): + print(f" βœ… Validating against reference stats...") + try: + ref_stats = load_stats(dataset.root) + check_aggregate_stats(dataset, ref_stats) + print(f" βœ… Stats validation passed!") + except AssertionError as e: + print(f" ⚠️ Stats validation failed with minor differences: {e}") + print(f" ⚠️ This is likely due to floating-point precision, continuing anyway...") + # Check if the error is just a small numerical difference + if "Max absolute difference:" in str(e) and "Max relative difference:" in str(e): + print(f" βœ… Treating as acceptable numerical precision difference") + else: + raise e + + # Remove old stats.json file + print(f" πŸ—‘οΈ Removing old stats.json") + stats_path.unlink() + else: + print(f" ⚠️ No reference stats found, skipping validation") + + # Update codebase version + dataset.meta.info["codebase_version"] = CODEBASE_VERSION + write_info(dataset.meta.info, dataset.root) + + print(f" βœ… Successfully converted to v2.1") + return True + + except Exception as e: + print(f" ❌ Failed to convert: {e}") + logger.exception("Conversion failed") + return False + +def convert_multiple_datasets( + base_dirs: list[str], + max_datasets: int = None, + num_workers: int = 4 +): + """Convert multiple datasets from base directories.""" + + datasets_to_convert = [] + + # Scan for datasets needing conversion + for base_dir in base_dirs: + base_path = Path(base_dir) + if not base_path.exists(): + print(f"⚠️ Directory not found: {base_dir}") + continue + + print(f"πŸ” Scanning: {base_dir}") + + # Walk through author/dataset structure + for author_dir in sorted(base_path.iterdir()): + if not author_dir.is_dir(): + continue + + for dataset_dir in sorted(author_dir.iterdir()): + if not dataset_dir.is_dir(): + continue + + # Check if needs conversion + episodes_stats_path = dataset_dir / "meta" / "episodes_stats.jsonl" + info_path = dataset_dir / "meta" / "info.json" + + needs_conversion = False + if info_path.exists(): + if not episodes_stats_path.exists(): + needs_conversion = True + print(f" πŸ“ Found (missing): {author_dir.name}/{dataset_dir.name}") + else: + # Check if episodes_stats file is empty + try: + file_size = episodes_stats_path.stat().st_size + if file_size == 0: + needs_conversion = True + print(f" πŸ“ Found (empty): {author_dir.name}/{dataset_dir.name}") + else: + # Check if file has content + with open(episodes_stats_path, 'r') as f: + content = f.read().strip() + if not content: + needs_conversion = True + print(f" πŸ“ Found (no content): {author_dir.name}/{dataset_dir.name}") + except Exception as e: + # If we can't read the file, consider it needs conversion + needs_conversion = True + print(f" πŸ“ Found (read error): {author_dir.name}/{dataset_dir.name}") + + if needs_conversion: + datasets_to_convert.append(dataset_dir) + + if not datasets_to_convert: + print("πŸŽ‰ No datasets need conversion!") + return + + if max_datasets: + datasets_to_convert = datasets_to_convert[:max_datasets] + + print(f"\nπŸš€ Converting {len(datasets_to_convert)} datasets...") + + successful = 0 + failed = 0 + + for i, dataset_path in enumerate(datasets_to_convert, 1): + print(f"\n[{i}/{len(datasets_to_convert)}] {dataset_path.parent.name}/{dataset_path.name}") + + success = convert_local_dataset(dataset_path, num_workers=num_workers) + if success: + successful += 1 + else: + failed += 1 + + print(f"\nπŸ“Š Conversion Summary:") + print(f" βœ… Successful: {successful}") + print(f" ❌ Failed: {failed}") + print(f" πŸ“ˆ Success rate: {successful}/{len(datasets_to_convert)} ({100*successful/len(datasets_to_convert):.1f}%)") + + +def main(): + parser = argparse.ArgumentParser(description="Convert local LeRobot datasets to v2.1 format") + parser.add_argument("--dataset", type=str, help="Single dataset path to convert") + parser.add_argument("--base-dirs", nargs="+", + default=["/fsx/dana_aubakirova/vla/community_dataset_v1"], + help="Base directories to scan for datasets") + parser.add_argument("--max-datasets", type=int, help="Maximum number of datasets to convert") + parser.add_argument("--num-workers", type=int, default=4, help="Number of workers for stats computation") + parser.add_argument("--all", action="store_true", help="Convert all datasets in base directories") + + args = parser.parse_args() + + if args.dataset: + # Convert single dataset + success = convert_local_dataset(args.dataset, num_workers=args.num_workers) + if success: + print(f"\nπŸŽ‰ Successfully converted: {args.dataset}") + else: + print(f"\nπŸ’₯ Failed to convert: {args.dataset}") + sys.exit(1) + + elif args.all: + # Convert all datasets + convert_multiple_datasets( + args.base_dirs, + max_datasets=args.max_datasets, + num_workers=args.num_workers + ) + + else: + parser.print_help() + +if __name__ == "__main__": + main() \ No newline at end of file diff --git a/examples/convert_libero.py b/examples/convert_libero.py new file mode 100644 index 000000000..7bfc50eae --- /dev/null +++ b/examples/convert_libero.py @@ -0,0 +1,126 @@ +import os +import pyarrow.parquet as pq +import tempfile +import shutil + +# Root directory of converted data +root_dir = "/raid/jade/libero_converted" + +# No renaming +rename_map = { + +} + +# Hugging Face features metadata (constant across all files) +HF_METADATA = { + b"huggingface": b'{"info": {"features": {"observation.images.image": {"_type": "Image"}, "observation.images.image2": {"_type": "Image"}, "state": {"feature": {"dtype": "float32", "_type": "Value"}, "length": 8, "_type": "Sequence"}, "actions": {"feature": {"dtype": "float32", "_type": "Value"}, "length": 7, "_type": "Sequence"}, "timestamp": {"dtype": "float32", "_type": "Value"}, "frame_index": {"dtype": "int64", "_type": "Value"}, "episode_index": {"dtype": "int64", "_type": "Value"}, "index": {"dtype": "int64", "_type": "Value"}, "task_index": {"dtype": "int64", "_type": "Value"}}}}' +} + +def patch_parquet(parquet_path, hf_metadata): + try: + table = pq.read_table(parquet_path) + + # Merge metadata + new_meta = dict(table.schema.metadata or {}) + new_meta.update(hf_metadata) + + # Apply metadata to table + table = table.replace_schema_metadata(new_meta) + + # Write safely via temp file + tmp_fd, tmp_path = tempfile.mkstemp(suffix=".parquet") + os.close(tmp_fd) + pq.write_table(table, tmp_path) + shutil.move(tmp_path, parquet_path) + + print(f"βœ… Patched: {parquet_path}") + return True + except Exception as e: + print(f"❌ Failed on {parquet_path}: {e}") + return False + +# Walk through all chunk dirs and patch parquet files +for dirpath, _, filenames in os.walk(root_dir): + for fname in filenames: + if fname.endswith(".parquet"): + fpath = os.path.join(dirpath, fname) + patch_parquet(fpath, HF_METADATA)#!/usr/bin/env python3 + +#!/usr/bin/env python3 +import os +import pyarrow.parquet as pq +import tempfile +import shutil + +# Explicit list of files to patch +FILES_TO_PATCH = [ + "/raid/jade/.cache/huggingface/lerobot/HuggingFaceVLA/libero/data/chunk-000/episode_000021.parquet", + "/raid/jade/.cache/huggingface/lerobot/HuggingFaceVLA/libero/data/chunk-000/episode_000022.parquet", + "/raid/jade/.cache/huggingface/lerobot/HuggingFaceVLA/libero/data/chunk-000/episode_000023.parquet", + "/raid/jade/.cache/huggingface/lerobot/HuggingFaceVLA/libero/data/chunk-000/episode_000024.parquet", + "/raid/jade/.cache/huggingface/lerobot/HuggingFaceVLA/libero/data/chunk-000/episode_000025.parquet", +] + +# Optional renaming map (fill in as needed) +rename_map = { + # "old_column_name": "new_column_name", + "image": "observation.images.image", + "image2": "observation.images.image2", + "actions": "action", +} + +# Hugging Face features metadata (constant across all files) +HF_METADATA = { + b"huggingface": b'{"info": {"features": {' + b'"observation.images.image": {"_type": "Image"}, ' + b'"observation.images.image2": {"_type": "Image"}, ' + b'"state": {"feature": {"dtype": "float32", "_type": "Value"}, "length": 8, "_type": "Sequence"}, ' + b'"actions": {"feature": {"dtype": "float32", "_type": "Value"}, "length": 7, "_type": "Sequence"}, ' + b'"timestamp": {"dtype": "float32", "_type": "Value"}, ' + b'"frame_index": {"dtype": "int64", "_type": "Value"}, ' + b'"episode_index": {"dtype": "int64", "_type": "Value"}, ' + b'"index": {"dtype": "int64", "_type": "Value"}, ' + b'"task_index": {"dtype": "int64", "_type": "Value"}}}}' +} + +def patch_parquet(parquet_path, hf_metadata, rename_map): + try: + # Load parquet table + table = pq.read_table(parquet_path) + + # If renaming is needed + if rename_map: + schema = table.schema + new_names = [ + rename_map.get(name, name) for name in schema.names + ] + table = table.rename_columns(new_names) + + # Merge schema metadata + new_meta = dict(table.schema.metadata or {}) + new_meta.update(hf_metadata) + + # Replace metadata in table + table = table.replace_schema_metadata(new_meta) + + # Write safely via temp file + tmp_fd, tmp_path = tempfile.mkstemp(suffix=".parquet") + os.close(tmp_fd) + pq.write_table(table, tmp_path) + + # Replace original file + shutil.move(tmp_path, parquet_path) + + print(f"βœ… Patched: {parquet_path}") + return True + except Exception as e: + print(f"❌ Failed on {parquet_path}: {e}") + return False + + +if __name__ == "__main__": + for fpath in FILES_TO_PATCH: + if os.path.exists(fpath): + patch_parquet(fpath, HF_METADATA, rename_map) + else: + print(f"⚠️ File not found: {fpath}") diff --git a/examples/evaluate_libero.py b/examples/evaluate_libero.py new file mode 100644 index 000000000..bf99994b6 --- /dev/null +++ b/examples/evaluate_libero.py @@ -0,0 +1,255 @@ +""" +This script demonstrates how to evaluate a pretrained smolVLA policy on the LIBERO benchmark. +""" + +import collections +import dataclasses +import logging +import math +import pathlib + +import cv2 +import draccus +import imageio +import numpy as np +import torch +from libero.libero import benchmark, get_libero_path +from libero.libero.envs import OffScreenRenderEnv +from tqdm import tqdm + +from lerobot.policies.smolvla.modeling_smolvla import SmolVLAPolicy +from lerobot.policies.pi0.modeling_pi0 import PI0Policy + +LIBERO_DUMMY_ACTION = [0.0] * 6 + [-1.0] +LIBERO_ENV_RESOLUTION = 256 # resolution used to render training data + +@dataclasses.dataclass +class Args: + """ + Evaluation arguments for smolVLA on LIBERO. + """ + + # --- Hugging Face arguments --- + policy_path: str = "lerobot/smolvla_base" + """Path to the pretrained policy on the Hugging Face Hub or local directory.""" + + # --- LIBERO environment-specific parameters --- + task_suite_name: str = "libero_spatial" + """Task suite. Options: libero_spatial, libero_object, libero_goal, libero_10, libero_90""" + num_steps_wait: int = 10 + """Number of steps to wait for objects to stabilize in sim.""" + num_trials_per_task: int = 50 + """Number of rollouts per task.""" + + # --- Evaluation arguments --- + video_out_path: str = "data/libero/videos" + """Path to save videos.""" + device: str = "cuda" + """Device to use for evaluation.""" + + seed: int = 7 + """Random Seed (for reproducibility)""" + + +@draccus.wrap() +def eval_libero(args: Args) -> None: + # Set random seed + torch.manual_seed(args.seed) + np.random.seed(args.seed) + + # --- Load Policy --- + policy = SmolVLAPolicy.from_pretrained(args.policy_path) + policy.to(args.device) + policy.eval() + + # --- Initialize LIBERO task suite --- + benchmark_dict = benchmark.get_benchmark_dict() + try: + task_suite = benchmark_dict[args.task_suite_name]() + except KeyError: + raise ValueError( + f"Unknown task suite: {args.task_suite_name}. " + f"Available options are: {list(benchmark_dict.keys())}" + ) + num_tasks_in_suite = task_suite.n_tasks + logging.info(f"Task suite: {args.task_suite_name}") + + pathlib.Path(args.video_out_path).mkdir(parents=True, exist_ok=True) + + if args.task_suite_name == "libero_spatial": + max_steps = 220 # longest training demo has 193 steps + elif args.task_suite_name == "libero_object": + max_steps = 280 # longest training demo has 254 steps + elif args.task_suite_name == "libero_goal": + max_steps = 300 # longest training demo has 270 steps + elif args.task_suite_name == "libero_10": + max_steps = 520 # longest training demo has 505 steps + elif args.task_suite_name == "libero_90": + max_steps = 400 # longest training demo has 373 steps + else: + # Fallback for custom task suites + max_steps = 520 + + # --- Evaluation Loop --- + total_episodes, total_successes = 0, 0 + for task_id in tqdm(range(num_tasks_in_suite), desc="Tasks"): + # Get task + task = task_suite.get_task(task_id) + + # Get default LIBERO initial states + initial_states = task_suite.get_task_init_states(task_id) + + # Initialize LIBERO environment and task description + env, task_description = _get_libero_env(task, LIBERO_ENV_RESOLUTION, args.seed) + + # Start episodes + task_episodes, task_successes = 0, 0 + for episode_idx in tqdm( + range(min(args.num_trials_per_task, len(initial_states))), + desc=f"Task {task_id}: {task.language}", + leave=False, + ): + logging.info(f"\nTask: {task_description}") + + # Reset environment and policy + env.reset() + policy.reset() + + # Set initial states + obs = env.set_init_state(initial_states[episode_idx]) + + # IMPORTANT: Do nothing for the first few timesteps because the simulator drops objects + # and we need to wait for them to fall + for _ in range(args.num_steps_wait): + obs, _, _, _ = env.step(LIBERO_DUMMY_ACTION) + + # Setup + t = 0 + frames = [] + done = False + + # Add initial frame + agentview_image = np.ascontiguousarray(obs["agentview_image"][::-1, ::-1]) + # frames.append(agentview_image) + # import ipdb; ipdb.set_trace() + logging.info(f"Starting episode {task_episodes+1}...") + while t < max_steps: + try: + # Get preprocessed image + # IMPORTANT: rotate 180 degrees to match train preprocessing + wrist_img = np.ascontiguousarray(obs["robot0_eye_in_hand_image"][::-1, ::-1]) + agentview_image = np.ascontiguousarray(obs["agentview_image"][::-1, ::-1]) + frames.append(agentview_image) + + # Prepare observations dict + state = np.concatenate( + ( + obs["robot0_eef_pos"], + _quat2axisangle(obs["robot0_eef_quat"]), + obs["robot0_gripper_qpos"], + ) + ) + observation = { + "observation.images.image": torch.from_numpy(agentview_image / 255.0) + .permute(2, 0, 1) + .to(torch.float32) + .to(args.device).unsqueeze(0), + "observation.images.image2": torch.from_numpy(wrist_img / 255.0) + .permute(2, 0, 1) + .to(torch.float32) + .to(args.device).unsqueeze(0), + "observation.state": torch.from_numpy(state).to(torch.float32).to(args.device).unsqueeze(0), + "task": task_description, + } + + # Query model to get action + with torch.inference_mode(): + action_tensor = policy.select_action(observation) + action = action_tensor.cpu().numpy()[0] + action[-1] = 1 - action[-1] + + # Execute action in environment + obs, _, done, _ = env.step(action) + if done: + task_successes += 1 + total_successes += 1 + break + t += 1 + + except Exception as e: + logging.error(f"Caught exception: {e}") + break + + task_episodes += 1 + total_episodes += 1 + + # Save a replay video of the episode + suffix = "success" if done else "failure" + task_segment = task_description.replace(" ", "_").replace("/", "_") + video_path = ( + pathlib.Path(args.video_out_path) / f"rollout_task_{task_id}_episode_{episode_idx}_{task_segment}_{suffix}.mp4" + ) + fps = 30 + writer = imageio.get_writer(video_path, fps=fps) + + for image in frames: + writer.append_data(image) + writer.close() + logging.info(f"Saved video to {video_path}") + + # Log current results + logging.info(f"Success: {done}") + if total_episodes > 0: + logging.info(f"# episodes completed so far: {total_episodes}") + logging.info(f"# successes: {total_successes} ({total_successes / total_episodes * 100:.1f}%)") + + # Log final results for the task + if task_episodes > 0: + logging.info(f"Task {task_id} success rate: {float(task_successes) / float(task_episodes):.2f}") + if total_episodes > 0: + logging.info(f"Cumulative success rate: {float(total_successes) / float(total_episodes):.2f}") + + logging.info("--- Evaluation finished ---") + if total_episodes > 0: + logging.info(f"Total success rate: {float(total_successes) / float(total_episodes):.2f}") + logging.info(f"Total episodes: {total_episodes}") + logging.info(f"Total successes: {total_successes}") + cv2.destroyAllWindows() + + +def _get_libero_env(task, resolution, seed): + """Initializes and returns the LIBERO environment, along with the task description.""" + task_description = task.language + task_bddl_file = pathlib.Path(get_libero_path("bddl_files")) / task.problem_folder / task.bddl_file + env_args = { + "bddl_file_name": str(task_bddl_file), + "camera_heights": resolution, + "camera_widths": resolution, + } + env = OffScreenRenderEnv(**env_args) + env.seed(seed) # IMPORTANT: seed seems to affect object positions even when using fixed initial state + return env, task_description + + +def _quat2axisangle(quat): + """ + Copied from robosuite: + https://github.com/ARISE-Initiative/robosuite/blob/eafb81f54ffc104f905ee48a16bb15f059176ad3/robosuite/utils/transform_utils.py#L490C1-L512C55 + """ + # clip quaternion + if quat[3] > 1.0: + quat[3] = 1.0 + elif quat[3] < -1.0: + quat[3] = -1.0 + + den = np.sqrt(1.0 - quat[3] * quat[3]) + if math.isclose(den, 0.0): + # This is (close to) a zero degree rotation, immediately return + return np.zeros(3) + + return (quat[:3] * 2.0 * math.acos(quat[3])) / den + + +if __name__ == "__main__": + logging.basicConfig(level=logging.INFO) + eval_libero() \ No newline at end of file diff --git a/examples/requirements.in b/examples/requirements.in new file mode 100644 index 000000000..25664608a --- /dev/null +++ b/examples/requirements.in @@ -0,0 +1,8 @@ +imageio[ffmpeg] +numpy==1.22.4 +tqdm +tyro +PyYaml +opencv-python==4.6.0.66 +robosuite==1.4.1 +matplotlib==3.5.3 \ No newline at end of file diff --git a/examples/script2.py b/examples/script2.py new file mode 100644 index 000000000..cbd4da913 --- /dev/null +++ b/examples/script2.py @@ -0,0 +1,70 @@ +#!/usr/bin/env python3 +import os +import pyarrow.parquet as pq +import tempfile +import shutil + +FILES_TO_PATCH = [ + "/raid/jade/.cache/huggingface/lerobot/HuggingFaceVLA/libero/data/chunk-000/episode_000021.parquet", + "/raid/jade/.cache/huggingface/lerobot/HuggingFaceVLA/libero/data/chunk-000/episode_000022.parquet", + "/raid/jade/.cache/huggingface/lerobot/HuggingFaceVLA/libero/data/chunk-000/episode_000023.parquet", + "/raid/jade/.cache/huggingface/lerobot/HuggingFaceVLA/libero/data/chunk-000/episode_000024.parquet", + "/raid/jade/.cache/huggingface/lerobot/HuggingFaceVLA/libero/data/chunk-000/episode_000025.parquet", +] + +# Column renaming map +rename_map = { + "wrist_image": "observation.images.image2", + "actions": "action", +} + +# Hugging Face metadata +HF_METADATA = { + b"huggingface": b'{"info": {"features": {' + b'"observation.images.image": {"_type": "Image"}, ' + b'"observation.images.image2": {"_type": "Image"}, ' + b'"state": {"feature": {"dtype": "float32", "_type": "Value"}, "length": 8, "_type": "Sequence"}, ' + b'"action": {"feature": {"dtype": "float32", "_type": "Value"}, "length": 7, "_type": "Sequence"}, ' + b'"timestamp": {"dtype": "float32", "_type": "Value"}, ' + b'"frame_index": {"dtype": "int64", "_type": "Value"}, ' + b'"episode_index": {"dtype": "int64", "_type": "Value"}, ' + b'"index": {"dtype": "int64", "_type": "Value"}, ' + b'"task_index": {"dtype": "int64", "_type": "Value"}}}}' +} + +def patch_parquet(parquet_path, hf_metadata, rename_map): + try: + table = pq.read_table(parquet_path) + + # Apply column renames if needed + if rename_map: + schema = table.schema + new_names = [rename_map.get(name, name) for name in schema.names] + table = table.rename_columns(new_names) + + # Merge schema metadata + new_meta = dict(table.schema.metadata or {}) + new_meta.update(hf_metadata) + + # Replace metadata + table = table.replace_schema_metadata(new_meta) + + # Write via temp file + tmp_fd, tmp_path = tempfile.mkstemp(suffix=".parquet") + os.close(tmp_fd) + pq.write_table(table, tmp_path) + + shutil.move(tmp_path, parquet_path) + print(f"βœ… Patched: {parquet_path}") + return True + except Exception as e: + print(f"❌ Failed on {parquet_path}: {e}") + return False + + +if __name__ == "__main__": + for fpath in FILES_TO_PATCH: + if os.path.exists(fpath): + patch_parquet(fpath, HF_METADATA, rename_map) + else: + print(f"⚠️ File not found: {fpath}") diff --git a/examples/script3.py b/examples/script3.py new file mode 100644 index 000000000..7b4d7957a --- /dev/null +++ b/examples/script3.py @@ -0,0 +1,64 @@ +#!/usr/bin/env python3 +import os +import pyarrow.parquet as pq +import tempfile +import shutil + +# Root directory containing all parquet files +ROOT_DIR = "/raid/jade/.cache/huggingface/lerobot/HuggingFaceVLA/libero/data" + +# Column renaming map (normalize schema to what training expects) +rename_map = { + "state": "observation.state", +} + +# Hugging Face metadata (aligned with expected feature names) +HF_METADATA = { + b"huggingface": b'{"info": {"features": {' + b'"observation.images.image": {"_type": "Image"}, ' + b'"observation.images.image2": {"_type": "Image"}, ' + b'"observation.state": {"feature": {"dtype": "float32", "_type": "Value"}, "length": 8, "_type": "Sequence"}, ' + b'"action": {"feature": {"dtype": "float32", "_type": "Value"}, "length": 7, "_type": "Sequence"}, ' + b'"timestamp": {"dtype": "float32", "_type": "Value"}, ' + b'"frame_index": {"dtype": "int64", "_type": "Value"}, ' + b'"episode_index": {"dtype": "int64", "_type": "Value"}, ' + b'"index": {"dtype": "int64", "_type": "Value"}, ' + b'"task_index": {"dtype": "int64", "_type": "Value"}}}}' +} + +def patch_parquet(parquet_path, hf_metadata, rename_map): + try: + # Read the parquet table + table = pq.read_table(parquet_path) + + # Apply renames if necessary + if rename_map: + new_names = [rename_map.get(name, name) for name in table.schema.names] + if new_names != table.schema.names: + table = table.rename_columns(new_names) + + # Update metadata + new_meta = dict(table.schema.metadata or {}) + new_meta.update(hf_metadata) + table = table.replace_schema_metadata(new_meta) + + # Write to temp file then atomically move back + tmp_fd, tmp_path = tempfile.mkstemp(suffix=".parquet") + os.close(tmp_fd) + pq.write_table(table, tmp_path) + shutil.move(tmp_path, parquet_path) + + # Debug print + print(f"βœ… Patched: {parquet_path}") + print(" Columns:", table.schema.names) + return True + except Exception as e: + print(f"❌ Failed on {parquet_path}: {e}") + return False + +if __name__ == "__main__": + for dirpath, _, filenames in os.walk(ROOT_DIR): + for fname in filenames: + if fname.endswith(".parquet"): + fpath = os.path.join(dirpath, fname) + patch_parquet(fpath, HF_METADATA, rename_map) diff --git a/examples/script4.py b/examples/script4.py new file mode 100644 index 000000000..2eed60a94 --- /dev/null +++ b/examples/script4.py @@ -0,0 +1,3 @@ +from huggingface_hub import HfApi +hub_api = HfApi() +hub_api.create_tag("HuggingFaceVLA/libero", tag="v2.1", repo_type="dataset") diff --git a/log_text.txt b/log_text.txt new file mode 100644 index 000000000..6676df0eb --- /dev/null +++ b/log_text.txt @@ -0,0 +1,1765 @@ + self.vlm_with_expert = SmolVLMWithExpertModel( + File "/home/jade_choghari/lerobot/src/lerobot/policies/smolvla/smolvlm_with_expert.py", line 88, in __init__ + self.processor = AutoProcessor.from_pretrained(model_id) + File "/home/jade_choghari/miniconda3/envs/lerobot/lib/python3.10/site-packages/transformers/models/auto/processing +_auto.py", line 300, in from_pretrained + config_dict, _ = ProcessorMixin.get_processor_dict(pretrained_model_name_or_path, **kwargs) + File "/home/jade_choghari/miniconda3/envs/lerobot/lib/python3.10/site-packages/transformers/processing_utils.py", +line 944, in get_processor_dict + resolved_raw_chat_template_file = cached_file( + File "/home/jade_choghari/miniconda3/envs/lerobot/lib/python3.10/site-packages/transformers/utils/hub.py", line 32 +1, in cached_file + file = cached_files(path_or_repo_id=path_or_repo_id, filenames=[filename], **kwargs) + File "/home/jade_choghari/miniconda3/envs/lerobot/lib/python3.10/site-packages/transformers/utils/hub.py", line 47 +8, in cached_files + hf_hub_download( + File "/home/jade_choghari/miniconda3/envs/lerobot/lib/python3.10/site-packages/huggingface_hub/utils/_validators.p +y", line 114, in _inner_fn + return fn(*args, **kwargs) + File "/home/jade_choghari/miniconda3/envs/lerobot/lib/python3.10/site-packages/huggingface_hub/file_download.py", +line 1010, in hf_hub_download + return _hf_hub_download_to_cache_dir( + File "/home/jade_choghari/miniconda3/envs/lerobot/lib/python3.10/site-packages/huggingface_hub/file_download.py", +line 1073, in _hf_hub_download_to_cache_dir + (url_to_download, etag, commit_hash, expected_size, xet_file_data, head_call_error) = _get_metadata_or_catch_err +or( + File "/home/jade_choghari/miniconda3/envs/lerobot/lib/python3.10/site-packages/huggingface_hub/file_download.py", +line 1546, in _get_metadata_or_catch_error + metadata = get_hf_file_metadata( + File "/home/jade_choghari/miniconda3/envs/lerobot/lib/python3.10/site-packages/huggingface_hub/utils/_validators.p +y", line 114, in _inner_fn + return fn(*args, **kwargs) + File "/home/jade_choghari/miniconda3/envs/lerobot/lib/python3.10/site-packages/huggingface_hub/file_download.py", +line 1463, in get_hf_file_metadata + r = _request_wrapper( + File "/home/jade_choghari/miniconda3/envs/lerobot/lib/python3.10/site-packages/huggingface_hub/file_download.py", +line 286, in _request_wrapper + response = _request_wrapper( + File "/home/jade_choghari/miniconda3/envs/lerobot/lib/python3.10/site-packages/huggingface_hub/file_download.py", +line 309, in _request_wrapper + response = http_backoff(method=method, url=url, **params, retry_on_exceptions=(), retry_on_status_codes=(429,)) + File "/home/jade_choghari/miniconda3/envs/lerobot/lib/python3.10/site-packages/huggingface_hub/utils/_http.py", li +ne 310, in http_backoff + response = session.request(method=method, url=url, **kwargs) + File "/home/jade_choghari/miniconda3/envs/lerobot/lib/python3.10/site-packages/requests/sessions.py", line 589, in + request + resp = self.send(prep, **send_kwargs) + File "/home/jade_choghari/miniconda3/envs/lerobot/lib/python3.10/site-packages/requests/sessions.py", line 703, in + send + r = adapter.send(request, **kwargs) + File "/home/jade_choghari/miniconda3/envs/lerobot/lib/python3.10/site-packages/huggingface_hub/utils/_http.py", li +ne 96, in send + return super().send(request, *args, **kwargs) + File "/home/jade_choghari/miniconda3/envs/lerobot/lib/python3.10/site-packages/requests/adapters.py", line 644, in + send + resp = conn.urlopen( + File "/home/jade_choghari/miniconda3/envs/lerobot/lib/python3.10/site-packages/urllib3/connectionpool.py", line 78 +7, in urlopen + response = self._make_request( + File "/home/jade_choghari/miniconda3/envs/lerobot/lib/python3.10/site-packages/urllib3/connectionpool.py", line 53 +4, in _make_request + response = conn.getresponse() + File "/home/jade_choghari/miniconda3/envs/lerobot/lib/python3.10/site-packages/urllib3/connection.py", line 565, i +n getresponse + httplib_response = super().getresponse() + File "/home/jade_choghari/miniconda3/envs/lerobot/lib/python3.10/http/client.py", line 1375, in getresponse + response.begin() + File "/home/jade_choghari/miniconda3/envs/lerobot/lib/python3.10/http/client.py", line 318, in begin + version, status, reason = self._read_status() + File "/home/jade_choghari/miniconda3/envs/lerobot/lib/python3.10/http/client.py", line 279, in _read_status + line = str(self.fp.readline(_MAXLINE + 1), "iso-8859-1") + File "/home/jade_choghari/miniconda3/envs/lerobot/lib/python3.10/socket.py", line 717, in readinto + return self._sock.recv_into(b) + File "/home/jade_choghari/miniconda3/envs/lerobot/lib/python3.10/ssl.py", line 1307, in recv_into + return self.read(nbytes, buffer) + File "/home/jade_choghari/miniconda3/envs/lerobot/lib/python3.10/ssl.py", line 1163, in read + return self._sslobj.read(len, buffer) +KeyboardInterrupt +clea +(lerobot) jade_choghari@hf-dgx-01:~/lerobot$ clear +(lerobot) jade_choghari@hf-dgx-01:~/lerobot$ bash examples/8_train_smolvla_must.sh +Training dir: /raid/jade/logs/lerobot/lerobot_2_physical-intelligence_libero_smolvla_lr1e-4bs32steps100000 +/home/jade_choghari/miniconda3/envs/lerobot/lib/python3.10/site-packages/transformers/utils/hub.py:111: FutureWarnin +g: Using `TRANSFORMERS_CACHE` is deprecated and will be removed in v5 of Transformers. Use `HF_HOME` instead. + warnings.warn( +INFO 2025-09-09 15:50:52 ils/utils.py:48 Cuda backend detected, using cuda. +WARNING 2025-09-09 15:50:52 /policies.py:81 Device 'None' is not available. Switching to 'cuda'. +INFO 2025-09-09 15:50:52 ts/train.py:137 {'batch_size': 32, + 'dataset': {'episodes': None, + 'image_transforms': {'enable': False, + 'max_num_transforms': 3, + 'random_order': False, + 'tfs': {'brightness': {'kwargs': {'brightness': [0.8, + 1.2]}, + 'type': 'ColorJitter', + 'weight': 1.0}, + 'contrast': {'kwargs': {'contrast': [0.8, + 1.2]}, + 'type': 'ColorJitter', + 'weight': 1.0}, + 'hue': {'kwargs': {'hue': [-0.05, + 0.05]}, + 'type': 'ColorJitter', + 'weight': 1.0}, + 'saturation': {'kwargs': {'saturation': [0.5, + 1.5]}, + 'type': 'ColorJitter', + 'weight': 1.0}, + 'sharpness': {'kwargs': {'sharpness': [0.5, + 1.5]}, + 'type': 'SharpnessJitter', + 'weight': 1.0}}}, + 'repo_id': 'physical-intelligence/libero', + 'revision': None, + 'root': '/raid/jade/.cache/huggingface/datasets', + 'use_imagenet_stats': True, + 'video_backend': 'torchcodec'}, + 'env': {'camera_name': 'agentview_image,robot0_eye_in_hand_image', + 'episode_length': 520, + 'features': {'action': {'shape': [7], + 'type': }, + 'agent_pos': {'shape': [8], + 'type': }, + 'pixels/agentview_image': {'shape': [360, 360, 3], + 'type': }, + 'pixels/robot0_eye_in_hand_image': {'shape': [360, + 360, + 3], + 'type': }}, + 'features_map': {'action': 'action', + 'agent_pos': 'observation.state', + 'pixels/agentview_image': 'observation.images.image', + 'pixels/robot0_eye_in_hand_image': 'observation.images.image2'}, + 'fps': 30, + 'init_states': True, + 'max_parallel_tasks': 5, + 'multitask_eval': True, + 'obs_type': 'pixels_agent_pos', + 'render_mode': 'rgb_array', + 'task': 'libero_spatial', + 'type': 'libero'}, + 'eval': {'batch_size': 1, 'n_episodes': 1, 'use_async_envs': False}, + 'eval_freq': 0, + 'job_name': 'libero_smolvla', + 'log_freq': 200, + 'num_workers': 4, + 'optimizer': {'betas': [0.9, 0.95], + 'eps': 1e-08, + 'grad_clip_norm': 10, + 'lr': 0.0001, + 'type': 'adamw', + 'weight_decay': 1e-10}, + 'output_dir': '/raid/jade/logs/lerobot/lerobot_2_physical-intelligence_libero_smolvla_lr1e-4bs32steps100000', + 'policy': {'adapt_to_pi_aloha': False, + 'add_image_special_tokens': False, + 'attention_mode': 'cross_attn', + 'chunk_size': 50, + 'device': 'cuda', + 'empty_cameras': 0, + 'expert_width_multiplier': 0.5, + 'freeze_vision_encoder': True, + 'gradient_accumulation_steps': 1, + 'input_features': {}, + 'license': None, + 'load_vlm_weights': False, + 'max_action_dim': 32, + 'max_period': 4.0, + 'max_state_dim': 32, + 'min_period': 0.004, + 'n_action_steps': 1, + 'n_obs_steps': 1, + 'normalization_mapping': {'ACTION': , + 'STATE': , + 'VISUAL': }, + 'num_expert_layers': -1, + 'num_steps': 10, + 'num_vlm_layers': 16, + 'optimizer_betas': [0.9, 0.95], + 'optimizer_eps': 1e-08, + 'optimizer_grad_clip_norm': 10, + 'optimizer_lr': 0.0001, + 'optimizer_weight_decay': 1e-10, + 'output_features': {}, + 'pad_language_to': 'longest', + 'prefix_length': 0, + 'private': None, + 'push_to_hub': True, + 'repo_id': 'None', + 'resize_imgs_with_padding': [512, 512], + 'scheduler_decay_lr': 2.5e-06, + 'scheduler_decay_steps': 30000, + 'scheduler_warmup_steps': 1000, + 'self_attn_every_n_layers': 2, + 'tags': None, + 'tokenizer_max_length': 48, + 'train_expert_only': True, + 'train_state_proj': True, + 'type': 'smolvla', + 'use_amp': True, + 'use_cache': True, + 'use_delta_joint_actions_aloha': False, + 'vlm_model_name': 'HuggingFaceTB/SmolVLM2-500M-Instruct'}, + 'resume': False, + 'save_checkpoint': True, + 'save_freq': 20000, + 'scheduler': {'decay_lr': 2.5e-06, + 'num_decay_steps': 30000, + 'num_warmup_steps': 1000, + 'peak_lr': 0.0001, + 'type': 'cosine_decay_with_warmup'}, + 'seed': 1000, + 'steps': 100000, + 'use_policy_training_preset': True, + 'wandb': {'disable_artifact': False, + 'enable': False, + 'entity': None, + 'mode': None, + 'notes': None, + 'project': 'lerobot', + 'run_id': None}} +INFO 2025-09-09 15:50:52 ts/train.py:143 Logs will be saved locally. +INFO 2025-09-09 15:50:52 ts/train.py:153 Creating dataset +WARNING 2025-09-09 15:50:52 ts/utils.py:302 +The dataset you requested (physical-intelligence/libero) is in 2.0 format. +While current version of LeRobot is backward-compatible with it, the version of your dataset still uses global +stats instead of per-episode stats. Update your dataset stats to the new format using this command: +``` +python -m lerobot.datasets.v21.convert_dataset_v20_to_v21 --repo-id=physical-intelligence/libero +``` + +If you encounter a problem, contact LeRobot maintainers on [Discord](https://discord.com/invite/s3KuuzsPFb) +or open an [issue on GitHub](https://github.com/huggingface/lerobot/issues/new/choose). + +WARNING 2025-09-09 15:50:52 ts/utils.py:302 +The dataset you requested (physical-intelligence/libero) is in 2.0 format. +While current version of LeRobot is backward-compatible with it, the version of your dataset still uses global +stats instead of per-episode stats. Update your dataset stats to the new format using this command: +``` +python -m lerobot.datasets.v21.convert_dataset_v20_to_v21 --repo-id=physical-intelligence/libero +``` + +If you encounter a problem, contact LeRobot maintainers on [Discord](https://discord.com/invite/s3KuuzsPFb) +or open an [issue on GitHub](https://github.com/huggingface/lerobot/issues/new/choose). + +Resolving data files: 100%|β–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆ| 1693/1693 [00:00<00:00, 67057.8 +5it/s] +Loading dataset shards: 100%|β–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆ| 70/70 [00:00<00:00, 5343.9 +4it/s] +INFO 2025-09-09 15:50:57 ts/train.py:163 Creating policy +Fetching 2 files: 100%|β–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆ| 2/2 [00:00<00:00, 47393.2 +7it/s] +Fetching 2 files: 100%|β–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆ| 2/2 [00:00<00:00, 3797.4 +7it/s] +Fetching 2 files: 100%|β–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆ| 2/2 [00:00<00:00, 44384.1 +7it/s] +Fetching 2 files: 100%|β–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆ| 2/2 [00:00<00:00, 6533.1 +8it/s] +Reducing the number of VLM layers to 16 ... +INFO 2025-09-09 15:51:30 ts/train.py:168 Creating optimizer and scheduler +INFO 2025-09-09 15:51:30 ts/train.py:180 Output dir: /raid/jade/logs/lerobot/lerobot_2_physical-intelligence_libero_ +smolvla_lr1e-4bs32steps100000 +INFO 2025-09-09 15:51:30 ts/train.py:182 cfg.env.task='libero_spatial' +INFO 2025-09-09 15:51:30 ts/train.py:183 cfg.steps=100000 (100K) +INFO 2025-09-09 15:51:30 ts/train.py:184 dataset.num_frames=273465 (273K) +INFO 2025-09-09 15:51:30 ts/train.py:185 dataset.num_episodes=1693 +INFO 2025-09-09 15:51:30 ts/train.py:186 num_learnable_params=49103712 (49M) +INFO 2025-09-09 15:51:30 ts/train.py:187 num_total_params=399268924 (399M) +INFO 2025-09-09 15:51:30 ts/train.py:225 Start offline training on a fixed dataset +> /home/jade_choghari/lerobot/src/lerobot/scripts/train.py(230)train() +-> train_tracker.dataloading_s = time.perf_counter() - start_time +(Pdb) batch.keys() +dict_keys(['image', 'wrist_image', 'state', 'actions', 'timestamp', 'frame_index', 'episode_index', 'index', 'task_i +ndex', 'task']) +(Pdb) policy.config.input_features +{'image': PolicyFeature(type=, shape=(3, 256, 256)), 'wrist_image': PolicyFeature(type +=, shape=(3, 256, 256))} +(Pdb) quit() +Traceback (most recent call last): + File "/home/jade_choghari/lerobot/src/lerobot/scripts/train.py", line 343, in + main() + File "/home/jade_choghari/lerobot/src/lerobot/scripts/train.py", line 339, in main + train() + File "/home/jade_choghari/lerobot/src/lerobot/configs/parser.py", line 225, in wrapper_inner + response = fn(cfg, *args, **kwargs) + File "/home/jade_choghari/lerobot/src/lerobot/scripts/train.py", line 230, in train + train_tracker.dataloading_s = time.perf_counter() - start_time + File "/home/jade_choghari/lerobot/src/lerobot/scripts/train.py", line 230, in train + train_tracker.dataloading_s = time.perf_counter() - start_time + File "/home/jade_choghari/miniconda3/envs/lerobot/lib/python3.10/bdb.py", line 90, in trace_dispatch + return self.dispatch_line(frame) + File "/home/jade_choghari/miniconda3/envs/lerobot/lib/python3.10/bdb.py", line 115, in dispatch_line + if self.quitting: raise BdbQuit +bdb.BdbQuit +clear +^[[A(lerobot) jade_choghari@hf-dgx-01:~/lerobot$ clear +(lerobot) jade_choghari@hf-dgx-01:~/lerobot$ bash examples/8_train_smolvla_must.sh +Training dir: /raid/jade/logs/lerobot/lerobot_2_physical-intelligence_libero_smolvla_lr1e-4bs32steps100000 +/home/jade_choghari/miniconda3/envs/lerobot/lib/python3.10/site-packages/transformers/utils/hub.py:111: FutureWarnin +g: Using `TRANSFORMERS_CACHE` is deprecated and will be removed in v5 of Transformers. Use `HF_HOME` instead. + warnings.warn( +INFO 2025-09-09 15:53:49 ils/utils.py:48 Cuda backend detected, using cuda. +WARNING 2025-09-09 15:53:49 /policies.py:81 Device 'None' is not available. Switching to 'cuda'. +INFO 2025-09-09 15:53:49 ts/train.py:137 {'batch_size': 32, + 'dataset': {'episodes': None, + 'image_transforms': {'enable': False, + 'max_num_transforms': 3, + 'random_order': False, + 'tfs': {'brightness': {'kwargs': {'brightness': [0.8, + 1.2]}, + 'type': 'ColorJitter', + 'weight': 1.0}, + 'contrast': {'kwargs': {'contrast': [0.8, + 1.2]}, + 'type': 'ColorJitter', + 'weight': 1.0}, + 'hue': {'kwargs': {'hue': [-0.05, + 0.05]}, + 'type': 'ColorJitter', + 'weight': 1.0}, + 'saturation': {'kwargs': {'saturation': [0.5, + 1.5]}, + 'type': 'ColorJitter', + 'weight': 1.0}, + 'sharpness': {'kwargs': {'sharpness': [0.5, + 1.5]}, + 'type': 'SharpnessJitter', + 'weight': 1.0}}}, + 'repo_id': 'physical-intelligence/libero', + 'revision': None, + 'root': '/raid/jade/.cache/huggingface/datasets', + 'use_imagenet_stats': True, + 'video_backend': 'torchcodec'}, + 'env': {'camera_name': 'agentview_image,robot0_eye_in_hand_image', + 'episode_length': 520, + 'features': {'action': {'shape': [7], + 'type': }, + 'agent_pos': {'shape': [8], + 'type': }, + 'pixels/agentview_image': {'shape': [360, 360, 3], + 'type': }, + 'pixels/robot0_eye_in_hand_image': {'shape': [360, + 360, + 3], + 'type': }}, + 'features_map': {'action': 'action', + 'agent_pos': 'observation.state', + 'pixels/agentview_image': 'observation.images.image', + 'pixels/robot0_eye_in_hand_image': 'observation.images.image2'}, + 'fps': 30, + 'init_states': True, + 'max_parallel_tasks': 5, + 'multitask_eval': True, + 'obs_type': 'pixels_agent_pos', + 'render_mode': 'rgb_array', + 'task': 'libero_spatial', + 'type': 'libero'}, + 'eval': {'batch_size': 1, 'n_episodes': 1, 'use_async_envs': False}, + 'eval_freq': 0, + 'job_name': 'libero_smolvla', + 'log_freq': 200, + 'num_workers': 4, + 'optimizer': {'betas': [0.9, 0.95], + 'eps': 1e-08, + 'grad_clip_norm': 10, + 'lr': 0.0001, + 'type': 'adamw', + 'weight_decay': 1e-10}, + 'output_dir': '/raid/jade/logs/lerobot/lerobot_2_physical-intelligence_libero_smolvla_lr1e-4bs32steps100000', + 'policy': {'adapt_to_pi_aloha': False, + 'add_image_special_tokens': False, + 'attention_mode': 'cross_attn', + 'chunk_size': 50, + 'device': 'cuda', + 'empty_cameras': 0, + 'expert_width_multiplier': 0.5, + 'freeze_vision_encoder': True, + 'gradient_accumulation_steps': 1, + 'input_features': {}, + 'license': None, + 'load_vlm_weights': False, + 'max_action_dim': 32, + 'max_period': 4.0, + 'max_state_dim': 32, + 'min_period': 0.004, + 'n_action_steps': 1, + 'n_obs_steps': 1, + 'normalization_mapping': {'ACTION': , + 'STATE': , + 'VISUAL': }, + 'num_expert_layers': -1, + 'num_steps': 10, + 'num_vlm_layers': 16, + 'optimizer_betas': [0.9, 0.95], + 'optimizer_eps': 1e-08, + 'optimizer_grad_clip_norm': 10, + 'optimizer_lr': 0.0001, + 'optimizer_weight_decay': 1e-10, + 'output_features': {}, + 'pad_language_to': 'longest', + 'prefix_length': 0, + 'private': None, + 'push_to_hub': True, + 'repo_id': 'None', + 'resize_imgs_with_padding': [512, 512], + 'scheduler_decay_lr': 2.5e-06, + 'scheduler_decay_steps': 30000, + 'scheduler_warmup_steps': 1000, + 'self_attn_every_n_layers': 2, + 'tags': None, + 'tokenizer_max_length': 48, + 'train_expert_only': True, + 'train_state_proj': True, + 'type': 'smolvla', + 'use_amp': True, + 'use_cache': True, + 'use_delta_joint_actions_aloha': False, + 'vlm_model_name': 'HuggingFaceTB/SmolVLM2-500M-Instruct'}, + 'resume': False, + 'save_checkpoint': True, + 'save_freq': 20000, + 'scheduler': {'decay_lr': 2.5e-06, + 'num_decay_steps': 30000, + 'num_warmup_steps': 1000, + 'peak_lr': 0.0001, + 'type': 'cosine_decay_with_warmup'}, + 'seed': 1000, + 'steps': 100000, + 'use_policy_training_preset': True, + 'wandb': {'disable_artifact': False, + 'enable': False, + 'entity': None, + 'mode': None, + 'notes': None, + 'project': 'lerobot', + 'run_id': None}} +INFO 2025-09-09 15:53:49 ts/train.py:143 Logs will be saved locally. +INFO 2025-09-09 15:53:49 ts/train.py:153 Creating dataset +WARNING 2025-09-09 15:53:49 ts/utils.py:302 +The dataset you requested (physical-intelligence/libero) is in 2.0 format. +While current version of LeRobot is backward-compatible with it, the version of your dataset still uses global +stats instead of per-episode stats. Update your dataset stats to the new format using this command: +``` +python -m lerobot.datasets.v21.convert_dataset_v20_to_v21 --repo-id=physical-intelligence/libero +``` + +If you encounter a problem, contact LeRobot maintainers on [Discord](https://discord.com/invite/s3KuuzsPFb) +or open an [issue on GitHub](https://github.com/huggingface/lerobot/issues/new/choose). + +WARNING 2025-09-09 15:53:49 ts/utils.py:302 +The dataset you requested (physical-intelligence/libero) is in 2.0 format. +While current version of LeRobot is backward-compatible with it, the version of your dataset still uses global +stats instead of per-episode stats. Update your dataset stats to the new format using this command: +``` +python -m lerobot.datasets.v21.convert_dataset_v20_to_v21 --repo-id=physical-intelligence/libero +``` + +If you encounter a problem, contact LeRobot maintainers on [Discord](https://discord.com/invite/s3KuuzsPFb) +or open an [issue on GitHub](https://github.com/huggingface/lerobot/issues/new/choose). + +Resolving data files: 100%|β–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆ| 1693/1693 [00:00<00:00, 34701.4 +4it/s] +Loading dataset shards: 100%|β–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆ| 70/70 [00:00<00:00, 5495.3 +7it/s] +INFO 2025-09-09 15:53:55 ts/train.py:163 Creating policy +Fetching 2 files: 100%|β–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆ| 2/2 [00:00<00:00, 41943.0 +4it/s] +Fetching 2 files: 100%|β–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆ| 2/2 [00:00<00:00, 5500.7 +3it/s] +Fetching 2 files: 100%|β–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆ| 2/2 [00:00<00:00, 2361.6 +6it/s] +Fetching 2 files: 100%|β–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆ| 2/2 [00:00<00:00, 5041.2 +3it/s] +Reducing the number of VLM layers to 16 ... +> /home/jade_choghari/lerobot/src/lerobot/policies/factory.py(173)make_policy() +-> assert isinstance(policy, nn.Module) +(Pdb) features +{'image': PolicyFeature(type=, shape=(3, 256, 256)), 'wrist_image': PolicyFeature(type +=, shape=(3, 256, 256)), 'actions': PolicyFeature(type=, + shape=(7,))} +(Pdb) ds_meta.features +{'image': {'dtype': 'image', 'shape': (256, 256, 3), 'names': ['height', 'width', 'channel']}, 'wrist_image': {'dtyp +e': 'image', 'shape': (256, 256, 3), 'names': ['height', 'width', 'channel']}, 'state': {'dtype': 'float32', 'shape' +: (8,), 'names': ['state']}, 'actions': {'dtype': 'float32', 'shape': (7,), 'names': ['actions']}, 'timestamp': {'dt +ype': 'float32', 'shape': (1,), 'names': None}, 'frame_index': {'dtype': 'int64', 'shape': (1,), 'names': None}, 'ep +isode_index': {'dtype': 'int64', 'shape': (1,), 'names': None}, 'index': {'dtype': 'int64', 'shape': (1,), 'names': +None}, 'task_index': {'dtype': 'int64', 'shape': (1,), 'names': None}} +(Pdb) quit() + +Traceback (most recent call last): + File "/home/jade_choghari/lerobot/src/lerobot/scripts/train.py", line 343, in + main() + File "/home/jade_choghari/lerobot/src/lerobot/scripts/train.py", line 339, in main + train() + File "/home/jade_choghari/lerobot/src/lerobot/configs/parser.py", line 225, in wrapper_inner + response = fn(cfg, *args, **kwargs) + File "/home/jade_choghari/lerobot/src/lerobot/scripts/train.py", line 164, in train + policy = make_policy( + File "/home/jade_choghari/lerobot/src/lerobot/policies/factory.py", line 173, in make_policy + assert isinstance(policy, nn.Module) + File "/home/jade_choghari/lerobot/src/lerobot/policies/factory.py", line 173, in make_policy + assert isinstance(policy, nn.Module) + File "/home/jade_choghari/miniconda3/envs/lerobot/lib/python3.10/bdb.py", line 90, in trace_dispatch + return self.dispatch_line(frame) + File "/home/jade_choghari/miniconda3/envs/lerobot/lib/python3.10/bdb.py", line 115, in dispatch_line + if self.quitting: raise BdbQuit +bdb.BdbQuit +clear +(lerobot) jade_choghari@hf-dgx-01:~/lerobot$ clear +(lerobot) jade_choghari@hf-dgx-01:~/lerobot$ bash examples/8_train_smolvla_must.sh +Training dir: /raid/jade/logs/lerobot/lerobot_2_physical-intelligence_libero_smolvla_lr1e-4bs32steps100000 +/home/jade_choghari/miniconda3/envs/lerobot/lib/python3.10/site-packages/transformers/utils/hub.py:111: FutureWarnin +g: Using `TRANSFORMERS_CACHE` is deprecated and will be removed in v5 of Transformers. Use `HF_HOME` instead. + warnings.warn( +INFO 2025-09-09 15:56:35 ils/utils.py:48 Cuda backend detected, using cuda. +WARNING 2025-09-09 15:56:35 /policies.py:81 Device 'None' is not available. Switching to 'cuda'. +INFO 2025-09-09 15:56:35 ts/train.py:137 {'batch_size': 32, + 'dataset': {'episodes': None, + 'image_transforms': {'enable': False, + 'max_num_transforms': 3, + 'random_order': False, + 'tfs': {'brightness': {'kwargs': {'brightness': [0.8, + 1.2]}, + 'type': 'ColorJitter', + 'weight': 1.0}, + 'contrast': {'kwargs': {'contrast': [0.8, + 1.2]}, + 'type': 'ColorJitter', + 'weight': 1.0}, + 'hue': {'kwargs': {'hue': [-0.05, + 0.05]}, + 'type': 'ColorJitter', + 'weight': 1.0}, + 'saturation': {'kwargs': {'saturation': [0.5, + 1.5]}, + 'type': 'ColorJitter', + 'weight': 1.0}, + 'sharpness': {'kwargs': {'sharpness': [0.5, + 1.5]}, + 'type': 'SharpnessJitter', + 'weight': 1.0}}}, + 'repo_id': 'physical-intelligence/libero', + 'revision': None, + 'root': '/raid/jade/.cache/huggingface/datasets', + 'use_imagenet_stats': True, + 'video_backend': 'torchcodec'}, + 'env': {'camera_name': 'agentview_image,robot0_eye_in_hand_image', + 'episode_length': 520, + 'features': {'action': {'shape': [7], + 'type': }, + 'agent_pos': {'shape': [8], + 'type': }, + 'pixels/agentview_image': {'shape': [360, 360, 3], + 'type': }, + 'pixels/robot0_eye_in_hand_image': {'shape': [360, + 360, + 3], + 'type': }}, + 'features_map': {'action': 'action', + 'agent_pos': 'observation.state', + 'pixels/agentview_image': 'observation.images.image', + 'pixels/robot0_eye_in_hand_image': 'observation.images.image2'}, + 'fps': 30, + 'init_states': True, + 'max_parallel_tasks': 5, + 'multitask_eval': True, + 'obs_type': 'pixels_agent_pos', + 'render_mode': 'rgb_array', + 'task': 'libero_spatial', + 'type': 'libero'}, + 'eval': {'batch_size': 1, 'n_episodes': 1, 'use_async_envs': False}, + 'eval_freq': 0, + 'job_name': 'libero_smolvla', + 'log_freq': 200, + 'num_workers': 4, + 'optimizer': {'betas': [0.9, 0.95], + 'eps': 1e-08, + 'grad_clip_norm': 10, + 'lr': 0.0001, + 'type': 'adamw', + 'weight_decay': 1e-10}, + 'output_dir': '/raid/jade/logs/lerobot/lerobot_2_physical-intelligence_libero_smolvla_lr1e-4bs32steps100000', + 'policy': {'adapt_to_pi_aloha': False, + 'add_image_special_tokens': False, + 'attention_mode': 'cross_attn', + 'chunk_size': 50, + 'device': 'cuda', + 'empty_cameras': 0, + 'expert_width_multiplier': 0.5, + 'freeze_vision_encoder': True, + 'gradient_accumulation_steps': 1, + 'input_features': {}, + 'license': None, + 'load_vlm_weights': False, + 'max_action_dim': 32, + 'max_period': 4.0, + 'max_state_dim': 32, + 'min_period': 0.004, + 'n_action_steps': 1, + 'n_obs_steps': 1, + 'normalization_mapping': {'ACTION': , + 'STATE': , + 'VISUAL': }, + 'num_expert_layers': -1, + 'num_steps': 10, + 'num_vlm_layers': 16, + 'optimizer_betas': [0.9, 0.95], + 'optimizer_eps': 1e-08, + 'optimizer_grad_clip_norm': 10, + 'optimizer_lr': 0.0001, + 'optimizer_weight_decay': 1e-10, + 'output_features': {}, + 'pad_language_to': 'longest', + 'prefix_length': 0, + 'private': None, + 'push_to_hub': True, + 'repo_id': 'None', + 'resize_imgs_with_padding': [512, 512], + 'scheduler_decay_lr': 2.5e-06, + 'scheduler_decay_steps': 30000, + 'scheduler_warmup_steps': 1000, + 'self_attn_every_n_layers': 2, + 'tags': None, + 'tokenizer_max_length': 48, + 'train_expert_only': True, + 'train_state_proj': True, + 'type': 'smolvla', + 'use_amp': True, + 'use_cache': True, + 'use_delta_joint_actions_aloha': False, + 'vlm_model_name': 'HuggingFaceTB/SmolVLM2-500M-Instruct'}, + 'resume': False, + 'save_checkpoint': True, + 'save_freq': 20000, + 'scheduler': {'decay_lr': 2.5e-06, + 'num_decay_steps': 30000, + 'num_warmup_steps': 1000, + 'peak_lr': 0.0001, + 'type': 'cosine_decay_with_warmup'}, + 'seed': 1000, + 'steps': 100000, + 'use_policy_training_preset': True, + 'wandb': {'disable_artifact': False, + 'enable': False, + 'entity': None, + 'mode': None, + 'notes': None, + 'project': 'lerobot', + 'run_id': None}} +INFO 2025-09-09 15:56:35 ts/train.py:143 Logs will be saved locally. +INFO 2025-09-09 15:56:35 ts/train.py:153 Creating dataset +WARNING 2025-09-09 15:56:35 ts/utils.py:302 +The dataset you requested (physical-intelligence/libero) is in 2.0 format. +While current version of LeRobot is backward-compatible with it, the version of your dataset still uses global +stats instead of per-episode stats. Update your dataset stats to the new format using this command: +``` +python -m lerobot.datasets.v21.convert_dataset_v20_to_v21 --repo-id=physical-intelligence/libero +``` + +If you encounter a problem, contact LeRobot maintainers on [Discord](https://discord.com/invite/s3KuuzsPFb) +or open an [issue on GitHub](https://github.com/huggingface/lerobot/issues/new/choose). + +WARNING 2025-09-09 15:56:35 ts/utils.py:302 +The dataset you requested (physical-intelligence/libero) is in 2.0 format. +While current version of LeRobot is backward-compatible with it, the version of your dataset still uses global +stats instead of per-episode stats. Update your dataset stats to the new format using this command: +``` +python -m lerobot.datasets.v21.convert_dataset_v20_to_v21 --repo-id=physical-intelligence/libero +``` + +If you encounter a problem, contact LeRobot maintainers on [Discord](https://discord.com/invite/s3KuuzsPFb) +or open an [issue on GitHub](https://github.com/huggingface/lerobot/issues/new/choose). + +Resolving data files: 100%|β–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆ| 1693/1693 [00:00<00:00, 78132.9 +5it/s] +Loading dataset shards: 100%|β–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆ| 70/70 [00:00<00:00, 4716.0 +3it/s] +INFO 2025-09-09 15:56:40 ts/train.py:163 Creating policy +Fetching 2 files: 100%|β–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆ| 2/2 [00:00<00:00, 5259.3 +2it/s] +Fetching 2 files: 100%|β–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆ| 2/2 [00:00<00:00, 3477.8 +6it/s] +Fetching 2 files: 100%|β–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆ| 2/2 [00:00<00:00, 45343.8 +3it/s] +Fetching 2 files: 100%|β–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆ| 2/2 [00:00<00:00, 5551.6 +9it/s] +Reducing the number of VLM layers to 16 ... +> /home/jade_choghari/lerobot/src/lerobot/policies/factory.py(173)make_policy() +-> assert isinstance(policy, nn.Module) +(Pdb) features +{'image': PolicyFeature(type=, shape=(3, 256, 256)), 'wrist_image': PolicyFeature(type +=, shape=(3, 256, 256)), 'state': PolicyFeature(type=, sha +pe=(8,)), 'actions': PolicyFeature(type=, shape=(7,))} +(Pdb) quit() +Traceback (most recent call last): + File "/home/jade_choghari/lerobot/src/lerobot/scripts/train.py", line 343, in + File "/home/jade_choghari/lerobot/src/lerobot/scripts/train.py", line 339, in main + + File "/home/jade_choghari/lerobot/src/lerobot/configs/parser.py", line 225, in wrapper_inner + response = fn(cfg, *args, **kwargs) + File "/home/jade_choghari/lerobot/src/lerobot/scripts/train.py", line 164, in train + policy = make_policy( + File "/home/jade_choghari/lerobot/src/lerobot/policies/factory.py", line 173, in make_policy + # policy = torch.compile(policy, mode="reduce-overhead") + File "/home/jade_choghari/lerobot/src/lerobot/policies/factory.py", line 173, in make_policy + # policy = torch.compile(policy, mode="reduce-overhead") + File "/home/jade_choghari/miniconda3/envs/lerobot/lib/python3.10/bdb.py", line 90, in trace_dispatch + return self.dispatch_line(frame) + File "/home/jade_choghari/miniconda3/envs/lerobot/lib/python3.10/bdb.py", line 115, in dispatch_line + if self.quitting: raise BdbQuit +bdb.BdbQuit +clear +(lerobot) jade_choghari@hf-dgx-01:~/lerobot$ clear +(lerobot) jade_choghari@hf-dgx-01:~/lerobot$ bash examples/8_train_smolvla_must.sh +Training dir: /raid/jade/logs/lerobot/lerobot_2_physical-intelligence_libero_smolvla_lr1e-4bs32steps100000 +/home/jade_choghari/miniconda3/envs/lerobot/lib/python3.10/site-packages/transformers/utils/hub.py:111: FutureWarnin +g: Using `TRANSFORMERS_CACHE` is deprecated and will be removed in v5 of Transformers. Use `HF_HOME` instead. + warnings.warn( +INFO 2025-09-09 15:58:35 ils/utils.py:48 Cuda backend detected, using cuda. +WARNING 2025-09-09 15:58:35 /policies.py:81 Device 'None' is not available. Switching to 'cuda'. +INFO 2025-09-09 15:58:35 ts/train.py:137 {'batch_size': 32, + 'dataset': {'episodes': None, + 'image_transforms': {'enable': False, + 'max_num_transforms': 3, + 'random_order': False, + 'tfs': {'brightness': {'kwargs': {'brightness': [0.8, + 1.2]}, + 'type': 'ColorJitter', + 'weight': 1.0}, + 'contrast': {'kwargs': {'contrast': [0.8, + 1.2]}, + 'type': 'ColorJitter', + 'weight': 1.0}, + 'hue': {'kwargs': {'hue': [-0.05, + 0.05]}, + 'type': 'ColorJitter', + 'weight': 1.0}, + 'saturation': {'kwargs': {'saturation': [0.5, + 1.5]}, + 'type': 'ColorJitter', + 'weight': 1.0}, + 'sharpness': {'kwargs': {'sharpness': [0.5, + 1.5]}, + 'type': 'SharpnessJitter', + 'weight': 1.0}}}, + 'repo_id': 'physical-intelligence/libero', + 'revision': None, + 'root': '/raid/jade/.cache/huggingface/datasets', + 'use_imagenet_stats': True, + 'video_backend': 'torchcodec'}, + 'env': {'camera_name': 'agentview_image,robot0_eye_in_hand_image', + 'episode_length': 520, + 'features': {'action': {'shape': [7], + 'type': }, + 'agent_pos': {'shape': [8], + 'type': }, + 'pixels/agentview_image': {'shape': [360, 360, 3], + 'type': }, + 'pixels/robot0_eye_in_hand_image': {'shape': [360, + 360, + 3], + 'type': }}, + 'features_map': {'action': 'action', + 'agent_pos': 'observation.state', + 'pixels/agentview_image': 'observation.images.image', + 'pixels/robot0_eye_in_hand_image': 'observation.images.image2'}, + 'fps': 30, + 'init_states': True, + 'max_parallel_tasks': 5, + 'multitask_eval': True, + 'obs_type': 'pixels_agent_pos', + 'render_mode': 'rgb_array', + 'task': 'libero_spatial', + 'type': 'libero'}, + 'eval': {'batch_size': 1, 'n_episodes': 1, 'use_async_envs': False}, + 'eval_freq': 0, + 'job_name': 'libero_smolvla', + 'log_freq': 200, + 'num_workers': 4, + 'optimizer': {'betas': [0.9, 0.95], + 'eps': 1e-08, + 'grad_clip_norm': 10, + 'lr': 0.0001, + 'type': 'adamw', + 'weight_decay': 1e-10}, + 'output_dir': '/raid/jade/logs/lerobot/lerobot_2_physical-intelligence_libero_smolvla_lr1e-4bs32steps100000', + 'policy': {'adapt_to_pi_aloha': False, + 'add_image_special_tokens': False, + 'attention_mode': 'cross_attn', + 'chunk_size': 50, + 'device': 'cuda', + 'empty_cameras': 0, + 'expert_width_multiplier': 0.5, + 'freeze_vision_encoder': True, + 'gradient_accumulation_steps': 1, + 'input_features': {}, + 'license': None, + 'load_vlm_weights': False, + 'max_action_dim': 32, + 'max_period': 4.0, + 'max_state_dim': 32, + 'min_period': 0.004, + 'n_action_steps': 1, + 'n_obs_steps': 1, + 'normalization_mapping': {'ACTION': , + 'STATE': , + 'VISUAL': }, + 'num_expert_layers': -1, + 'num_steps': 10, + 'num_vlm_layers': 16, + 'optimizer_betas': [0.9, 0.95], + 'optimizer_eps': 1e-08, + 'optimizer_grad_clip_norm': 10, + 'optimizer_lr': 0.0001, + 'optimizer_weight_decay': 1e-10, + 'output_features': {}, + 'pad_language_to': 'longest', + 'prefix_length': 0, + 'private': None, + 'push_to_hub': True, + 'repo_id': 'None', + 'resize_imgs_with_padding': [512, 512], + 'scheduler_decay_lr': 2.5e-06, + 'scheduler_decay_steps': 30000, + 'scheduler_warmup_steps': 1000, + 'self_attn_every_n_layers': 2, + 'tags': None, + 'tokenizer_max_length': 48, + 'train_expert_only': True, + 'train_state_proj': True, + 'type': 'smolvla', + 'use_amp': True, + 'use_cache': True, + 'use_delta_joint_actions_aloha': False, + 'vlm_model_name': 'HuggingFaceTB/SmolVLM2-500M-Instruct'}, + 'resume': False, + 'save_checkpoint': True, + 'save_freq': 20000, + 'scheduler': {'decay_lr': 2.5e-06, + 'num_decay_steps': 30000, + 'num_warmup_steps': 1000, + 'peak_lr': 0.0001, + 'type': 'cosine_decay_with_warmup'}, + 'seed': 1000, + 'steps': 100000, + 'use_policy_training_preset': True, + 'wandb': {'disable_artifact': False, + 'enable': False, + 'entity': None, + 'mode': None, + 'notes': None, + 'project': 'lerobot', + 'run_id': None}} +INFO 2025-09-09 15:58:35 ts/train.py:143 Logs will be saved locally. +INFO 2025-09-09 15:58:35 ts/train.py:153 Creating dataset +WARNING 2025-09-09 15:58:35 ts/utils.py:302 +The dataset you requested (physical-intelligence/libero) is in 2.0 format. +While current version of LeRobot is backward-compatible with it, the version of your dataset still uses global +stats instead of per-episode stats. Update your dataset stats to the new format using this command: +``` +python -m lerobot.datasets.v21.convert_dataset_v20_to_v21 --repo-id=physical-intelligence/libero +``` + +If you encounter a problem, contact LeRobot maintainers on [Discord](https://discord.com/invite/s3KuuzsPFb) +or open an [issue on GitHub](https://github.com/huggingface/lerobot/issues/new/choose). + +WARNING 2025-09-09 15:58:35 ts/utils.py:302 +The dataset you requested (physical-intelligence/libero) is in 2.0 format. +While current version of LeRobot is backward-compatible with it, the version of your dataset still uses global +stats instead of per-episode stats. Update your dataset stats to the new format using this command: +``` +python -m lerobot.datasets.v21.convert_dataset_v20_to_v21 --repo-id=physical-intelligence/libero +``` + +If you encounter a problem, contact LeRobot maintainers on [Discord](https://discord.com/invite/s3KuuzsPFb) +or open an [issue on GitHub](https://github.com/huggingface/lerobot/issues/new/choose). + +Resolving data files: 100%|β–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆ| 1693/1693 [00:00<00:00, 27666.4 +6it/s] +Loading dataset shards: 100%|β–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆ| 70/70 [00:00<00:00, 5305.7 +0it/s] +INFO 2025-09-09 15:58:41 ts/train.py:163 Creating policy +Fetching 2 files: 100%|β–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆ| 2/2 [00:00<00:00, 44384.1 +7it/s] +Fetching 2 files: 100%|β–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆ| 2/2 [00:00<00:00, 3192.0 +1it/s] +Fetching 2 files: 100%|β–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆ| 2/2 [00:00<00:00, 44620.2 +6it/s] +Fetching 2 files: 100%|β–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆ| 2/2 [00:00<00:00, 42799.0 +2it/s] +Reducing the number of VLM layers to 16 ... +INFO 2025-09-09 15:59:13 ts/train.py:168 Creating optimizer and scheduler +INFO 2025-09-09 15:59:13 ts/train.py:180 Output dir: /raid/jade/logs/lerobot/lerobot_2_physical-intelligence_libero_ +smolvla_lr1e-4bs32steps100000 +INFO 2025-09-09 15:59:13 ts/train.py:182 cfg.env.task='libero_spatial' +INFO 2025-09-09 15:59:13 ts/train.py:183 cfg.steps=100000 (100K) +INFO 2025-09-09 15:59:13 ts/train.py:184 dataset.num_frames=273465 (273K) +INFO 2025-09-09 15:59:13 ts/train.py:185 dataset.num_episodes=1693 +INFO 2025-09-09 15:59:13 ts/train.py:186 num_learnable_params=49103712 (49M) +INFO 2025-09-09 15:59:13 ts/train.py:187 num_total_params=399268940 (399M) +INFO 2025-09-09 15:59:13 ts/train.py:225 Start offline training on a fixed dataset +Traceback (most recent call last): + File "/home/jade_choghari/lerobot/src/lerobot/scripts/train.py", line 342, in + main() + File "/home/jade_choghari/lerobot/src/lerobot/scripts/train.py", line 338, in main + train() + File "/home/jade_choghari/lerobot/src/lerobot/configs/parser.py", line 225, in wrapper_inner + response = fn(cfg, *args, **kwargs) + File "/home/jade_choghari/lerobot/src/lerobot/scripts/train.py", line 235, in train + train_tracker, output_dict = update_policy( + File "/home/jade_choghari/lerobot/src/lerobot/scripts/train.py", line 71, in update_policy + loss, output_dict = policy.forward(batch) + File "/home/jade_choghari/lerobot/src/lerobot/policies/smolvla/modeling_smolvla.py", line 458, in forward + actions = self.prepare_action(batch) + File "/home/jade_choghari/lerobot/src/lerobot/policies/smolvla/modeling_smolvla.py", line 580, in prepare_action + actions = pad_vector(batch[ACTION], self.config.max_action_dim) +KeyError: 'action' +Exception in thread Thread-3 (_pin_memory_loop): +Traceback (most recent call last): + File "/home/jade_choghari/miniconda3/envs/lerobot/lib/python3.10/threading.py", line 1016, in _bootstrap_inner + self.run() + File "/home/jade_choghari/miniconda3/envs/lerobot/lib/python3.10/threading.py", line 953, in run + self._target(*self._args, **self._kwargs) + File "/home/jade_choghari/miniconda3/envs/lerobot/lib/python3.10/site-packages/torch/utils/data/_utils/pin_memory. +py", line 61, in _pin_memory_loop + do_one_step() + File "/home/jade_choghari/miniconda3/envs/lerobot/lib/python3.10/site-packages/torch/utils/data/_utils/pin_memory. +py", line 37, in do_one_step + r = in_queue.get(timeout=MP_STATUS_CHECK_INTERVAL) + File "/home/jade_choghari/miniconda3/envs/lerobot/lib/python3.10/multiprocessing/queues.py", line 122, in get + return _ForkingPickler.loads(res) + File "/home/jade_choghari/miniconda3/envs/lerobot/lib/python3.10/site-packages/torch/multiprocessing/reductions.py +", line 541, in rebuild_storage_fd + fd = df.detach() + File "/home/jade_choghari/miniconda3/envs/lerobot/lib/python3.10/multiprocessing/resource_sharer.py", line 57, in +detach + with _resource_sharer.get_connection(self._id) as conn: + File "/home/jade_choghari/miniconda3/envs/lerobot/lib/python3.10/multiprocessing/resource_sharer.py", line 86, in +get_connection + c = Client(address, authkey=process.current_process().authkey) + File "/home/jade_choghari/miniconda3/envs/lerobot/lib/python3.10/multiprocessing/connection.py", line 508, in Clie +nt + answer_challenge(c, authkey) + File "/home/jade_choghari/miniconda3/envs/lerobot/lib/python3.10/multiprocessing/connection.py", line 752, in answ +er_challenge + message = connection.recv_bytes(256) # reject large message + File "/home/jade_choghari/miniconda3/envs/lerobot/lib/python3.10/multiprocessing/connection.py", line 216, in recv +_bytes + buf = self._recv_bytes(maxlength) + File "/home/jade_choghari/miniconda3/envs/lerobot/lib/python3.10/multiprocessing/connection.py", line 414, in _rec +v_bytes + buf = self._recv(4) + File "/home/jade_choghari/miniconda3/envs/lerobot/lib/python3.10/multiprocessing/connection.py", line 379, in _rec +v + chunk = read(handle, remaining) +ConnectionResetError: [Errno 104] Connection reset by peer +(lerobot) jade_choghari@hf-dgx-01:~/lerobot$ clear +(lerobot) jade_choghari@hf-dgx-01:~/lerobot$ bash examples/8_train_smolvla_must.sh +Training dir: /raid/jade/logs/lerobot/lerobot_2_physical-intelligence_libero_smolvla_lr1e-4bs32steps100000 +/home/jade_choghari/miniconda3/envs/lerobot/lib/python3.10/site-packages/transformers/utils/hub.py:111: FutureWarnin +g: Using `TRANSFORMERS_CACHE` is deprecated and will be removed in v5 of Transformers. Use `HF_HOME` instead. + warnings.warn( +INFO 2025-09-09 15:59:53 ils/utils.py:48 Cuda backend detected, using cuda. +WARNING 2025-09-09 15:59:53 /policies.py:81 Device 'None' is not available. Switching to 'cuda'. +INFO 2025-09-09 15:59:53 ts/train.py:137 {'batch_size': 32, + 'dataset': {'episodes': None, + 'image_transforms': {'enable': False, + 'max_num_transforms': 3, + 'random_order': False, + 'tfs': {'brightness': {'kwargs': {'brightness': [0.8, + 1.2]}, + 'type': 'ColorJitter', + 'weight': 1.0}, + 'contrast': {'kwargs': {'contrast': [0.8, + 1.2]}, + 'type': 'ColorJitter', + 'weight': 1.0}, + 'hue': {'kwargs': {'hue': [-0.05, + 0.05]}, + 'type': 'ColorJitter', + 'weight': 1.0}, + 'saturation': {'kwargs': {'saturation': [0.5, + 1.5]}, + 'type': 'ColorJitter', + 'weight': 1.0}, + 'sharpness': {'kwargs': {'sharpness': [0.5, + 1.5]}, + 'type': 'SharpnessJitter', + 'weight': 1.0}}}, + 'repo_id': 'physical-intelligence/libero', + 'revision': None, + 'root': '/raid/jade/.cache/huggingface/datasets', + 'use_imagenet_stats': True, + 'video_backend': 'torchcodec'}, + 'env': {'camera_name': 'agentview_image,robot0_eye_in_hand_image', + 'episode_length': 520, + 'features': {'action': {'shape': [7], + 'type': }, + 'agent_pos': {'shape': [8], + 'type': }, + 'pixels/agentview_image': {'shape': [360, 360, 3], + 'type': }, + 'pixels/robot0_eye_in_hand_image': {'shape': [360, + 360, + 3], + 'type': }}, + 'features_map': {'action': 'action', + 'agent_pos': 'observation.state', + 'pixels/agentview_image': 'observation.images.image', + 'pixels/robot0_eye_in_hand_image': 'observation.images.image2'}, + 'fps': 30, + 'init_states': True, + 'max_parallel_tasks': 5, + 'multitask_eval': True, + 'obs_type': 'pixels_agent_pos', + 'render_mode': 'rgb_array', + 'task': 'libero_spatial', + 'type': 'libero'}, + 'eval': {'batch_size': 1, 'n_episodes': 1, 'use_async_envs': False}, + 'eval_freq': 0, + 'job_name': 'libero_smolvla', + 'log_freq': 200, + 'num_workers': 4, + 'optimizer': {'betas': [0.9, 0.95], + 'eps': 1e-08, + 'grad_clip_norm': 10, + 'lr': 0.0001, + 'type': 'adamw', + 'weight_decay': 1e-10}, + 'output_dir': '/raid/jade/logs/lerobot/lerobot_2_physical-intelligence_libero_smolvla_lr1e-4bs32steps100000', + 'policy': {'adapt_to_pi_aloha': False, + 'add_image_special_tokens': False, + 'attention_mode': 'cross_attn', + 'chunk_size': 50, + 'device': 'cuda', + 'empty_cameras': 0, + 'expert_width_multiplier': 0.5, + 'freeze_vision_encoder': True, + 'gradient_accumulation_steps': 1, + 'input_features': {}, + 'license': None, + 'load_vlm_weights': False, + 'max_action_dim': 32, + 'max_period': 4.0, + 'max_state_dim': 32, + 'min_period': 0.004, + 'n_action_steps': 1, + 'n_obs_steps': 1, + 'normalization_mapping': {'ACTION': , + 'STATE': , + 'VISUAL': }, + 'num_expert_layers': -1, + 'num_steps': 10, + 'num_vlm_layers': 16, + 'optimizer_betas': [0.9, 0.95], + 'optimizer_eps': 1e-08, + 'optimizer_grad_clip_norm': 10, + 'optimizer_lr': 0.0001, + 'optimizer_weight_decay': 1e-10, + 'output_features': {}, + 'pad_language_to': 'longest', + 'prefix_length': 0, + 'private': None, + 'push_to_hub': True, + 'repo_id': 'None', + 'resize_imgs_with_padding': [512, 512], + 'scheduler_decay_lr': 2.5e-06, + 'scheduler_decay_steps': 30000, + 'scheduler_warmup_steps': 1000, + 'self_attn_every_n_layers': 2, + 'tags': None, + 'tokenizer_max_length': 48, + 'train_expert_only': True, + 'train_state_proj': True, + 'type': 'smolvla', + 'use_amp': True, + 'use_cache': True, + 'use_delta_joint_actions_aloha': False, + 'vlm_model_name': 'HuggingFaceTB/SmolVLM2-500M-Instruct'}, + 'resume': False, + 'save_checkpoint': True, + 'save_freq': 20000, + 'scheduler': {'decay_lr': 2.5e-06, + 'num_decay_steps': 30000, + 'num_warmup_steps': 1000, + 'peak_lr': 0.0001, + 'type': 'cosine_decay_with_warmup'}, + 'seed': 1000, + 'steps': 100000, + 'use_policy_training_preset': True, + 'wandb': {'disable_artifact': False, + 'enable': False, + 'entity': None, + 'mode': None, + 'notes': None, + 'project': 'lerobot', + 'run_id': None}} +INFO 2025-09-09 15:59:53 ts/train.py:143 Logs will be saved locally. +INFO 2025-09-09 15:59:53 ts/train.py:153 Creating dataset +WARNING 2025-09-09 15:59:53 ts/utils.py:302 +The dataset you requested (physical-intelligence/libero) is in 2.0 format. +While current version of LeRobot is backward-compatible with it, the version of your dataset still uses global +stats instead of per-episode stats. Update your dataset stats to the new format using this command: +``` +python -m lerobot.datasets.v21.convert_dataset_v20_to_v21 --repo-id=physical-intelligence/libero +``` + +If you encounter a problem, contact LeRobot maintainers on [Discord](https://discord.com/invite/s3KuuzsPFb) +or open an [issue on GitHub](https://github.com/huggingface/lerobot/issues/new/choose). + +WARNING 2025-09-09 15:59:53 ts/utils.py:302 +The dataset you requested (physical-intelligence/libero) is in 2.0 format. +While current version of LeRobot is backward-compatible with it, the version of your dataset still uses global +stats instead of per-episode stats. Update your dataset stats to the new format using this command: +``` +python -m lerobot.datasets.v21.convert_dataset_v20_to_v21 --repo-id=physical-intelligence/libero +``` + +If you encounter a problem, contact LeRobot maintainers on [Discord](https://discord.com/invite/s3KuuzsPFb) +or open an [issue on GitHub](https://github.com/huggingface/lerobot/issues/new/choose). + +Resolving data files: 100%|β–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆ| 1693/1693 [00:00<00:00, 72147.3 +3it/s] +Loading dataset shards: 100%|β–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆ| 70/70 [00:00<00:00, 5076.7 +1it/s] +INFO 2025-09-09 15:59:58 ts/train.py:163 Creating policy +Fetching 2 files: 100%|β–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆ| 2/2 [00:00<00:00, 6096.3 +7it/s] +Fetching 2 files: 100%|β–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆ| 2/2 [00:00<00:00, 4348.6 +8it/s] +Fetching 2 files: 100%|β–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆ| 2/2 [00:00<00:00, 46091.2 +5it/s] +Fetching 2 files: 100%|β–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆ| 2/2 [00:00<00:00, 3225.1 +5it/s] +Reducing the number of VLM layers to 16 ... +INFO 2025-09-09 16:00:31 ts/train.py:168 Creating optimizer and scheduler +INFO 2025-09-09 16:00:31 ts/train.py:180 Output dir: /raid/jade/logs/lerobot/lerobot_2_physical-intelligence_libero_ +smolvla_lr1e-4bs32steps100000 +INFO 2025-09-09 16:00:31 ts/train.py:182 cfg.env.task='libero_spatial' +INFO 2025-09-09 16:00:31 ts/train.py:183 cfg.steps=100000 (100K) +INFO 2025-09-09 16:00:31 ts/train.py:184 dataset.num_frames=273465 (273K) +INFO 2025-09-09 16:00:31 ts/train.py:185 dataset.num_episodes=1693 +INFO 2025-09-09 16:00:31 ts/train.py:186 num_learnable_params=49103712 (49M) +INFO 2025-09-09 16:00:31 ts/train.py:187 num_total_params=399268940 (399M) +INFO 2025-09-09 16:00:31 ts/train.py:225 Start offline training on a fixed dataset +Traceback (most recent call last): + File "/home/jade_choghari/lerobot/src/lerobot/scripts/train.py", line 342, in + main() + File "/home/jade_choghari/lerobot/src/lerobot/scripts/train.py", line 338, in main + train() + File "/home/jade_choghari/lerobot/src/lerobot/configs/parser.py", line 225, in wrapper_inner + response = fn(cfg, *args, **kwargs) + File "/home/jade_choghari/lerobot/src/lerobot/scripts/train.py", line 235, in train + train_tracker, output_dict = update_policy( + File "/home/jade_choghari/lerobot/src/lerobot/scripts/train.py", line 71, in update_policy + loss, output_dict = policy.forward(batch) + File "/home/jade_choghari/lerobot/src/lerobot/policies/smolvla/modeling_smolvla.py", line 461, in forward + losses = self.model.forward(images, img_masks, lang_tokens, lang_masks, state, actions, noise, time) + File "/home/jade_choghari/lerobot/src/lerobot/policies/smolvla/modeling_smolvla.py", line 850, in forward + att_2d_masks = make_att_2d_masks(pad_masks, att_masks) + File "/home/jade_choghari/lerobot/src/lerobot/policies/smolvla/modeling_smolvla.py", line 226, in make_att_2d_mask +s + att_2d_masks = att_2d_masks & pad_2d_masks +RuntimeError: The size of tensor a (199) must match the size of tensor b (181) at non-singleton dimension 2 +(lerobot) jade_choghari@hf-dgx-01:~/lerobot$ clear +(lerobot) jade_choghari@hf-dgx-01:~/lerobot$ bash examples/8_train_smolvla_must.sh +Training dir: /raid/jade/logs/lerobot/lerobot_2_physical-intelligence_libero_smolvla_lr1e-4bs32steps100000 +/home/jade_choghari/miniconda3/envs/lerobot/lib/python3.10/site-packages/transformers/utils/hub.py:111: FutureWarnin +g: Using `TRANSFORMERS_CACHE` is deprecated and will be removed in v5 of Transformers. Use `HF_HOME` instead. + warnings.warn( +INFO 2025-09-09 16:10:03 ils/utils.py:48 Cuda backend detected, using cuda. +WARNING 2025-09-09 16:10:03 /policies.py:81 Device 'None' is not available. Switching to 'cuda'. +INFO 2025-09-09 16:10:03 ts/train.py:137 {'batch_size': 32, + 'dataset': {'episodes': None, + 'image_transforms': {'enable': False, + 'max_num_transforms': 3, + 'random_order': False, + 'tfs': {'brightness': {'kwargs': {'brightness': [0.8, + 1.2]}, + 'type': 'ColorJitter', + 'weight': 1.0}, + 'contrast': {'kwargs': {'contrast': [0.8, + 1.2]}, + 'type': 'ColorJitter', + 'weight': 1.0}, + 'hue': {'kwargs': {'hue': [-0.05, + 0.05]}, + 'type': 'ColorJitter', + 'weight': 1.0}, + 'saturation': {'kwargs': {'saturation': [0.5, + 1.5]}, + 'type': 'ColorJitter', + 'weight': 1.0}, + 'sharpness': {'kwargs': {'sharpness': [0.5, + 1.5]}, + 'type': 'SharpnessJitter', + 'weight': 1.0}}}, + 'repo_id': 'physical-intelligence/libero', + 'revision': None, + 'root': '/raid/jade/.cache/huggingface/datasets', + 'use_imagenet_stats': True, + 'video_backend': 'torchcodec'}, + 'env': {'camera_name': 'agentview_image,robot0_eye_in_hand_image', + 'episode_length': 520, + 'features': {'action': {'shape': [7], + 'type': }, + 'agent_pos': {'shape': [8], + 'type': }, + 'pixels/agentview_image': {'shape': [360, 360, 3], + 'type': }, + 'pixels/robot0_eye_in_hand_image': {'shape': [360, + 360, + 3], + 'type': }}, + 'features_map': {'action': 'action', + 'agent_pos': 'observation.state', + 'pixels/agentview_image': 'observation.images.image', + 'pixels/robot0_eye_in_hand_image': 'observation.images.image2'}, + 'fps': 30, + 'init_states': True, + 'max_parallel_tasks': 5, + 'multitask_eval': True, + 'obs_type': 'pixels_agent_pos', + 'render_mode': 'rgb_array', + 'task': 'libero_spatial', + 'type': 'libero'}, + 'eval': {'batch_size': 1, 'n_episodes': 1, 'use_async_envs': False}, + 'eval_freq': 0, + 'job_name': 'libero_smolvla', + 'log_freq': 200, + 'num_workers': 4, + 'optimizer': {'betas': [0.9, 0.95], + 'eps': 1e-08, + 'grad_clip_norm': 10, + 'lr': 0.0001, + 'type': 'adamw', + 'weight_decay': 1e-10}, + 'output_dir': '/raid/jade/logs/lerobot/lerobot_2_physical-intelligence_libero_smolvla_lr1e-4bs32steps100000', + 'policy': {'adapt_to_pi_aloha': False, + 'add_image_special_tokens': False, + 'attention_mode': 'cross_attn', + 'chunk_size': 50, + 'device': 'cuda', + 'empty_cameras': 0, + 'expert_width_multiplier': 0.5, + 'freeze_vision_encoder': True, + 'gradient_accumulation_steps': 1, + 'input_features': {}, + 'license': None, + 'load_vlm_weights': False, + 'max_action_dim': 32, + 'max_period': 4.0, + 'max_state_dim': 32, + 'min_period': 0.004, + 'n_action_steps': 1, + 'n_obs_steps': 1, + 'normalization_mapping': {'ACTION': , + 'STATE': , + 'VISUAL': }, + 'num_expert_layers': -1, + 'num_steps': 10, + 'num_vlm_layers': 16, + 'optimizer_betas': [0.9, 0.95], + 'optimizer_eps': 1e-08, + 'optimizer_grad_clip_norm': 10, + 'optimizer_lr': 0.0001, + 'optimizer_weight_decay': 1e-10, + 'output_features': {}, + 'pad_language_to': 'longest', + 'prefix_length': 0, + 'private': None, + 'push_to_hub': True, + 'repo_id': 'None', + 'resize_imgs_with_padding': [512, 512], + 'scheduler_decay_lr': 2.5e-06, + 'scheduler_decay_steps': 30000, + 'scheduler_warmup_steps': 1000, + 'self_attn_every_n_layers': 2, + 'tags': None, + 'tokenizer_max_length': 48, + 'train_expert_only': True, + 'train_state_proj': True, + 'type': 'smolvla', + 'use_amp': True, + 'use_cache': True, + 'use_delta_joint_actions_aloha': False, + 'vlm_model_name': 'HuggingFaceTB/SmolVLM2-500M-Instruct'}, + 'resume': False, + 'save_checkpoint': True, + 'save_freq': 20000, + 'scheduler': {'decay_lr': 2.5e-06, + 'num_decay_steps': 30000, + 'num_warmup_steps': 1000, + 'peak_lr': 0.0001, + 'type': 'cosine_decay_with_warmup'}, + 'seed': 1000, + 'steps': 100000, + 'use_policy_training_preset': True, + 'wandb': {'disable_artifact': False, + 'enable': False, + 'entity': None, + 'mode': None, + 'notes': None, + 'project': 'lerobot', + 'run_id': None}} +INFO 2025-09-09 16:10:03 ts/train.py:143 Logs will be saved locally. +INFO 2025-09-09 16:10:03 ts/train.py:153 Creating dataset +WARNING 2025-09-09 16:10:03 ts/utils.py:302 +The dataset you requested (physical-intelligence/libero) is in 2.0 format. +While current version of LeRobot is backward-compatible with it, the version of your dataset still uses global +stats instead of per-episode stats. Update your dataset stats to the new format using this command: +``` +python -m lerobot.datasets.v21.convert_dataset_v20_to_v21 --repo-id=physical-intelligence/libero +``` + +If you encounter a problem, contact LeRobot maintainers on [Discord](https://discord.com/invite/s3KuuzsPFb) +or open an [issue on GitHub](https://github.com/huggingface/lerobot/issues/new/choose). + +WARNING 2025-09-09 16:10:03 ts/utils.py:302 +The dataset you requested (physical-intelligence/libero) is in 2.0 format. +While current version of LeRobot is backward-compatible with it, the version of your dataset still uses global +stats instead of per-episode stats. Update your dataset stats to the new format using this command: +``` +python -m lerobot.datasets.v21.convert_dataset_v20_to_v21 --repo-id=physical-intelligence/libero +``` + +If you encounter a problem, contact LeRobot maintainers on [Discord](https://discord.com/invite/s3KuuzsPFb) +or open an [issue on GitHub](https://github.com/huggingface/lerobot/issues/new/choose). + +Resolving data files: 100%|β–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆ| 1693/1693 [00:00<00:00, 54574.89it/s] +Loading dataset shards: 100%|β–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆ| 70/70 [00:00<00:00, 7567.63it/s] +INFO 2025-09-09 16:10:09 ts/train.py:163 Creating policy +Fetching 2 files: 100%|β–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆ| 2/2 [00:00<00:00, 40721.40it/s] +Fetching 2 files: 100%|β–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆ| 2/2 [00:00<00:00, 7516.67it/s] +Fetching 2 files: 100%|β–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆ| 2/2 [00:00<00:00, 3158.36it/s] +Fetching 2 files: 100%|β–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆ| 2/2 [00:00<00:00, 6775.94it/s] +Reducing the number of VLM layers to 16 ... +INFO 2025-09-09 16:10:41 ts/train.py:168 Creating optimizer and scheduler +INFO 2025-09-09 16:10:41 ts/train.py:180 Output dir: /raid/jade/logs/lerobot/lerobot_2_physical-intelligence_libero_ +smolvla_lr1e-4bs32steps100000 +INFO 2025-09-09 16:10:41 ts/train.py:182 cfg.env.task='libero_spatial' +INFO 2025-09-09 16:10:41 ts/train.py:183 cfg.steps=100000 (100K) +INFO 2025-09-09 16:10:41 ts/train.py:184 dataset.num_frames=273465 (273K) +INFO 2025-09-09 16:10:41 ts/train.py:185 dataset.num_episodes=1693 +INFO 2025-09-09 16:10:41 ts/train.py:186 num_learnable_params=49103712 (49M) +INFO 2025-09-09 16:10:41 ts/train.py:187 num_total_params=399268940 (399M) +INFO 2025-09-09 16:10:41 ts/train.py:225 Start offline training on a fixed dataset +Traceback (most recent call last): + File "/home/jade_choghari/lerobot/src/lerobot/scripts/train.py", line 342, in + main() + File "/home/jade_choghari/lerobot/src/lerobot/scripts/train.py", line 338, in main + train() + File "/home/jade_choghari/lerobot/src/lerobot/configs/parser.py", line 225, in wrapper_inner + response = fn(cfg, *args, **kwargs) + File "/home/jade_choghari/lerobot/src/lerobot/scripts/train.py", line 235, in train + train_tracker, output_dict = update_policy( + File "/home/jade_choghari/lerobot/src/lerobot/scripts/train.py", line 76, in update_policy + grad_scaler.unscale_(optimizer) + File "/home/jade_choghari/miniconda3/envs/lerobot/lib/python3.10/site-packages/torch/amp/grad_scaler.py", line 342 +, in unscale_ + optimizer_state["found_inf_per_device"] = self._unscale_grads_( + File "/home/jade_choghari/miniconda3/envs/lerobot/lib/python3.10/site-packages/torch/amp/grad_scaler.py", line 283 +, in _unscale_grads_ + torch._amp_foreach_non_finite_check_and_unscale_( +RuntimeError: "_amp_foreach_non_finite_check_and_unscale_cuda" not implemented for 'BFloat16' +(lerobot) jade_choghari@hf-dgx-01:~/lerobot$ clear +(lerobot) jade_choghari@hf-dgx-01:~/lerobot$ bash examples/8_train_smolvla_must.sh +Training dir: /raid/jade/logs/lerobot/lerobot_2_physical-intelligence_libero_smolvla_lr1e-4bs32steps100000 +/home/jade_choghari/miniconda3/envs/lerobot/lib/python3.10/site-packages/transformers/utils/hub.py:111: FutureWarnin +g: Using `TRANSFORMERS_CACHE` is deprecated and will be removed in v5 of Transformers. Use `HF_HOME` instead. + warnings.warn( +INFO 2025-09-09 16:12:28 ils/utils.py:48 Cuda backend detected, using cuda. +WARNING 2025-09-09 16:12:28 /policies.py:81 Device 'None' is not available. Switching to 'cuda'. +INFO 2025-09-09 16:12:28 ts/train.py:137 {'batch_size': 32, + 'dataset': {'episodes': None, + 'image_transforms': {'enable': False, + 'max_num_transforms': 3, + 'random_order': False, + 'tfs': {'brightness': {'kwargs': {'brightness': [0.8, + 1.2]}, + 'type': 'ColorJitter', + 'weight': 1.0}, + 'contrast': {'kwargs': {'contrast': [0.8, + 1.2]}, + 'type': 'ColorJitter', + 'weight': 1.0}, + 'hue': {'kwargs': {'hue': [-0.05, + 0.05]}, + 'type': 'ColorJitter', + 'weight': 1.0}, + 'saturation': {'kwargs': {'saturation': [0.5, + 1.5]}, + 'type': 'ColorJitter', + 'weight': 1.0}, + 'sharpness': {'kwargs': {'sharpness': [0.5, + 1.5]}, + 'type': 'SharpnessJitter', + 'weight': 1.0}}}, + 'repo_id': 'physical-intelligence/libero', + 'revision': None, + 'root': '/raid/jade/.cache/huggingface/datasets', + 'use_imagenet_stats': True, + 'video_backend': 'torchcodec'}, + 'env': {'camera_name': 'agentview_image,robot0_eye_in_hand_image', + 'episode_length': 520, + 'features': {'action': {'shape': [7], + 'type': }, + 'agent_pos': {'shape': [8], + 'type': }, + 'pixels/agentview_image': {'shape': [360, 360, 3], + 'type': }, + 'pixels/robot0_eye_in_hand_image': {'shape': [360, + 360, + 3], + 'type': }}, + 'features_map': {'action': 'action', + 'agent_pos': 'observation.state', + 'pixels/agentview_image': 'observation.images.image', + 'pixels/robot0_eye_in_hand_image': 'observation.images.image2'}, + 'fps': 30, + 'init_states': True, + 'max_parallel_tasks': 5, + 'multitask_eval': True, + 'obs_type': 'pixels_agent_pos', + 'render_mode': 'rgb_array', + 'task': 'libero_spatial', + 'type': 'libero'}, + 'eval': {'batch_size': 1, 'n_episodes': 1, 'use_async_envs': False}, + 'eval_freq': 0, + 'job_name': 'libero_smolvla', + 'log_freq': 200, + 'num_workers': 4, + 'optimizer': {'betas': [0.9, 0.95], + 'eps': 1e-08, + 'grad_clip_norm': 10, + 'lr': 0.0001, + 'type': 'adamw', + 'weight_decay': 1e-10}, + 'output_dir': '/raid/jade/logs/lerobot/lerobot_2_physical-intelligence_libero_smolvla_lr1e-4bs32steps100000', + 'policy': {'adapt_to_pi_aloha': False, + 'add_image_special_tokens': False, + 'attention_mode': 'cross_attn', + 'chunk_size': 50, + 'device': 'cuda', + 'empty_cameras': 0, + 'expert_width_multiplier': 0.5, + 'freeze_vision_encoder': True, + 'gradient_accumulation_steps': 1, + 'input_features': {}, + 'license': None, + 'load_vlm_weights': False, + 'max_action_dim': 32, + 'max_period': 4.0, + 'max_state_dim': 32, + 'min_period': 0.004, + 'n_action_steps': 1, + 'n_obs_steps': 1, + 'normalization_mapping': {'ACTION': , + 'STATE': , + 'VISUAL': }, + 'num_expert_layers': -1, + 'num_steps': 10, + 'num_vlm_layers': 16, + 'optimizer_betas': [0.9, 0.95], + 'optimizer_eps': 1e-08, + 'optimizer_grad_clip_norm': 10, + 'optimizer_lr': 0.0001, + 'optimizer_weight_decay': 1e-10, + 'output_features': {}, + 'pad_language_to': 'longest', + 'prefix_length': 0, + 'private': None, + 'push_to_hub': True, + 'repo_id': 'None', + 'resize_imgs_with_padding': [512, 512], + 'scheduler_decay_lr': 2.5e-06, + 'scheduler_decay_steps': 30000, + 'scheduler_warmup_steps': 1000, + 'self_attn_every_n_layers': 2, + 'tags': None, + 'tokenizer_max_length': 48, + 'train_expert_only': True, + 'train_state_proj': True, + 'type': 'smolvla', + 'use_amp': True, + 'use_cache': True, + 'use_delta_joint_actions_aloha': False, + 'vlm_model_name': 'HuggingFaceTB/SmolVLM2-500M-Instruct'}, + 'resume': False, + 'save_checkpoint': True, + 'save_freq': 20000, + 'scheduler': {'decay_lr': 2.5e-06, + 'num_decay_steps': 30000, + 'num_warmup_steps': 1000, + 'peak_lr': 0.0001, + 'type': 'cosine_decay_with_warmup'}, + 'seed': 1000, + 'steps': 100000, + 'use_policy_training_preset': True, + 'wandb': {'disable_artifact': False, + 'enable': False, + 'entity': None, + 'mode': None, + 'notes': None, + 'project': 'lerobot', + 'run_id': None}} +INFO 2025-09-09 16:12:28 ts/train.py:143 Logs will be saved locally. +INFO 2025-09-09 16:12:28 ts/train.py:153 Creating dataset +WARNING 2025-09-09 16:12:28 ts/utils.py:302 +The dataset you requested (physical-intelligence/libero) is in 2.0 format. +While current version of LeRobot is backward-compatible with it, the version of your dataset still uses global +stats instead of per-episode stats. Update your dataset stats to the new format using this command: +``` +python -m lerobot.datasets.v21.convert_dataset_v20_to_v21 --repo-id=physical-intelligence/libero +``` + +If you encounter a problem, contact LeRobot maintainers on [Discord](https://discord.com/invite/s3KuuzsPFb) +or open an [issue on GitHub](https://github.com/huggingface/lerobot/issues/new/choose). + +WARNING 2025-09-09 16:12:28 ts/utils.py:302 +The dataset you requested (physical-intelligence/libero) is in 2.0 format. +While current version of LeRobot is backward-compatible with it, the version of your dataset still uses global +stats instead of per-episode stats. Update your dataset stats to the new format using this command: +``` +python -m lerobot.datasets.v21.convert_dataset_v20_to_v21 --repo-id=physical-intelligence/libero +``` + +If you encounter a problem, contact LeRobot maintainers on [Discord](https://discord.com/invite/s3KuuzsPFb) +or open an [issue on GitHub](https://github.com/huggingface/lerobot/issues/new/choose). + +Resolving data files: 100%|β–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆ| 1693/1693 [00:00<00:00, 87666.13it/s] +Loading dataset shards: 100%|β–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆ| 70/70 [00:00<00:00, 4223.20it/s] +INFO 2025-09-09 16:12:34 ts/train.py:163 Creating policy +Fetching 2 files: 100%|β–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆ| 2/2 [00:00<00:00, 43690.67it/s] +Fetching 2 files: 100%|β–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆ| 2/2 [00:00<00:00, 4871.43it/s] +Fetching 2 files: 100%|β–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆ| 2/2 [00:00<00:00, 6512.89it/s] +Fetching 2 files: 100%|β–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆ| 2/2 [00:00<00:00, 43018.50it/s] +Reducing the number of VLM layers to 16 ... +INFO 2025-09-09 16:13:06 ts/train.py:168 Creating optimizer and scheduler +INFO 2025-09-09 16:13:06 ts/train.py:180 Output dir: /raid/jade/logs/lerobot/lerobot_2_physical-intelligence_libero_ +smolvla_lr1e-4bs32steps100000 +INFO 2025-09-09 16:13:06 ts/train.py:182 cfg.env.task='libero_spatial' +INFO 2025-09-09 16:13:06 ts/train.py:183 cfg.steps=100000 (100K) +INFO 2025-09-09 16:13:06 ts/train.py:184 dataset.num_frames=273465 (273K) +INFO 2025-09-09 16:13:06 ts/train.py:185 dataset.num_episodes=1693 +INFO 2025-09-09 16:13:06 ts/train.py:186 num_learnable_params=49103712 (49M) +INFO 2025-09-09 16:13:06 ts/train.py:187 num_total_params=399268940 (399M) +INFO 2025-09-09 16:13:06 ts/train.py:225 Start offline training on a fixed dataset +Traceback (most recent call last): + File "/home/jade_choghari/lerobot/src/lerobot/scripts/train.py", line 342, in + main() + File "/home/jade_choghari/lerobot/src/lerobot/scripts/train.py", line 338, in main + train() + File "/home/jade_choghari/lerobot/src/lerobot/configs/parser.py", line 225, in wrapper_inner + response = fn(cfg, *args, **kwargs) + File "/home/jade_choghari/lerobot/src/lerobot/scripts/train.py", line 235, in train + train_tracker, output_dict = update_policy( + File "/home/jade_choghari/lerobot/src/lerobot/scripts/train.py", line 76, in update_policy + grad_scaler.unscale_(optimizer) + File "/home/jade_choghari/miniconda3/envs/lerobot/lib/python3.10/site-packages/torch/amp/grad_scaler.py", line 342 +, in unscale_ + optimizer_state["found_inf_per_device"] = self._unscale_grads_( + File "/home/jade_choghari/miniconda3/envs/lerobot/lib/python3.10/site-packages/torch/amp/grad_scaler.py", line 283 +, in _unscale_grads_ + torch._amp_foreach_non_finite_check_and_unscale_( +RuntimeError: "_amp_foreach_non_finite_check_and_unscale_cuda" not implemented for 'BFloat16' +(lerobot) jade_choghari@hf-dgx-01:~/lerobot$ bash examples/8_train_smolvla_must.sh +Training dir: /raid/jade/logs/lerobot/lerobot_2_physical-intelligence_libero_smolvla_lr1e-4bs32steps100000 +/home/jade_choghari/miniconda3/envs/lerobot/lib/python3.10/site-packages/transformers/utils/hub.py:111: FutureWarnin +g: Using `TRANSFORMERS_CACHE` is deprecated and will be removed in v5 of Transformers. Use `HF_HOME` instead. + warnings.warn( +INFO 2025-09-09 16:13:51 ils/utils.py:48 Cuda backend detected, using cuda. +WARNING 2025-09-09 16:13:51 /policies.py:81 Device 'None' is not available. Switching to 'cuda'. +INFO 2025-09-09 16:13:51 ts/train.py:137 {'batch_size': 32, + 'dataset': {'episodes': None, + 'image_transforms': {'enable': False, + 'max_num_transforms': 3, + 'random_order': False, + 'tfs': {'brightness': {'kwargs': {'brightness': [0.8, + 1.2]}, + 'type': 'ColorJitter', + 'weight': 1.0}, + 'contrast': {'kwargs': {'contrast': [0.8, + 1.2]}, + 'type': 'ColorJitter', + 'weight': 1.0}, + 'hue': {'kwargs': {'hue': [-0.05, + 0.05]}, + 'type': 'ColorJitter', + 'weight': 1.0}, + 'saturation': {'kwargs': {'saturation': [0.5, + 1.5]}, + 'type': 'ColorJitter', + 'weight': 1.0}, + 'sharpness': {'kwargs': {'sharpness': [0.5, + 1.5]}, + 'type': 'SharpnessJitter', + 'weight': 1.0}}}, + 'repo_id': 'physical-intelligence/libero', + 'revision': None, + 'root': '/raid/jade/.cache/huggingface/datasets', + 'use_imagenet_stats': True, + 'video_backend': 'torchcodec'}, + 'env': {'camera_name': 'agentview_image,robot0_eye_in_hand_image', + 'episode_length': 520, + 'features': {'action': {'shape': [7], + 'type': }, + 'agent_pos': {'shape': [8], + 'type': }, + 'pixels/agentview_image': {'shape': [360, 360, 3], + 'type': }, + 'pixels/robot0_eye_in_hand_image': {'shape': [360, + 360, + 3], + 'type': }}, + 'features_map': {'action': 'action', + 'agent_pos': 'observation.state', + 'pixels/agentview_image': 'observation.images.image', + 'pixels/robot0_eye_in_hand_image': 'observation.images.image2'}, + 'fps': 30, + 'init_states': True, + 'max_parallel_tasks': 5, + 'multitask_eval': True, + 'obs_type': 'pixels_agent_pos', + 'render_mode': 'rgb_array', + 'task': 'libero_spatial', + 'type': 'libero'}, + 'eval': {'batch_size': 1, 'n_episodes': 1, 'use_async_envs': False}, + 'eval_freq': 0, + 'job_name': 'libero_smolvla', + 'log_freq': 200, + 'num_workers': 4, + 'optimizer': {'betas': [0.9, 0.95], + 'eps': 1e-08, + 'grad_clip_norm': 10, + 'lr': 0.0001, + 'type': 'adamw', + 'weight_decay': 1e-10}, + 'output_dir': '/raid/jade/logs/lerobot/lerobot_2_physical-intelligence_libero_smolvla_lr1e-4bs32steps100000', + 'policy': {'adapt_to_pi_aloha': False, + 'add_image_special_tokens': False, + 'attention_mode': 'cross_attn', + 'chunk_size': 50, + 'device': 'cuda', + 'empty_cameras': 0, + 'expert_width_multiplier': 0.5, + 'freeze_vision_encoder': True, + 'gradient_accumulation_steps': 1, + 'input_features': {}, + 'license': None, + 'load_vlm_weights': False, + 'max_action_dim': 32, + 'max_period': 4.0, + 'max_state_dim': 32, + 'min_period': 0.004, + 'n_action_steps': 1, + 'n_obs_steps': 1, + 'normalization_mapping': {'ACTION': , + 'STATE': , + 'VISUAL': }, + 'num_expert_layers': -1, + 'num_steps': 10, + 'num_vlm_layers': 16, + 'optimizer_betas': [0.9, 0.95], + 'optimizer_eps': 1e-08, + 'optimizer_grad_clip_norm': 10, + 'optimizer_lr': 0.0001, + 'optimizer_weight_decay': 1e-10, + 'output_features': {}, + 'pad_language_to': 'longest', + 'prefix_length': 0, + 'private': None, + 'push_to_hub': True, + 'repo_id': 'None', + 'resize_imgs_with_padding': [512, 512], + 'scheduler_decay_lr': 2.5e-06, + 'scheduler_decay_steps': 30000, + 'scheduler_warmup_steps': 1000, + 'self_attn_every_n_layers': 2, + 'tags': None, + 'tokenizer_max_length': 48, + 'train_expert_only': True, + 'train_state_proj': True, + 'type': 'smolvla', + 'use_amp': False, + 'use_cache': True, + 'use_delta_joint_actions_aloha': False, + 'vlm_model_name': 'HuggingFaceTB/SmolVLM2-500M-Instruct'}, + 'resume': False, + 'save_checkpoint': True, + 'save_freq': 20000, + 'scheduler': {'decay_lr': 2.5e-06, + 'num_decay_steps': 30000, + 'num_warmup_steps': 1000, + 'peak_lr': 0.0001, + 'type': 'cosine_decay_with_warmup'}, + 'seed': 1000, + 'steps': 100000, + 'use_policy_training_preset': True, + 'wandb': {'disable_artifact': False, + 'enable': False, + 'entity': None, + 'mode': None, + 'notes': None, + 'project': 'lerobot', + 'run_id': None}} +INFO 2025-09-09 16:13:51 ts/train.py:143 Logs will be saved locally. +INFO 2025-09-09 16:13:51 ts/train.py:153 Creating dataset +WARNING 2025-09-09 16:13:51 ts/utils.py:302 +The dataset you requested (physical-intelligence/libero) is in 2.0 format. +While current version of LeRobot is backward-compatible with it, the version of your dataset still uses global +stats instead of per-episode stats. Update your dataset stats to the new format using this command: +``` +python -m lerobot.datasets.v21.convert_dataset_v20_to_v21 --repo-id=physical-intelligence/libero +``` + +If you encounter a problem, contact LeRobot maintainers on [Discord](https://discord.com/invite/s3KuuzsPFb) +or open an [issue on GitHub](https://github.com/huggingface/lerobot/issues/new/choose). + +WARNING 2025-09-09 16:13:51 ts/utils.py:302 +The dataset you requested (physical-intelligence/libero) is in 2.0 format. +While current version of LeRobot is backward-compatible with it, the version of your dataset still uses global +stats instead of per-episode stats. Update your dataset stats to the new format using this command: +``` +python -m lerobot.datasets.v21.convert_dataset_v20_to_v21 --repo-id=physical-intelligence/libero +``` + +If you encounter a problem, contact LeRobot maintainers on [Discord](https://discord.com/invite/s3KuuzsPFb) +or open an [issue on GitHub](https://github.com/huggingface/lerobot/issues/new/choose). + +Resolving data files: 100%|β–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆ| 1693/1693 [00:00<00:00, 82981.28it/s] +Loading dataset shards: 100%|β–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆ| 70/70 [00:00<00:00, 4687.94it/s] +INFO 2025-09-09 16:13:57 ts/train.py:163 Creating policy +Fetching 2 files: 100%|β–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆ| 2/2 [00:00<00:00, 21345.06it/s] +Fetching 2 files: 100%|β–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆ| 2/2 [00:00<00:00, 4226.00it/s] +Fetching 2 files: 100%|β–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆ| 2/2 [00:00<00:00, 2966.27it/s] +Fetching 2 files: 100%|β–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆ| 2/2 [00:00<00:00, 6497.76it/s] +Reducing the number of VLM layers to 16 ... +INFO 2025-09-09 16:14:30 ts/train.py:168 Creating optimizer and scheduler +INFO 2025-09-09 16:14:30 ts/train.py:180 Output dir: /raid/jade/logs/lerobot/lerobot_2_physical-intelligence_libero_ +smolvla_lr1e-4bs32steps100000 +INFO 2025-09-09 16:14:30 ts/train.py:182 cfg.env.task='libero_spatial' +INFO 2025-09-09 16:14:30 ts/train.py:183 cfg.steps=100000 (100K) +INFO 2025-09-09 16:14:30 ts/train.py:184 dataset.num_frames=273465 (273K) +INFO 2025-09-09 16:14:30 ts/train.py:185 dataset.num_episodes=1693 +INFO 2025-09-09 16:14:30 ts/train.py:186 num_learnable_params=49103712 (49M) +INFO 2025-09-09 16:14:30 ts/train.py:187 num_total_params=399268940 (399M) +INFO 2025-09-09 16:14:30 ts/train.py:225 Start offline training on a fixed dataset +INFO 2025-09-09 16:16:20 ts/train.py:255 step:200 smpl:6K ep:40 epch:0.02 loss:1.244 grdn:2.492 lr:1.0e-05 updt_s:0. +536 data_s:0.007 +INFO 2025-09-09 16:17:56 ts/train.py:255 step:400 smpl:13K ep:79 epch:0.05 loss:0.685 grdn:4.262 lr:3.0e-05 updt_s:0 +.481 data_s:0.000 +INFO 2025-09-09 16:19:33 ts/train.py:255 step:600 smpl:19K ep:119 epch:0.07 loss:0.364 grdn:4.849 lr:5.0e-05 updt_s: +0.482 data_s:0.000 +INFO 2025-09-09 16:21:10 ts/train.py:255 step:800 smpl:26K ep:158 epch:0.09 loss:0.239 grdn:4.024 lr:7.0e-05 updt_s: +0.481 data_s:0.000 +INFO 2025-09-09 16:22:46 ts/train.py:255 step:1K smpl:32K ep:198 epch:0.12 loss:0.197 grdn:3.267 lr:9.0e-05 updt_s:0 +.478 data_s:0.000 +INFO 2025-09-09 16:24:22 ts/train.py:255 step:1K smpl:38K ep:238 epch:0.14 loss:0.173 grdn:2.319 lr:1.0e-04 updt_s:0 +.481 data_s:0.000 +INFO 2025-09-09 16:25:59 ts/train.py:255 step:1K smpl:45K ep:277 epch:0.16 loss:0.153 grdn:1.741 lr:1.0e-04 updt_s:0 +.483 data_s:0.000 +INFO 2025-09-09 16:27:36 ts/train.py:255 step:2K smpl:51K ep:317 epch:0.19 loss:0.135 grdn:1.354 lr:9.9e-05 updt_s:0 +.483 data_s:0.000 +INFO 2025-09-09 16:29:14 ts/train.py:255 step:2K smpl:58K ep:357 epch:0.21 loss:0.126 grdn:1.177 lr:9.9e-05 updt_s:0 +.484 data_s:0.000 + diff --git a/src/lerobot/configs/policies.py b/src/lerobot/configs/policies.py index f5fa727cf..75863d3fc 100644 --- a/src/lerobot/configs/policies.py +++ b/src/lerobot/configs/policies.py @@ -62,6 +62,7 @@ class PreTrainedConfig(draccus.ChoiceRegistry, HubMixin, abc.ABC): # `use_amp` determines whether to use Automatic Mixed Precision (AMP) for training and evaluation. With AMP, # automatic gradient scaling is used. use_amp: bool = False + gradient_accumulation_steps: int = 1 push_to_hub: bool = True repo_id: str | None = None diff --git a/src/lerobot/datasets/lerobot_dataset.py b/src/lerobot/datasets/lerobot_dataset.py index a869cb920..6509993bb 100644 --- a/src/lerobot/datasets/lerobot_dataset.py +++ b/src/lerobot/datasets/lerobot_dataset.py @@ -472,7 +472,6 @@ class LeRobotDataset(torch.utils.data.Dataset): episodes_stats = [self.meta.episodes_stats[ep_idx] for ep_idx in self.episodes] self.stats = aggregate_stats(episodes_stats) - # Load actual data try: if force_cache_sync: raise FileNotFoundError @@ -598,6 +597,7 @@ class LeRobotDataset(torch.utils.data.Dataset): """hf_dataset contains all the observations, states, actions, rewards, etc.""" if self.episodes is None: path = str(self.root / "data") + # added by jade hf_dataset = load_dataset("parquet", data_dir=path, split="train") else: files = [str(self.root / self.meta.get_data_file_path(ep_idx)) for ep_idx in self.episodes] diff --git a/src/lerobot/datasets/utils.py b/src/lerobot/datasets/utils.py index 078c5351d..daa1de163 100644 --- a/src/lerobot/datasets/utils.py +++ b/src/lerobot/datasets/utils.py @@ -455,7 +455,8 @@ def dataset_to_policy_features(features: dict[str, dict]) -> dict[str, PolicyFea shape = (shape[2], shape[0], shape[1]) elif key == "observation.environment_state": type = FeatureType.ENV - elif key.startswith("observation"): + # changed by jade + elif key.startswith("observation") or key.startswith("state"): type = FeatureType.STATE elif key.startswith("action"): type = FeatureType.ACTION diff --git a/src/lerobot/policies/factory.py b/src/lerobot/policies/factory.py index 4b8eeffd1..79461d3a9 100644 --- a/src/lerobot/policies/factory.py +++ b/src/lerobot/policies/factory.py @@ -34,6 +34,7 @@ from lerobot.policies.sac.reward_model.configuration_classifier import RewardCla from lerobot.policies.smolvla.configuration_smolvla import SmolVLAConfig from lerobot.policies.tdmpc.configuration_tdmpc import TDMPCConfig from lerobot.policies.vqbet.configuration_vqbet import VQBeTConfig +from lerobot.policies.smolpi0.configuration_smolpi0 import SMOLPI0Config def get_policy_class(name: str) -> PreTrainedPolicy: @@ -74,6 +75,10 @@ def get_policy_class(name: str) -> PreTrainedPolicy: from lerobot.policies.smolvla.modeling_smolvla import SmolVLAPolicy return SmolVLAPolicy + elif name == "smolpi0": + from lerobot.policies.smolpi0.modeling_smolpi0 import SMOLPI0Policy + + return SMOLPI0Policy else: raise NotImplementedError(f"Policy with name {name} is not implemented.") @@ -97,6 +102,8 @@ def make_policy_config(policy_type: str, **kwargs) -> PreTrainedConfig: return SmolVLAConfig(**kwargs) elif policy_type == "reward_classifier": return RewardClassifierConfig(**kwargs) + elif policy_type == "smolpi0": + return SMOLPI0Config(**kwargs) else: raise ValueError(f"Policy type '{policy_type}' is not available.") @@ -170,7 +177,6 @@ def make_policy( policy = policy_cls(**kwargs) policy.to(cfg.device) assert isinstance(policy, nn.Module) - # policy = torch.compile(policy, mode="reduce-overhead") return policy diff --git a/src/lerobot/policies/normalize.py b/src/lerobot/policies/normalize.py index 119055873..043265b1b 100644 --- a/src/lerobot/policies/normalize.py +++ b/src/lerobot/policies/normalize.py @@ -255,6 +255,83 @@ class Unnormalize(nn.Module): return batch +class NormalizePerRobotType(nn.Module): + """Normalizes data (e.g. "observation.image") for more stable and faster convergence during training.""" + + def __init__( + self, + features: dict[str, PolicyFeature], + norm_map: dict[str, NormalizationMode], + stats: dict[str, dict[str, Tensor]] | None = None, + ): + """ + Args: + shapes (dict): A dictionary where keys are input modalities (e.g. "observation.image") and values + are their shapes (e.g. `[3,96,96]`]). These shapes are used to create the tensor buffer containing + mean, std, min, max statistics. If the provided `shapes` contain keys related to images, the shape + is adjusted to be invariant to height and width, assuming a channel-first (c, h, w) format. + modes (dict): A dictionary where keys are output modalities (e.g. "observation.image") and values + are their normalization modes among: + - "mean_std": subtract the mean and divide by standard deviation. + - "min_max": map to [-1, 1] range. + stats (dict, optional): A dictionary where keys are output modalities (e.g. "observation.image") + and values are dictionaries of statistic types and their values (e.g. + `{"mean": torch.randn(3,1,1)}, "std": torch.randn(3,1,1)}`). If provided, as expected for + training the model for the first time, these statistics will overwrite the default buffers. If + not provided, as expected for finetuning or evaluation, the default buffers should to be + overwritten by a call to `policy.load_state_dict(state_dict)`. That way, initializing the + dataset is not needed to get the stats, since they are already in the policy state_dict. + """ + super().__init__() + self.features = features + self.norm_map = norm_map + for robot_type in stats.keys(): + stats_buffers = create_stats_buffers(features, norm_map, stats[robot_type]) + for key, buffer in stats_buffers.items(): + setattr(self, f"{robot_type}_buffer_" + key.replace(".", "_"), buffer) + + # TODO(rcadene): should we remove torch.no_grad? + @torch.no_grad + def forward(self, batch: dict[str, Tensor]) -> dict[str, Tensor]: + batch = dict(batch) # shallow copy avoids mutating the input batch + assert "robot_type" in batch, "robot_type is not in the batch" + robot_types = batch["robot_type"] + + for key, ft in self.features.items(): + if key not in batch: + continue + + norm_mode = self.norm_map.get(ft.type, NormalizationMode.IDENTITY) + if norm_mode is NormalizationMode.IDENTITY: + continue + # FIXME(mshukor): make it more efficient + buffers = [ + getattr(self, f"{robot_type}_buffer_" + key.replace(".", "_")) for robot_type in robot_types + ] + if norm_mode is NormalizationMode.MEAN_STD: + mean = torch.stack([buffers[i]["mean"] for i in range(len(robot_types))],dim=0) + std = torch.stack([buffers[i]["std"] for i in range(len(robot_types))],dim=0) + if batch[key].ndim == 3: + mean = mean.unsqueeze(1) + std = std.unsqueeze(1) + assert not torch.isinf(mean).any(), _no_stats_error_str("mean") + assert not torch.isinf(std).any(), _no_stats_error_str("std") + batch[key] = (batch[key] - mean) / (std + 1e-8) + elif norm_mode is NormalizationMode.MIN_MAX: + min = torch.stack([buffers[i]["min"] for i in range(len(robot_types))], dim=0) + max = torch.stack([buffers[i]["max"] for i in range(len(robot_types))], dim=0) + assert not torch.isinf(min).any(), _no_stats_error_str("min") + assert not torch.isinf(max).any(), _no_stats_error_str("max") + if batch[key].ndim == 3: + min = min.unsqueeze(1) + max = max.unsqueeze(1) + # normalize to [0,1] + batch[key] = (batch[key] - min) / (max - min + 1e-8) + # normalize to [-1, 1] + batch[key] = batch[key] * 2 - 1 + else: + raise ValueError(norm_mode) + return batch # TODO (azouitine): We should replace all normalization on the policies with register_buffer normalization # and remove the `Normalize` and `Unnormalize` classes. def _initialize_stats_buffers( @@ -418,3 +495,87 @@ class UnnormalizeBuffer(nn.Module): raise ValueError(norm_mode) return batch + + +class UnnormalizePerRobotType(nn.Module): + """ + Similar to `Normalize` but unnormalizes output data (e.g. `{"action": torch.randn(b,c)}`) in their + original range used by the environment. + """ + + def __init__( + self, + features: dict[str, PolicyFeature], + norm_map: dict[str, NormalizationMode], + stats: dict[str, dict[str, Tensor]] | None = None, + ): + """ + Args: + shapes (dict): A dictionary where keys are input modalities (e.g. "observation.image") and values + are their shapes (e.g. `[3,96,96]`]). These shapes are used to create the tensor buffer containing + mean, std, min, max statistics. If the provided `shapes` contain keys related to images, the shape + is adjusted to be invariant to height and width, assuming a channel-first (c, h, w) format. + modes (dict): A dictionary where keys are output modalities (e.g. "observation.image") and values + are their normalization modes among: + - "mean_std": subtract the mean and divide by standard deviation. + - "min_max": map to [-1, 1] range. + stats (dict, optional): A dictionary where keys are output modalities (e.g. "observation.image") + and values are dictionaries of statistic types and their values (e.g. + `{"mean": torch.randn(3,1,1)}, "std": torch.randn(3,1,1)}`). If provided, as expected for + training the model for the first time, these statistics will overwrite the default buffers. If + not provided, as expected for finetuning or evaluation, the default buffers should to be + overwritten by a call to `policy.load_state_dict(state_dict)`. That way, initializing the + dataset is not needed to get the stats, since they are already in the policy state_dict. + """ + super().__init__() + self.features = features + self.norm_map = norm_map + self.stats = stats + # `self.buffer_observation_state["mean"]` contains `torch.tensor(state_dim)` + for robot_type in stats.keys(): + stats_buffers = create_stats_buffers(features, norm_map, stats[robot_type]) + for key, buffer in stats_buffers.items(): + setattr(self, f"{robot_type}_buffer_" + key.replace(".", "_"), buffer) + + # TODO(rcadene): should we remove torch.no_grad? + @torch.no_grad + def forward(self, batch: dict[str, Tensor]) -> dict[str, Tensor]: + batch = dict(batch) # shallow copy avoids mutating the input batch + assert "robot_type" in batch, "robot_type is not in the batch" + robot_types = batch["robot_type"] + + for key, ft in self.features.items(): + if key not in batch: + continue + + norm_mode = self.norm_map.get(ft.type, NormalizationMode.IDENTITY) + if norm_mode is NormalizationMode.IDENTITY: + continue + + # buffer = getattr(self, "buffer_" + key.replace(".", "_")) + buffers = [ + getattr(self, f"{robot_type}_buffer_" + key.replace(".", "_")) for robot_type in robot_types + ] + + if norm_mode is NormalizationMode.MEAN_STD: + mean = torch.stack([buffers[i]["mean"] for i in range(len(robot_types))], dim=0) + std = torch.stack([buffers[i]["std"] for i in range(len(robot_types))], dim=0) + assert not torch.isinf(mean).any(), _no_stats_error_str("mean") + assert not torch.isinf(std).any(), _no_stats_error_str("std") + if batch[key].ndim == 3: + mean = mean.unsqueeze(1) + std = std.unsqueeze(1) + batch[key] = batch[key] * std + mean + elif norm_mode is NormalizationMode.MIN_MAX: + min = torch.stack([buffers[i]["min"] for i in range(len(robot_types))], dim=0) + max = torch.stack([buffers[i]["max"] for i in range(len(robot_types))], dim=0) + assert not torch.isinf(min).any(), _no_stats_error_str("min") + assert not torch.isinf(max).any(), _no_stats_error_str("max") + if batch[key].ndim == 3: + min = min.unsqueeze(1) + max = max.unsqueeze(1) + batch[key] = (batch[key] + 1) / 2 + batch[key] = batch[key] * (max - min) + min + else: + raise ValueError(norm_mode) + return batch diff --git a/src/lerobot/policies/smolpi0/configuration_smolpi0.py b/src/lerobot/policies/smolpi0/configuration_smolpi0.py new file mode 100644 index 000000000..c3605cd82 --- /dev/null +++ b/src/lerobot/policies/smolpi0/configuration_smolpi0.py @@ -0,0 +1,210 @@ +# Copyright 2024 The HuggingFace Inc. team. All rights reserved. +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. + +from dataclasses import dataclass, field + +from lerobot.optim.optimizers import AdamWConfig +from lerobot.optim.schedulers import ( + CosineDecayWithWarmupSchedulerConfig, +) +from lerobot.configs.policies import PreTrainedConfig +from lerobot.configs.types import FeatureType, NormalizationMode, PolicyFeature + + +@dataclass +class PEFTConfig: + r: int = 4 + lora_alpha: int = 16 + lora_dropout: float = 0.1 + target_modules: str = "q_proj,v_proj" + + +@PreTrainedConfig.register_subclass("smolpi0") +@dataclass +class SMOLPI0Config(PreTrainedConfig): + # Input / output structure. + n_obs_steps: int = 1 + chunk_size: int = 50 + n_action_steps: int = 50 + n_obs_gap: int = 1 + + normalization_mapping: dict[str, NormalizationMode] = field( + default_factory=lambda: { + "VISUAL": NormalizationMode.IDENTITY, + "STATE": NormalizationMode.MEAN_STD, + "ACTION": NormalizationMode.MEAN_STD, + } + ) + + # Shorter state and action vectors will be padded + max_state_dim: int = 32 + max_action_dim: int = 32 + + # Image preprocessing + resize_imgs_with_padding: tuple[int, int] = (512, 512) #(224, 224) + + # Add empty images. Used by pi0_aloha_sim which adds the empty + # left and right wrist cameras in addition to the top camera. + empty_cameras: int = 0 + + # Converts the joint and gripper values from the standard Aloha space to + # the space used by the pi internal runtime which was used to train the base model. + adapt_to_pi_aloha: bool = False + + # Converts joint dimensions to deltas with respect to the current state before passing to the model. + # Gripper dimensions will remain in absolute values. + use_delta_joint_actions_aloha: bool = False + + # Tokenizer + tokenizer_max_length: int = 48 + + # Projector + proj_width: int = 480 + + # Decoding + num_steps: int = 10 + + # Attention utils + use_cache: bool = True + attention_implementation: str = "eager" # or fa2, flex + + # Finetuning settings + freeze_vision_encoder: bool = True + train_expert_only: bool = False + train_state_proj: bool = True + + # Training presets + optimizer_lr: float = 2.5e-5 + optimizer_betas: tuple[float, float] = (0.9, 0.95) + optimizer_eps: float = 1e-8 + optimizer_weight_decay: float = 1e-10 + optimizer_grad_clip_norm: float = 10 + optimizer_lr_vlm: float = 0 + + scheduler_warmup_steps: int = 1_000 + scheduler_decay_steps: int = 30_000 + scheduler_decay_lr: float = 2.5e-6 + + # TODO: Add EMA + vlm_model_name: str = "HuggingFaceTB/SmolVLM2-500M-Video-Instruct" + checkpoint_path: str = None + load_vlm_weights: bool = False + + peft_method: str = "" + peft_config: PEFTConfig = PEFTConfig() + peft_target_model: str = "" + + add_image_special_tokens: bool = False + add_prompt_template: bool = False + prefix_prompt_template: str = f"<|im_start|>User: What action should the robot take to" + suffix_prompt_template: str = f"?\nAssistant:" + + attention_mode: str = "self_attn" + + prefix_length: int = -1 # n_obs_steps * num_cameras * num_image_token_per_image + tokenizer_max_length + + past_obs_keys: str = f"image" + + add_local_special_image_tokens: bool = False + + reverse_images_order: bool = False + + state_to_prefix: bool = False + + pad_language_to: str = "longest" # "max_length" + + num_expert_layers: int = -1 + num_vlm_layers: int = -1 + + causal_action_attention_mask: bool = False + + self_attn_every_n_layers: int = -1 + + expert_width_multiplier: float = 0.5 + + robot_type: str = "" + + self_attn_only_actions: bool = False + + causal_attention_on_history: bool = False + + predict_relative_actions: bool = False + relative_actions_mode: str = "first" + + shuffle_camera_positions: bool = False + vlm_img_size: int = -1 + + regression_loss: bool = False + + def __post_init__(self): + super().__post_init__() + if self.vlm_img_size > 0: + self.resize_imgs_with_padding = (self.vlm_img_size, self.vlm_img_size) + """Input validation (not exhaustive).""" + if self.n_action_steps > self.chunk_size: + raise ValueError( + f"The chunk size is the upper bound for the number of action steps per model invocation. Got " + f"{self.n_action_steps} for `n_action_steps` and {self.chunk_size} for `chunk_size`." + ) + # if self.n_obs_steps != 1: + # raise ValueError( + # f"Multiple observation steps not handled yet. Got `nobs_steps={self.n_obs_steps}`" + # ) + + if self.use_delta_joint_actions_aloha: + raise NotImplementedError( + "`use_delta_joint_actions_aloha` is used by pi0 for aloha real models. It is not ported yet in LeRobot." + ) + + def validate_features(self) -> None: + # TODO: implement value error + # if not self.image_features and not self.env_state_feature: + # raise ValueError("You must provide at least one image or the environment state among the inputs.") + + for i in range(self.empty_cameras): + key = f"observation.images.empty_camera_{i}" + empty_camera = PolicyFeature( + type=FeatureType.VISUAL, + shape=(3, 480, 640), + ) + self.input_features[key] = empty_camera + + def get_optimizer_preset(self) -> AdamWConfig: + return AdamWConfig( + lr=self.optimizer_lr, + betas=self.optimizer_betas, + eps=self.optimizer_eps, + weight_decay=self.optimizer_weight_decay, + grad_clip_norm=self.optimizer_grad_clip_norm, + ) + + def get_scheduler_preset(self): + return CosineDecayWithWarmupSchedulerConfig( + peak_lr=self.optimizer_lr, + decay_lr=self.scheduler_decay_lr, + num_warmup_steps=self.scheduler_warmup_steps, + num_decay_steps=self.scheduler_decay_steps, + ) + + @property + def observation_delta_indices(self) -> list: # FIXME(mshukor): support spacing between observations + return [-k for k in range(0, self.n_obs_steps * self.n_obs_gap, self.n_obs_gap)][::-1] + + @property + def action_delta_indices(self) -> list: + return list(range(self.chunk_size)) + + @property + def reward_delta_indices(self) -> None: + return None diff --git a/src/lerobot/policies/smolpi0/flex_attention.py b/src/lerobot/policies/smolpi0/flex_attention.py new file mode 100644 index 000000000..13950f743 --- /dev/null +++ b/src/lerobot/policies/smolpi0/flex_attention.py @@ -0,0 +1,145 @@ +# Copyright 2024 The HuggingFace Inc. team. All rights reserved. +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. + +import torch +import torch.nn.functional as F # noqa: N812 +from packaging.version import Version + +if Version(torch.__version__) > Version("2.5.0"): + # Ffex attention is only available from torch 2.5 onwards + from torch.nn.attention.flex_attention import ( + _mask_mod_signature, + _round_up_to_multiple, + create_block_mask, + create_mask, + flex_attention, + ) + + +@torch.compile(dynamic=False) +def flex_attention_forward( + attention_mask: torch.Tensor, + batch_size: int, + head_dim: int, + query_states: torch.Tensor, + key_states: torch.Tensor, + value_states: torch.Tensor, + scaling=None, + num_att_heads: int = 8, + num_key_value_heads: int = 1, +): + """ + This is defined out of classes to make compile happy. + """ + + original_dtype = query_states.dtype + num_key_value_groups = num_att_heads // num_key_value_heads + key_states = key_states[:, :, :, None, :] + key_states = key_states.expand( + batch_size, key_states.shape[1], num_key_value_heads, num_key_value_groups, head_dim + ) + key_states = key_states.reshape( + batch_size, key_states.shape[1], num_key_value_heads * num_key_value_groups, head_dim + ) + + value_states = value_states[:, :, :, None, :] + value_states = value_states.expand( + batch_size, value_states.shape[1], num_key_value_heads, num_key_value_groups, head_dim + ) + value_states = value_states.reshape( + batch_size, value_states.shape[1], num_key_value_heads * num_key_value_groups, head_dim + ) + + query_states = query_states.transpose(1, 2) + key_states = key_states.transpose(1, 2) + value_states = value_states.transpose(1, 2) + + # query_states = query_states.to(torch.float32) + # key_states = key_states.to(torch.float32) + # value_states = value_states.to(torch.float32) + + causal_mask = attention_mask + if causal_mask is not None: + causal_mask = causal_mask[:, None, :, : key_states.shape[2]] + + if causal_mask.shape[1] == 1 and query_states.shape[1] > 1: + causal_mask = causal_mask.expand(-1, query_states.shape[1], -1, -1) + + def precomputed_mask_factory(precomputed_mask: torch.Tensor) -> _mask_mod_signature: + def mask_mod(b, h, q_idx, kv_idx): + # Danger zone: if b,h,q_idx,kv_idx exceed the shape, device-side assert occurs. + return precomputed_mask[b][h][q_idx][kv_idx] + + return mask_mod + + b_mask, h_mask, q_len, kv_len = causal_mask.shape # The shape of your mask + + block_size = 128 # limitation of flex attention + q_len_rounded = _round_up_to_multiple(q_len, block_size) + kv_len_rounded = _round_up_to_multiple(kv_len, block_size) + + # *CRITICAL* we do need to expand here, else we get a CUDA index error + + pad_q = q_len_rounded - q_len + pad_k = kv_len_rounded - kv_len + if pad_q > 0 or pad_k > 0: + padded_causal_mask = F.pad(causal_mask, (0, pad_k, 0, pad_q), value=0.0) + else: + padded_causal_mask = causal_mask + mask_mod_fn_orig = precomputed_mask_factory(padded_causal_mask) + + mask_4d = create_mask( + mod_fn=mask_mod_fn_orig, + B=b_mask, + H=h_mask, + Q_LEN=q_len_rounded, + KV_LEN=kv_len_rounded, + device=causal_mask.device, + ) + + mask_mod_fn_padded = precomputed_mask_factory(mask_4d) + # FIXME(mshukor): compile mask torch.compile(create_block_mask) + create_block_mask_compiled = torch.compile(create_block_mask) + block_mask = create_block_mask_compiled( + mask_mod=mask_mod_fn_padded, + B=b_mask, + H=None, # + Q_LEN=q_len_rounded, + KV_LEN=kv_len_rounded, + BLOCK_SIZE=block_size, + device=causal_mask.device, + _compile=False, + ) + padded_query_states = F.pad(query_states, (0, 0, 0, pad_q), value=0.0) if pad_q > 0 else query_states + padded_key_states = F.pad(key_states, (0, 0, 0, pad_k), value=0.0) if pad_k > 0 else key_states + padded_value_states = F.pad(value_states, (0, 0, 0, pad_k), value=0.0) if pad_k > 0 else value_states + # mask is applied inside the kernel, ideally more efficiently than score_mod. + attn_output, attention_weights = flex_attention( + padded_query_states, + padded_key_states, + padded_value_states, + block_mask=block_mask, + enable_gqa=True, # because we shaped query/key states for GQA + scale=head_dim**-0.5 if scaling is None else scaling, + return_lse=True, + ) + + attn_output = attn_output.to(dtype=original_dtype) + attn_output = attn_output.transpose(1, 2).contiguous() # [B, Q_LEN, H, head_dim] + attn_output = attn_output.reshape( + batch_size, + -1, + attn_output.shape[2] * attn_output.shape[3], # merges [H, head_dim] + ) + return attn_output[:, :-pad_k, :] if pad_k > 0 else attn_output diff --git a/src/lerobot/policies/smolpi0/modeling_smolpi0.py b/src/lerobot/policies/smolpi0/modeling_smolpi0.py new file mode 100644 index 000000000..fa2d3d5a7 --- /dev/null +++ b/src/lerobot/policies/smolpi0/modeling_smolpi0.py @@ -0,0 +1,1021 @@ +#!/usr/bin/env python + +# Copyright 2025 Physical Intelligence and The HuggingFace Inc. team. All rights reserved. +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. + +""" +Ο€0: A Vision-Language-Action Flow Model for General Robot Control + +[Paper](https://www.physicalintelligence.company/download/pi0.pdf) +[Jax code](https://github.com/Physical-Intelligence/openpi) + +Designed by Physical Intelligence. Ported from Jax by Hugging Face. + +Install pi0 extra dependencies: +```bash +pip install -e ".[pi0]" +``` + +Example of finetuning the pi0 pretrained model (`pi0_base` in `openpi`): +```bash +python lerobot/scripts/train.py \ +--policy.path=lerobot/pi0 \ +--dataset.repo_id=danaaubakirova/koch_test +``` + +Example of finetuning the pi0 neural network with PaliGemma and expert Gemma +pretrained with VLM default parameters before pi0 finetuning: +```bash +python lerobot/scripts/train.py \ +--policy.type=pi0 \ +--dataset.repo_id=danaaubakirova/koch_test +``` + +Example of using the pi0 pretrained model outside LeRobot training framework: +```python +policy = Pi0Policy.from_pretrained("lerobot/pi0") +``` + +""" + +import math +from collections import deque + +import torch +import torch.nn.functional as F # noqa: N812 +from torch import Tensor, nn +from transformers import AutoProcessor + +from lerobot.constants import ACTION, OBS_STATE +from lerobot.policies.normalize import ( + Normalize, + NormalizePerRobotType, + Unnormalize, + UnnormalizePerRobotType, +) +from lerobot.policies.smolpi0.configuration_smolpi0 import SMOLPI0Config +from lerobot.policies.smolpi0.smolvlm_with_expert import ( + SmolVLMWithExpertModel +) +from lerobot.policies.pretrained import PreTrainedPolicy +from lerobot.utils.utils import get_safe_dtype +OBS_IMAGE = "observation.image" +OBS_IMAGES = "observation.images" +ACTION = "action" +OBS_IMAGE_2 = "observation.image2" +OBS_IMAGE_3 = "observation.image3" +OBS_IMAGE_4 = "observation.image4" +TASK = "task" +ROBOT = "robot_type" +IMAGES_ORDER = { + OBS_IMAGE: 0, + OBS_IMAGE_2: 1, + OBS_IMAGE_3: 2, + OBS_IMAGE_4: 3, +} +from lerobot.policies.utils import ( + populate_queues, +) +import random +def create_sinusoidal_pos_embedding( + time: torch.tensor, dimension: int, min_period: float, max_period: float, device="cpu" +) -> Tensor: + """Computes sine-cosine positional embedding vectors for scalar positions.""" + if dimension % 2 != 0: + raise ValueError(f"dimension ({dimension}) must be divisible by 2") + + if time.ndim != 1: + raise ValueError("The time tensor is expected to be of shape `(batch_size, )`.") + + dtype = get_safe_dtype(torch.float64, device.type) + fraction = torch.linspace(0.0, 1.0, dimension // 2, dtype=dtype, device=device) + period = min_period * (max_period / min_period) ** fraction + + # Compute the outer product + scaling_factor = 1.0 / period * 2 * math.pi + sin_input = scaling_factor[None, :] * time[:, None] + pos_emb = torch.cat([torch.sin(sin_input), torch.cos(sin_input)], dim=1) + return pos_emb + + +def sample_beta(alpha, beta, bsize, device): + gamma1 = torch.empty((bsize,), device=device).uniform_(0, 1).pow(1 / alpha) + gamma2 = torch.empty((bsize,), device=device).uniform_(0, 1).pow(1 / beta) + return gamma1 / (gamma1 + gamma2) + + +def make_att_2d_masks(pad_masks, att_masks): + """Copied from big_vision. + + Tokens can attend to valid inputs tokens which have a cumulative mask_ar + smaller or equal to theirs. This way `mask_ar` int[B, N] can be used to + setup several types of attention, for example: + + [[1 1 1 1 1 1]]: pure causal attention. + + [[0 0 0 1 1 1]]: prefix-lm attention. The first 3 tokens can attend between + themselves and the last 3 tokens have a causal attention. The first + entry could also be a 1 without changing behaviour. + + [[1 0 1 0 1 0 0 1 0 0]]: causal attention between 4 blocks. Tokens of a + block can attend all previous blocks and all tokens on the same block. + + Args: + input_mask: bool[B, N] true if its part of the input, false if padding. + mask_ar: int32[B, N] mask that's 1 where previous tokens cannot depend on + it and 0 where it shares the same attention mask as the previous token. + """ + if att_masks.ndim != 2: + raise ValueError(att_masks.ndim) + if pad_masks.ndim != 2: + raise ValueError(pad_masks.ndim) + + cumsum = torch.cumsum(att_masks, dim=1) + att_2d_masks = cumsum[:, None, :] <= cumsum[:, :, None] + pad_2d_masks = pad_masks[:, None, :] * pad_masks[:, :, None] + att_2d_masks = att_2d_masks & pad_2d_masks + return att_2d_masks + + +def resize_with_pad(img, width, height, pad_value=-1): + # assume no-op when width height fits already + if img.ndim != 4: + raise ValueError(f"(b,c,h,w) expected, but {img.shape}") + + cur_height, cur_width = img.shape[2:] + + ratio = max(cur_width / width, cur_height / height) + resized_height = int(cur_height / ratio) + resized_width = int(cur_width / ratio) + resized_img = F.interpolate( + img, size=(resized_height, resized_width), mode="bilinear", align_corners=False + ) + + pad_height = max(0, int(height - resized_height)) + pad_width = max(0, int(width - resized_width)) + + # pad on left and top of image + padded_img = F.pad(resized_img, (pad_width, 0, pad_height, 0), value=pad_value) + return padded_img + + +def pad_vector(vector, new_dim): + """Can be (batch_size x sequence_length x features_dimension) + or (batch_size x features_dimension) + """ + if vector.shape[-1] == new_dim: + return vector + shape = list(vector.shape) + current_dim = shape[-1] + shape[-1] = new_dim + new_vector = torch.zeros(*shape, dtype=vector.dtype, device=vector.device) + new_vector[..., :current_dim] = vector + return new_vector + + +def normalize(x, min_val, max_val): + return (x - min_val) / (max_val - min_val) + + +def unnormalize(x, min_val, max_val): + return x * (max_val - min_val) + min_val + + +def safe_arcsin(value): + # This ensures that the input stays within + # [βˆ’1,1] to avoid invalid values for arcsin + return torch.arcsin(torch.clamp(value, -1.0, 1.0)) + + +def aloha_gripper_to_angular(value): + # Aloha transforms the gripper positions into a linear space. The following code + # reverses this transformation to be consistent with pi0 which is pretrained in + # angular space. + # + # These values are coming from the Aloha code: + # PUPPET_GRIPPER_POSITION_OPEN, PUPPET_GRIPPER_POSITION_CLOSED + value = unnormalize(value, min_val=0.01844, max_val=0.05800) + + # This is the inverse of the angular to linear transformation inside the Interbotix code. + def linear_to_radian(linear_position, arm_length, horn_radius): + value = (horn_radius**2 + linear_position**2 - arm_length**2) / (2 * horn_radius * linear_position) + return safe_arcsin(value) + + # The constants are taken from the Interbotix code. + value = linear_to_radian(value, arm_length=0.036, horn_radius=0.022) + + # Normalize to [0, 1]. + # The values 0.4 and 1.5 were measured on an actual Trossen robot. + return normalize(value, min_val=0.4, max_val=1.5) + + +def aloha_gripper_from_angular(value): + # Convert from the gripper position used by pi0 to the gripper position that is used by Aloha. + # Note that the units are still angular but the range is different. + + # The values 0.4 and 1.5 were measured on an actual Trossen robot. + value = unnormalize(value, min_val=0.4, max_val=1.5) + + # These values are coming from the Aloha code: + # PUPPET_GRIPPER_JOINT_OPEN, PUPPET_GRIPPER_JOINT_CLOSE + return normalize(value, min_val=-0.6213, max_val=1.4910) + + +def aloha_gripper_from_angular_inv(value): + # Directly inverts the gripper_from_angular function. + value = unnormalize(value, min_val=-0.6213, max_val=1.4910) + return normalize(value, min_val=0.4, max_val=1.5) + +class SMOLPI0Policy(PreTrainedPolicy): + """Wrapper class around VLAFlowMatching model to train and run inference within LeRobot.""" + + config_class = SMOLPI0Config + name = "smolpi0" + + def __init__( + self, + config: SMOLPI0Config, + dataset_stats: dict[str, dict[str, Tensor]] | None = None, + ): + """ + Args: + config: Policy configuration class instance or None, in which case the default instantiation of + the configuration class is used. + dataset_stats: Dataset statistics to be used for normalization. If not passed here, it is expected + that they will be passed with a call to `load_state_dict` before the policy is used. + """ + + super().__init__(config) + config.validate_features() + self.config = config + self.normalize_per_robot_type = getattr( + config, "normalize_per_robot_type", False + ) # FIXME(mshukor): assert in case of single dataset + if self.normalize_per_robot_type: + if not dataset_stats: + dataset_stats[config.robot_type] = {} + self.normalize_inputs = NormalizePerRobotType( + config.input_features, config.normalization_mapping, dataset_stats + ) + self.normalize_targets = NormalizePerRobotType( + config.output_features, config.normalization_mapping, dataset_stats + ) + self.unnormalize_outputs = UnnormalizePerRobotType( + config.output_features, config.normalization_mapping, dataset_stats + ) + else: + self.normalize_inputs = Normalize( + config.input_features, config.normalization_mapping, dataset_stats + ) + self.normalize_targets = Normalize( + config.output_features, config.normalization_mapping, dataset_stats + ) + self.unnormalize_outputs = Unnormalize( + config.output_features, config.normalization_mapping, dataset_stats + ) + + self.language_tokenizer = AutoProcessor.from_pretrained(self.config.vlm_model_name).tokenizer + self.model = VLAFlowMatching(config) + self.include_past_states = config.n_obs_steps > 1 and OBS_STATE in self.config.past_obs_keys.split(",") + self.include_past_images = config.n_obs_steps > 1 and "image" in self.config.past_obs_keys.split(",") + self.num_past_images = self.config.n_obs_steps if self.include_past_images else 1 + self.reset() + + def reset(self): + """This should be called whenever the environment is reset.""" + # self._action_queue = deque([], maxlen=self.config.n_action_steps) + self._queues = { + ACTION: deque(maxlen=self.config.n_action_steps), + } + if self.config.n_obs_steps > 1: + for k in self.config.input_features: + if any([past_obs_key in k for past_obs_key in self.config.past_obs_keys.split(",")]): + self._queues[k] = deque(maxlen=self.config.n_obs_steps) + + def get_optim_params(self) -> dict: + if self.config.optimizer_lr_vlm > 0 and self.config.optimizer_lr_vlm != self.config.optimizer_lr: + params = [ + { + "params": [ + p + for n, p in self.named_parameters() + if not ".vlm." in n and p.requires_grad + ] + }, + { + "params": [ + p + for n, p in self.named_parameters() + if ".vlm." in n and p.requires_grad + ], + "lr": self.config.optimizer_lr_vlm, + }, + ] + return params + + else: + return self.parameters() + + + def merge_peft_model_weights(self) -> None: + if "lora" in self.config.peft_method: + self.model.vlm_with_expert.merge_lora_weights() + + @torch.no_grad + def select_action_chunk(self, batch: dict[str, Tensor], noise: Tensor | None = None) -> Tensor: + """Select a single action given environment observations. + + This method wraps `select_actions` in order to return one action at a time for execution in the + environment. It works by managing the actions in a queue and only calling `select_actions` when the + queue is empty. + """ + self.eval() + + if self.config.adapt_to_pi_aloha: + batch[OBS_STATE] = self._pi_aloha_decode_state(batch[OBS_STATE]) + + batch = self.normalize_inputs(batch) + + images, img_masks = self.prepare_images(batch) + state = self.prepare_state(batch) + lang_tokens, lang_masks = self.prepare_language(batch) + + actions = self.model.sample_actions( + images, img_masks, lang_tokens, lang_masks, state, noise=noise + ) + # Unpad actions + original_action_dim = self.config.action_feature.shape[0] + actions = actions[:, :, :original_action_dim] + + actions = self.unnormalize_outputs({"action": actions, "robot_type": batch["robot_type"]})["action"] + + if self.config.adapt_to_pi_aloha: + actions = self._pi_aloha_encode_actions(actions) + + return actions + + @torch.no_grad + def select_action(self, batch: dict[str, Tensor], noise: Tensor | None = None) -> Tensor: + """Select a single action given environment observations. + + This method wraps `select_actions` in order to return one action at a time for execution in the + environment. It works by managing the actions in a queue and only calling `select_actions` when the + queue is empty. + """ + self.eval() + + if self.config.adapt_to_pi_aloha: + batch[OBS_STATE] = self._pi_aloha_decode_state(batch[OBS_STATE]) + + batch = self.normalize_inputs(batch) + + self._queues = populate_queues(self._queues, batch, exclude_keys=[ACTION]) + # Action queue logic for n_action_steps > 1. When the action_queue is depleted, populate it by + # querying the policy. + if len(self._queues[ACTION]) == 0: + for k in batch: + if k in self._queues: + batch[k] = torch.stack(list(self._queues[k]), dim=1) + images, img_masks = self.prepare_images(batch) + state = self.prepare_state(batch) + lang_tokens, lang_masks = self.prepare_language(batch) + actions = self.model.sample_actions( + images, img_masks, lang_tokens, lang_masks, state, noise=noise + ) + if self.config.predict_relative_actions and actions.ndim == 3: + # If the model predicts relative actions, we need to unpad the actions + # and then convert them to absolute actions. + if self.config.relative_actions_mode == "first": + actions = torch.cat((actions[:, :1], actions[:, 1:] + actions[:, :1]), dim=1) + elif self.config.relative_actions_mode == "state": + actions = actions + state.unsqueeze(1) + else: + actions = torch.cat((actions[:, :1], actions[:, 1:] + actions[:, :-1]), dim=1) + # Unpad actions + original_action_dim = self.config.action_feature.shape[0] + actions = actions[:, :, :original_action_dim] + + actions = self.unnormalize_outputs({"action": actions})["action"] + + if self.config.adapt_to_pi_aloha: + actions = self._pi_aloha_encode_actions(actions) + + # `self.model.forward` returns a (batch_size, n_action_steps, action_dim) tensor, but the queue + # effectively has shape (n_action_steps, batch_size, *), hence the transpose. + self._queues[ACTION].extend(actions.transpose(0, 1)[: self.config.n_action_steps]) + return self._queues[ACTION].popleft() + + def forward(self, batch: dict[str, Tensor], noise=None, time=None) -> dict[str, Tensor]: + """Do a full training forward pass to compute the loss""" + if self.config.adapt_to_pi_aloha: + batch[OBS_STATE] = self._pi_aloha_decode_state(batch[OBS_STATE]) + batch[ACTION] = self._pi_aloha_encode_actions_inv(batch[ACTION]) + batch = self.normalize_inputs(batch) + batch = self.normalize_targets(batch) + images, img_masks = self.prepare_images( + batch + ) # FIXME(mshukor): adapte it to take into account already padded images in the batch + state = self.prepare_state(batch) + lang_tokens, lang_masks = self.prepare_language(batch) + actions = self.prepare_action(batch, state=state) + actions_is_pad = batch.get("actions_id_pad") + loss_dict = {} + losses = self.model.forward(images, img_masks, lang_tokens, lang_masks, state, actions, noise, time) + loss_dict["losses_after_forward"] = losses.mean().clone() + + if actions_is_pad is not None: + in_episode_bound = ~actions_is_pad + losses = losses * in_episode_bound.unsqueeze(-1) + loss_dict["losses_after_in_ep_bound"] = losses.mean().clone() + + # Remove padding + losses = losses[:, :, : self.config.max_action_dim] + loss_dict["losses_after_rm_padding"] = losses.mean().clone() + + # For backward pass + loss = losses.mean() + # For backward pass + loss_dict["loss"] = loss + # # For logging + # loss_dict["l2_loss"] = loss.item() # remove for torch compile + return loss_dict + + def prepare_images(self, batch): + """Apply Pi0 preprocessing to the images, like resizing to 224x224 and padding to keep aspect ratio, and + convert pixel range from [0.0, 1.0] to [-1.0, 1.0] as requested by SigLIP. + """ + images = [] + img_masks = [] + present_img_keys = [key for key in self.config.image_features if key in batch] + missing_img_keys = [key for key in self.config.image_features if key not in batch] + + present_img_keys = sorted(present_img_keys, key=lambda k: IMAGES_ORDER.get(k, float("inf")), reverse=self.config.reverse_images_order) + if self.config.shuffle_camera_positions and ACTION in batch: # only during training + present_img_keys = random.sample(present_img_keys, len(present_img_keys)) + if len(present_img_keys) == 0: + raise ValueError( + f"All image features are missing from the batch. At least one expected. (batch: {batch.keys()}) (image_features:{self.config.image_features})" + ) + for i in range(self.num_past_images): + # Preprocess image features present in the batch + for key in present_img_keys: + img = batch[key][:, i, :, :, :] if batch[key].ndim == 5 else batch[key] + if self.config.resize_imgs_with_padding is not None: + img = resize_with_pad(img, *self.config.resize_imgs_with_padding, pad_value=0) + + # Normalize from range [0,1] to [-1,1] as expacted by siglip + img = img * 2.0 - 1.0 + + bsize = img.shape[0] + device = img.device + if f"{key}_padding_mask" in batch: + mask = batch[f"{key}_padding_mask"].bool() + else: + mask = torch.ones(bsize, dtype=torch.bool, device=device) + images.append(img) + img_masks.append(mask) + + # Create image features not present in the batch + # as fully 0 padded images. + for num_empty_cameras in range(len(missing_img_keys)): + if num_empty_cameras >= self.config.empty_cameras: + break + img = torch.ones_like(img) * -1 + mask = torch.zeros_like(mask) + images.append(img) + img_masks.append(mask) + return images, img_masks + + def prepare_language(self, batch) -> tuple[Tensor, Tensor]: + """Tokenize the text input""" + device = batch[OBS_STATE].device + tasks = batch["task"] + if len(tasks) == 1: + tasks = [tasks[0] for _ in range(batch[OBS_STATE].shape[0])] + + if self.config.add_prompt_template: + tasks = [f"{self.config.prefix_prompt_template}{task}{self.config.suffix_prompt_template}" for task in tasks] + else: + tasks = [task if task.endswith("\n") else f"{task}\n" for task in tasks] + tokenized_prompt = self.language_tokenizer.__call__( + tasks, + padding=self.config.pad_language_to, + padding_side="right", + max_length=self.config.tokenizer_max_length, + return_tensors="pt", + truncation=True, # FIXME(mshukor) + ) + + lang_tokens = tokenized_prompt["input_ids"].to(device=device) + lang_masks = tokenized_prompt["attention_mask"].to(device=device, dtype=torch.bool) + + return lang_tokens, lang_masks + + def _pi_aloha_decode_state(self, state): + # Flip the joints. + for motor_idx in [1, 2, 8, 9]: + state[:, motor_idx] *= -1 + # Reverse the gripper transformation that is being applied by the Aloha runtime. + for motor_idx in [6, 13]: + state[:, motor_idx] = aloha_gripper_to_angular(state[:, motor_idx]) + return state + + def _pi_aloha_encode_actions(self, actions): + # Flip the joints. + for motor_idx in [1, 2, 8, 9]: + actions[:, :, motor_idx] *= -1 + # Reverse the gripper transformation that is being applied by the Aloha runtime. + for motor_idx in [6, 13]: + actions[:, :, motor_idx] = aloha_gripper_from_angular(actions[:, :, motor_idx]) + return actions + + def _pi_aloha_encode_actions_inv(self, actions): + # Flip the joints again. + for motor_idx in [1, 2, 8, 9]: + actions[:, :, motor_idx] *= -1 + # Reverse the gripper transformation that is being applied by the Aloha runtime. + for motor_idx in [6, 13]: + actions[:, :, motor_idx] = aloha_gripper_from_angular_inv(actions[:, :, motor_idx]) + return actions + + def prepare_state(self, batch): + """Pad state""" + state = batch[OBS_STATE][:, -1, :] if (batch[OBS_STATE].ndim > 2 and not self.include_past_states) else batch[OBS_STATE] # FIXME(mshukor): no state history for now + state = pad_vector(state, self.config.max_state_dim) + return state + + def prepare_action(self, batch, state=None): + """Pad action""" + actions = pad_vector(batch[ACTION], self.config.max_action_dim) + if self.config.predict_relative_actions and actions.ndim == 3: + if self.config.relative_actions_mode == "first": + actions = torch.cat((actions[:, :1], actions[:, 1:] - actions[:, :1]), dim=1) + elif self.config.relative_actions_mode == "state": + assert batch[ACTION].shape[-1] == batch[OBS_STATE].shape[-1], "Relative action mode 'state' requires the action and state to have the same dimension." + if state.ndim == 2: + state = state.unsqueeze(1) + actions = actions - state + else: + actions = torch.cat((actions[:, :1], actions[:, 1:] - actions[:, :-1]), dim=1) + return actions + +def pad_tensor(tensor, max_len, pad_value=0): + """ + Efficiently pads a tensor along sequence dimension to match max_len. + + Args: + tensor (torch.Tensor): Shape (B, L, ...) or (B, L). + max_len (int): Fixed sequence length. + pad_value (int/float): Value for padding. + + Returns: + torch.Tensor: Shape (B, max_len, ...) or (B, max_len). + """ + B, L = tensor.shape[:2] + + # Create a padded tensor of max_len and copy the existing values + padded_tensor = torch.full((B, max_len, *tensor.shape[2:]), pad_value, dtype=tensor.dtype, device=tensor.device) + padded_tensor[:, :L] = tensor # Efficient in-place copy + + return padded_tensor + +class VLAFlowMatching(nn.Module): + """ + Ο€0: A Vision-Language-Action Flow Model for General Robot Control + + [Paper](https://www.physicalintelligence.company/download/pi0.pdf) + [Jax code](https://github.com/Physical-Intelligence/openpi) + + Designed by Physical Intelligence. Ported from Jax by Hugging Face. + β”Œβ”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β” + β”‚ actions β”‚ + β”‚ β–² β”‚ + β”‚ β”Œβ”΄β”€β”€β”€β”€β”€β” β”‚ + β”‚ kv cache β”‚Gemma β”‚ β”‚ + β”‚ β”Œβ”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β–Ίβ”‚Expertβ”‚ β”‚ + β”‚ β”‚ β”‚ β”‚ β”‚ + β”‚ β”Œβ”΄β”€β”€β”€β”€β”€β”€β”€β”€β” β”‚x 10 β”‚ β”‚ + β”‚ β”‚ β”‚ β””β–²β”€β”€β–²β”€β”€β”˜ β”‚ + β”‚ β”‚ VLM β”‚ β”‚ β”‚ β”‚ + β”‚ β”‚ β”‚ β”‚ robot state β”‚ + β”‚ β”‚ β”‚ noise β”‚ + β”‚ β””β–²β”€β”€β–²β”€β”€β”€β”€β”€β”˜ β”‚ + β”‚ β”‚ β”‚ β”‚ + β”‚ β”‚ image(s) β”‚ + β”‚ language tokens β”‚ + β””β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”˜ + """ + + def __init__(self, config): + super().__init__() + self.config = config + + self.vlm_with_expert = SmolVLMWithExpertModel(model_id=self.config.vlm_model_name, + freeze_vision_encoder=self.config.freeze_vision_encoder, + train_expert_only=self.config.train_expert_only, + attention_implementation=self.config.attention_implementation, + load_vlm_weights=self.config.load_vlm_weights, + attention_mode=self.config.attention_mode, + num_expert_layers=self.config.num_expert_layers, + num_vlm_layers=self.config.num_vlm_layers, + self_attn_every_n_layers=self.config.self_attn_every_n_layers, + expert_width_multiplier=self.config.expert_width_multiplier, + self_attn_only_actions=self.config.self_attn_only_actions, + ) + # self.paligemma_with_expert = self.configure_peft(paligemma_with_expert) + self.vlm_with_expert.configure_peft(config=self.config) + # Projections are float32 + self.state_to_prefix = self.config.state_to_prefix + if self.state_to_prefix: + self.state_proj = nn.Linear(self.config.max_state_dim, self.vlm_with_expert.config.text_config.hidden_size) + else: + self.state_proj = nn.Linear(self.config.max_state_dim, self.vlm_with_expert.expert_hidden_size) + self.action_in_proj = nn.Linear(self.config.max_action_dim, self.vlm_with_expert.expert_hidden_size) + self.action_out_proj = nn.Linear(self.vlm_with_expert.expert_hidden_size, self.config.max_action_dim) + + self.action_time_mlp_in = nn.Linear(self.vlm_with_expert.expert_hidden_size * 2, self.vlm_with_expert.expert_hidden_size) + self.action_time_mlp_out = nn.Linear(self.vlm_with_expert.expert_hidden_size, self.vlm_with_expert.expert_hidden_size) + + self.set_requires_grad() + # SmolVLM2 has: [fake_tok + crop_tok + crop + fake_tok + crop_tok ... + fake_tok + global_tok + global + fake_tok] + [second image] + ... + if any([k in self.config.vlm_model_name for k in ["SmolVLM-", "SmolVLA-"]]): + if "SmolVLM-Instruct" in self.config.vlm_model_name: + self.fake_image_token = 49152 + self.global_image_token = [44, 13906, 29, 6266, 46] + self.global_image_start_token = torch.tensor([self.fake_image_token] + self.global_image_token, dtype=torch.long) + else: + self.fake_image_token = 49189 + self.global_image_token = 49152 + self.global_image_start_token = torch.tensor([self.fake_image_token, self.global_image_token], dtype=torch.long) + else: + self.fake_image_token = self.vlm_with_expert.processor.tokenizer.fake_image_token_id + self.global_image_token = self.vlm_with_expert.processor.tokenizer.global_image_token_id + self.global_image_start_token = torch.tensor([self.fake_image_token, self.global_image_token], dtype=torch.long) + + self.add_image_special_tokens = self.config.add_image_special_tokens + self.add_local_special_image_tokens = self.config.add_local_special_image_tokens + self.local_image_tokens = [torch.tensor([self.fake_image_token, tok], dtype=torch.long) for tok in [49153, 49154, 49155, 49159, 49160, 49161, 49165, 49166, 49167]] # assume 3 x 3 grid + + self.local_image_start_token = self.global_image_start_token + self.image_end_token = torch.tensor([self.fake_image_token], dtype=torch.long) + self.prefix_length = self.config.prefix_length + self.include_past_images = self.config.n_obs_steps > 1 and "image" in self.config.past_obs_keys.split(",") + self.num_past_images = self.config.n_obs_steps if self.include_past_images else 1 + self.causal_attention_on_history = self.config.causal_attention_on_history + + + + + # def configure_peft(self, model): + # # return model + # self.peft_method = self.config.peft_method + # if "lora" in self.peft_method: + # peft_config = self.config.peft_config + # target_modules = peft_config.target_modules + # if not isinstance(target_modules, list): + # target_modules = target_modules.split(",") + # lora_config = LoraConfig( + # task_type=TaskType.CAUSAL_LM, # Based on the task type (e.g., language modeling, etc.) + # r=peft_config.r, # The rank of the low-rank adaptation + # lora_alpha=peft_config.lora_alpha, # Scaling factor + # lora_dropout=peft_config.lora_dropout, # Dropout applied to LoRA layers + # target_modules=target_modules, # The components where LoRA is applied + # exclude_modules=["gemma_expert", "model.gemma_expert.model.layers"], # FIXME(mshukor): this does not work for now + # ) + # # LoraConfig(task_type=TaskType.CAUSAL_LM, r=16, lora_alpha=1, lora_dropout=0, target_modules=["q_proj"], exclude_modules=["gemma_expert"]) + # self.lora_config = lora_config + # # Apply LoRA and ensure only LoRA parameters are trainable + + # model = get_peft_model(model, lora_config) + # assert self.config.train_expert_only, "Backbone should be frozen and only lora parameters are " # FIXME(mshukor): handle this here? + # for name, param in model.named_parameters(): + # if ( + # "lora" in name + # ): # lm_head is not a parameter in most LLMs becasue it's tied to the embedding layer + # param.requires_grad = True + # return model + + def set_requires_grad(self): + for params in self.state_proj.parameters(): + params.requires_grad = self.config.train_state_proj + + def sample_noise(self, shape, device): + noise = torch.normal( + mean=0.0, + std=1.0, + size=shape, + dtype=torch.float32, + device=device, + ) + return noise + + def sample_time(self, bsize, device): + time_beta = sample_beta(1.5, 1.0, bsize, device) + time = time_beta * 0.999 + 0.001 + return time.to(dtype=torch.float32, device=device) + + def embed_prefix( + self, images, img_masks, lang_tokens, lang_masks, state: torch.Tensor = None + ) -> tuple[torch.Tensor, torch.Tensor, torch.Tensor]: + """Embed images with SigLIP and language tokens with embedding layer to prepare + for SmolVLM transformer processing. + """ + # TODO: avoid list in python and torch.cat ; prefer pre-allocation with torch.empty + embs = [] + pad_masks = [] + att_masks = [] + num_images = len(images) // self.num_past_images + # TODO: remove for loop + for img_idx, ( + img, + img_mask, + ) in enumerate(zip(images, img_masks, strict=False)): + # FIXME(mshukor): add special tokens for the history each history_steps or not + if self.add_image_special_tokens: + if self.add_local_special_image_tokens and img_idx % num_images != num_images - 1: + local_token_idx = img_idx % num_images + image_start_token = self.vlm_with_expert.embed_language_tokens(self.local_image_tokens[local_token_idx].to(device=self.vlm_with_expert.vlm.device)).unsqueeze(0).expand(img.shape[0], -1, -1) + else: + image_start_token = self.vlm_with_expert.embed_language_tokens(self.global_image_start_token.to(device=self.vlm_with_expert.vlm.device)).unsqueeze(0).expand(img.shape[0], -1, -1) + image_start_mask = torch.ones_like(image_start_token[:, :, 0], dtype=torch.bool, device=image_start_token.device) + if self.causal_attention_on_history and img_idx % num_images == 0: + att_masks += [1] + [0] * (image_start_mask.shape[-1] - 1) + else: + att_masks += [0] * (image_start_mask.shape[-1]) + embs.append(image_start_token) + pad_masks.append(image_start_mask) + + img_emb = self.vlm_with_expert.embed_image(img) + img_emb = img_emb #.to(dtype=self.vlm_with_expert.type) + + # Normalize image embeddings + img_emb_dim = img_emb.shape[-1] + img_emb = img_emb * torch.tensor(img_emb_dim**0.5, dtype=img_emb.dtype, device=img_emb.device) + + bsize, num_img_embs = img_emb.shape[:2] + img_mask = img_mask[:, None].expand(bsize, num_img_embs) + + # FIXME(mshukor): add special image tokens. Assume no tiling fake global images fake + # template <|im_start|>User: What actions? image tokens \nAssistant: or processor.apply_chat_template? + # processor.fake_image_token + # processor.global_image_token + + embs.append(img_emb) + pad_masks.append(img_mask) + + att_masks += [0] * (num_img_embs) + if self.add_image_special_tokens: + if not self.add_local_special_image_tokens or (self.add_local_special_image_tokens and img_idx % num_images == num_images - 1): + image_end_token = self.vlm_with_expert.embed_language_tokens(self.image_end_token.to(device=self.vlm_with_expert.vlm.device)).unsqueeze(0).expand(img.shape[0], -1, -1) + image_end_mask = torch.ones_like(image_end_token[:, :, 0], dtype=torch.bool, device=image_end_token.device) + embs.append(image_end_token) + pad_masks.append(image_end_mask) + att_masks += [0] * (image_end_mask.shape[1]) + lang_emb = self.vlm_with_expert.embed_language_tokens(lang_tokens) + # Normalize language embeddings + lang_emb_dim = lang_emb.shape[-1] + lang_emb = lang_emb * math.sqrt(lang_emb_dim) # FIXME(mshukor): is this needed for smolvlm? + + embs.append(lang_emb) + pad_masks.append(lang_masks) + + # full attention between image and language inputs + num_lang_embs = lang_emb.shape[1] + att_masks += [0] * num_lang_embs + + if state is not None and self.state_to_prefix: + state_emb = self.state_proj(state) + state_emb = state_emb[:, None, :] if state_emb.ndim == 2 else state_emb #.to(dtype=self.vlm_with_expert.type) + embs.append(state_emb) + bsize = state_emb.shape[0] + dtype = state_emb.dtype + device = state_emb.device + + states_seq_len = state_emb.shape[1] + state_mask = torch.ones(bsize, states_seq_len, dtype=torch.bool, device=device) + pad_masks.append(state_mask) + + # Set attention masks so that image and language inputs do not attend to state or actions + # att_masks += [1] + [0]*(states_seq_len - 1) + att_masks += [1]*(states_seq_len) + embs = torch.cat(embs, dim=1) + pad_masks = torch.cat(pad_masks, dim=1) + att_masks = torch.tensor(att_masks, dtype=torch.bool, device=pad_masks.device) + att_masks = att_masks[None, :] + + seq_len = pad_masks.shape[1] + if seq_len < self.prefix_length: + embs = pad_tensor(embs, self.prefix_length, pad_value=0) + pad_masks = pad_tensor(pad_masks, self.prefix_length, pad_value=0) + att_masks = pad_tensor(att_masks, self.prefix_length, pad_value=0) + + att_masks = att_masks.expand(bsize, -1) + + return embs, pad_masks, att_masks + + def embed_suffix(self, state, noisy_actions, timestep): + """Embed state, noisy_actions, timestep to prepare for Expert Gemma processing.""" + embs = [] + pad_masks = [] + att_masks = [] + + # Embed state + if not self.state_to_prefix: + state_emb = self.state_proj(state) + state_emb = state_emb[:, None, :] if state_emb.ndim == 2 else state_emb #.to(dtype=self.vlm_with_expert.type) + embs.append(state_emb) + bsize = state_emb.shape[0] + dtype = state_emb.dtype + device = state_emb.device + + states_seq_len = state_emb.shape[1] + state_mask = torch.ones(bsize, states_seq_len, dtype=torch.bool, device=device) + pad_masks.append(state_mask) + + # Set attention masks so that image and language inputs do not attend to state or actions + att_masks += [1] + [0]*(states_seq_len - 1) + + + # Fuse timestep + action information using an MLP + action_emb = self.action_in_proj(noisy_actions) + device = action_emb.device + bsize = action_emb.shape[0] + dtype = action_emb.dtype + # Embed timestep using sine-cosine positional encoding with sensitivity in the range [0, 1] + time_emb = create_sinusoidal_pos_embedding( + timestep, self.vlm_with_expert.expert_hidden_size, min_period=4e-3, max_period=4.0, device=device + ) + time_emb = time_emb.type(dtype=dtype) + + time_emb = time_emb[:, None, :].expand_as(action_emb) + action_time_emb = torch.cat([action_emb, time_emb], dim=2) + + action_time_emb = self.action_time_mlp_in(action_time_emb) + action_time_emb = F.silu(action_time_emb) # swish == silu + action_time_emb = self.action_time_mlp_out(action_time_emb) + + # Add to input tokens + embs.append(action_time_emb) + + bsize, action_time_dim = action_time_emb.shape[:2] + action_time_mask = torch.ones(bsize, action_time_dim, dtype=torch.bool, device=device) + pad_masks.append(action_time_mask) + + # Set attention masks so that image, language and state inputs do not attend to action tokens + if self.config.causal_action_attention_mask: + att_masks += [1] * self.config.chunk_size + else: + att_masks += [1] + ([0] * (self.config.chunk_size - 1)) + embs = torch.cat(embs, dim=1) + pad_masks = torch.cat(pad_masks, dim=1) + att_masks = torch.tensor(att_masks, dtype=embs.dtype, device=embs.device) + att_masks = att_masks[None, :].expand(bsize, len(att_masks)) + return embs, pad_masks, att_masks + + def forward( + self, images, img_masks, lang_tokens, lang_masks, state, actions, noise=None, time=None + ) -> Tensor: + """Do a full training forward pass and compute the loss (batch_size x num_steps x num_motors)""" + if noise is None: + noise = self.sample_noise(actions.shape, actions.device) + + if time is None: + time = self.sample_time(actions.shape[0], actions.device) + + time_expanded = time[:, None, None] + if self.config.regression_loss: + # Hack to compare regression to flow matching + time = torch.zeros_like(time, dtype=time.dtype, device=time.device) + x_t = torch.zeros_like(actions, dtype=actions.dtype, device=actions.device) + u_t = actions + else: + x_t = time_expanded * noise + (1 - time_expanded) * actions + u_t = noise - actions + prefix_embs, prefix_pad_masks, prefix_att_masks = self.embed_prefix( + images, img_masks, lang_tokens, lang_masks, state=state + ) + suffix_embs, suffix_pad_masks, suffix_att_masks = self.embed_suffix(state, x_t, time) + + pad_masks = torch.cat([prefix_pad_masks, suffix_pad_masks], dim=1) + att_masks = torch.cat([prefix_att_masks, suffix_att_masks], dim=1) + + att_2d_masks = make_att_2d_masks(pad_masks, att_masks) + position_ids = torch.cumsum(pad_masks, dim=1) - 1 + (_, suffix_out), _ = self.vlm_with_expert.forward( + attention_mask=att_2d_masks, + position_ids=position_ids, + past_key_values=None, + inputs_embeds=[prefix_embs, suffix_embs], + use_cache=False, + fill_kv_cache=False, + ) + suffix_out = suffix_out[:, -self.config.chunk_size :] + # Original openpi code, upcast attention output + suffix_out = suffix_out.to(dtype=torch.float32) + v_t = self.action_out_proj(suffix_out) + if self.config.regression_loss: + losses = F.l1_loss(u_t, v_t, reduction="none") + else: + losses = F.mse_loss(u_t, v_t, reduction="none") + return losses + + def sample_actions(self, images, img_masks, lang_tokens, lang_masks, state, noise=None) -> Tensor: + """Do a full inference forward and compute the action (batch_size x num_steps x num_motors)""" + bsize = state.shape[0] + device = state.device + + if noise is None: + actions_shape = (bsize, self.config.chunk_size, self.config.max_action_dim) + noise = self.sample_noise(actions_shape, device) + + prefix_embs, prefix_pad_masks, prefix_att_masks = self.embed_prefix( + images, img_masks, lang_tokens, lang_masks, state=state + ) + prefix_att_2d_masks = make_att_2d_masks(prefix_pad_masks, prefix_att_masks) + prefix_position_ids = torch.cumsum(prefix_pad_masks, dim=1) - 1 + # Compute image and language key value cache + _, past_key_values = self.vlm_with_expert.forward( + attention_mask=prefix_att_2d_masks, + position_ids=prefix_position_ids, + past_key_values=None, + inputs_embeds=[prefix_embs, None], + use_cache=self.config.use_cache, + fill_kv_cache=True, + ) + if self.config.regression_loss: + x_t = torch.zeros_like(noise, dtype=torch.float32, device=device) + expanded_time = torch.zeros(bsize, dtype=torch.float32, device=device) + x_t = self.denoise_step( + state, + prefix_pad_masks, + past_key_values, + x_t, + expanded_time, + ) + else: + dt = -1.0 / self.config.num_steps + dt = torch.tensor(dt, dtype=torch.float32, device=device) + + x_t = noise + time = torch.tensor(1.0, dtype=torch.float32, device=device) + while time >= -dt / 2: + expanded_time = time.expand(bsize) + v_t = self.denoise_step( + state, + prefix_pad_masks, + past_key_values, + x_t, + expanded_time, + ) + + # Euler step + x_t += dt * v_t + time += dt + return x_t + + def denoise_step( + self, + state, + prefix_pad_masks, + past_key_values, + x_t, + timestep, + ): + """Apply one denoising step of the noise `x_t` at a given timestep.""" + suffix_embs, suffix_pad_masks, suffix_att_masks = self.embed_suffix(state, x_t, timestep) + + suffix_len = suffix_pad_masks.shape[1] + batch_size = prefix_pad_masks.shape[0] + prefix_len = prefix_pad_masks.shape[1] + prefix_pad_2d_masks = prefix_pad_masks[:, None, :].expand(batch_size, suffix_len, prefix_len) + + suffix_att_2d_masks = make_att_2d_masks(suffix_pad_masks, suffix_att_masks) + + full_att_2d_masks = torch.cat([prefix_pad_2d_masks, suffix_att_2d_masks], dim=2) + prefix_offsets = torch.sum(prefix_pad_masks, dim=-1)[:, None] + position_ids = prefix_offsets + torch.cumsum(suffix_pad_masks, dim=1) - 1 + + outputs_embeds, _ = self.vlm_with_expert.forward( + attention_mask=full_att_2d_masks, + position_ids=position_ids, + past_key_values=past_key_values, + inputs_embeds=[None, suffix_embs], + use_cache=self.config.use_cache, + fill_kv_cache=False, + ) + suffix_out = outputs_embeds[1] + suffix_out = suffix_out[:, -self.config.chunk_size :] + suffix_out = suffix_out.to(dtype=torch.float32) + v_t = self.action_out_proj(suffix_out) + return v_t diff --git a/src/lerobot/policies/smolpi0/smolvlm_with_expert.py b/src/lerobot/policies/smolpi0/smolvlm_with_expert.py new file mode 100644 index 000000000..b910679f1 --- /dev/null +++ b/src/lerobot/policies/smolpi0/smolvlm_with_expert.py @@ -0,0 +1,824 @@ +# Copyright 2024 The HuggingFace Inc. team. All rights reserved. +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. + +from typing import List, Optional, Union +from functools import partial +import copy + +import torch +import torch.version +import torch.nn.functional as F # noqa: N812 +from peft import LoraConfig, TaskType, get_peft_model +from pytest import Cache +from torch import nn +from transformers import ( + AutoConfig, + GemmaForCausalLM, + AutoModelForImageTextToText, + AutoProcessor, + PretrainedConfig, + PreTrainedModel, + SmolVLMForConditionalGeneration, + AutoModel, + AutoModelForVision2Seq, +) +from transformers.models.auto import CONFIG_MAPPING +from transformers import SmolVLMModel, SmolVLMConfig +from lerobot.policies.smolpi0.flex_attention import flex_attention_forward + +def _round_up_to_multiple(x, multiple): + return (x + multiple - 1) // multiple * multiple + + +def apply_rope(x, positions, max_wavelength=10_000): + """ + Applies RoPE positions [B, L] to x [B, L, H, D]. + """ + d_half = x.shape[-1] // 2 + device = x.device + dtype = x.dtype + x = x.to(torch.float32) + + freq_exponents = (2.0 / x.shape[-1]) * torch.arange(d_half, dtype=torch.float32, device=device) + timescale = max_wavelength**freq_exponents + radians = positions[..., None].to(torch.float32) / timescale[None, None, :].to(torch.float32) + + radians = radians[..., None, :] + + sin = torch.sin(radians) # .to(dtype=dtype) + cos = torch.cos(radians) # .to(dtype=dtype) + + x1, x2 = x.split(d_half, dim=-1) + res = torch.empty_like(x) + res[..., :d_half] = x1 * cos - x2 * sin + res[..., d_half:] = x2 * cos + x1 * sin + + return res.to(dtype) + + +# class SmolVLMWithExpertConfig(PretrainedConfig): +# model_type = "SmolVLMWithExpertModel" +# sub_configs = {"smolvlm_config": AutoConfig, "lm_expert_config": AutoConfig} + +# def __init__( +# self, +# smolvlm_config: dict | None = None, +# lm_expert_config: dict | None = None, +# freeze_vision_encoder: bool = True, +# train_expert_only: bool = True, +# attention_implementation: str = "eager", +# load_vlm_weights: bool = False, +# **kwargs, +# ): +# self.load_vlm_weights = load_vlm_weights +# self.freeze_vision_encoder = freeze_vision_encoder +# self.train_expert_only = train_expert_only +# self.attention_implementation = attention_implementation + +# if smolvlm_config is None: +# # Default config from Pi0 +# self.smolvlm_config = CONFIG_MAPPING["smolvlm"]( +# transformers_version="4.48.1", +# _vocab_size=257152, +# bos_token_id=2, +# eos_token_id=1, +# hidden_size=2048, +# image_token_index=257152, +# model_type="smolvlm", +# pad_token_id=0, +# projection_dim=2048, +# text_config={ +# "hidden_activation": "gelu_pytorch_tanh", +# "hidden_size": 2048, +# "intermediate_size": 16384, +# "model_type": "gemma", +# "num_attention_heads": 8, +# "num_hidden_layers": 18, +# "num_image_tokens": 256, +# "num_key_value_heads": 1, +# "torch_dtype": "float32", +# "vocab_size": 257152, +# }, +# vision_config={ +# "hidden_size": 1152, +# "intermediate_size": 4304, +# "model_type": "siglip_vision_model", +# "num_attention_heads": 16, +# "num_hidden_layers": 27, +# "num_image_tokens": 256, +# "patch_size": 14, +# "projection_dim": 2048, +# "projector_hidden_act": "gelu_fast", +# "torch_dtype": "float32", +# "vision_use_head": False, +# }, +# ) +# elif isinstance(self.paligemma_config, dict): +# # Override Pi0 default config for PaliGemma +# if "model_type" not in gemma_expert_config: +# paligemma_config["model_type"] = "paligemma" + +# cfg_cls = CONFIG_MAPPING[paligemma_config["model_type"]] +# self.paligemma_config = cfg_cls(**paligemma_config) + +# if gemma_expert_config is None: +# # Default config from Pi0 +# self.gemma_expert_config = CONFIG_MAPPING["gemma"]( +# attention_bias=False, +# attention_dropout=0.0, +# bos_token_id=2, +# eos_token_id=1, +# head_dim=256, +# hidden_act="gelu_pytorch_tanh", +# hidden_activation="gelu_pytorch_tanh", +# hidden_size=1024, +# initializer_range=0.02, +# intermediate_size=4096, +# max_position_embeddings=8192, +# model_type="gemma", +# num_attention_heads=8, +# num_hidden_layers=18, +# num_key_value_heads=1, +# pad_token_id=0, +# rms_norm_eps=1e-06, +# rope_theta=10000.0, +# torch_dtype="float32", +# transformers_version="4.48.1", +# use_cache=True, +# vocab_size=257152, +# ) +# elif isinstance(self.gemma_expert_config, dict): +# # Override Pi0 default config for Gemma Expert +# if "model_type" not in gemma_expert_config: +# gemma_expert_config["model_type"] = "gemma" + +# cfg_cls = CONFIG_MAPPING[paligemma_config["model_type"]] +# self.gemma_expert_config = cfg_cls(**gemma_expert_config) + +# super().__init__(**kwargs) + +# def __post_init__(self): +# super().__post_init__() +# if self.train_expert_only and not self.freeze_vision_encoder: +# raise ValueError( +# "You set `freeze_vision_encoder=False` and `train_expert_only=True` which are not compatible." +# ) + +# if self.attention_implementation not in ["eager", "fa2", "flex"]: +# raise ValueError( +# f"Wrong value provided for `attention_implementation` ({self.attention_implementation}). Expected 'eager', 'fa2' or 'flex'." +# ) + +def get_intermediate_size(hidden_dim, ffn_dim_multiplier=4, multiple_of=256): + hidden_dim = int(2 * hidden_dim / 3) + hidden_dim = int(ffn_dim_multiplier * hidden_dim) + hidden_dim = multiple_of * ((hidden_dim + multiple_of - 1) // multiple_of) + return hidden_dim + + +class SmolVLMWithExpertModel(nn.Module): + # config_class = PaliGemmaWithExpertConfig + + def __init__(self, model_id: str = "HuggingFaceTB/SmolVLM2-500M-Video-Instruct", + load_vlm_weights: bool = True, train_expert_only: bool = True, freeze_vision_encoder: bool = False, + attention_implementation: str = "eager", attention_mode: str = "self_attn", num_expert_layers: int = -1, + num_vlm_layers: int = -1, self_attn_every_n_layers: int = -1, expert_width_multiplier: float = 0.5, self_attn_only_actions: bool = False): + super().__init__() + if load_vlm_weights: + print(f"Loading {model_id} weights ...") + if "SmolVLM-" in model_id: + self.vlm = AutoModelForVision2Seq.from_pretrained( + model_id, + device_map="cuda", + torch_dtype="bfloat16", + low_cpu_mem_usage=True, + ) + else: + # model_id = "HuggingFaceTB/SmolVLM2-500M-Video-Instruct" + self.vlm = AutoModelForImageTextToText.from_pretrained( + model_id, + device_map="cuda", + torch_dtype="bfloat16", + low_cpu_mem_usage=True, + # attn_implementation="eager", + # attn_implementation="flash_attention_2" + ) + config = self.vlm.config + else: + config = AutoConfig.from_pretrained(model_id) + self.vlm = SmolVLMForConditionalGeneration(config=config) + self.processor = AutoProcessor.from_pretrained(model_id) + if num_vlm_layers > 0: + print(f"Reducing the number of VLM layers to {num_vlm_layers} ...") + self.get_vlm_model().text_model.layers = self.get_vlm_model().text_model.layers[:num_vlm_layers] + self.num_vlm_layers = len(self.get_vlm_model().text_model.layers) + self.config = config + # Smaller lm expert + lm_expert_config = copy.deepcopy(config.text_config) + hidden_size = lm_expert_config.hidden_size + lm_expert_config.hidden_size = int(hidden_size*expert_width_multiplier) #hidden_size // 2 + lm_expert_config.intermediate_size = get_intermediate_size(int(hidden_size*expert_width_multiplier)) + lm_expert_config.num_hidden_layers = self.num_vlm_layers + if num_expert_layers > 0 : + assert len(self.get_vlm_model().text_model.layers) % num_expert_layers == 0, f"Number of layers in the VLM {len(self.get_vlm_model().text_model.layers)} are not multiple of num_expert_layers {num_expert_layers}" + lm_expert_config.num_hidden_layers = num_expert_layers + # lm_expert_config.head_dim = lm_expert_config.head_dim * 2 + self.lm_expert = AutoModel.from_config(lm_expert_config) + + self.num_expert_layers = len(self.lm_expert.layers) + self.self_attn_every_n_layers = self_attn_every_n_layers + self.self_attn_only_actions = self_attn_only_actions + if "cross" in attention_mode: + # Reshape qkv projections to have the same input dimension as the vlm + for layer_idx in range(len(self.lm_expert.layers)): + if self.self_attn_every_n_layers > 0 and layer_idx % self.self_attn_every_n_layers == 0: + continue + self.lm_expert.layers[layer_idx].self_attn.k_proj = nn.Linear( + config.text_config.num_key_value_heads * config.text_config.head_dim, lm_expert_config.num_key_value_heads * lm_expert_config.head_dim, bias=lm_expert_config.attention_bias + ) + self.lm_expert.layers[layer_idx].self_attn.v_proj = nn.Linear( + config.text_config.num_key_value_heads * config.text_config.head_dim, lm_expert_config.num_key_value_heads * lm_expert_config.head_dim, bias=lm_expert_config.attention_bias + ) + # Remove unused embed_tokens + self.lm_expert.embed_tokens = None + + self.num_attention_heads = self.config.text_config.num_attention_heads + self.num_key_value_heads = self.config.text_config.num_key_value_heads + + self.freeze_vision_encoder = freeze_vision_encoder + self.train_expert_only = train_expert_only + self.attention_implementation = attention_implementation + self.attention_mode = attention_mode + self.expert_hidden_size = lm_expert_config.hidden_size + # self.to_bfloat16_like_physical_intelligence() + self.set_requires_grad() + + def configure_peft(self, config): + # return model + self.peft_method = config.peft_method + self.peft_target_model = config.peft_target_model + if "lora" in self.peft_method: + peft_config = config.peft_config + target_modules = peft_config.target_modules + if not isinstance(target_modules, list): + target_modules = target_modules.split(",") + lora_config = LoraConfig( + task_type=TaskType.CAUSAL_LM, # Based on the task type (e.g., language modeling, etc.) + r=peft_config.r, # The rank of the low-rank adaptation + lora_alpha=peft_config.lora_alpha, # Scaling factor + lora_dropout=peft_config.lora_dropout, # Dropout applied to LoRA layers + target_modules=target_modules, # The components where LoRA is applied + exclude_modules=[ + "lm_expert", + "model.lm_expert.model.layers", + ], # FIXME(mshukor): this does not work for now + ) + self.lora_config = lora_config + # Apply LoRA and ensure only LoRA parameters are trainable + if "text" in self.peft_target_model: + self.get_vlm_model().text_model = get_peft_model(self.get_vlm_model().text_model, lora_config) + else: + self.vlm = get_peft_model(self.vlm, lora_config) + # assert config.train_expert_only, "Backbone should be frozen and only lora parameters are " # FIXME(mshukor): handle this here? + for name, param in self.vlm.named_parameters(): + if ( + "lora" in name and "text_model.model.layers.17" not in name + ): # lm_head is not a parameter in most LLMs becasue it's tied to the embedding layer + param.requires_grad = True + else: + param.requires_grad = False + + def merge_lora_weights(self): + """ + Merge LoRA weights into the base model. + """ + if "text" in self.peft_target_model: + self.get_vlm_model().text_model = self.get_vlm_model().text_model.merge_and_unload() + else: + self.vlm = self.vlm.merge_and_unload() + + def get_vlm_model(self,): + if hasattr(self.vlm.model, "model"): # When using peft + return self.vlm.model.model + else: + return self.vlm.model + + def set_requires_grad(self): + if self.freeze_vision_encoder: + self.get_vlm_model().vision_model.eval() + for params in self.get_vlm_model().vision_model.parameters(): + params.requires_grad = False + if self.train_expert_only: + self.vlm.eval() + for params in self.vlm.parameters(): + params.requires_grad = False + else: + # To avoid unused params issue with distributed training + last_layers = [self.num_vlm_layers - 1] + if self.num_vlm_layers != self.num_expert_layers and self.num_vlm_layers % self.num_expert_layers == 0: + last_layers.append(self.num_vlm_layers - 2) + frozen_layers = [ + "lm_head", + "text_model.model.norm.weight", + ] + for layer in last_layers: + frozen_layers.append(f"text_model.model.layers.{layer}.") + + for name, params in self.vlm.named_parameters(): + if any( + [ + k in name + for k in frozen_layers + ] + ): + params.requires_grad = False + # To avoid unused params issue with distributed training + for name, params in self.lm_expert.named_parameters(): + if any( + [ + k in name + for k in [ + "lm_head", + ] + ] + ): + params.requires_grad = False + + def train(self, mode: bool = True): + super().train(mode) + + if self.freeze_vision_encoder: + self.get_vlm_model().vision_model.eval() + + if self.train_expert_only: + self.vlm.eval() + + # def to_bfloat16_like_physical_intelligence(self): + # self.vlm = self.vlm.to(dtype=torch.bfloat16) + + # params_to_change_dtype = [ + # "language_model.model.layers", + # "gemma_expert.model.layers", + # "vision_tower", + # "multi_modal", + # ] + # for name, param in self.named_parameters(): + # if any(selector in name for selector in params_to_change_dtype): + # param.data = param.data.to(dtype=torch.bfloat16) + + def embed_image(self, image: torch.Tensor): + patch_attention_mask = None + # # FIXME(mshukor): probably not needed as we don't have padded images here + # pixel_values = image.unsqueeze(1) + # batch_size, num_images, num_channels, height, width = pixel_values.shape + # pixel_values = pixel_values + # pixel_values = pixel_values.view(batch_size * num_images, *pixel_values.shape[2:]) + + # # Remove padding images - padding images are full 0. + # nb_values_per_image = pixel_values.shape[1:].numel() + # real_images_inds = (pixel_values == 0.0).sum(dim=(-1, -2, -3)) != nb_values_per_image + + # if not any(real_images_inds): + # # no images, leave one empty image. + # real_images_inds[0] = True + + # pixel_values = pixel_values[real_images_inds].contiguous() + + # # Handle the vision attention mask + + # pixel_attention_mask = torch.ones( + # size=[pixel_values.shape[i] for i in (0, 2, 3)], + # dtype=torch.bool, + # device=pixel_values.device, + # ) + + # patch_size = self.vlm.config.vision_config.patch_size + # patches_subgrid = pixel_attention_mask.unfold(dimension=1, size=patch_size, step=patch_size) + # patches_subgrid = patches_subgrid.unfold(dimension=2, size=patch_size, step=patch_size) + # patch_attention_mask = (patches_subgrid.sum(dim=(-1, -2)) > 0).bool() + + # FIXME(mshukor): add special image tokens specific to smolvlm + # Get sequence from the vision encoder + image_hidden_states = self.get_vlm_model().vision_model( + pixel_values=image.to(dtype=self.get_vlm_model().vision_model.dtype), + patch_attention_mask=patch_attention_mask, + ).last_hidden_state + # Modality projection & resampling + image_hidden_states = self.get_vlm_model().connector(image_hidden_states) + return image_hidden_states + + def embed_language_tokens(self, tokens: torch.Tensor): + return self.get_vlm_model().text_model.get_input_embeddings()(tokens) + + def forward_attn_layer(self, model_layers, inputs_embeds, layer_idx, position_ids, attention_mask, batch_size, head_dim, use_cache: bool = True, fill_kv_cache: bool = True, past_key_values=None) -> list[torch.Tensor]: + + query_states = [] + key_states = [] + value_states = [] + for i, hidden_states in enumerate(inputs_embeds): + layer = model_layers[i][layer_idx] + if hidden_states is None or layer is None: + continue + + # normalizer = torch.tensor(models[i].config.hidden_size**0.5, dtype=hidden_states.dtype) + # hidden_states = hidden_states * normalizer + hidden_states = layer.input_layernorm(hidden_states) + + input_shape = hidden_states.shape[:-1] + hidden_shape = (*input_shape, -1, layer.self_attn.head_dim) + + hidden_states = hidden_states.to(dtype=layer.self_attn.q_proj.weight.dtype) + query_state = layer.self_attn.q_proj(hidden_states).view(hidden_shape) + key_state = layer.self_attn.k_proj(hidden_states).view(hidden_shape) + value_state = layer.self_attn.v_proj(hidden_states).view(hidden_shape) + + query_states.append(query_state) + key_states.append(key_state) + value_states.append(value_state) + + # FIXME(mshukor): self attention always when having only the prefix + # B,L,H,D with L sequence length, H number of heads, D head dim + # concatenate on the number of embeddings/tokens + query_states = torch.cat(query_states, dim=1) + key_states = torch.cat(key_states, dim=1) + value_states = torch.cat(value_states, dim=1) + # FIXME(mshukor): seq should be B, H, L, D ? + seq_len = query_states.shape[1] + if seq_len < position_ids.shape[1]: + _position_ids = position_ids[:, :seq_len] + _attention_mask = attention_mask[:, :seq_len, :seq_len] + else: + _position_ids = position_ids + _attention_mask = attention_mask + + if self.self_attn_only_actions: + attention_mask_ = _attention_mask.clone() + position_ids_ = _position_ids.clone() + if inputs_embeds[1] is not None: + suffix_len = inputs_embeds[1].shape[1] + attention_mask_[:, -suffix_len:, :-suffix_len] = False + position_ids_[:, -suffix_len:] = _position_ids[:, -suffix_len:] - _position_ids[:, -suffix_len][:, None] + else: + attention_mask_ = _attention_mask + position_ids_ = _position_ids + + query_states = apply_rope(query_states, position_ids_) # FIXME(mshukor): this assumes we have always the vlm features? + key_states = apply_rope(key_states, position_ids_) + + if use_cache and past_key_values is None: + past_key_values = {} + + if use_cache: + if fill_kv_cache: + past_key_values[layer_idx] = { + "key_states": key_states, + "value_states": value_states, + } + else: + # TODO here, some optimization can be done - similar to a `StaticCache` we can declare the `max_len` before. + # so we create an empty cache, with just one cuda malloc, and if (in autoregressive case) we reach + # the max len, then we (for instance) double the cache size. This implementation already exists + # in `transformers`. (molbap) + key_states = torch.cat([past_key_values[layer_idx]["key_states"], key_states], dim=1) + value_states = torch.cat( + [past_key_values[layer_idx]["value_states"], value_states], dim=1 + ) + + attention_interface = self.get_attention_interface() + + att_output = attention_interface( + attention_mask_, batch_size, head_dim, query_states, key_states, value_states + ) + # att_output = att_output.to(dtype=models[i].dtype) + + return [att_output], past_key_values + + + def forward_cross_attn_layer(self, model_layers, inputs_embeds, layer_idx, position_ids, attention_mask, batch_size, head_dim, use_cache: bool = True, fill_kv_cache: bool = True, past_key_values = None) -> list[torch.Tensor]: + + attention_interface = self.get_attention_interface() + + att_outputs = [] + assert len(inputs_embeds) == 2 or (use_cache and past_key_values is not None and not fill_kv_cache), f"Both len(inputs_embeds) == {len(inputs_embeds)} and past_key_values is {past_key_values}" + + if len(inputs_embeds) == 2 and not past_key_values: + # Prefix attention + seq_len = inputs_embeds[0].shape[1] + position_id, expert_position_id = position_ids[:, :seq_len], position_ids[:, seq_len:] + prefix_attention_mask = attention_mask[:, :seq_len, :seq_len] + + layer = model_layers[0][layer_idx] + + hidden_states = layer.input_layernorm(inputs_embeds[0]) + + input_shape = hidden_states.shape[:-1] + hidden_shape = (*input_shape, -1, layer.self_attn.head_dim) + + hidden_states = hidden_states.to(dtype=layer.self_attn.q_proj.weight.dtype) + query_state = layer.self_attn.q_proj(hidden_states).view(hidden_shape) + key_state = layer.self_attn.k_proj(hidden_states).view(hidden_shape) + value_states = layer.self_attn.v_proj(hidden_states).view(hidden_shape) + + # B,L,H,D with L sequence length, H number of heads, D head dim + query_states = apply_rope(query_state, position_id) + key_states = apply_rope(key_state, position_id) + + att_output = attention_interface( + prefix_attention_mask, batch_size, head_dim, query_states, key_states, value_states + ) + att_outputs.append(att_output) + else: + expert_position_id = position_ids + + if use_cache and past_key_values is None: + past_key_values = {} + + if use_cache: + if fill_kv_cache: + past_key_values[layer_idx] = { + "key_states": key_states, + "value_states": value_states, + } + else: + # TODO here, some optimization can be done - similar to a `StaticCache` we can declare the `max_len` before. + # so we create an empty cache, with just one cuda malloc, and if (in autoregressive case) we reach + # the max len, then we (for instance) double the cache size. This implementation already exists + # in `transformers`. (molbap) + key_states = past_key_values[layer_idx]["key_states"] + value_states = past_key_values[layer_idx]["value_states"] + + + # Expert + expert_layer = model_layers[1][layer_idx] + if expert_layer is not None: + expert_hidden_states = expert_layer.input_layernorm(inputs_embeds[1]) + + expert_input_shape = expert_hidden_states.shape[:-1] + expert_hidden_shape = (*expert_input_shape, -1, expert_layer.self_attn.head_dim) + + expert_hidden_states = expert_hidden_states.to(dtype=expert_layer.self_attn.q_proj.weight.dtype) + expert_query_state = expert_layer.self_attn.q_proj(expert_hidden_states).view(expert_hidden_shape) + + + _key_states = key_states.to(dtype=expert_layer.self_attn.k_proj.weight.dtype).view(*key_states.shape[:2], -1) + expert_key_states = expert_layer.self_attn.k_proj(_key_states).view(*_key_states.shape[:-1], -1, expert_layer.self_attn.head_dim) # k_proj should have same dim as kv + + _value_states = value_states.to(dtype=expert_layer.self_attn.v_proj.weight.dtype).view(*value_states.shape[:2], -1) + expert_value_states = expert_layer.self_attn.v_proj(_value_states).view(*_value_states.shape[:-1], -1, expert_layer.self_attn.head_dim) + + expert_position_id = expert_position_id - torch.min(expert_position_id, dim=1, keepdim=True).values # start from 0 + expert_attention_mask = attention_mask[:, -inputs_embeds[1].shape[1]:, :expert_key_states.shape[1]:] # take into account kv + + expert_query_states = apply_rope(expert_query_state, expert_position_id) + # expert_key_states = apply_rope(expert_key_state, expert_position_id) + + att_output = attention_interface( + expert_attention_mask, batch_size, head_dim, expert_query_states, expert_key_states, expert_value_states + ) + att_outputs.append(att_output) + else: + att_outputs.append(None) + + # att_output = att_output.to(dtype=models[i].dtype) + return att_outputs, past_key_values + + def get_model_layers(self, models: list) -> list: # FIXME(mshukor): is this efficient? + vlm_layers = [] + expert_layers = [] + multiple_of = self.num_vlm_layers // self.num_expert_layers + for i in range(self.num_vlm_layers): + if multiple_of > 0 and i > 0 and i % multiple_of != 0: + expert_layer = None + else: + expert_layer_index = i // multiple_of if multiple_of > 0 else i + expert_layer = models[1].layers[expert_layer_index] + vlm_layers.append(models[0].layers[i]) + expert_layers.append(expert_layer) + return [vlm_layers, expert_layers] + # TODO: break down this huge forward into modules or functions + def forward( + self, + attention_mask: Optional[torch.Tensor] = None, + position_ids: Optional[torch.LongTensor] = None, + past_key_values: Optional[Union[List[torch.FloatTensor], Cache]] = None, + inputs_embeds: List[torch.FloatTensor] = None, + use_cache: Optional[bool] = None, + fill_kv_cache: Optional[bool] = None, + ): + models = [self.get_vlm_model().text_model, self.lm_expert] + model_layers = self.get_model_layers(models) + for hidden_states in inputs_embeds: + # TODO this is very inefficient + # dtype is always the same, batch size too (if > 1 len) + # device could be trickier in multi gpu edge cases but that's it + if hidden_states is None: + continue + batch_size = hidden_states.shape[0] + + # # Pad prefix embds so that prefix_embs + prefix_embs len are multiple of 128, pad left or right depending on the gen or train + if self.attention_implementation == "flex": + if inputs_embeds[0] is not None and inputs_embeds[1] is not None and attention_mask.shape[-1] == attention_mask.shape[-2] and past_key_values is None: # Now only during training + seq_len = inputs_embeds[0].shape[1] + inputs_embeds[1].shape[1] + padded_seq_len = _round_up_to_multiple(seq_len, 128) # FIXME(mshukor): more efficient to have a fixed seq len? + b_mask, q_len, kv_len = attention_mask.shape # The shape of your mask + pad = padded_seq_len - q_len + attention_mask = F.pad(attention_mask, (0, pad, 0, pad), value=True) + inputs_embeds[0] = F.pad(inputs_embeds[0], (0, 0, 0, pad), value=0.0) + position_ids = F.pad(position_ids, (0, pad), value=0) + + + # RMSNorm + num_layers = self.num_vlm_layers + head_dim = self.vlm.config.text_config.head_dim + for layer_idx in range(num_layers): + if fill_kv_cache or "cross" not in self.attention_mode or (self.self_attn_every_n_layers > 0 and layer_idx % self.self_attn_every_n_layers == 0): + att_outputs, past_key_values = self.forward_attn_layer(model_layers, inputs_embeds, layer_idx, position_ids, attention_mask, batch_size, head_dim, use_cache=use_cache, fill_kv_cache=fill_kv_cache, past_key_values=past_key_values) + else: + att_outputs, past_key_values = self.forward_cross_attn_layer(model_layers, inputs_embeds, layer_idx, position_ids, attention_mask, batch_size, head_dim, use_cache=use_cache, fill_kv_cache=fill_kv_cache, past_key_values=past_key_values) + # query_states = [] + # key_states = [] + # value_states = [] + # for i, hidden_states in enumerate(inputs_embeds): + # if hidden_states is None: + # continue + # layer = models[i].layers[layer_idx] + # # normalizer = torch.tensor(models[i].config.hidden_size**0.5, dtype=hidden_states.dtype) + # # hidden_states = hidden_states * normalizer + # hidden_states = layer.input_layernorm(hidden_states) + + # input_shape = hidden_states.shape[:-1] + # hidden_shape = (*input_shape, -1, layer.self_attn.head_dim) + + # hidden_states = hidden_states.to(dtype=layer.self_attn.q_proj.weight.dtype) + # query_state = layer.self_attn.q_proj(hidden_states).view(hidden_shape) + # key_state = layer.self_attn.k_proj(hidden_states).view(hidden_shape) + # value_state = layer.self_attn.v_proj(hidden_states).view(hidden_shape) + + # query_states.append(query_state) + # key_states.append(key_state) + # value_states.append(value_state) + + # # FIXME(mshukor): self attention always when having only the prefix + # # B,L,H,D with L sequence length, H number of heads, D head dim + # # concatenate on the number of embeddings/tokens + # query_states = torch.cat(query_states, dim=1) + # key_states = torch.cat(key_states, dim=1) + # value_states = torch.cat(value_states, dim=1) + # # FIXME(mshukor): seq should be B, H, L, D ? + # query_states = apply_rope(query_states, position_ids) + # key_states = apply_rope(key_states, position_ids) + + # if use_cache and past_key_values is None: + # past_key_values = {} + + # if use_cache: + # if fill_kv_cache: + # past_key_values[layer_idx] = { + # "key_states": key_states, + # "value_states": value_states, + # } + # else: + # # TODO here, some optimization can be done - similar to a `StaticCache` we can declare the `max_len` before. + # # so we create an empty cache, with just one cuda malloc, and if (in autoregressive case) we reach + # # the max len, then we (for instance) double the cache size. This implementation already exists + # # in `transformers`. (molbap) + # key_states = torch.cat([past_key_values[layer_idx]["key_states"], key_states], dim=1) + # value_states = torch.cat( + # [past_key_values[layer_idx]["value_states"], value_states], dim=1 + # ) + + # attention_interface = self.get_attention_interface() + # att_output = attention_interface( + # attention_mask, batch_size, head_dim, query_states, key_states, value_states + # ) + + + # att_output = att_output.to(dtype=models[i].dtype) + + # first part of att_output is prefix (up to sequence length, [:, 0:prefix_seq_len]) + outputs_embeds = [] + start = 0 + for i, hidden_states in enumerate(inputs_embeds): + # layer = models[i].layers[layer_idx] + layer = model_layers[i][layer_idx] + att_output = att_outputs[i] if i < len(att_outputs) else att_outputs[0] # in case of self_attn + if hidden_states is not None: + if layer is None: + outputs_embeds.append(hidden_states) + continue + end = start + hidden_states.shape[1] + + if att_output.dtype != layer.self_attn.o_proj.weight.dtype: + att_output = att_output.to(layer.self_attn.o_proj.weight.dtype) + att_out = att_output[:, start:end] + out_emb = layer.self_attn.o_proj(att_out) + + # TODO: first dropout (by default 0.0) + # first residual + out_emb += hidden_states + after_first_residual = out_emb.clone() + + out_emb = layer.post_attention_layernorm(out_emb) + out_emb = layer.mlp(out_emb) + + # TODO: second dropout (by default 0.0) + + # second residual + out_emb += after_first_residual + + outputs_embeds.append(out_emb) + + start = end if len(att_outputs) == 1 else 0 + else: + outputs_embeds.append(None) + + inputs_embeds = outputs_embeds + + # final norm + outputs_embeds = [] + for i, hidden_states in enumerate(inputs_embeds): + if hidden_states is not None: + out_emb = models[i].norm(hidden_states) + outputs_embeds.append(out_emb) + else: + outputs_embeds.append(None) + return outputs_embeds, past_key_values + + def get_attention_interface(self): + if self.attention_implementation == "fa2": + attention_interface = self.flash_attention_forward + elif self.attention_implementation == "flex": + attention_interface = partial(flex_attention_forward, num_att_heads=self.num_attention_heads, num_key_value_heads=self.num_key_value_heads) + else: + attention_interface = self.eager_attention_forward + return attention_interface + + def flash_attention_forward( + self, attention_mask, batch_size, head_dim, query_states, key_states, value_states + ): + raise NotImplementedError("FA2 is not implemented (yet)") + + def eager_attention_forward( + self, attention_mask, batch_size, head_dim, query_states, key_states, value_states + ): + num_att_heads = self.num_attention_heads + num_key_value_heads = self.num_key_value_heads + num_key_value_groups = num_att_heads // num_key_value_heads + + # query_states: batch_size, sequence_length, num_att_head, head_dim + # key_states: batch_size, sequence_length, num_key_value_head, head_dim + # value_states: batch_size, sequence_length, num_key_value_head, head_dim + sequence_length = key_states.shape[1] + + key_states = key_states[:, :, :, None, :].expand( + batch_size, sequence_length, num_key_value_heads, num_key_value_groups, head_dim + ) + key_states = key_states.reshape( + batch_size, sequence_length, num_key_value_heads * num_key_value_groups, head_dim + ) + + value_states = value_states[:, :, :, None, :].expand( + batch_size, sequence_length, num_key_value_heads, num_key_value_groups, head_dim + ) + value_states = value_states.reshape( + batch_size, sequence_length, num_key_value_heads * num_key_value_groups, head_dim + ) + + # Attention here is upcasted to float32 to match the original eager implementation. + + query_states = query_states.to(dtype=torch.float32) + key_states = key_states.to(dtype=torch.float32) + + query_states = query_states.transpose(1, 2) + key_states = key_states.transpose(1, 2) + + att_weights = torch.matmul(query_states, key_states.transpose(2, 3)) + att_weights *= head_dim**-0.5 + + att_weights = att_weights.to(dtype=torch.float32) + big_neg = torch.finfo(att_weights.dtype).min #-2.3819763e38 # See gemma/modules.py + masked_att_weights = torch.where(attention_mask[:, None, :, :], att_weights, big_neg) + probs = nn.functional.softmax(masked_att_weights, dim=-1) + probs = probs.to(dtype=value_states.dtype) + + # probs: batch_size, num_key_value_head, num_att_head, sequence_length, sequence_length + # value_states: batch_size, sequence_length, num_att_heads, head_dim + + att_output = torch.matmul(probs, value_states.permute(0, 2, 1, 3)) + + att_output = att_output.permute(0, 2, 1, 3) + # we use -1 because sequence length can change + att_output = att_output.reshape(batch_size, -1, num_key_value_heads * num_key_value_groups * head_dim) + + return att_output diff --git a/src/lerobot/policies/smolvla/modeling_smolvla.py b/src/lerobot/policies/smolvla/modeling_smolvla.py index 95ed993d2..9b7e3520a 100644 --- a/src/lerobot/policies/smolvla/modeling_smolvla.py +++ b/src/lerobot/policies/smolvla/modeling_smolvla.py @@ -1,3 +1,955 @@ +# #!/usr/bin/env python + +# # Copyright 2025 HuggingFace Inc. team. All rights reserved. +# # +# # Licensed under the Apache License, Version 2.0 (the "License"); +# # you may not use this file except in compliance with the License. +# # You may obtain a copy of the License at +# # +# # http://www.apache.org/licenses/LICENSE-2.0 +# # +# # Unless required by applicable law or agreed to in writing, software +# # distributed under the License is distributed on an "AS IS" BASIS, +# # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# # See the License for the specific language governing permissions and +# # limitations under the License. + +# """ +# SmolVLA: + +# [Paper](https://huggingface.co/papers/2506.01844) + +# Designed by Hugging Face. + +# Install smolvla extra dependencies: +# ```bash +# pip install -e ".[smolvla]" +# ``` + +# Example of finetuning the smolvla pretrained model (`smolvla_base`): +# ```bash +# lerobot-train \ +# --policy.path=lerobot/smolvla_base \ +# --dataset.repo_id=danaaubakirova/svla_so100_task1_v3 \ +# --batch_size=64 \ +# --steps=200000 +# ``` + +# Example of finetuning a smolVLA. SmolVLA is composed of a pretrained VLM, +# and an action expert. +# ```bash +# lerobot-train \ +# --policy.type=smolvla \ +# --dataset.repo_id=danaaubakirova/svla_so100_task1_v3 \ +# --batch_size=64 \ +# --steps=200000 +# ``` + +# Example of using the smolvla pretrained model outside LeRobot training framework: +# ```python +# policy = SmolVLAPolicy.from_pretrained("lerobot/smolvla_base") +# ``` + +# """ + +# import math +# import os +# import re +# from collections import deque + +# import safetensors +# import torch +# import torch.nn.functional as F # noqa: N812 +# from torch import Tensor, nn +# from transformers import AutoProcessor + +# from lerobot.constants import ACTION +# from lerobot.policies.normalize import ( +# Normalize, +# Unnormalize, +# ) +# from lerobot.policies.pretrained import PreTrainedPolicy +# from lerobot.policies.smolvla.configuration_smolvla import SmolVLAConfig +# from lerobot.policies.smolvla.smolvlm_with_expert import SmolVLMWithExpertModel +# from lerobot.policies.utils import ( +# populate_queues, +# ) +# from lerobot.utils.utils import get_safe_dtype +# OBS_STATE = 'state' +# ACTION = 'actions' +# # Matches ".soNNN", optionally followed by "-something", up to the "_buffer_" marker +# _VARIANT_RE = re.compile(r"\.so\d+(?:-[\w]+)?_buffer_") + + +# def canonicalise(k: str) -> str: +# """ +# Remove dataset-variant markers like '.so100-blue_' or '.so100_' from a +# normalisation-buffer key. +# """ +# return _VARIANT_RE.sub(".buffer_", k) + + +# def standardise_state_dict( +# checkpoint: dict[str, torch.Tensor], ref_keys: set[str], *, verbose: bool = True +# ) -> tuple[dict[str, torch.Tensor], list[str]]: +# """ +# β€’ Re-keys `checkpoint ` so that every entry matches the *reference* key set. +# β€’ If several variant keys collapse to the same canonical name we keep the +# first one and log the collision. +# β€’ Returns the new dict + a list of entries that could not be matched. +# """ +# out, collisions, unmatched = {}, {}, [] + +# for k, v in checkpoint.items(): +# canon = canonicalise(k) +# if canon in ref_keys: +# if canon in out: # duplicate after collapsing +# collisions.setdefault(canon, []).append(k) +# else: +# out[canon] = v +# else: +# unmatched.append(k) + +# if verbose: +# for canon, variants in collisions.items(): +# print(f"[standardise_state_dict] '{canon}' ← {variants}") +# if unmatched: +# print(f"[standardise_state_dict] kept {len(unmatched)} unmatched keys") + +# out.update({k: checkpoint[k] for k in unmatched}) +# return out, unmatched + + +# def rename_checkpoint_keys(checkpoint: dict, rename_str: str): +# """ +# Renames keys in a checkpoint dictionary based on the given rename string. + +# Args: +# checkpoint (dict): The checkpoint dictionary. +# rename_str (str): A string specifying key mappings in the format "old1//new1,old2//new2". + +# Returns: +# dict: The modified checkpoint with renamed keys. +# """ + +# rename_dict = dict(pair.split("//") for pair in rename_str.split(",")) + +# new_checkpoint = {} +# for k, v in checkpoint.items(): +# for old_key, new_key in rename_dict.items(): +# if old_key in k: +# k = k.replace(old_key, new_key) +# new_checkpoint[k] = v +# return new_checkpoint + + +# def load_smolvla( +# model: torch.nn.Module, +# filename: str | os.PathLike, +# *, +# device: str = "cpu", +# checkpoint_keys_mapping: str = "", +# ) -> torch.nn.Module: +# state_dict = safetensors.torch.load_file(filename, device=device) + +# # Optional user-supplied renames (e.g. "model._orig_mod.//model.") +# if checkpoint_keys_mapping and "//" in checkpoint_keys_mapping: +# state_dict = rename_checkpoint_keys(state_dict, checkpoint_keys_mapping) + +# state_dict, _ = standardise_state_dict(state_dict, set(model.state_dict().keys())) + +# # HACK(aliberts): to not overwrite normalization parameters as they should come from the dataset +# norm_keys = ("normalize_inputs", "normalize_targets", "unnormalize_outputs") +# state_dict = {k: v for k, v in state_dict.items() if not k.startswith(norm_keys)} + +# missing, unexpected = model.load_state_dict(state_dict, strict=False) + +# if not all(key.startswith(norm_keys) for key in missing) or unexpected: +# raise RuntimeError( +# "SmolVLA %d missing / %d unexpected keys", +# len(missing), +# len(unexpected), +# ) + +# return model + + +# def create_sinusoidal_pos_embedding( +# time: torch.tensor, dimension: int, min_period: float, max_period: float, device="cpu" +# ) -> Tensor: +# """Computes sine-cosine positional embedding vectors for scalar positions.""" +# if dimension % 2 != 0: +# raise ValueError(f"dimension ({dimension}) must be divisible by 2") + +# if time.ndim != 1: +# raise ValueError("The time tensor is expected to be of shape `(batch_size, )`.") + +# dtype = get_safe_dtype(torch.float64, device.type) +# fraction = torch.linspace(0.0, 1.0, dimension // 2, dtype=dtype, device=device) +# period = min_period * (max_period / min_period) ** fraction + +# # Compute the outer product +# scaling_factor = 1.0 / period * 2 * math.pi +# sin_input = scaling_factor[None, :] * time[:, None] +# pos_emb = torch.cat([torch.sin(sin_input), torch.cos(sin_input)], dim=1) +# return pos_emb + + +# def make_att_2d_masks(pad_masks, att_masks): +# """Copied from big_vision. + +# Tokens can attend to valid inputs tokens which have a cumulative mask_ar +# smaller or equal to theirs. This way `mask_ar` int[B, N] can be used to +# setup several types of attention, for example: + +# [[1 1 1 1 1 1]]: pure causal attention. + +# [[0 0 0 1 1 1]]: prefix-lm attention. The first 3 tokens can attend between +# themselves and the last 3 tokens have a causal attention. The first +# entry could also be a 1 without changing behaviour. + +# [[1 0 1 0 1 0 0 1 0 0]]: causal attention between 4 blocks. Tokens of a +# block can attend all previous blocks and all tokens on the same block. + +# Args: +# input_mask: bool[B, N] true if its part of the input, false if padding. +# mask_ar: int32[B, N] mask that's 1 where previous tokens cannot depend on +# it and 0 where it shares the same attention mask as the previous token. +# """ +# if att_masks.ndim != 2: +# raise ValueError(att_masks.ndim) +# if pad_masks.ndim != 2: +# raise ValueError(pad_masks.ndim) + +# cumsum = torch.cumsum(att_masks, dim=1) +# att_2d_masks = cumsum[:, None, :] <= cumsum[:, :, None] +# pad_2d_masks = pad_masks[:, None, :] * pad_masks[:, :, None] +# att_2d_masks = att_2d_masks & pad_2d_masks +# return att_2d_masks + + +# def resize_with_pad(img, width, height, pad_value=-1): +# # assume no-op when width height fits already +# if img.ndim != 4: +# raise ValueError(f"(b,c,h,w) expected, but {img.shape}") + +# cur_height, cur_width = img.shape[2:] + +# ratio = max(cur_width / width, cur_height / height) +# resized_height = int(cur_height / ratio) +# resized_width = int(cur_width / ratio) +# resized_img = F.interpolate( +# img, size=(resized_height, resized_width), mode="bilinear", align_corners=False +# ) + +# pad_height = max(0, int(height - resized_height)) +# pad_width = max(0, int(width - resized_width)) + +# # pad on left and top of image +# padded_img = F.pad(resized_img, (pad_width, 0, pad_height, 0), value=pad_value) +# return padded_img + + +# def pad_vector(vector, new_dim): +# """Can be (batch_size x sequence_length x features_dimension) +# or (batch_size x features_dimension) +# """ +# if vector.shape[-1] == new_dim: +# return vector +# shape = list(vector.shape) +# current_dim = shape[-1] +# shape[-1] = new_dim +# new_vector = torch.zeros(*shape, dtype=vector.dtype, device=vector.device) +# new_vector[..., :current_dim] = vector +# return new_vector + + +# def normalize(x, min_val, max_val): +# return (x - min_val) / (max_val - min_val) + + +# def unnormalize(x, min_val, max_val): +# return x * (max_val - min_val) + min_val + + +# def safe_arcsin(value): +# # This ensures that the input stays within +# # [βˆ’1,1] to avoid invalid values for arcsin +# return torch.arcsin(torch.clamp(value, -1.0, 1.0)) + + +# def aloha_gripper_to_angular(value): +# # Aloha transforms the gripper positions into a linear space. The following code +# # reverses this transformation to be consistent with smolvla which is pretrained in +# # angular space. +# # +# # These values are coming from the Aloha code: +# # PUPPET_GRIPPER_POSITION_OPEN, PUPPET_GRIPPER_POSITION_CLOSED +# value = unnormalize(value, min_val=0.01844, max_val=0.05800) + +# # This is the inverse of the angular to linear transformation inside the Interbotix code. +# def linear_to_radian(linear_position, arm_length, horn_radius): +# value = (horn_radius**2 + linear_position**2 - arm_length**2) / (2 * horn_radius * linear_position) +# return safe_arcsin(value) + +# # The constants are taken from the Interbotix code. +# value = linear_to_radian(value, arm_length=0.036, horn_radius=0.022) + +# # Normalize to [0, 1]. +# # The values 0.4 and 1.5 were measured on an actual Trossen robot. +# return normalize(value, min_val=0.4, max_val=1.5) + + +# def aloha_gripper_from_angular(value): +# # Convert from the gripper position used by smolvla to the gripper position that is used by Aloha. +# # Note that the units are still angular but the range is different. + +# # The values 0.4 and 1.5 were measured on an actual Trossen robot. +# value = unnormalize(value, min_val=0.4, max_val=1.5) + +# # These values are coming from the Aloha code: +# # PUPPET_GRIPPER_JOINT_OPEN, PUPPET_GRIPPER_JOINT_CLOSE +# return normalize(value, min_val=-0.6213, max_val=1.4910) + + +# def aloha_gripper_from_angular_inv(value): +# # Directly inverts the gripper_from_angular function. +# value = unnormalize(value, min_val=-0.6213, max_val=1.4910) +# return normalize(value, min_val=0.4, max_val=1.5) + + +# class SmolVLAPolicy(PreTrainedPolicy): +# """Wrapper class around VLAFlowMatching model to train and run inference within LeRobot.""" + +# config_class = SmolVLAConfig +# name = "smolvla" + +# def __init__( +# self, +# config: SmolVLAConfig, +# dataset_stats: dict[str, dict[str, Tensor]] | None = None, +# ): +# """ +# Args: +# config: Policy configuration class instance or None, in which case the default instantiation of +# the configuration class is used. +# dataset_stats: Dataset statistics to be used for normalization. If not passed here, it is expected +# that they will be passed with a call to `load_state_dict` before the policy is used. +# """ + +# super().__init__(config) +# config.validate_features() +# self.config = config +# self.normalize_inputs = Normalize(config.input_features, config.normalization_mapping, dataset_stats) +# self.normalize_targets = Normalize( +# config.output_features, config.normalization_mapping, dataset_stats +# ) +# self.unnormalize_outputs = Unnormalize( +# config.output_features, config.normalization_mapping, dataset_stats +# ) + +# self.language_tokenizer = AutoProcessor.from_pretrained(self.config.vlm_model_name).tokenizer +# self.model = VLAFlowMatching(config) +# self.reset() + +# def reset(self): +# """This should be called whenever the environment is reset.""" +# self._queues = { +# ACTION: deque(maxlen=self.config.n_action_steps), +# } + +# # HACK(aliberts, danaaubakirova): we overwrite this classmethod here to fix smolVLA-specific issues +# @classmethod +# def _load_as_safetensor( +# cls, +# model: "SmolVLAPolicy", +# model_file: str, +# map_location: str, +# strict: bool, +# ): +# safetensors.torch.load_model(model, model_file, strict=strict, device=map_location) +# return load_smolvla( +# model, +# model_file, +# device=map_location, +# checkpoint_keys_mapping="model._orig_mod.//model.", +# ) + +# def get_optim_params(self) -> dict: +# return self.parameters() + +# def _get_action_chunk(self, batch: dict[str, Tensor], noise: Tensor | None = None) -> Tensor: +# # TODO: Check if this for loop is needed. +# # Context: In fact, self.queues contains only ACTION field, and in inference, we don't have action in the batch +# # In the case of offline inference, we have the action in the batch +# # that why without the k != ACTION check, it will raise an error because we are trying to stack +# # on an empty container. +# for k in batch: +# if k in self._queues and k != ACTION: +# batch[k] = torch.stack(list(self._queues[k]), dim=1) + +# images, img_masks = self.prepare_images(batch) +# state = self.prepare_state(batch) +# lang_tokens, lang_masks = self.prepare_language(batch) + +# actions = self.model.sample_actions(images, img_masks, lang_tokens, lang_masks, state, noise=noise) + +# # Unpad actions +# original_action_dim = self.config.action_feature.shape[0] +# actions = actions[:, :, :original_action_dim] + +# actions = self.unnormalize_outputs({ACTION: actions})[ACTION] + +# if self.config.adapt_to_pi_aloha: +# actions = self._pi_aloha_encode_actions(actions) + +# return actions + +# def _prepare_batch(self, batch: dict[str, Tensor]) -> dict[str, Tensor]: +# if self.config.adapt_to_pi_aloha: +# batch[OBS_STATE] = self._pi_aloha_decode_state(batch[OBS_STATE]) + +# batch = self.normalize_inputs(batch) + +# return batch + +# @torch.no_grad() +# def predict_action_chunk(self, batch: dict[str, Tensor], noise: Tensor | None = None) -> Tensor: +# self.eval() + +# batch = self._prepare_batch(batch) +# self._queues = populate_queues(self._queues, batch, exclude_keys=[ACTION]) + +# actions = self._get_action_chunk(batch, noise) +# return actions + +# @torch.no_grad() +# def select_action(self, batch: dict[str, Tensor], noise: Tensor | None = None) -> Tensor: +# """Select a single action given environment observations. + +# This method wraps `select_actions` in order to return one action at a time for execution in the +# environment. It works by managing the actions in a queue and only calling `select_actions` when the +# queue is empty. +# """ +# self.eval() +# batch = self._prepare_batch(batch) +# self._queues = populate_queues(self._queues, batch, exclude_keys=[ACTION]) + +# # Action queue logic for n_action_steps > 1. When the action_queue is depleted, populate it by +# # querying the policy. +# if len(self._queues[ACTION]) == 0: +# actions = self._get_action_chunk(batch, noise) + +# # `self.predict_action_chunk` returns a (batch_size, n_action_steps, action_dim) tensor, but the queue +# # effectively has shape (n_action_steps, batch_size, *), hence the transpose. +# self._queues[ACTION].extend(actions.transpose(0, 1)[: self.config.n_action_steps]) + +# return self._queues[ACTION].popleft() + +# def forward(self, batch: dict[str, Tensor], noise=None, time=None) -> dict[str, Tensor]: +# """Do a full training forward pass to compute the loss""" +# if self.config.adapt_to_pi_aloha: +# batch[OBS_STATE] = self._pi_aloha_decode_state(batch[OBS_STATE]) +# batch[ACTION] = self._pi_aloha_encode_actions_inv(batch[ACTION]) +# batch = self.normalize_inputs(batch) +# batch = self.normalize_targets(batch) +# images, img_masks = self.prepare_images(batch) +# state = self.prepare_state(batch) +# lang_tokens, lang_masks = self.prepare_language(batch) +# actions = self.prepare_action(batch) +# actions_is_pad = batch.get("actions_id_pad") +# loss_dict = {} +# losses = self.model.forward(images, img_masks, lang_tokens, lang_masks, state, actions, noise, time) +# loss_dict["losses_after_forward"] = losses.clone() + +# if actions_is_pad is not None: +# in_episode_bound = ~actions_is_pad +# losses = losses * in_episode_bound.unsqueeze(-1) +# loss_dict["losses_after_in_ep_bound"] = losses.clone() + +# # Remove padding +# losses = losses[:, :, : self.config.max_action_dim] +# loss_dict["losses_after_rm_padding"] = losses.clone() + +# # For backward pass +# loss = losses.mean() +# # For backward pass +# loss_dict["loss"] = loss.item() +# return loss, loss_dict + +# def prepare_images(self, batch): +# """Apply SmolVLA preprocessing to the images, like resizing to 224x224 and padding to keep aspect ratio, and +# convert pixel range from [0.0, 1.0] to [-1.0, 1.0] as requested by SigLIP. +# """ +# images = [] +# img_masks = [] +# present_img_keys = [key for key in self.config.image_features if key in batch] +# missing_img_keys = [key for key in self.config.image_features if key not in batch] + +# if len(present_img_keys) == 0: +# raise ValueError( +# f"All image features are missing from the batch. At least one expected. (batch: {batch.keys()}) (image_features:{self.config.image_features})" +# ) +# # Preprocess image features present in the batch +# for key in present_img_keys: +# img = batch[key][:, -1, :, :, :] if batch[key].ndim == 5 else batch[key] +# if self.config.resize_imgs_with_padding is not None: +# img = resize_with_pad(img, *self.config.resize_imgs_with_padding, pad_value=0) + +# # Normalize from range [0,1] to [-1,1] as expacted by siglip +# img = img * 2.0 - 1.0 + +# bsize = img.shape[0] +# device = img.device +# if f"{key}_padding_mask" in batch: +# mask = batch[f"{key}_padding_mask"].bool() +# else: +# mask = torch.ones(bsize, dtype=torch.bool, device=device) +# images.append(img) +# img_masks.append(mask) + +# # Create image features not present in the batch +# # as fully 0 padded images. +# for num_empty_cameras in range(len(missing_img_keys)): +# if num_empty_cameras >= self.config.empty_cameras: +# break +# img = torch.ones_like(img) * -1 +# mask = torch.zeros_like(mask) +# images.append(img) +# img_masks.append(mask) +# return images, img_masks + +# def prepare_language(self, batch) -> tuple[Tensor, Tensor]: +# """Tokenize the text input""" +# device = batch[OBS_STATE].device +# tasks = batch["task"] +# if isinstance(tasks, str): +# tasks = [tasks] + +# if len(tasks) == 1: +# tasks = [tasks[0] for _ in range(batch[OBS_STATE].shape[0])] + +# tasks = [task if task.endswith("\n") else f"{task}\n" for task in tasks] + +# tokenized_prompt = self.language_tokenizer.__call__( +# tasks, +# padding=self.config.pad_language_to, +# padding_side="right", +# max_length=self.config.tokenizer_max_length, +# return_tensors="pt", +# ) +# lang_tokens = tokenized_prompt["input_ids"].to(device=device) +# lang_masks = tokenized_prompt["attention_mask"].to(device=device, dtype=torch.bool) + +# return lang_tokens, lang_masks + +# def _pi_aloha_decode_state(self, state): +# # Flip the joints. +# for motor_idx in [1, 2, 8, 9]: +# state[:, motor_idx] *= -1 +# # Reverse the gripper transformation that is being applied by the Aloha runtime. +# for motor_idx in [6, 13]: +# state[:, motor_idx] = aloha_gripper_to_angular(state[:, motor_idx]) +# return state + +# def _pi_aloha_encode_actions(self, actions): +# # Flip the joints. +# for motor_idx in [1, 2, 8, 9]: +# actions[:, :, motor_idx] *= -1 +# # Reverse the gripper transformation that is being applied by the Aloha runtime. +# for motor_idx in [6, 13]: +# actions[:, :, motor_idx] = aloha_gripper_from_angular(actions[:, :, motor_idx]) +# return actions + +# def _pi_aloha_encode_actions_inv(self, actions): +# # Flip the joints again. +# for motor_idx in [1, 2, 8, 9]: +# actions[:, :, motor_idx] *= -1 +# # Reverse the gripper transformation that is being applied by the Aloha runtime. +# for motor_idx in [6, 13]: +# actions[:, :, motor_idx] = aloha_gripper_from_angular_inv(actions[:, :, motor_idx]) +# return actions + +# def prepare_state(self, batch): +# """Pad state""" +# state = batch[OBS_STATE][:, -1, :] if batch[OBS_STATE].ndim > 2 else batch[OBS_STATE] +# state = pad_vector(state, self.config.max_state_dim) +# return state + +# def prepare_action(self, batch): +# """Pad action""" +# actions = pad_vector(batch[ACTION], self.config.max_action_dim) +# return actions + + +# def pad_tensor(tensor, max_len, pad_value=0): +# """ +# Efficiently pads a tensor along sequence dimension to match max_len. + +# Args: +# tensor (torch.Tensor): Shape (B, L, ...) or (B, L). +# max_len (int): Fixed sequence length. +# pad_value (int/float): Value for padding. + +# Returns: +# torch.Tensor: Shape (B, max_len, ...) or (B, max_len). +# """ +# b, d = tensor.shape[:2] + +# # Create a padded tensor of max_len and copy the existing values +# padded_tensor = torch.full( +# (b, max_len, *tensor.shape[2:]), pad_value, dtype=tensor.dtype, device=tensor.device +# ) +# padded_tensor[:, :d] = tensor # Efficient in-place copy + +# return padded_tensor + + +# class VLAFlowMatching(nn.Module): +# """ +# SmolVLA + +# [Paper]() + +# Designed by Hugging Face. +# β”Œβ”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β” +# β”‚ actions β”‚ +# β”‚ β–² β”‚ +# β”‚ β”Œβ”€β”€β”€β”€β”€β”€β”€β”€β”€β” β”Œβ”€|────┐ β”‚ +# β”‚ | │────► β”‚ β”‚ β”‚ +# β”‚ | β”‚ kv β”‚ β”‚ β”‚ +# β”‚ | │────► β”‚Actionβ”‚ β”‚ +# β”‚ | VLM β”‚cache β”‚Expertβ”‚ | +# β”‚ β”‚ │────► | β”‚ β”‚ +# β”‚ β”‚ β”‚ β”‚ β”‚ β”‚ +# β”‚ β””β–²β”€β”€β–²β”€β”€β”€β–²β”€β”˜ β””β”€β”€β”€β–²β”€β”€β”˜ | +# β”‚ β”‚ | | β”‚ | +# β”‚ | | | noise β”‚ +# β”‚ β”‚ β”‚ state β”‚ +# β”‚ β”‚ language tokens β”‚ +# β”‚ image(s) β”‚ +# β””β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”˜ +# """ + +# def __init__(self, config: SmolVLAConfig): +# super().__init__() +# self.config = config + +# self.vlm_with_expert = SmolVLMWithExpertModel( +# model_id=self.config.vlm_model_name, +# freeze_vision_encoder=self.config.freeze_vision_encoder, +# train_expert_only=self.config.train_expert_only, +# load_vlm_weights=self.config.load_vlm_weights, +# attention_mode=self.config.attention_mode, +# num_expert_layers=self.config.num_expert_layers, +# num_vlm_layers=self.config.num_vlm_layers, +# self_attn_every_n_layers=self.config.self_attn_every_n_layers, +# expert_width_multiplier=self.config.expert_width_multiplier, +# ) +# self.state_proj = nn.Linear( +# self.config.max_state_dim, self.vlm_with_expert.config.text_config.hidden_size +# ) +# self.action_in_proj = nn.Linear(self.config.max_action_dim, self.vlm_with_expert.expert_hidden_size) +# self.action_out_proj = nn.Linear(self.vlm_with_expert.expert_hidden_size, self.config.max_action_dim) + +# self.action_time_mlp_in = nn.Linear( +# self.vlm_with_expert.expert_hidden_size * 2, self.vlm_with_expert.expert_hidden_size +# ) +# self.action_time_mlp_out = nn.Linear( +# self.vlm_with_expert.expert_hidden_size, self.vlm_with_expert.expert_hidden_size +# ) + +# self.set_requires_grad() +# self.fake_image_token = self.vlm_with_expert.processor.tokenizer.fake_image_token_id +# self.global_image_token = self.vlm_with_expert.processor.tokenizer.global_image_token_id +# self.global_image_start_token = torch.tensor( +# [self.fake_image_token, self.global_image_token], dtype=torch.long +# ) + +# self.add_image_special_tokens = self.config.add_image_special_tokens +# self.image_end_token = torch.tensor([self.fake_image_token], dtype=torch.long) +# self.prefix_length = self.config.prefix_length + +# def set_requires_grad(self): +# for params in self.state_proj.parameters(): +# params.requires_grad = self.config.train_state_proj + +# def sample_noise(self, shape, device): +# noise = torch.normal( +# mean=0.0, +# std=1.0, +# size=shape, +# dtype=torch.float32, +# device=device, +# ) +# return noise + +# def sample_time(self, bsize, device): +# beta_dist = torch.distributions.Beta(concentration1=1.5, concentration0=1.0) +# time_beta = beta_dist.sample((bsize,)).to(device=device, dtype=torch.float32) +# time = time_beta * 0.999 + 0.001 +# return time + +# def embed_prefix( +# self, images, img_masks, lang_tokens, lang_masks, state: torch.Tensor = None +# ) -> tuple[torch.Tensor, torch.Tensor, torch.Tensor]: +# """Embed images with SigLIP and language tokens with embedding layer to prepare +# for SmolVLM transformer processing. +# """ +# embs = [] +# pad_masks = [] +# att_masks = [] +# for _img_idx, ( +# img, +# img_mask, +# ) in enumerate(zip(images, img_masks, strict=False)): +# if self.add_image_special_tokens: +# image_start_token = ( +# self.vlm_with_expert.embed_language_tokens( +# self.global_image_start_token.to(device=self.vlm_with_expert.vlm.device) +# ) +# .unsqueeze(0) +# .expand(img.shape[0], -1, -1) +# ) +# image_start_mask = torch.ones_like( +# image_start_token[:, :, 0], dtype=torch.bool, device=image_start_token.device +# ) +# att_masks += [0] * (image_start_mask.shape[-1]) +# embs.append(image_start_token) +# pad_masks.append(image_start_mask) + +# img_emb = self.vlm_with_expert.embed_image(img) +# img_emb = img_emb + +# # Normalize image embeddings +# img_emb_dim = img_emb.shape[-1] +# img_emb = img_emb * torch.tensor(img_emb_dim**0.5, dtype=img_emb.dtype, device=img_emb.device) + +# bsize, num_img_embs = img_emb.shape[:2] +# img_mask = img_mask[:, None].expand(bsize, num_img_embs) + +# embs.append(img_emb) +# pad_masks.append(img_mask) + +# att_masks += [0] * (num_img_embs) +# if self.add_image_special_tokens: +# image_end_token = ( +# self.vlm_with_expert.embed_language_tokens( +# self.image_end_token.to(device=self.vlm_with_expert.vlm.device) +# ) +# .unsqueeze(0) +# .expand(img.shape[0], -1, -1) +# ) +# image_end_mask = torch.ones_like( +# image_end_token[:, :, 0], dtype=torch.bool, device=image_end_token.device +# ) +# embs.append(image_end_token) +# pad_masks.append(image_end_mask) +# att_masks += [0] * (image_end_mask.shape[1]) +# lang_emb = self.vlm_with_expert.embed_language_tokens(lang_tokens) +# # Normalize language embeddings +# lang_emb_dim = lang_emb.shape[-1] +# lang_emb = lang_emb * math.sqrt(lang_emb_dim) + +# embs.append(lang_emb) +# pad_masks.append(lang_masks) + +# num_lang_embs = lang_emb.shape[1] +# att_masks += [0] * num_lang_embs + +# state_emb = self.state_proj(state) +# state_emb = state_emb[:, None, :] if state_emb.ndim == 2 else state_emb +# embs.append(state_emb) +# bsize = state_emb.shape[0] +# device = state_emb.device + +# states_seq_len = state_emb.shape[1] +# state_mask = torch.ones(bsize, states_seq_len, dtype=torch.bool, device=device) +# pad_masks.append(state_mask) + +# # Set attention masks so that image and language inputs do not attend to state or actions +# att_masks += [1] * (states_seq_len) +# embs = torch.cat(embs, dim=1) +# pad_masks = torch.cat(pad_masks, dim=1) +# att_masks = torch.tensor(att_masks, dtype=torch.bool, device=pad_masks.device) +# att_masks = att_masks[None, :] + +# seq_len = pad_masks.shape[1] +# if seq_len < self.prefix_length: +# embs = pad_tensor(embs, self.prefix_length, pad_value=0) +# pad_masks = pad_tensor(pad_masks, self.prefix_length, pad_value=0) +# att_masks = pad_tensor(att_masks, self.prefix_length, pad_value=0) + +# att_masks = att_masks.expand(bsize, -1) + +# return embs, pad_masks, att_masks + +# def embed_suffix(self, noisy_actions, timestep): +# """Embed state, noisy_actions, timestep to prepare for Expert Gemma processing.""" +# embs = [] +# pad_masks = [] +# att_masks = [] + +# # Fuse timestep + action information using an MLP +# action_emb = self.action_in_proj(noisy_actions) +# device = action_emb.device +# bsize = action_emb.shape[0] +# dtype = action_emb.dtype +# # Embed timestep using sine-cosine positional encoding with sensitivity in the range [0, 1] +# time_emb = create_sinusoidal_pos_embedding( +# timestep, +# self.vlm_with_expert.expert_hidden_size, +# self.config.min_period, +# self.config.max_period, +# device=device, +# ) +# time_emb = time_emb.type(dtype=dtype) + +# time_emb = time_emb[:, None, :].expand_as(action_emb) +# action_time_emb = torch.cat([action_emb, time_emb], dim=2) + +# action_time_emb = self.action_time_mlp_in(action_time_emb) +# action_time_emb = F.silu(action_time_emb) # swish == silu +# action_time_emb = self.action_time_mlp_out(action_time_emb) + +# # Add to input tokens +# embs.append(action_time_emb) + +# bsize, action_time_dim = action_time_emb.shape[:2] +# action_time_mask = torch.ones(bsize, action_time_dim, dtype=torch.bool, device=device) +# pad_masks.append(action_time_mask) + +# # Set attention masks so that image, language and state inputs do not attend to action tokens +# att_masks += [1] * self.config.chunk_size +# embs = torch.cat(embs, dim=1) +# pad_masks = torch.cat(pad_masks, dim=1) +# att_masks = torch.tensor(att_masks, dtype=embs.dtype, device=embs.device) +# att_masks = att_masks[None, :].expand(bsize, len(att_masks)) +# # added by jade +# seq_len = pad_masks.shape[1] +# if seq_len < self.config.chunk_size: +# embs = pad_tensor(embs, self.config.chunk_size, pad_value=0) +# pad_masks = pad_tensor(pad_masks, self.config.chunk_size, pad_value=0) +# att_masks = pad_tensor(att_masks, self.config.chunk_size, pad_value=0) +# return embs, pad_masks, att_masks + +# def forward( +# self, images, img_masks, lang_tokens, lang_masks, state, actions, noise=None, time=None +# ) -> Tensor: +# """Do a full training forward pass and compute the loss (batch_size x num_steps x num_motors)""" +# #added by jade +# if actions.ndim == 2: +# actions = actions[:, None, :].expand(-1, self.config.chunk_size, -1) +# if noise is None: +# noise = self.sample_noise(actions.shape, actions.device) + +# if time is None: +# time = self.sample_time(actions.shape[0], actions.device) + +# time_expanded = time[:, None, None] +# x_t = time_expanded * noise + (1 - time_expanded) * actions +# u_t = noise - actions +# prefix_embs, prefix_pad_masks, prefix_att_masks = self.embed_prefix( +# images, img_masks, lang_tokens, lang_masks, state=state +# ) +# suffix_embs, suffix_pad_masks, suffix_att_masks = self.embed_suffix(x_t, time) + +# pad_masks = torch.cat([prefix_pad_masks, suffix_pad_masks], dim=1) +# att_masks = torch.cat([prefix_att_masks, suffix_att_masks], dim=1) + +# att_2d_masks = make_att_2d_masks(pad_masks, att_masks) +# position_ids = torch.cumsum(pad_masks, dim=1) - 1 +# (_, suffix_out), _ = self.vlm_with_expert.forward( +# attention_mask=att_2d_masks, +# position_ids=position_ids, +# past_key_values=None, +# inputs_embeds=[prefix_embs, suffix_embs], +# use_cache=False, +# fill_kv_cache=False, +# ) +# # suffix_out = suffix_out[:, -self.config.chunk_size :] +# suffix_out = suffix_out[:, -self.config.chunk_size:, :] +# # Original openpi code, upcast attention output +# suffix_out = suffix_out.to(dtype=torch.float32) +# v_t = self.action_out_proj(suffix_out) +# losses = F.mse_loss(u_t, v_t, reduction="none") +# return losses + +# def sample_actions(self, images, img_masks, lang_tokens, lang_masks, state, noise=None) -> Tensor: +# """Do a full inference forward and compute the action (batch_size x num_steps x num_motors)""" +# bsize = state.shape[0] +# device = state.device + +# if noise is None: +# actions_shape = (bsize, self.config.chunk_size, self.config.max_action_dim) +# noise = self.sample_noise(actions_shape, device) + +# prefix_embs, prefix_pad_masks, prefix_att_masks = self.embed_prefix( +# images, img_masks, lang_tokens, lang_masks, state=state +# ) +# prefix_att_2d_masks = make_att_2d_masks(prefix_pad_masks, prefix_att_masks) +# prefix_position_ids = torch.cumsum(prefix_pad_masks, dim=1) - 1 +# # Compute image and language key value cache +# _, past_key_values = self.vlm_with_expert.forward( +# attention_mask=prefix_att_2d_masks, +# position_ids=prefix_position_ids, +# past_key_values=None, +# inputs_embeds=[prefix_embs, None], +# use_cache=self.config.use_cache, +# fill_kv_cache=True, +# ) +# dt = -1.0 / self.config.num_steps +# dt = torch.tensor(dt, dtype=torch.float32, device=device) + +# x_t = noise +# time = torch.tensor(1.0, dtype=torch.float32, device=device) +# while time >= -dt / 2: +# expanded_time = time.expand(bsize) +# v_t = self.denoise_step( +# prefix_pad_masks, +# past_key_values, +# x_t, +# expanded_time, +# ) +# # Euler step +# x_t += dt * v_t +# time += dt +# return x_t + +# def denoise_step( +# self, +# prefix_pad_masks, +# past_key_values, +# x_t, +# timestep, +# ): +# """Apply one denoising step of the noise `x_t` at a given timestep.""" +# suffix_embs, suffix_pad_masks, suffix_att_masks = self.embed_suffix(x_t, timestep) + +# suffix_len = suffix_pad_masks.shape[1] +# batch_size = prefix_pad_masks.shape[0] +# prefix_len = prefix_pad_masks.shape[1] +# prefix_pad_2d_masks = prefix_pad_masks[:, None, :].expand(batch_size, suffix_len, prefix_len) + +# suffix_att_2d_masks = make_att_2d_masks(suffix_pad_masks, suffix_att_masks) + +# full_att_2d_masks = torch.cat([prefix_pad_2d_masks, suffix_att_2d_masks], dim=2) +# prefix_offsets = torch.sum(prefix_pad_masks, dim=-1)[:, None] +# position_ids = prefix_offsets + torch.cumsum(suffix_pad_masks, dim=1) - 1 + +# outputs_embeds, _ = self.vlm_with_expert.forward( +# attention_mask=full_att_2d_masks, +# position_ids=position_ids, +# past_key_values=past_key_values, +# inputs_embeds=[None, suffix_embs], +# use_cache=self.config.use_cache, +# fill_kv_cache=False, +# ) +# suffix_out = outputs_embeds[1] +# suffix_out = suffix_out[:, -self.config.chunk_size :] +# suffix_out = suffix_out.to(dtype=torch.float32) +# v_t = self.action_out_proj(suffix_out) +# return v_t #!/usr/bin/env python # Copyright 2025 HuggingFace Inc. team. All rights reserved. @@ -63,7 +1015,7 @@ import torch.nn.functional as F # noqa: N812 from torch import Tensor, nn from transformers import AutoProcessor -from lerobot.constants import ACTION +from lerobot.constants import ACTION, OBS_STATE from lerobot.policies.normalize import ( Normalize, Unnormalize, @@ -75,8 +1027,7 @@ from lerobot.policies.utils import ( populate_queues, ) from lerobot.utils.utils import get_safe_dtype -OBS_STATE = 'state' -ACTION = 'actions' + # Matches ".soNNN", optionally followed by "-something", up to the "_buffer_" marker _VARIANT_RE = re.compile(r"\.so\d+(?:-[\w]+)?_buffer_") @@ -825,21 +1776,12 @@ class VLAFlowMatching(nn.Module): pad_masks = torch.cat(pad_masks, dim=1) att_masks = torch.tensor(att_masks, dtype=embs.dtype, device=embs.device) att_masks = att_masks[None, :].expand(bsize, len(att_masks)) - # added by jade - seq_len = pad_masks.shape[1] - if seq_len < self.config.chunk_size: - embs = pad_tensor(embs, self.config.chunk_size, pad_value=0) - pad_masks = pad_tensor(pad_masks, self.config.chunk_size, pad_value=0) - att_masks = pad_tensor(att_masks, self.config.chunk_size, pad_value=0) return embs, pad_masks, att_masks def forward( self, images, img_masks, lang_tokens, lang_masks, state, actions, noise=None, time=None ) -> Tensor: """Do a full training forward pass and compute the loss (batch_size x num_steps x num_motors)""" - #added by jade - if actions.ndim == 2: - actions = actions[:, None, :].expand(-1, self.config.chunk_size, -1) if noise is None: noise = self.sample_noise(actions.shape, actions.device) @@ -867,8 +1809,7 @@ class VLAFlowMatching(nn.Module): use_cache=False, fill_kv_cache=False, ) - # suffix_out = suffix_out[:, -self.config.chunk_size :] - suffix_out = suffix_out[:, -self.config.chunk_size:, :] + suffix_out = suffix_out[:, -self.config.chunk_size :] # Original openpi code, upcast attention output suffix_out = suffix_out.to(dtype=torch.float32) v_t = self.action_out_proj(suffix_out) @@ -949,4 +1890,4 @@ class VLAFlowMatching(nn.Module): suffix_out = suffix_out[:, -self.config.chunk_size :] suffix_out = suffix_out.to(dtype=torch.float32) v_t = self.action_out_proj(suffix_out) - return v_t + return v_t \ No newline at end of file diff --git a/src/lerobot/policies/smolvla/modeling_smolvla_v2.py b/src/lerobot/policies/smolvla/modeling_smolvla_v2.py new file mode 100644 index 000000000..e69de29bb diff --git a/src/lerobot/policies/smolvla/saver.txt b/src/lerobot/policies/smolvla/saver.txt new file mode 100644 index 000000000..3410062ba --- /dev/null +++ b/src/lerobot/policies/smolvla/saver.txt @@ -0,0 +1 @@ +c \ No newline at end of file diff --git a/src/lerobot/policies/smolvla/smolvlm_with_expert.py b/src/lerobot/policies/smolvla/smolvlm_with_expert.py index f6a49dccf..e4cd7acac 100644 --- a/src/lerobot/policies/smolvla/smolvlm_with_expert.py +++ b/src/lerobot/policies/smolvla/smolvlm_with_expert.py @@ -77,8 +77,8 @@ class SmolVLMWithExpertModel(nn.Module): self.vlm = AutoModelForImageTextToText.from_pretrained( model_id, device_map="auto", - # torch_dtype="bfloat16", - torch_dtype=torch.float16, + torch_dtype="bfloat16", + # torch_dtype=torch.float16, low_cpu_mem_usage=True, ) config = self.vlm.config @@ -547,4 +547,4 @@ class SmolVLMWithExpertModel(nn.Module): # we use -1 because sequence length can change att_output = att_output.reshape(batch_size, -1, num_key_value_heads * num_key_value_groups * head_dim) - return att_output + return att_output \ No newline at end of file diff --git a/src/lerobot/scripts/eval.py b/src/lerobot/scripts/eval.py index 3145bed35..92a3bf833 100644 --- a/src/lerobot/scripts/eval.py +++ b/src/lerobot/scripts/eval.py @@ -458,6 +458,43 @@ def _compile_episode_data( data_dict["index"] = torch.arange(start_data_index, start_data_index + total_frames, 1) return data_dict +from lerobot.policies.smolvla.modeling_smolvla import SmolVLAPolicy +from lerobot.datasets.lerobot_dataset import LeRobotDatasetMetadata +def _inject_normalization_stats(policy: SmolVLAPolicy, dataset_meta: LeRobotDatasetMetadata): + """Recreate normalization layers with proper stats from the dataset.""" + from lerobot.policies.normalize import Normalize, Unnormalize + + # Convert numpy stats to the format expected by normalization layers + stats = {} + for key, stat_dict in dataset_meta.stats.items(): + stats[key] = { + stat_type: torch.from_numpy(stat_array) if isinstance(stat_array, np.ndarray) else stat_array + for stat_type, stat_array in stat_dict.items() + } + + # Recreate normalization layers with proper stats + normalize_inputs = Normalize(policy.config.input_features, policy.config.normalization_mapping, stats) + + normalize_targets = Normalize(policy.config.output_features, policy.config.normalization_mapping, stats) + + unnormalize_outputs = Unnormalize( + policy.config.output_features, policy.config.normalization_mapping, stats + ) + + # Replace the normalization layers on the policy + policy.normalize_inputs = normalize_inputs + policy.normalize_targets = normalize_targets + policy.unnormalize_outputs = unnormalize_outputs + + print("Normalization layers recreated with dataset stats.") + + +def load_smolvla(cfg, dataset_repo: str): + from lerobot.datasets.lerobot_dataset import LeRobotDataset + dataset = LeRobotDataset(dataset_repo, root='/raid/jade/.cache/huggingface/datasets/') + policy = make_policy(cfg=cfg, ds_meta=dataset.meta) + _inject_normalization_stats(policy=policy, dataset_meta=dataset.meta) # only needed if stats are missing + return policy, dataset @parser.wrap() @@ -466,7 +503,9 @@ def eval_main(cfg: EvalPipelineConfig): # Check device is available device = get_safe_torch_device(cfg.policy.device, log=True) - + #login to hf + from huggingface_hub import login + login() torch.backends.cudnn.benchmark = True torch.backends.cuda.matmul.allow_tf32 = True set_seed(cfg.seed) @@ -481,6 +520,9 @@ def eval_main(cfg: EvalPipelineConfig): cfg=cfg.policy, env_cfg=cfg.env, ) + # breakpoint() + load_smolvla(cfg.policy, "physical-intelligence/libero") + # breakpoint() policy.eval() with torch.no_grad(), torch.autocast(device_type=device.type) if cfg.policy.use_amp else nullcontext(): if cfg.env.multitask_eval: diff --git a/src/lerobot/scripts/train.py b/src/lerobot/scripts/train.py index d5d562518..74219fc38 100644 --- a/src/lerobot/scripts/train.py +++ b/src/lerobot/scripts/train.py @@ -104,6 +104,32 @@ def update_policy( train_metrics.update_s = time.perf_counter() - start_time return train_metrics, output_dict +# def _inject_normalization_stats(policy: SmolVLAPolicy, dataset_meta: LeRobotDatasetMetadata): +# """Recreate normalization layers with dataset stats if missing (Adil's workaround).""" +# from lerobot.policies.normalize import Normalize, Unnormalize + +# if not hasattr(dataset_meta, "stats") or not dataset_meta.stats: +# print("⚠️ Dataset has no stats, skipping normalization injection.") +# return + +# stats = {} +# for key, stat_dict in dataset_meta.stats.items(): +# stats[key] = { +# stat_type: torch.as_tensor(stat_array) +# if isinstance(stat_array, np.ndarray) +# else stat_array +# for stat_type, stat_array in stat_dict.items() +# } + +# normalize_inputs = Normalize(policy.config.input_features, policy.config.normalization_mapping, stats) +# normalize_targets = Normalize(policy.config.output_features, policy.config.normalization_mapping, stats) +# unnormalize_outputs = Unnormalize(policy.config.output_features, policy.config.normalization_mapping, stats) + +# policy.normalize_inputs = normalize_inputs +# policy.normalize_targets = normalize_targets +# policy.unnormalize_outputs = unnormalize_outputs + +# print("βœ… Normalization layers injected with dataset stats.") @parser.wrap() def train(cfg: TrainPipelineConfig): @@ -126,7 +152,6 @@ def train(cfg: TrainPipelineConfig): logging.info("Creating dataset") dataset = make_dataset(cfg) - # Create environment used for evaluating checkpoints during training on simulation data. # On real-world data, no need to create an environment as evaluations are done outside train.py, # using the eval.py instead, with gym_dora environment and dora-rs. @@ -140,7 +165,6 @@ def train(cfg: TrainPipelineConfig): cfg=cfg.policy, ds_meta=dataset.meta, ) - logging.info("Creating optimizer and scheduler") optimizer, lr_scheduler = make_optimizer_and_scheduler(cfg, policy) grad_scaler = GradScaler(device.type, enabled=cfg.policy.use_amp) @@ -203,7 +227,6 @@ def train(cfg: TrainPipelineConfig): start_time = time.perf_counter() batch = next(dl_iter) train_tracker.dataloading_s = time.perf_counter() - start_time - for key in batch: if isinstance(batch[key], torch.Tensor): batch[key] = batch[key].to(device, non_blocking=device.type == "cuda") diff --git a/src/lerobot/scripts/train_2.py b/src/lerobot/scripts/train_2.py new file mode 100644 index 000000000..26a9e7aea --- /dev/null +++ b/src/lerobot/scripts/train_2.py @@ -0,0 +1,343 @@ +#!/usr/bin/env python + +# Copyright 2024 The HuggingFace Inc. team. All rights reserved. +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. +import logging +import time +from contextlib import nullcontext +from pprint import pformat +from typing import Any + +import torch +from termcolor import colored +from torch.amp import GradScaler +from torch.optim import Optimizer + +from lerobot.configs import parser +from lerobot.configs.train import TrainPipelineConfig +from lerobot.datasets.factory import make_dataset +from lerobot.datasets.sampler import EpisodeAwareSampler +from lerobot.datasets.utils import cycle +from lerobot.envs.factory import make_env +from lerobot.optim.factory import make_optimizer_and_scheduler +from lerobot.policies.factory import make_policy +from lerobot.policies.pretrained import PreTrainedPolicy +from lerobot.policies.utils import get_device_from_parameters +from lerobot.scripts.eval import eval_policy, eval_policy_multitask +from lerobot.utils.logging_utils import AverageMeter, MetricsTracker +from lerobot.utils.random_utils import set_seed +from lerobot.utils.train_utils import ( + get_step_checkpoint_dir, + get_step_identifier, + load_training_state, + save_checkpoint, + update_last_checkpoint, +) +from lerobot.utils.utils import ( + format_big_number, + get_safe_torch_device, + has_method, + init_logging, +) +from lerobot.utils.wandb_utils import WandBLogger + + +def update_policy( + train_metrics: MetricsTracker, + policy: PreTrainedPolicy, + batch: Any, + optimizer: Optimizer, + grad_clip_norm: float, + grad_scaler: GradScaler, + lr_scheduler=None, + use_amp: bool = False, + lock=None, +) -> tuple[MetricsTracker, dict]: + start_time = time.perf_counter() + device = get_device_from_parameters(policy) + policy.train() + with torch.autocast(device_type=device.type) if use_amp else nullcontext(): + loss, output_dict = policy.forward(batch) + # TODO(rcadene): policy.unnormalize_outputs(out_dict) + grad_scaler.scale(loss).backward() + + # Unscale the gradient of the optimizer's assigned params in-place **prior to gradient clipping**. + grad_scaler.unscale_(optimizer) + + grad_norm = torch.nn.utils.clip_grad_norm_( + policy.parameters(), + grad_clip_norm, + error_if_nonfinite=False, + ) + + # Optimizer's gradients are already unscaled, so scaler.step does not unscale them, + # although it still skips optimizer.step() if the gradients contain infs or NaNs. + with lock if lock is not None else nullcontext(): + grad_scaler.step(optimizer) + # Updates the scale for next iteration. + grad_scaler.update() + + optimizer.zero_grad() + + # Step through pytorch scheduler at every batch instead of epoch + if lr_scheduler is not None: + lr_scheduler.step() + + if has_method(policy, "update"): + # To possibly update an internal buffer (for instance an Exponential Moving Average like in TDMPC). + policy.update() + + train_metrics.loss = loss.item() + train_metrics.grad_norm = grad_norm.item() + train_metrics.lr = optimizer.param_groups[0]["lr"] + train_metrics.update_s = time.perf_counter() - start_time + return train_metrics, output_dict + +def _inject_normalization_stats(policy: SmolVLAPolicy, dataset_meta: LeRobotDatasetMetadata): + """Recreate normalization layers with dataset stats if missing (Adil's workaround).""" + from lerobot.policies.normalize import Normalize, Unnormalize + + if not hasattr(dataset_meta, "stats") or not dataset_meta.stats: + print("⚠️ Dataset has no stats, skipping normalization injection.") + return + + stats = {} + for key, stat_dict in dataset_meta.stats.items(): + stats[key] = { + stat_type: torch.as_tensor(stat_array) + if isinstance(stat_array, np.ndarray) + else stat_array + for stat_type, stat_array in stat_dict.items() + } + + normalize_inputs = Normalize(policy.config.input_features, policy.config.normalization_mapping, stats) + normalize_targets = Normalize(policy.config.output_features, policy.config.normalization_mapping, stats) + unnormalize_outputs = Unnormalize(policy.config.output_features, policy.config.normalization_mapping, stats) + + policy.normalize_inputs = normalize_inputs + policy.normalize_targets = normalize_targets + policy.unnormalize_outputs = unnormalize_outputs + + print("βœ… Normalization layers injected with dataset stats.") + +@parser.wrap() +def train(cfg: TrainPipelineConfig): + cfg.validate() + logging.info(pformat(cfg.to_dict())) + + if cfg.wandb.enable and cfg.wandb.project: + wandb_logger = WandBLogger(cfg) + else: + wandb_logger = None + logging.info(colored("Logs will be saved locally.", "yellow", attrs=["bold"])) + + if cfg.seed is not None: + set_seed(cfg.seed) + + # Check device is available + device = get_safe_torch_device(cfg.policy.device, log=True) + torch.backends.cudnn.benchmark = True + torch.backends.cuda.matmul.allow_tf32 = True + + logging.info("Creating dataset") + dataset = make_dataset(cfg) + + # Create environment used for evaluating checkpoints during training on simulation data. + # On real-world data, no need to create an environment as evaluations are done outside train.py, + # using the eval.py instead, with gym_dora environment and dora-rs. + eval_env = None + if cfg.eval_freq > 0 and cfg.env is not None: + logging.info("Creating env") + eval_env = make_env(cfg.env, n_envs=cfg.eval.batch_size, use_async_envs=cfg.eval.use_async_envs) + + logging.info("Creating policy") + policy = make_policy( + cfg=cfg.policy, + ds_meta=dataset.meta, + ) + logging.info("Creating optimizer and scheduler") + optimizer, lr_scheduler = make_optimizer_and_scheduler(cfg, policy) + grad_scaler = GradScaler(device.type, enabled=cfg.policy.use_amp) + + step = 0 # number of policy updates (forward + backward + optim) + + if cfg.resume: + step, optimizer, lr_scheduler = load_training_state(cfg.checkpoint_path, optimizer, lr_scheduler) + + num_learnable_params = sum(p.numel() for p in policy.parameters() if p.requires_grad) + num_total_params = sum(p.numel() for p in policy.parameters()) + + logging.info(colored("Output dir:", "yellow", attrs=["bold"]) + f" {cfg.output_dir}") + if cfg.env is not None: + logging.info(f"{cfg.env.task=}") + logging.info(f"{cfg.steps=} ({format_big_number(cfg.steps)})") + logging.info(f"{dataset.num_frames=} ({format_big_number(dataset.num_frames)})") + logging.info(f"{dataset.num_episodes=}") + logging.info(f"{num_learnable_params=} ({format_big_number(num_learnable_params)})") + logging.info(f"{num_total_params=} ({format_big_number(num_total_params)})") + + # create dataloader for offline training + if hasattr(cfg.policy, "drop_n_last_frames"): + shuffle = False + sampler = EpisodeAwareSampler( + dataset.episode_data_index, + drop_n_last_frames=cfg.policy.drop_n_last_frames, + shuffle=True, + ) + else: + shuffle = True + sampler = None + + dataloader = torch.utils.data.DataLoader( + dataset, + num_workers=cfg.num_workers, + batch_size=cfg.batch_size, + shuffle=shuffle, + sampler=sampler, + pin_memory=device.type == "cuda", + drop_last=False, + ) + dl_iter = cycle(dataloader) + + policy.train() + train_metrics = { + "loss": AverageMeter("loss", ":.3f"), + "grad_norm": AverageMeter("grdn", ":.3f"), + "lr": AverageMeter("lr", ":0.1e"), + "update_s": AverageMeter("updt_s", ":.3f"), + "dataloading_s": AverageMeter("data_s", ":.3f"), + } + + train_tracker = MetricsTracker( + cfg.batch_size, dataset.num_frames, dataset.num_episodes, train_metrics, initial_step=step + ) + + logging.info("Start offline training on a fixed dataset") + for _ in range(step, cfg.steps): + start_time = time.perf_counter() + batch = next(dl_iter) + train_tracker.dataloading_s = time.perf_counter() - start_time + + for key in batch: + if isinstance(batch[key], torch.Tensor): + batch[key] = batch[key].to(device, non_blocking=device.type == "cuda") + + train_tracker, output_dict = update_policy( + train_tracker, + policy, + batch, + optimizer, + cfg.optimizer.grad_clip_norm, + grad_scaler=grad_scaler, + lr_scheduler=lr_scheduler, + use_amp=cfg.policy.use_amp, + ) + + # Note: eval and checkpoint happens *after* the `step`th training update has completed, so we + # increment `step` here. + step += 1 + train_tracker.step() + is_log_step = cfg.log_freq > 0 and step % cfg.log_freq == 0 + is_saving_step = step % cfg.save_freq == 0 or step == cfg.steps + is_eval_step = cfg.eval_freq > 0 and step % cfg.eval_freq == 0 + + if is_log_step: + logging.info(train_tracker) + if wandb_logger: + wandb_log_dict = train_tracker.to_dict() + if output_dict: + wandb_log_dict.update(output_dict) + wandb_logger.log_dict(wandb_log_dict, step) + train_tracker.reset_averages() + + if cfg.save_checkpoint and is_saving_step: + logging.info(f"Checkpoint policy after step {step}") + checkpoint_dir = get_step_checkpoint_dir(cfg.output_dir, cfg.steps, step) + save_checkpoint(checkpoint_dir, step, cfg, policy, optimizer, lr_scheduler) + update_last_checkpoint(checkpoint_dir) + if wandb_logger: + wandb_logger.log_policy(checkpoint_dir) + + if cfg.env and is_eval_step: + step_id = get_step_identifier(step, cfg.steps) + logging.info(f"Eval policy at step {step}") + with ( + torch.no_grad(), + torch.autocast(device_type=device.type) if cfg.policy.use_amp else nullcontext(), + ): + if cfg.env.multitask_eval: + eval_info = eval_policy_multitask( + eval_env, + policy, + cfg.eval.n_episodes, + videos_dir=cfg.output_dir / "eval" / f"videos_step_{step_id}", + max_episodes_rendered=4, + start_seed=cfg.seed, + max_parallel_tasks=cfg.env.max_parallel_tasks, + ) + aggregated = eval_info["overall"]["aggregated"] + # Print per-suite stats, log? + for task_group, task_group_info in eval_info.items(): + if task_group == "overall": + continue # Skip the overall stats since we already printed it + print(f"\nAggregated Metrics for {task_group}:") + print(task_group_info["aggregated"]) + else: + eval_info = eval_policy( + eval_env, + policy, + cfg.eval.n_episodes, + videos_dir=cfg.output_dir / "eval" / f"videos_step_{step_id}", + max_episodes_rendered=4, + start_seed=cfg.seed, + ) + aggregated = eval_info["aggregated"] + + eval_metrics = { + "avg_sum_reward": AverageMeter("βˆ‘rwrd", ":.3f"), + "pc_success": AverageMeter("success", ":.1f"), + "eval_s": AverageMeter("eval_s", ":.3f"), + } + eval_tracker = MetricsTracker( + cfg.batch_size, dataset.num_frames, dataset.num_episodes, eval_metrics, initial_step=step + ) + eval_tracker.eval_s = aggregated.pop("eval_s") + eval_tracker.avg_sum_reward = aggregated.pop("avg_sum_reward") + eval_tracker.pc_success = aggregated.pop("pc_success") + logging.info(eval_tracker) + if wandb_logger: + wandb_log_dict = {**eval_tracker.to_dict(), **eval_info} + wandb_logger.log_dict(wandb_log_dict, step, mode="eval") + wandb_logger.log_video(eval_info["video_paths"][0], step, mode="eval") + + if eval_env: + if cfg.env.multitask_eval: + for _task_group, envs_dict in eval_env.items(): + for _idx, env in envs_dict.items(): + env.close() + else: + eval_env.close() + logging.info("End of training") + + if cfg.policy.push_to_hub: + policy.push_model_to_hub(cfg) + + +def main(): + init_logging() + train() + + +if __name__ == "__main__": + main() diff --git a/src/lerobot/scripts/train_accelerate.py b/src/lerobot/scripts/train_accelerate.py new file mode 100644 index 000000000..e205f138f --- /dev/null +++ b/src/lerobot/scripts/train_accelerate.py @@ -0,0 +1,365 @@ +#!/usr/bin/env python + +# Copyright 2024 The HuggingFace Inc. team. All rights reserved. +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. +import logging +import time +from pprint import pformat +from typing import Any + +import torch +from accelerate import Accelerator +from accelerate.utils import set_seed as accelerate_set_seed +from termcolor import colored +from torch.optim import Optimizer + +from lerobot.datasets.factory import make_dataset +from lerobot.datasets.sampler import EpisodeAwareSampler +from lerobot.envs.factory import make_env +from lerobot.optim.factory import make_optimizer_and_scheduler +from lerobot.policies.factory import make_policy +from lerobot.policies.pretrained import PreTrainedPolicy +from lerobot.utils.logging_utils import AverageMeter, MetricsTracker +from lerobot.utils.train_utils import ( + get_step_checkpoint_dir, + get_step_identifier, + load_training_state, + save_checkpoint, + update_last_checkpoint, +) +from lerobot.utils.utils import ( + format_big_number, + has_method, + init_logging, +) +from lerobot.configs import parser +from lerobot.configs.train import TrainPipelineConfig +from lerobot.scripts.eval import eval_policy + + +def update_policy( + train_metrics: MetricsTracker, + policy: PreTrainedPolicy, + batch: Any, + optimizer: Optimizer, + grad_clip_norm: float, + accelerator: Accelerator, + lr_scheduler=None, +) -> tuple[MetricsTracker, dict]: + start_time = time.perf_counter() + policy.train() + + # Use accelerator's autocast context if mixed precision is enabled + with accelerator.autocast(): + loss, output_dict = policy.forward(batch) + # TODO(rcadene): policy.unnormalize_outputs(out_dict) + + # Use accelerator for backward pass + accelerator.backward(loss) + + # Gradient clipping - accelerator handles unscaling automatically + if accelerator.sync_gradients and grad_clip_norm > 0: + grad_norm = accelerator.clip_grad_norm_(policy.parameters(), grad_clip_norm) + else: + grad_norm = torch.tensor(0.0) + + optimizer.step() + lr_scheduler.step() if lr_scheduler is not None else None + optimizer.zero_grad() + + # Update policy-specific buffers if needed + if has_method(policy, "update"): + policy.update() + + # Gather metrics across all processes + loss_value = accelerator.gather(loss.detach()).mean().item() + grad_norm_value = accelerator.gather(grad_norm).mean().item() + + train_metrics.loss = loss_value + train_metrics.grad_norm = grad_norm_value + train_metrics.lr = optimizer.param_groups[0]["lr"] + train_metrics.update_s = time.perf_counter() - start_time + return train_metrics, output_dict + + +@parser.wrap() +def train(cfg: TrainPipelineConfig): + cfg.validate() + logging.info(pformat(cfg.to_dict())) + + # Initialize accelerator + from accelerate.utils import DistributedDataParallelKwargs + # added by jade 2 lines + ddp_kwargs = DistributedDataParallelKwargs(find_unused_parameters=False) + accelerator = Accelerator(..., kwargs_handlers=[ddp_kwargs]) + + from lerobot.utils.wandb_utils import cfg_to_group, get_wandb_run_id_from_filesystem + + ddp_kwargs = DistributedDataParallelKwargs(find_unused_parameters=True) + accelerator = Accelerator( + mixed_precision="bf16" if cfg.policy.use_amp else "no", + gradient_accumulation_steps=cfg.policy.gradient_accumulation_steps, + log_with="wandb" if cfg.wandb.enable else None, + kwargs_handlers=[ddp_kwargs], + project_dir=cfg.output_dir, + ) + + accelerator.init_trackers( + project_name=cfg.wandb.project, + init_kwargs={ + "wandb": { + "entity": cfg.wandb.entity, + "name": cfg.job_name, + "notes": cfg.wandb.notes, + "tags": cfg_to_group(cfg, return_list=True), + "dir": cfg.output_dir, + "config": cfg.to_dict(), + "save_code": False, + "job_type": "train_eval", + "mode": cfg.wandb.mode if cfg.wandb.mode in ["online", "offline", "disabled"] else "online", + "resume": "must" if cfg.resume else None, + "id": cfg.wandb.run_id + if cfg.wandb.run_id + else (get_wandb_run_id_from_filesystem(cfg.output_dir) if cfg.resume else None), + } + }, + ) + + # Set seed for reproducibility + if cfg.seed is not None: + accelerate_set_seed(cfg.seed) + + # Setup device - accelerator handles device placement + torch.backends.cudnn.benchmark = True + torch.backends.cuda.matmul.allow_tf32 = True + + # Create dataset + if accelerator.is_main_process: + logging.info("Creating dataset") + dataset = make_dataset(cfg) + print("c") + # Create evaluation environment (only on main process) + eval_env = None + if cfg.eval_freq > 0 and cfg.env is not None and accelerator.is_main_process: + logging.info("Creating env") + eval_env = make_env(cfg.env, n_envs=cfg.eval.batch_size, use_async_envs=cfg.eval.use_async_envs) + + # Create policy + if accelerator.is_main_process: + logging.info("Creating policy") + + # Use accelerator's device instead of cfg.policy.device + with accelerator.main_process_first(): + policy = make_policy( + cfg=cfg.policy, + ds_meta=dataset.meta, + ) + + # Create optimizer and scheduler + if accelerator.is_main_process: + logging.info("Creating optimizer and scheduler") + optimizer, lr_scheduler = make_optimizer_and_scheduler(cfg, policy) + + step = 0 # number of policy updates + + if cfg.resume: + step, optimizer, lr_scheduler = load_training_state(cfg.checkpoint_path, optimizer, lr_scheduler) + + # Prepare dataloader + if hasattr(cfg.policy, "drop_n_last_frames"): + shuffle = False + sampler = EpisodeAwareSampler( + dataset.episode_data_index, + drop_n_last_frames=cfg.policy.drop_n_last_frames, + shuffle=True, + ) + else: + shuffle = True + sampler = None + + dataloader = torch.utils.data.DataLoader( + dataset, + num_workers=cfg.num_workers, + batch_size=cfg.batch_size, + shuffle=shuffle, + sampler=sampler, + pin_memory=True, + drop_last=True, # Important for distributed training + ) + + # Prepare for distributed training + policy, optimizer, dataloader, lr_scheduler = accelerator.prepare( + policy, optimizer, dataloader, lr_scheduler + ) + + # Log training info (only on main process) + if accelerator.is_main_process: + num_learnable_params = sum(p.numel() for p in policy.parameters() if p.requires_grad) + num_total_params = sum(p.numel() for p in policy.parameters()) + + logging.info(colored("Output dir:", "yellow", attrs=["bold"]) + f" {cfg.output_dir}") + if cfg.env is not None: + logging.info(f"{cfg.env.task=}") + logging.info(f"{cfg.steps=} ({format_big_number(cfg.steps)})") + logging.info(f"{dataset.num_frames=} ({format_big_number(dataset.num_frames)})") + logging.info(f"{dataset.num_episodes=}") + logging.info(f"{num_learnable_params=} ({format_big_number(num_learnable_params)})") + logging.info(f"{num_total_params=} ({format_big_number(num_total_params)})") + logging.info(f"Number of processes: {accelerator.num_processes}") + logging.info(f"Device: {accelerator.device}") + logging.info(f"Mixed precision: {accelerator.mixed_precision}") + + # Create metrics trackers + train_metrics = { + "loss": AverageMeter("loss", ":.3f"), + "grad_norm": AverageMeter("grdn", ":.3f"), + "lr": AverageMeter("lr", ":0.1e"), + "update_s": AverageMeter("updt_s", ":.3f"), + "dataloading_s": AverageMeter("data_s", ":.3f"), + } + + train_tracker = MetricsTracker( + cfg.batch_size * accelerator.num_processes, # Account for all processes + dataset.num_frames, + dataset.num_episodes, + train_metrics, + initial_step=step, + ) + + # Training loop + policy.train() + if accelerator.is_main_process: + logging.info("Start offline training on a fixed dataset") + + # Create iterator from dataloader + dl_iter = iter(dataloader) + + for current_step in range(step, cfg.steps): + start_time = time.perf_counter() + # Get next batch, cycling through dataloader if needed + try: + batch = next(dl_iter) + print("data laoder batch keys: ", batch.keys()) + breakpoint() + except StopIteration: + dl_iter = iter(dataloader) + batch = next(dl_iter) + train_tracker.dataloading_s = time.perf_counter() - start_time + # Update policy + train_tracker, output_dict = update_policy( + train_tracker, + policy, + batch, + optimizer, + cfg.optimizer.grad_clip_norm, + accelerator, + lr_scheduler=lr_scheduler, + ) + + # Increment step counter + step += 1 + train_tracker.step() + + # Determine if we should log, save, or evaluate + is_log_step = cfg.log_freq > 0 and step % cfg.log_freq == 0 + is_saving_step = step % cfg.save_freq == 0 or step == cfg.steps + is_eval_step = cfg.eval_freq > 0 and step % cfg.eval_freq == 0 + + # Logging (only on main process) + if is_log_step and accelerator.is_main_process: + logging.info(train_tracker) + wandb_log_dict = train_tracker.to_dict() + if output_dict: + wandb_log_dict.update(output_dict) + for k, v in wandb_log_dict.items(): + accelerator.log({f"{'train'}/{k}": v}, step=step) + train_tracker.reset_averages() + + # Checkpointing (only on main process) + if cfg.save_checkpoint and is_saving_step: + # βœ… all processes wait here + accelerator.wait_for_everyone() + + if accelerator.is_main_process: + logging.info(f"Checkpoint policy after step {step}") + checkpoint_dir = get_step_checkpoint_dir(cfg.output_dir, cfg.steps, step) + + unwrapped_policy = accelerator.unwrap_model(policy) + save_checkpoint(checkpoint_dir, step, cfg, unwrapped_policy, optimizer, lr_scheduler) + update_last_checkpoint(checkpoint_dir) + + # βœ… all processes sync again after saving + accelerator.wait_for_everyone() + + # if wandb_logger: + # wandb_logger.log_policy(checkpoint_dir) + + # Evaluation (only on main process) + if cfg.env and is_eval_step and accelerator.is_main_process: + step_id = get_step_identifier(step, cfg.steps) + logging.info(f"Eval policy at step {step}") + + # Unwrap model for evaluation + unwrapped_policy = accelerator.unwrap_model(policy) + unwrapped_policy.eval() + + with torch.no_grad(): + eval_info = eval_policy( + eval_env, + unwrapped_policy, + cfg.eval.n_episodes, + videos_dir=cfg.output_dir / "eval" / f"videos_step_{step_id}", + max_episodes_rendered=4, + start_seed=cfg.seed, + ) + + eval_metrics = { + "avg_sum_reward": AverageMeter("βˆ‘rwrd", ":.3f"), + "pc_success": AverageMeter("success", ":.1f"), + "eval_s": AverageMeter("eval_s", ":.3f"), + } + eval_tracker = MetricsTracker( + cfg.batch_size * accelerator.num_processes, + dataset.num_frames, + dataset.num_episodes, + eval_metrics, + initial_step=step, + ) + eval_tracker.eval_s = eval_info["aggregated"].pop("eval_s") + eval_tracker.avg_sum_reward = eval_info["aggregated"].pop("avg_sum_reward") + eval_tracker.pc_success = eval_info["aggregated"].pop("pc_success") + logging.info(eval_tracker) + + wandb_log_dict = {**eval_tracker.to_dict(), **eval_info} + for k, v in wandb_log_dict.items(): + accelerator.log({f"{'eval'}/{k}": v}, step=step) + + # Set back to training mode + policy.train() + + # Wait for all processes to finish + accelerator.wait_for_everyone() + + # Cleanup + if eval_env and accelerator.is_main_process: + eval_env.close() + + if accelerator.is_main_process: + logging.info("End of training") + accelerator.end_training() # added by jade + + +if __name__ == "__main__": + init_logging() + train() diff --git a/tmux_log.txt b/tmux_log.txt new file mode 100644 index 000000000..4936578bc --- /dev/null +++ b/tmux_log.txt @@ -0,0 +1,2008 @@ + 'resume': False, + 'save_checkpoint': True, + 'save_freq': 20000, + 'scheduler': {'decay_lr': 2.5e-06, + 'num_decay_steps': 30000, + 'num_warmup_steps': 1000, + 'peak_lr': 0.0001, + 'type': 'cosine_decay_with_warmup'}, + 'seed': 1000, + 'steps': 100000, + 'use_policy_training_preset': True, + 'wandb': {'disable_artifact': False, + 'enable': False, + 'entity': None, + 'mode': None, + 'notes': None, + 'project': 'lerobot', + 'run_id': None}} +INFO 2025-09-08 13:23:15 ils/utils.py:48 Cuda backend detected, using cuda. +WARNING 2025-09-08 13:23:15 /policies.py:81 Device 'None' is not available. Switching to 'cuda'. +INFO 2025-09-08 13:23:15 ccelerate.py:99 {'batch_size': 32, + 'dataset': {'episodes': None, + 'image_transforms': {'enable': False, + 'max_num_transforms': 3, + 'random_order': False, + 'tfs': {'brightness': {'kwargs': {'brightness': [0.8, + 1.2]}, + 'type': 'ColorJitter', + 'weight': 1.0}, + 'contrast': {'kwargs': {'contrast': [0.8, + 1.2]}, + 'type': 'ColorJitter', + 'weight': 1.0}, + 'hue': {'kwargs': {'hue': [-0.05, + 0.05]}, + 'type': 'ColorJitter', + 'weight': 1.0}, + 'saturation': {'kwargs': {'saturation': [0.5, + 1.5]}, + 'type': 'ColorJitter', + 'weight': 1.0}, + 'sharpness': {'kwargs': {'sharpness': [0.5, + 1.5]}, + 'type': 'SharpnessJitter', + 'weight': 1.0}}}, + 'repo_id': 'HuggingFaceVLA/libero', + 'revision': None, + 'root': '/raid/jade/.cache/huggingface/datasets', + 'use_imagenet_stats': True, + 'video_backend': 'torchcodec'}, + 'env': {'camera_name': 'agentview_image,robot0_eye_in_hand_image', + 'episode_length': 520, + 'features': {'action': {'shape': [7], + 'type': }, + 'agent_pos': {'shape': [8], + 'type': }, + 'pixels/agentview_image': {'shape': [360, 360, 3], + 'type': }, + 'pixels/robot0_eye_in_hand_image': {'shape': [360, + 360, + 3], + 'type': }}, + 'features_map': {'action': 'action', + 'agent_pos': 'observation.state', + 'pixels/agentview_image': 'observation.images.image', + 'pixels/robot0_eye_in_hand_image': 'observation.images.image2'}, + 'fps': 30, + 'init_states': True, + 'max_parallel_tasks': 5, + 'multitask_eval': True, + 'obs_type': 'pixels_agent_pos', + 'render_mode': 'rgb_array', + 'task': 'libero_spatial', + 'type': 'libero'}, + 'eval': {'batch_size': 1, 'n_episodes': 1, 'use_async_envs': False}, + 'eval_freq': 0, + 'job_name': 'libero_smolvla', + 'log_freq': 200, + 'num_workers': 4, + 'optimizer': {'betas': [0.9, 0.95], + 'eps': 1e-08, + 'grad_clip_norm': 10, + 'lr': 0.0001, + 'type': 'adamw', + 'weight_decay': 1e-10}, + 'output_dir': '/raid/jade/logs/lerobot/lerobot_2_HuggingFaceVLA_libero_smolvla_lr1e-4bs32steps100000', + 'policy': {'adapt_to_pi_aloha': False, + 'add_image_special_tokens': False, + 'attention_mode': 'cross_attn', + 'chunk_size': 50, + 'device': 'cuda', + 'empty_cameras': 0, + 'expert_width_multiplier': 0.75, + 'freeze_vision_encoder': True, + 'gradient_accumulation_steps': 1, + 'input_features': {}, + 'license': None, + 'load_vlm_weights': False, + 'max_action_dim': 32, + 'max_period': 4.0, + 'max_state_dim': 32, + 'min_period': 0.004, + 'n_action_steps': 1, + 'n_obs_steps': 1, + 'normalization_mapping': {'ACTION': , + 'STATE': , + 'VISUAL': }, + 'num_expert_layers': -1, + 'num_steps': 10, + 'num_vlm_layers': 16, + 'optimizer_betas': [0.9, 0.95], + 'optimizer_eps': 1e-08, + 'optimizer_grad_clip_norm': 10, + 'optimizer_lr': 0.0001, + 'optimizer_weight_decay': 1e-10, + 'output_features': {}, + 'pad_language_to': 'longest', + 'prefix_length': -1, + 'private': None, + 'push_to_hub': True, + 'repo_id': 'None', + 'resize_imgs_with_padding': [512, 512], + 'scheduler_decay_lr': 2.5e-06, + 'scheduler_decay_steps': 30000, + 'scheduler_warmup_steps': 1000, + 'self_attn_every_n_layers': 2, + 'tags': None, + 'tokenizer_max_length': 48, + 'train_expert_only': True, + 'train_state_proj': True, + 'type': 'smolvla', + 'use_amp': True, + 'use_cache': True, + 'use_delta_joint_actions_aloha': False, + 'vlm_model_name': 'HuggingFaceTB/SmolVLM2-500M-Instruct'}, + 'resume': False, + 'save_checkpoint': True, + 'save_freq': 20000, + 'scheduler': {'decay_lr': 2.5e-06, + 'num_decay_steps': 30000, + 'num_warmup_steps': 1000, + 'peak_lr': 0.0001, + 'type': 'cosine_decay_with_warmup'}, + 'seed': 1000, + 'steps': 100000, + 'use_policy_training_preset': True, + 'wandb': {'disable_artifact': False, + 'enable': False, + 'entity': None, + 'mode': None, + 'notes': None, + 'project': 'lerobot', + 'run_id': None}} +WARNING 2025-09-08 13:23:15 ls/other.py:512 Detected kernel version 5.4.0, which is below the recommended minimum of + 5.5.0; this can cause the process to hang. It is recommended to upgrade the kernel to the minimum version or higher +. +WARNING 2025-09-08 13:23:15 ls/other.py:512 Detected kernel version 5.4.0, which is below the recommended minimum of + 5.5.0; this can cause the process to hang. It is recommended to upgrade the kernel to the minimum version or higher +. +INFO 2025-09-08 13:23:15 celerate.py:149 Creating dataset +Resolving data files: 100%|β–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆ| 1693/1693 [00:00<00:00, 35414.48it/s] +Loading dataset shards: 100%|β–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆ| 69/69 [00:00<00:00, 5660.00it/s] +Resolving data files: 100%|β–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆ| 1693/1693 [00:00<00:00, 43760.67it/s] +Loading dataset shards: 100%|β–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆ| 69/69 [00:00<00:00, 5629.72it/s] +c +INFO 2025-09-08 13:23:22 celerate.py:160 Creating policy +/home/jade_choghari/miniconda3/envs/lerobot/lib/python3.10/site-packages/transformers/utils/hub.py:111: FutureWarnin +g: Using `TRANSFORMERS_CACHE` is deprecated and will be removed in v5 of Transformers. Use `HF_HOME` instead. + warnings.warn( +c +/home/jade_choghari/miniconda3/envs/lerobot/lib/python3.10/site-packages/torch/distributed/distributed_c10d.py:4631: + UserWarning: No device id is provided via `init_process_group` or `barrier `. Using the current device set by the u +ser. + warnings.warn( # warn only once +[rank1]:[W908 13:23:22.785516795 ProcessGroupNCCL.cpp:4718] [PG ID 0 PG GUID 0 Rank 1] using GPU 1 as device used b +y this process is currently unknown. This can potentially cause a hang if this rank to GPU mapping is incorrect. You + can pecify device_id in init_process_group() to force use of a particular device. +Reducing the number of VLM layers to 16 ... +/home/jade_choghari/miniconda3/envs/lerobot/lib/python3.10/site-packages/torch/distributed/distributed_c10d.py:4631: + UserWarning: No device id is provided via `init_process_group` or `barrier `. Using the current device set by the u +ser. + warnings.warn( # warn only once +[rank0]:[W908 13:23:43.028071493 ProcessGroupNCCL.cpp:4718] [PG ID 0 PG GUID 0 Rank 0] using GPU 0 as device used b +y this process is currently unknown. This can potentially cause a hang if this rank to GPU mapping is incorrect. You + can pecify device_id in init_process_group() to force use of a particular device. +INFO 2025-09-08 13:23:43 celerate.py:171 Creating optimizer and scheduler +/home/jade_choghari/miniconda3/envs/lerobot/lib/python3.10/site-packages/transformers/utils/hub.py:111: FutureWarnin +g: Using `TRANSFORMERS_CACHE` is deprecated and will be removed in v5 of Transformers. Use `HF_HOME` instead. + warnings.warn( +Reducing the number of VLM layers to 16 ... +INFO 2025-09-08 13:24:04 celerate.py:211 Output dir: /raid/jade/logs/lerobot/lerobot_2_HuggingFaceVLA_libero_smolvla +_lr1e-4bs32steps100000 +INFO 2025-09-08 13:24:04 celerate.py:213 cfg.env.task='libero_spatial' +INFO 2025-09-08 13:24:04 celerate.py:214 cfg.steps=100000 (100K) +INFO 2025-09-08 13:24:04 celerate.py:215 dataset.num_frames=273465 (273K) +INFO 2025-09-08 13:24:04 celerate.py:216 dataset.num_episodes=1693 +INFO 2025-09-08 13:24:04 celerate.py:217 num_learnable_params=99880992 (100M) +INFO 2025-09-08 13:24:04 celerate.py:218 num_total_params=450046220 (450M) +INFO 2025-09-08 13:24:04 celerate.py:219 Number of processes: 2 +INFO 2025-09-08 13:24:04 celerate.py:220 Device: cuda:0 +INFO 2025-09-08 13:24:04 celerate.py:221 Mixed precision: bf16 +INFO 2025-09-08 13:24:04 celerate.py:243 Start offline training on a fixed dataset + +bach: dict_keys(['observation.images.image', 'observation.images.image2', 'observation.state', 'action', 'timestamp +', 'frame_index', 'episode_index', 'index', 'task_index', 'observation.images.image_is_pad', 'observation.images.ima +ge2_is_pad', 'observation.state_is_pad', 'action_is_pad', 'task']) +> /home/jade_choghari/lerobot/src/lerobot/scripts/train_accelerate.py(263)train() +-> train_tracker, output_dict = update_policy( +(Pdb) +bach: dict_keys(['observation.images.image', 'observation.images.image2', 'observation.state', 'action', 'timestamp +', 'frame_index', 'episode_index', 'index', 'task_index', 'observation.images.image_is_pad', 'observation.images.ima +ge2_is_pad', 'observation.state_is_pad', 'action_is_pad', 'task']) +> /home/jade_choghari/lerobot/src/lerobot/scripts/train_accelerate.py(263)train() +-> train_tracker, output_dict = update_policy( +(Pdb) batch.keys()[rank0]:[W908 13:24:43.868440913 reducer.cpp:1430] Warning: find_unused_parameters=True was specif +ied in DDP constructor, but did not find any unused parameters in the forward pass. This flag results in an extra tr +aversal of the autograd graph every iteration, which can adversely affect performance. If your model indeed never h +as any unused parameters in the forward pass, consider turning this flag off. Note that this warning may be a false +positive if your model has flow control causing later iterations to have unused parameters. (function operator()) +policy.config.input_features +*** SyntaxError: invalid syntax +(Pdb) policy.config.input_features +*** AttributeError: 'DistributedDataParallel' object has no attribute 'config' +(Pdb) policy +DistributedDataParallel( + (module): SmolVLAPolicy( + (normalize_inputs): Normalize( + (buffer_observation_state): ParameterDict( + (mean): Parameter containing: [torch.cuda.FloatTensor of size 8 (cuda:1)] + (std): Parameter containing: [torch.cuda.FloatTensor of size 8 (cuda:1)] + ) + ) + (normalize_targets): Normalize( + (buffer_action): ParameterDict( + (mean): Parameter containing: [torch.cuda.FloatTensor of size 7 (cuda:1)] + (std): Parameter containing: [torch.cuda.FloatTensor of size 7 (cuda:1)] + ) + ) + (unnormalize_outputs): Unnormalize( + (buffer_action): ParameterDict( + (mean): Parameter containing: [torch.cuda.FloatTensor of size 7 (cuda:1)] + (std): Parameter containing: [torch.cuda.FloatTensor of size 7 (cuda:1)] + ) + ) + (model): VLAFlowMatching( + (vlm_with_expert): SmolVLMWithExpertModel( + (vlm): SmolVLMForConditionalGeneration( + (model): SmolVLMModel( + (vision_model): SmolVLMVisionTransformer( + (embeddings): SmolVLMVisionEmbeddings( + (patch_embedding): Conv2d(3, 768, kernel_size=(16, 16), stride=(16, 16), padding=valid) + (position_embedding): Embedding(1024, 768) + ) + (encoder): SmolVLMEncoder( + (layers): ModuleList( + (0-11): 12 x SmolVLMEncoderLayer( + (self_attn): SmolVLMVisionAttention( + (k_proj): Linear(in_features=768, out_features=768, bias=True) + (v_proj): Linear(in_features=768, out_features=768, bias=True) + (q_proj): Linear(in_features=768, out_features=768, bias=True) + (out_proj): Linear(in_features=768, out_features=768, bias=True) + ) + (layer_norm1): LayerNorm((768,), eps=1e-06, elementwise_affine=True) + (mlp): SmolVLMVisionMLP( + (activation_fn): PytorchGELUTanh() + (fc1): Linear(in_features=768, out_features=3072, bias=True) + (fc2): Linear(in_features=3072, out_features=768, bias=True) + ) + (layer_norm2): LayerNorm((768,), eps=1e-06, elementwise_affine=True) + ) + ) + ) + (post_layernorm): LayerNorm((768,), eps=1e-06, elementwise_affine=True) + ) + (connector): SmolVLMConnector( + (modality_projection): SmolVLMSimpleMLP( + (proj): Linear(in_features=12288, out_features=960, bias=False) + ) + ) + (text_model): LlamaModel( + (embed_tokens): Embedding(49280, 960, padding_idx=2) + (layers): ModuleList( + (0-15): 16 x LlamaDecoderLayer( + (self_attn): LlamaAttention( + (q_proj): Linear(in_features=960, out_features=960, bias=False) + (k_proj): Linear(in_features=960, out_features=320, bias=False) + (v_proj): Linear(in_features=960, out_features=320, bias=False) + (o_proj): Linear(in_features=960, out_features=960, bias=False) + ) + (mlp): LlamaMLP( + (gate_proj): Linear(in_features=960, out_features=2560, bias=False) + (up_proj): Linear(in_features=960, out_features=2560, bias=False) + (down_proj): Linear(in_features=2560, out_features=960, bias=False) + (act_fn): SiLU() + ) + (input_layernorm): LlamaRMSNorm((960,), eps=1e-05) + (post_attention_layernorm): LlamaRMSNorm((960,), eps=1e-05) + ) + ) + (norm): LlamaRMSNorm((960,), eps=1e-05) + (rotary_emb): LlamaRotaryEmbedding() + ) + ) + (lm_head): Linear(in_features=960, out_features=49280, bias=False) + ) + (lm_expert): LlamaModel( + (embed_tokens): None + (layers): ModuleList( + (0): LlamaDecoderLayer( + (self_attn): LlamaAttention( + (q_proj): Linear(in_features=720, out_features=960, bias=False) + (k_proj): Linear(in_features=720, out_features=320, bias=False) + (v_proj): Linear(in_features=720, out_features=320, bias=False) + (o_proj): Linear(in_features=960, out_features=720, bias=False) + ) + (mlp): LlamaMLP( + (gate_proj): Linear(in_features=720, out_features=2048, bias=False) + (up_proj): Linear(in_features=720, out_features=2048, bias=False) + (down_proj): Linear(in_features=2048, out_features=720, bias=False) + (act_fn): SiLU() + ) + (input_layernorm): LlamaRMSNorm((720,), eps=1e-05) + (post_attention_layernorm): LlamaRMSNorm((720,), eps=1e-05) + ) + (1): LlamaDecoderLayer( + (self_attn): LlamaAttention( + (q_proj): Linear(in_features=720, out_features=960, bias=False) + (k_proj): Linear(in_features=320, out_features=320, bias=False) + (v_proj): Linear(in_features=320, out_features=320, bias=False) + (o_proj): Linear(in_features=960, out_features=720, bias=False) + ) + (mlp): LlamaMLP( + (gate_proj): Linear(in_features=720, out_features=2048, bias=False) + (up_proj): Linear(in_features=720, out_features=2048, bias=False) + (down_proj): Linear(in_features=2048, out_features=720, bias=False) + (act_fn): SiLU() + ) + (input_layernorm): LlamaRMSNorm((720,), eps=1e-05) + (post_attention_layernorm): LlamaRMSNorm((720,), eps=1e-05) + ) + (2): LlamaDecoderLayer( + (self_attn): LlamaAttention( + (q_proj): Linear(in_features=720, out_features=960, bias=False) + (k_proj): Linear(in_features=720, out_features=320, bias=False) + (v_proj): Linear(in_features=720, out_features=320, bias=False) + (o_proj): Linear(in_features=960, out_features=720, bias=False) + ) + (mlp): LlamaMLP( + (gate_proj): Linear(in_features=720, out_features=2048, bias=False) + (up_proj): Linear(in_features=720, out_features=2048, bias=False) + (down_proj): Linear(in_features=2048, out_features=720, bias=False) + (act_fn): SiLU() + ) + (input_layernorm): LlamaRMSNorm((720,), eps=1e-05) + (post_attention_layernorm): LlamaRMSNorm((720,), eps=1e-05) + ) + (3): LlamaDecoderLayer( + (self_attn): LlamaAttention( + (q_proj): Linear(in_features=720, out_features=960, bias=False) + (k_proj): Linear(in_features=320, out_features=320, bias=False) + (v_proj): Linear(in_features=320, out_features=320, bias=False) + (o_proj): Linear(in_features=960, out_features=720, bias=False) + ) + (mlp): LlamaMLP( + (gate_proj): Linear(in_features=720, out_features=2048, bias=False) + (up_proj): Linear(in_features=720, out_features=2048, bias=False) + (down_proj): Linear(in_features=2048, out_features=720, bias=False) + (act_fn): SiLU() + ) + (input_layernorm): LlamaRMSNorm((720,), eps=1e-05) + (post_attention_layernorm): LlamaRMSNorm((720,), eps=1e-05) + ) + (4): LlamaDecoderLayer( + (self_attn): LlamaAttention( + (q_proj): Linear(in_features=720, out_features=960, bias=False) + (k_proj): Linear(in_features=720, out_features=320, bias=False) + (v_proj): Linear(in_features=720, out_features=320, bias=False) + (o_proj): Linear(in_features=960, out_features=720, bias=False) + ) + (mlp): LlamaMLP( + (gate_proj): Linear(in_features=720, out_features=2048, bias=False) + (up_proj): Linear(in_features=720, out_features=2048, bias=False) + (down_proj): Linear(in_features=2048, out_features=720, bias=False) + (act_fn): SiLU() + ) + (input_layernorm): LlamaRMSNorm((720,), eps=1e-05) + (post_attention_layernorm): LlamaRMSNorm((720,), eps=1e-05) + ) + (5): LlamaDecoderLayer( + (self_attn): LlamaAttention( + (q_proj): Linear(in_features=720, out_features=960, bias=False) + (k_proj): Linear(in_features=320, out_features=320, bias=False) + (v_proj): Linear(in_features=320, out_features=320, bias=False) + (o_proj): Linear(in_features=960, out_features=720, bias=False) + ) + (mlp): LlamaMLP( + (gate_proj): Linear(in_features=720, out_features=2048, bias=False) + (up_proj): Linear(in_features=720, out_features=2048, bias=False) + (down_proj): Linear(in_features=2048, out_features=720, bias=False) + (act_fn): SiLU() + ) + (input_layernorm): LlamaRMSNorm((720,), eps=1e-05) + (post_attention_layernorm): LlamaRMSNorm((720,), eps=1e-05) + ) + (6): LlamaDecoderLayer( + (self_attn): LlamaAttention( + (q_proj): Linear(in_features=720, out_features=960, bias=False) + (k_proj): Linear(in_features=720, out_features=320, bias=False) + (v_proj): Linear(in_features=720, out_features=320, bias=False) + (o_proj): Linear(in_features=960, out_features=720, bias=False) + ) + (mlp): LlamaMLP( + (gate_proj): Linear(in_features=720, out_features=2048, bias=False) + (up_proj): Linear(in_features=720, out_features=2048, bias=False) + (down_proj): Linear(in_features=2048, out_features=720, bias=False) + (act_fn): SiLU() + ) + (input_layernorm): LlamaRMSNorm((720,), eps=1e-05) + (post_attention_layernorm): LlamaRMSNorm((720,), eps=1e-05) + ) + (7): LlamaDecoderLayer( + (self_attn): LlamaAttention( + (q_proj): Linear(in_features=720, out_features=960, bias=False) + (k_proj): Linear(in_features=320, out_features=320, bias=False) + (v_proj): Linear(in_features=320, out_features=320, bias=False) + (o_proj): Linear(in_features=960, out_features=720, bias=False) + ) + (mlp): LlamaMLP( + (gate_proj): Linear(in_features=720, out_features=2048, bias=False) + (up_proj): Linear(in_features=720, out_features=2048, bias=False) + (down_proj): Linear(in_features=2048, out_features=720, bias=False) + (act_fn): SiLU() + ) + (input_layernorm): LlamaRMSNorm((720,), eps=1e-05) + (post_attention_layernorm): LlamaRMSNorm((720,), eps=1e-05) + ) + (8): LlamaDecoderLayer( + (self_attn): LlamaAttention( + (q_proj): Linear(in_features=720, out_features=960, bias=False) + (k_proj): Linear(in_features=720, out_features=320, bias=False) + (v_proj): Linear(in_features=720, out_features=320, bias=False) + (o_proj): Linear(in_features=960, out_features=720, bias=False) + ) + (mlp): LlamaMLP( + (gate_proj): Linear(in_features=720, out_features=2048, bias=False) + (up_proj): Linear(in_features=720, out_features=2048, bias=False) + (down_proj): Linear(in_features=2048, out_features=720, bias=False) + (act_fn): SiLU() + ) + (input_layernorm): LlamaRMSNorm((720,), eps=1e-05) + (post_attention_layernorm): LlamaRMSNorm((720,), eps=1e-05) + ) + (9): LlamaDecoderLayer( + (self_attn): LlamaAttention( + (q_proj): Linear(in_features=720, out_features=960, bias=False) + (k_proj): Linear(in_features=320, out_features=320, bias=False) + (v_proj): Linear(in_features=320, out_features=320, bias=False) + (o_proj): Linear(in_features=960, out_features=720, bias=False) + ) + (mlp): LlamaMLP( + (gate_proj): Linear(in_features=720, out_features=2048, bias=False) + (up_proj): Linear(in_features=720, out_features=2048, bias=False) + (down_proj): Linear(in_features=2048, out_features=720, bias=False) + (act_fn): SiLU() + ) + (input_layernorm): LlamaRMSNorm((720,), eps=1e-05) + (post_attention_layernorm): LlamaRMSNorm((720,), eps=1e-05) + ) + (10): LlamaDecoderLayer( + (self_attn): LlamaAttention( + (q_proj): Linear(in_features=720, out_features=960, bias=False) + (k_proj): Linear(in_features=720, out_features=320, bias=False) + (v_proj): Linear(in_features=720, out_features=320, bias=False) + (o_proj): Linear(in_features=960, out_features=720, bias=False) + ) + (mlp): LlamaMLP( + (gate_proj): Linear(in_features=720, out_features=2048, bias=False) + (up_proj): Linear(in_features=720, out_features=2048, bias=False) + (down_proj): Linear(in_features=2048, out_features=720, bias=False) + (act_fn): SiLU() + ) + (input_layernorm): LlamaRMSNorm((720,), eps=1e-05) + (post_attention_layernorm): LlamaRMSNorm((720,), eps=1e-05) + ) + (11): LlamaDecoderLayer( + (self_attn): LlamaAttention( + (q_proj): Linear(in_features=720, out_features=960, bias=False) + (k_proj): Linear(in_features=320, out_features=320, bias=False) + (v_proj): Linear(in_features=320, out_features=320, bias=False) + (o_proj): Linear(in_features=960, out_features=720, bias=False) + ) + (mlp): LlamaMLP( + (gate_proj): Linear(in_features=720, out_features=2048, bias=False) + (up_proj): Linear(in_features=720, out_features=2048, bias=False) + (down_proj): Linear(in_features=2048, out_features=720, bias=False) + (act_fn): SiLU() + ) + (input_layernorm): LlamaRMSNorm((720,), eps=1e-05) + (post_attention_layernorm): LlamaRMSNorm((720,), eps=1e-05) + ) + (12): LlamaDecoderLayer( + (self_attn): LlamaAttention( + (q_proj): Linear(in_features=720, out_features=960, bias=False) + (k_proj): Linear(in_features=720, out_features=320, bias=False) + (v_proj): Linear(in_features=720, out_features=320, bias=False) + (o_proj): Linear(in_features=960, out_features=720, bias=False) + ) + (mlp): LlamaMLP( + (gate_proj): Linear(in_features=720, out_features=2048, bias=False) + (up_proj): Linear(in_features=720, out_features=2048, bias=False) + (down_proj): Linear(in_features=2048, out_features=720, bias=False) + (act_fn): SiLU() + ) + (input_layernorm): LlamaRMSNorm((720,), eps=1e-05) + (post_attention_layernorm): LlamaRMSNorm((720,), eps=1e-05) + ) + (13): LlamaDecoderLayer( + (self_attn): LlamaAttention( + (q_proj): Linear(in_features=720, out_features=960, bias=False) + (k_proj): Linear(in_features=320, out_features=320, bias=False) + (v_proj): Linear(in_features=320, out_features=320, bias=False) + (o_proj): Linear(in_features=960, out_features=720, bias=False) + ) + (mlp): LlamaMLP( + (gate_proj): Linear(in_features=720, out_features=2048, bias=False) + (up_proj): Linear(in_features=720, out_features=2048, bias=False) + (down_proj): Linear(in_features=2048, out_features=720, bias=False) + (act_fn): SiLU() + ) + (input_layernorm): LlamaRMSNorm((720,), eps=1e-05) + (post_attention_layernorm): LlamaRMSNorm((720,), eps=1e-05) + ) + (14): LlamaDecoderLayer( + (self_attn): LlamaAttention( + (q_proj): Linear(in_features=720, out_features=960, bias=False) + (k_proj): Linear(in_features=720, out_features=320, bias=False) + (v_proj): Linear(in_features=720, out_features=320, bias=False) + (o_proj): Linear(in_features=960, out_features=720, bias=False) + ) + (mlp): LlamaMLP( + (gate_proj): Linear(in_features=720, out_features=2048, bias=False) + (up_proj): Linear(in_features=720, out_features=2048, bias=False) + (down_proj): Linear(in_features=2048, out_features=720, bias=False) + (act_fn): SiLU() + ) + (input_layernorm): LlamaRMSNorm((720,), eps=1e-05) + (post_attention_layernorm): LlamaRMSNorm((720,), eps=1e-05) + ) + (15): LlamaDecoderLayer( + (self_attn): LlamaAttention( + (q_proj): Linear(in_features=720, out_features=960, bias=False) + (k_proj): Linear(in_features=320, out_features=320, bias=False) + (v_proj): Linear(in_features=320, out_features=320, bias=False) + (o_proj): Linear(in_features=960, out_features=720, bias=False) + ) + (mlp): LlamaMLP( + (gate_proj): Linear(in_features=720, out_features=2048, bias=False) + (up_proj): Linear(in_features=720, out_features=2048, bias=False) + (down_proj): Linear(in_features=2048, out_features=720, bias=False) + (act_fn): SiLU() + ) + (input_layernorm): LlamaRMSNorm((720,), eps=1e-05) + (post_attention_layernorm): LlamaRMSNorm((720,), eps=1e-05) + ) + ) + (norm): LlamaRMSNorm((720,), eps=1e-05) + (rotary_emb): LlamaRotaryEmbedding() + ) + ) + (state_proj): Linear(in_features=32, out_features=960, bias=True) + (action_in_proj): Linear(in_features=32, out_features=720, bias=True) + (action_out_proj): Linear(in_features=720, out_features=32, bias=True) + (action_time_mlp_in): Linear(in_features=1440, out_features=720, bias=True) + (action_time_mlp_out): Linear(in_features=720, out_features=720, bias=True) + ) + ) +) +(Pdb) policy.config +*** AttributeError: 'DistributedDataParallel' object has no attribute 'config' +(Pdb) policy.input_features +*** AttributeError: 'DistributedDataParallel' object has no attribute 'input_features' +(Pdb) quit() +[rank1]: Traceback (most recent call last): +[rank1]: File "/home/jade_choghari/lerobot/src/lerobot/scripts/train_accelerate.py", line 368, in +[rank1]: train() +[rank1]: File "/home/jade_choghari/lerobot/src/lerobot/configs/parser.py", line 225, in wrapper_inner +[rank1]: response = fn(cfg, *args, **kwargs) +[rank1]: File "/home/jade_choghari/lerobot/src/lerobot/scripts/train_accelerate.py", line 263, in train +[rank1]: train_tracker, output_dict = update_policy( +[rank1]: File "/home/jade_choghari/lerobot/src/lerobot/scripts/train_accelerate.py", line 263, in train +[rank1]: train_tracker, output_dict = update_policy( +[rank1]: File "/home/jade_choghari/miniconda3/envs/lerobot/lib/python3.10/bdb.py", line 90, in trace_dispatch +[rank1]: return self.dispatch_line(frame) +[rank1]: File "/home/jade_choghari/miniconda3/envs/lerobot/lib/python3.10/bdb.py", line 115, in dispatch_line +[rank1]: if self.quitting: raise BdbQuit +[rank1]: bdb.BdbQuit +W0908 13:25:34.274000 776579 site-packages/torch/distributed/elastic/multiprocessing/api.py:900] Sending process 776 +663 closing signal SIGTERM +E0908 13:25:34.589000 776579 site-packages/torch/distributed/elastic/multiprocessing/api.py:874] failed (exitcode: 1 +) local_rank: 1 (pid: 776664) of binary: /home/jade_choghari/miniconda3/envs/lerobot/bin/python +Traceback (most recent call last): + File "/home/jade_choghari/miniconda3/envs/lerobot/lib/python3.10/runpy.py", line 196, in _run_module_as_main + return _run_code(code, main_globals, None, + File "/home/jade_choghari/miniconda3/envs/lerobot/lib/python3.10/runpy.py", line 86, in _run_code + exec(code, run_globals) + File "/home/jade_choghari/miniconda3/envs/lerobot/lib/python3.10/site-packages/accelerate/commands/launch.py", lin +e 1245, in + main() + File "/home/jade_choghari/miniconda3/envs/lerobot/lib/python3.10/site-packages/accelerate/commands/launch.py", lin +e 1241, in main + launch_command(args) + File "/home/jade_choghari/miniconda3/envs/lerobot/lib/python3.10/site-packages/accelerate/commands/launch.py", lin +e 1226, in launch_command + multi_gpu_launcher(args) + File "/home/jade_choghari/miniconda3/envs/lerobot/lib/python3.10/site-packages/accelerate/commands/launch.py", lin +e 853, in multi_gpu_launcher + distrib_run.run(args) + File "/home/jade_choghari/miniconda3/envs/lerobot/lib/python3.10/site-packages/torch/distributed/run.py", line 883 +, in run + elastic_launch( + File "/home/jade_choghari/miniconda3/envs/lerobot/lib/python3.10/site-packages/torch/distributed/launcher/api.py", + line 139, in __call__ + return launch_agent(self._config, self._entrypoint, list(args)) + File "/home/jade_choghari/miniconda3/envs/lerobot/lib/python3.10/site-packages/torch/distributed/launcher/api.py", + line 270, in launch_agent + raise ChildFailedError( +torch.distributed.elastic.multiprocessing.errors.ChildFailedError: +============================================================ +src/lerobot/scripts/train_accelerate.py FAILED +------------------------------------------------------------ +Failures: + +------------------------------------------------------------ +Root Cause (first observed failure): +[0]: + time : 2025-09-08_13:25:34 + host : hf-dgx-01 + rank : 1 (local_rank: 1) + exitcode : 1 (pid: 776664) + error_file: + traceback : To enable traceback see: https://pytorch.org/docs/stable/elastic/errors.html +============================================================ +(lerobot) jade_choghari@hf-dgx-01:~/lerobot$ clear +(lerobot) jade_choghari@hf-dgx-01:~/lerobot$ clear +(lerobot) jade_choghari@hf-dgx-01:~/lerobot$ bash examples/7_train_acc.sh +Training dir: /raid/jade/logs/lerobot/lerobot_2_HuggingFaceVLA_libero_smolvla_lr1e-4bs32steps100000 +The following values were not passed to `accelerate launch` and had defaults used instead: + More than one GPU was found, enabling multi-GPU training. + If this was unintended please pass in `--num_processes=1`. + `--dynamo_backend` was set to a value of `'no'` +To avoid this warning pass in values for each of the problematic parameters or run `accelerate config`. +/home/jade_choghari/miniconda3/envs/lerobot/lib/python3.10/site-packages/accelerate/utils/launch.py:238: UserWarning +: Port `29522` is already in use. Accelerate will attempt to launch in a standalone-like mode by finding an open por +t automatically for this session. If this current attempt fails, or for more control in future runs, please specify +a different port (e.g., `--main_process_port `) or use `--main_process_port 0` for automatic selec +tion in your launch command or Accelerate config file. + warnings.warn( +INFO 2025-09-08 13:33:47 ils/utils.py:48 Cuda backend detected, using cuda. +WARNING 2025-09-08 13:33:47 /policies.py:81 Device 'None' is not available. Switching to 'cuda'. +INFO 2025-09-08 13:33:47 ccelerate.py:99 {'batch_size': 32, + 'dataset': {'episodes': None, + 'image_transforms': {'enable': False, + 'max_num_transforms': 3, + 'random_order': False, + 'tfs': {'brightness': {'kwargs': {'brightness': [0.8, + 1.2]}, + 'type': 'ColorJitter', + 'weight': 1.0}, + 'contrast': {'kwargs': {'contrast': [0.8, + 1.2]}, + 'type': 'ColorJitter', + 'weight': 1.0}, + 'hue': {'kwargs': {'hue': [-0.05, + 0.05]}, + 'type': 'ColorJitter', + 'weight': 1.0}, + 'saturation': {'kwargs': {'saturation': [0.5, + 1.5]}, + 'type': 'ColorJitter', + 'weight': 1.0}, + 'sharpness': {'kwargs': {'sharpness': [0.5, + 1.5]}, + 'type': 'SharpnessJitter', + 'weight': 1.0}}}, + 'repo_id': 'HuggingFaceVLA/libero', + 'revision': None, + 'root': '/raid/jade/.cache/huggingface/datasets', + 'use_imagenet_stats': True, + 'video_backend': 'torchcodec'}, + 'env': {'camera_name': 'agentview_image,robot0_eye_in_hand_image', + 'episode_length': 520, + 'features': {'action': {'shape': [7], + 'type': }, + 'agent_pos': {'shape': [8], + 'type': }, + 'pixels/agentview_image': {'shape': [360, 360, 3], + 'type': }, + 'pixels/robot0_eye_in_hand_image': {'shape': [360, + 360, + 3], + 'type': }}, + 'features_map': {'action': 'action', + 'agent_pos': 'observation.state', + 'pixels/agentview_image': 'observation.images.image', + 'pixels/robot0_eye_in_hand_image': 'observation.images.image2'}, + 'fps': 30, + 'init_states': True, + 'max_parallel_tasks': 5, + 'multitask_eval': True, + 'obs_type': 'pixels_agent_pos', + 'render_mode': 'rgb_array', + 'task': 'libero_spatial', + 'type': 'libero'}, + 'eval': {'batch_size': 1, 'n_episodes': 1, 'use_async_envs': False}, + 'eval_freq': 0, + 'job_name': 'libero_smolvla', + 'log_freq': 200, + 'num_workers': 4, + 'optimizer': {'betas': [0.9, 0.95], + 'eps': 1e-08, + 'grad_clip_norm': 10, + 'lr': 0.0001, + 'type': 'adamw', + 'weight_decay': 1e-10}, + 'output_dir': '/raid/jade/logs/lerobot/lerobot_2_HuggingFaceVLA_libero_smolvla_lr1e-4bs32steps100000', + 'policy': {'adapt_to_pi_aloha': False, + 'add_image_special_tokens': False, + 'attention_mode': 'cross_attn', + 'chunk_size': 50, + 'device': 'cuda', + 'empty_cameras': 0, + 'expert_width_multiplier': 0.75, + 'freeze_vision_encoder': True, + 'gradient_accumulation_steps': 1, + 'input_features': {}, + 'license': None, + 'load_vlm_weights': False, + 'max_action_dim': 32, + 'max_period': 4.0, + 'max_state_dim': 32, + 'min_period': 0.004, + 'n_action_steps': 1, + 'n_obs_steps': 1, + 'normalization_mapping': {'ACTION': , + 'STATE': , + 'VISUAL': }, + 'num_expert_layers': -1, + 'num_steps': 10, + 'num_vlm_layers': 16, + 'optimizer_betas': [0.9, 0.95], + 'optimizer_eps': 1e-08, + 'optimizer_grad_clip_norm': 10, + 'optimizer_lr': 0.0001, + 'optimizer_weight_decay': 1e-10, + 'output_features': {}, + 'pad_language_to': 'longest', + 'prefix_length': -1, + 'private': None, + 'push_to_hub': True, + 'repo_id': 'None', + 'resize_imgs_with_padding': [512, 512], + 'scheduler_decay_lr': 2.5e-06, + 'scheduler_decay_steps': 30000, + 'scheduler_warmup_steps': 1000, + 'self_attn_every_n_layers': 2, + 'tags': None, + 'tokenizer_max_length': 48, + 'train_expert_only': True, + 'train_state_proj': True, + 'type': 'smolvla', + 'use_amp': True, + 'use_cache': True, + 'use_delta_joint_actions_aloha': False, + 'vlm_model_name': 'HuggingFaceTB/SmolVLM2-500M-Instruct'}, + 'resume': False, + 'save_checkpoint': True, + 'save_freq': 20000, + 'scheduler': {'decay_lr': 2.5e-06, + 'num_decay_steps': 30000, + 'num_warmup_steps': 1000, + 'peak_lr': 0.0001, + 'type': 'cosine_decay_with_warmup'}, + 'seed': 1000, + 'steps': 100000, + 'use_policy_training_preset': True, + 'wandb': {'disable_artifact': False, + 'enable': False, + 'entity': None, + 'mode': None, + 'notes': None, + 'project': 'lerobot', + 'run_id': None}} +INFO 2025-09-08 13:33:47 ils/utils.py:48 Cuda backend detected, using cuda. +WARNING 2025-09-08 13:33:47 /policies.py:81 Device 'None' is not available. Switching to 'cuda'. +INFO 2025-09-08 13:33:47 ccelerate.py:99 {'batch_size': 32, + 'dataset': {'episodes': None, + 'image_transforms': {'enable': False, + 'max_num_transforms': 3, + 'random_order': False, + 'tfs': {'brightness': {'kwargs': {'brightness': [0.8, + 1.2]}, + 'type': 'ColorJitter', + 'weight': 1.0}, + 'contrast': {'kwargs': {'contrast': [0.8, + 1.2]}, + 'type': 'ColorJitter', + 'weight': 1.0}, + 'hue': {'kwargs': {'hue': [-0.05, + 0.05]}, + 'type': 'ColorJitter', + 'weight': 1.0}, + 'saturation': {'kwargs': {'saturation': [0.5, + 1.5]}, + 'type': 'ColorJitter', + 'weight': 1.0}, + 'sharpness': {'kwargs': {'sharpness': [0.5, + 1.5]}, + 'type': 'SharpnessJitter', + 'weight': 1.0}}}, + 'repo_id': 'HuggingFaceVLA/libero', + 'revision': None, + 'root': '/raid/jade/.cache/huggingface/datasets', + 'use_imagenet_stats': True, + 'video_backend': 'torchcodec'}, + 'env': {'camera_name': 'agentview_image,robot0_eye_in_hand_image', + 'episode_length': 520, + 'features': {'action': {'shape': [7], + 'type': }, + 'agent_pos': {'shape': [8], + 'type': }, + 'pixels/agentview_image': {'shape': [360, 360, 3], + 'type': }, + 'pixels/robot0_eye_in_hand_image': {'shape': [360, + 360, + 3], + 'type': }}, + 'features_map': {'action': 'action', + 'agent_pos': 'observation.state', + 'pixels/agentview_image': 'observation.images.image', + 'pixels/robot0_eye_in_hand_image': 'observation.images.image2'}, + 'fps': 30, + 'init_states': True, + 'max_parallel_tasks': 5, + 'multitask_eval': True, + 'obs_type': 'pixels_agent_pos', + 'render_mode': 'rgb_array', + 'task': 'libero_spatial', + 'type': 'libero'}, + 'eval': {'batch_size': 1, 'n_episodes': 1, 'use_async_envs': False}, + 'eval_freq': 0, + 'job_name': 'libero_smolvla', + 'log_freq': 200, + 'num_workers': 4, + 'optimizer': {'betas': [0.9, 0.95], + 'eps': 1e-08, + 'grad_clip_norm': 10, + 'lr': 0.0001, + 'type': 'adamw', + 'weight_decay': 1e-10}, + 'output_dir': '/raid/jade/logs/lerobot/lerobot_2_HuggingFaceVLA_libero_smolvla_lr1e-4bs32steps100000', + 'policy': {'adapt_to_pi_aloha': False, + 'add_image_special_tokens': False, + 'attention_mode': 'cross_attn', + 'chunk_size': 50, + 'device': 'cuda', + 'empty_cameras': 0, + 'expert_width_multiplier': 0.75, + 'freeze_vision_encoder': True, + 'gradient_accumulation_steps': 1, + 'input_features': {}, + 'license': None, + 'load_vlm_weights': False, + 'max_action_dim': 32, + 'max_period': 4.0, + 'max_state_dim': 32, + 'min_period': 0.004, + 'n_action_steps': 1, + 'n_obs_steps': 1, + 'normalization_mapping': {'ACTION': , + 'STATE': , + 'VISUAL': }, + 'num_expert_layers': -1, + 'num_steps': 10, + 'num_vlm_layers': 16, + 'optimizer_betas': [0.9, 0.95], + 'optimizer_eps': 1e-08, + 'optimizer_grad_clip_norm': 10, + 'optimizer_lr': 0.0001, + 'optimizer_weight_decay': 1e-10, + 'output_features': {}, + 'pad_language_to': 'longest', + 'prefix_length': -1, + 'private': None, + 'push_to_hub': True, + 'repo_id': 'None', + 'resize_imgs_with_padding': [512, 512], + 'scheduler_decay_lr': 2.5e-06, + 'scheduler_decay_steps': 30000, + 'scheduler_warmup_steps': 1000, + 'self_attn_every_n_layers': 2, + 'tags': None, + 'tokenizer_max_length': 48, + 'train_expert_only': True, + 'train_state_proj': True, + 'type': 'smolvla', + 'use_amp': True, + 'use_cache': True, + 'use_delta_joint_actions_aloha': False, + 'vlm_model_name': 'HuggingFaceTB/SmolVLM2-500M-Instruct'}, + 'resume': False, + 'save_checkpoint': True, + 'save_freq': 20000, + 'scheduler': {'decay_lr': 2.5e-06, + 'num_decay_steps': 30000, + 'num_warmup_steps': 1000, + 'peak_lr': 0.0001, + 'type': 'cosine_decay_with_warmup'}, + 'seed': 1000, + 'steps': 100000, + 'use_policy_training_preset': True, + 'wandb': {'disable_artifact': False, + 'enable': False, + 'entity': None, + 'mode': None, + 'notes': None, + 'project': 'lerobot', + 'run_id': None}} +WARNING 2025-09-08 13:33:47 ls/other.py:512 Detected kernel version 5.4.0, which is below the recommended minimum of + 5.5.0; this can cause the process to hang. It is recommended to upgrade the kernel to the minimum version or higher +. +WARNING 2025-09-08 13:33:47 ls/other.py:512 Detected kernel version 5.4.0, which is below the recommended minimum of + 5.5.0; this can cause the process to hang. It is recommended to upgrade the kernel to the minimum version or higher +. +INFO 2025-09-08 13:33:47 celerate.py:149 Creating dataset +Resolving data files: 100%|β–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆ| 1693/1693 [00:00<00:00, 103295.66it/s] +Loading dataset shards: 100%|β–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆ| 69/69 [00:00<00:00, 5229.81it/s] +Resolving data files: 100%|β–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆ| 1693/1693 [00:00<00:00, 360601.09it/s] +Loading dataset shards: 100%|β–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆ| 69/69 [00:00<00:00, 4881.54it/s] +c +INFO 2025-09-08 13:33:53 celerate.py:160 Creating policy +/home/jade_choghari/miniconda3/envs/lerobot/lib/python3.10/site-packages/transformers/utils/hub.py:111: FutureWarnin +g: Using `TRANSFORMERS_CACHE` is deprecated and will be removed in v5 of Transformers. Use `HF_HOME` instead. + warnings.warn( +c +/home/jade_choghari/miniconda3/envs/lerobot/lib/python3.10/site-packages/torch/distributed/distributed_c10d.py:4631: + UserWarning: No device id is provided via `init_process_group` or `barrier `. Using the current device set by the u +ser. + warnings.warn( # warn only once +[rank1]:[W908 13:33:54.613597516 ProcessGroupNCCL.cpp:4718] [PG ID 0 PG GUID 0 Rank 1] using GPU 1 as device used b +y this process is currently unknown. This can potentially cause a hang if this rank to GPU mapping is incorrect. You + can pecify device_id in init_process_group() to force use of a particular device. +Reducing the number of VLM layers to 16 ... +/home/jade_choghari/miniconda3/envs/lerobot/lib/python3.10/site-packages/torch/distributed/distributed_c10d.py:4631: + UserWarning: No device id is provided via `init_process_group` or `barrier `. Using the current device set by the u +ser. + warnings.warn( # warn only once +[rank0]:[W908 13:34:15.806448425 ProcessGroupNCCL.cpp:4718] [PG ID 0 PG GUID 0 Rank 0] using GPU 0 as device used b +y this process is currently unknown. This can potentially cause a hang if this rank to GPU mapping is incorrect. You + can pecify device_id in init_process_group() to force use of a particular device. +INFO 2025-09-08 13:34:15 celerate.py:171 Creating optimizer and scheduler +/home/jade_choghari/miniconda3/envs/lerobot/lib/python3.10/site-packages/transformers/utils/hub.py:111: FutureWarnin +g: Using `TRANSFORMERS_CACHE` is deprecated and will be removed in v5 of Transformers. Use `HF_HOME` instead. + warnings.warn( +Reducing the number of VLM layers to 16 ... +INFO 2025-09-08 13:34:36 celerate.py:211 Output dir: /raid/jade/logs/lerobot/lerobot_2_HuggingFaceVLA_libero_smolvla +_lr1e-4bs32steps100000 +INFO 2025-09-08 13:34:36 celerate.py:213 cfg.env.task='libero_spatial' +INFO 2025-09-08 13:34:36 celerate.py:214 cfg.steps=100000 (100K) +INFO 2025-09-08 13:34:36 celerate.py:215 dataset.num_frames=273465 (273K) +INFO 2025-09-08 13:34:36 celerate.py:216 dataset.num_episodes=1693 +INFO 2025-09-08 13:34:36 celerate.py:217 num_learnable_params=99880992 (100M) +INFO 2025-09-08 13:34:36 celerate.py:218 num_total_params=450046220 (450M) +INFO 2025-09-08 13:34:36 celerate.py:219 Number of processes: 2 +INFO 2025-09-08 13:34:36 celerate.py:220 Device: cuda:0 +INFO 2025-09-08 13:34:36 celerate.py:221 Mixed precision: bf16 +INFO 2025-09-08 13:34:36 celerate.py:243 Start offline training on a fixed dataset +[rank1]:[W908 13:34:39.454560620 reducer.cpp:1430] Warning: find_unused_parameters=True was specified in DDP constru +ctor, but did not find any unused parameters in the forward pass. This flag results in an extra traversal of the aut +ograd graph every iteration, which can adversely affect performance. If your model indeed never has any unused para +meters in the forward pass, consider turning this flag off. Note that this warning may be a false positive if your m +odel has flow control causing later iterations to have unused parameters. (function operator()) +[rank0]:[W908 13:34:40.502702504 reducer.cpp:1430] Warning: find_unused_parameters=True was specified in DDP constru +ctor, but did not find any unused parameters in the forward pass. This flag results in an extra traversal of the aut +ograd graph every iteration, which can adversely affect performance. If your model indeed never has any unused para +meters in the forward pass, consider turning this flag off. Note that this warning may be a false positive if your m +odel has flow control causing later iterations to have unused parameters. (function operator()) +INFO 2025-09-08 13:36:23 celerate.py:281 step:200 smpl:13K ep:79 epch:0.05 loss:0.963 grdn:2.699 lr:2.0e-05 updt_s:0 +.506 data_s:0.027 +INFO 2025-09-08 13:38:09 celerate.py:281 step:400 smpl:26K ep:158 epch:0.09 loss:0.389 grdn:3.127 lr:6.0e-05 updt_s: +0.525 data_s:0.003 +INFO 2025-09-08 13:39:53 celerate.py:281 step:600 smpl:38K ep:238 epch:0.14 loss:0.261 grdn:2.618 lr:9.5e-05 updt_s: +0.517 data_s:0.003 +INFO 2025-09-08 13:41:37 celerate.py:281 step:800 smpl:51K ep:317 epch:0.19 loss:0.231 grdn:1.684 lr:9.9e-05 updt_s: +0.516 data_s:0.003 +INFO 2025-09-08 13:43:21 celerate.py:281 step:1K smpl:64K ep:396 epch:0.23 loss:0.211 grdn:1.258 lr:9.9e-05 updt_s:0 +.514 data_s:0.003 +INFO 2025-09-08 13:45:05 celerate.py:281 step:1K smpl:77K ep:475 epch:0.28 loss:0.198 grdn:1.032 lr:9.9e-05 updt_s:0 +.517 data_s:0.003 +INFO 2025-09-08 13:46:49 celerate.py:281 step:1K smpl:90K ep:555 epch:0.33 loss:0.182 grdn:0.880 lr:9.8e-05 updt_s:0 +.515 data_s:0.003 +INFO 2025-09-08 13:48:33 celerate.py:281 step:2K smpl:102K ep:634 epch:0.37 loss:0.167 grdn:0.744 lr:9.8e-05 updt_s: +0.514 data_s:0.003 +INFO 2025-09-08 13:50:17 celerate.py:281 step:2K smpl:115K ep:713 epch:0.42 loss:0.157 grdn:0.680 lr:9.7e-05 updt_s: +0.514 data_s:0.003 +INFO 2025-09-08 13:52:01 celerate.py:281 step:2K smpl:128K ep:792 epch:0.47 loss:0.147 grdn:0.612 lr:9.6e-05 updt_s: +0.517 data_s:0.003 +INFO 2025-09-08 13:53:44 celerate.py:281 step:2K smpl:141K ep:872 epch:0.51 loss:0.142 grdn:0.576 lr:9.5e-05 updt_s: +0.510 data_s:0.003 +INFO 2025-09-08 13:55:27 celerate.py:281 step:2K smpl:154K ep:951 epch:0.56 loss:0.136 grdn:0.523 lr:9.4e-05 updt_s: +0.514 data_s:0.003 +INFO 2025-09-08 13:57:11 celerate.py:281 step:3K smpl:166K ep:1K epch:0.61 loss:0.132 grdn:0.509 lr:9.3e-05 updt_s:0 +.516 data_s:0.003 +INFO 2025-09-08 13:58:57 celerate.py:281 step:3K smpl:179K ep:1K epch:0.66 loss:0.126 grdn:0.492 lr:9.2e-05 updt_s:0 +.525 data_s:0.003 +INFO 2025-09-08 14:00:43 celerate.py:281 step:3K smpl:192K ep:1K epch:0.70 loss:0.124 grdn:0.467 lr:9.1e-05 updt_s:0 +.525 data_s:0.003 +INFO 2025-09-08 14:02:26 celerate.py:281 step:3K smpl:205K ep:1K epch:0.75 loss:0.119 grdn:0.438 lr:9.0e-05 updt_s:0 +.508 data_s:0.003 +INFO 2025-09-08 14:04:27 celerate.py:281 step:3K smpl:218K ep:1K epch:0.80 loss:0.118 grdn:0.426 lr:8.9e-05 updt_s:0 +.564 data_s:0.039 +INFO 2025-09-08 14:06:10 celerate.py:281 step:4K smpl:230K ep:1K epch:0.84 loss:0.116 grdn:0.422 lr:8.7e-05 updt_s:0 +.511 data_s:0.004 +INFO 2025-09-08 14:07:55 celerate.py:281 step:4K smpl:243K ep:2K epch:0.89 loss:0.113 grdn:0.395 lr:8.6e-05 updt_s:0 +.517 data_s:0.003 +INFO 2025-09-08 14:09:38 celerate.py:281 step:4K smpl:256K ep:2K epch:0.94 loss:0.111 grdn:0.401 lr:8.5e-05 updt_s:0 +.511 data_s:0.003 +INFO 2025-09-08 14:11:21 celerate.py:281 step:4K smpl:269K ep:2K epch:0.98 loss:0.110 grdn:0.380 lr:8.3e-05 updt_s:0 +.511 data_s:0.003 +INFO 2025-09-08 14:13:08 celerate.py:281 step:4K smpl:282K ep:2K epch:1.03 loss:0.109 grdn:0.381 lr:8.2e-05 updt_s:0 +.413 data_s:0.119 +INFO 2025-09-08 14:14:52 celerate.py:281 step:5K smpl:294K ep:2K epch:1.08 loss:0.107 grdn:0.387 lr:8.0e-05 updt_s:0 +.373 data_s:0.146 +INFO 2025-09-08 14:16:36 celerate.py:281 step:5K smpl:307K ep:2K epch:1.12 loss:0.107 grdn:0.366 lr:7.8e-05 updt_s:0 +.446 data_s:0.072 +INFO 2025-09-08 14:18:19 celerate.py:281 step:5K smpl:320K ep:2K epch:1.17 loss:0.105 grdn:0.347 lr:7.6e-05 updt_s:0 +.468 data_s:0.045 +INFO 2025-09-08 14:20:01 celerate.py:281 step:5K smpl:333K ep:2K epch:1.22 loss:0.103 grdn:0.350 lr:7.5e-05 updt_s:0 +.510 data_s:0.003 +INFO 2025-09-08 14:21:46 celerate.py:281 step:5K smpl:346K ep:2K epch:1.26 loss:0.101 grdn:0.336 lr:7.3e-05 updt_s:0 +.512 data_s:0.011 +INFO 2025-09-08 14:23:30 celerate.py:281 step:6K smpl:358K ep:2K epch:1.31 loss:0.102 grdn:0.345 lr:7.1e-05 updt_s:0 +.515 data_s:0.003 +INFO 2025-09-08 14:25:15 celerate.py:281 step:6K smpl:371K ep:2K epch:1.36 loss:0.100 grdn:0.333 lr:6.9e-05 updt_s:0 +.521 data_s:0.003 +INFO 2025-09-08 14:26:59 celerate.py:281 step:6K smpl:384K ep:2K epch:1.40 loss:0.100 grdn:0.328 lr:6.7e-05 updt_s:0 +.516 data_s:0.003 +INFO 2025-09-08 14:28:43 celerate.py:281 step:6K smpl:397K ep:2K epch:1.45 loss:0.099 grdn:0.319 lr:6.5e-05 updt_s:0 +.512 data_s:0.003 +INFO 2025-09-08 14:30:26 celerate.py:281 step:6K smpl:410K ep:3K epch:1.50 loss:0.098 grdn:0.313 lr:6.3e-05 updt_s:0 +.515 data_s:0.003 +INFO 2025-09-08 14:32:11 celerate.py:281 step:7K smpl:422K ep:3K epch:1.54 loss:0.097 grdn:0.319 lr:6.1e-05 updt_s:0 +.519 data_s:0.004 +INFO 2025-09-08 14:33:55 celerate.py:281 step:7K smpl:435K ep:3K epch:1.59 loss:0.097 grdn:0.312 lr:5.9e-05 updt_s:0 +.506 data_s:0.010 +INFO 2025-09-08 14:35:39 celerate.py:281 step:7K smpl:448K ep:3K epch:1.64 loss:0.097 grdn:0.307 lr:5.7e-05 updt_s:0 +.516 data_s:0.003 +INFO 2025-09-08 14:37:23 celerate.py:281 step:7K smpl:461K ep:3K epch:1.69 loss:0.095 grdn:0.294 lr:5.5e-05 updt_s:0 +.518 data_s:0.003 +INFO 2025-09-08 14:39:07 celerate.py:281 step:7K smpl:474K ep:3K epch:1.73 loss:0.095 grdn:0.299 lr:5.3e-05 updt_s:0 +.507 data_s:0.007 +INFO 2025-09-08 14:40:52 celerate.py:281 step:8K smpl:486K ep:3K epch:1.78 loss:0.094 grdn:0.283 lr:5.1e-05 updt_s:0 +.523 data_s:0.003 +INFO 2025-09-08 14:42:36 celerate.py:281 step:8K smpl:499K ep:3K epch:1.83 loss:0.093 grdn:0.284 lr:4.9e-05 updt_s:0 +.517 data_s:0.003 +INFO 2025-09-08 14:44:22 celerate.py:281 step:8K smpl:512K ep:3K epch:1.87 loss:0.092 grdn:0.284 lr:4.7e-05 updt_s:0 +.465 data_s:0.060 +INFO 2025-09-08 14:46:06 celerate.py:281 step:8K smpl:525K ep:3K epch:1.92 loss:0.093 grdn:0.292 lr:4.5e-05 updt_s:0 +.456 data_s:0.066 +INFO 2025-09-08 14:47:49 celerate.py:281 step:8K smpl:538K ep:3K epch:1.97 loss:0.093 grdn:0.290 lr:4.3e-05 updt_s:0 +.510 data_s:0.003 +INFO 2025-09-08 14:49:37 celerate.py:281 step:9K smpl:550K ep:3K epch:2.01 loss:0.092 grdn:0.283 lr:4.1e-05 updt_s:0 +.419 data_s:0.117 +INFO 2025-09-08 14:51:20 celerate.py:281 step:9K smpl:563K ep:3K epch:2.06 loss:0.092 grdn:0.275 lr:3.9e-05 updt_s:0 +.463 data_s:0.053 +INFO 2025-09-08 14:53:05 celerate.py:281 step:9K smpl:576K ep:4K epch:2.11 loss:0.090 grdn:0.272 lr:3.7e-05 updt_s:0 +.517 data_s:0.003 +INFO 2025-09-08 14:54:49 celerate.py:281 step:9K smpl:589K ep:4K epch:2.15 loss:0.090 grdn:0.268 lr:3.5e-05 updt_s:0 +.506 data_s:0.013 +INFO 2025-09-08 14:56:32 celerate.py:281 step:9K smpl:602K ep:4K epch:2.20 loss:0.090 grdn:0.271 lr:3.3e-05 updt_s:0 +.513 data_s:0.003 +INFO 2025-09-08 14:58:17 celerate.py:281 step:10K smpl:614K ep:4K epch:2.25 loss:0.090 grdn:0.268 lr:3.1e-05 updt_s: +0.520 data_s:0.003 +INFO 2025-09-08 15:00:02 celerate.py:281 step:10K smpl:627K ep:4K epch:2.29 loss:0.089 grdn:0.261 lr:3.0e-05 updt_s: +0.519 data_s:0.003 +INFO 2025-09-08 15:01:48 celerate.py:281 step:10K smpl:640K ep:4K epch:2.34 loss:0.090 grdn:0.271 lr:2.8e-05 updt_s: +0.526 data_s:0.003 +INFO 2025-09-08 15:03:33 celerate.py:281 step:10K smpl:653K ep:4K epch:2.39 loss:0.089 grdn:0.262 lr:2.6e-05 updt_s: +0.521 data_s:0.003 +INFO 2025-09-08 15:05:18 celerate.py:281 step:10K smpl:666K ep:4K epch:2.43 loss:0.090 grdn:0.264 lr:2.4e-05 updt_s: +0.519 data_s:0.003 +INFO 2025-09-08 15:07:32 celerate.py:281 step:11K smpl:678K ep:4K epch:2.48 loss:0.089 grdn:0.255 lr:2.3e-05 updt_s: +0.663 data_s:0.004 +INFO 2025-09-08 15:09:21 celerate.py:281 step:11K smpl:691K ep:4K epch:2.53 loss:0.090 grdn:0.263 lr:2.1e-05 updt_s: +0.514 data_s:0.030 +INFO 2025-09-08 15:11:06 celerate.py:281 step:11K smpl:704K ep:4K epch:2.57 loss:0.088 grdn:0.254 lr:1.9e-05 updt_s: +0.517 data_s:0.006 +INFO 2025-09-08 15:12:51 celerate.py:281 step:11K smpl:717K ep:4K epch:2.62 loss:0.088 grdn:0.252 lr:1.8e-05 updt_s: +0.517 data_s:0.005 +INFO 2025-09-08 15:14:38 celerate.py:281 step:11K smpl:730K ep:5K epch:2.67 loss:0.088 grdn:0.251 lr:1.6e-05 updt_s: +0.532 data_s:0.003 +INFO 2025-09-08 15:16:23 celerate.py:281 step:12K smpl:742K ep:5K epch:2.71 loss:0.088 grdn:0.253 lr:1.5e-05 updt_s: +0.520 data_s:0.003 +INFO 2025-09-08 15:18:08 celerate.py:281 step:12K smpl:755K ep:5K epch:2.76 loss:0.087 grdn:0.244 lr:1.4e-05 updt_s: +0.521 data_s:0.003 +INFO 2025-09-08 15:19:54 celerate.py:281 step:12K smpl:768K ep:5K epch:2.81 loss:0.088 grdn:0.247 lr:1.2e-05 updt_s: +0.524 data_s:0.003 +INFO 2025-09-08 15:21:39 celerate.py:281 step:12K smpl:781K ep:5K epch:2.86 loss:0.087 grdn:0.242 lr:1.1e-05 updt_s: +0.520 data_s:0.003 +INFO 2025-09-08 15:23:32 celerate.py:281 step:12K smpl:794K ep:5K epch:2.90 loss:0.088 grdn:0.243 lr:1.0e-05 updt_s: +0.560 data_s:0.003 +INFO 2025-09-08 15:25:48 celerate.py:281 step:13K smpl:806K ep:5K epch:2.95 loss:0.087 grdn:0.240 lr:9.0e-06 updt_s: +0.674 data_s:0.005 +INFO 2025-09-08 15:28:02 celerate.py:281 step:13K smpl:819K ep:5K epch:3.00 loss:0.088 grdn:0.245 lr:8.0e-06 updt_s: +0.662 data_s:0.004 +INFO 2025-09-08 15:31:06 celerate.py:281 step:13K smpl:832K ep:5K epch:3.04 loss:0.086 grdn:0.236 lr:7.1e-06 updt_s: +0.688 data_s:0.231 +INFO 2025-09-08 15:32:52 celerate.py:281 step:13K smpl:845K ep:5K epch:3.09 loss:0.087 grdn:0.231 lr:6.3e-06 updt_s: +0.521 data_s:0.003 +INFO 2025-09-08 15:35:46 celerate.py:281 step:13K smpl:858K ep:5K epch:3.14 loss:0.088 grdn:0.235 lr:5.6e-06 updt_s: +0.637 data_s:0.232 +INFO 2025-09-08 15:37:34 celerate.py:281 step:14K smpl:870K ep:5K epch:3.18 loss:0.087 grdn:0.238 lr:4.9e-06 updt_s: +0.514 data_s:0.025 +INFO 2025-09-08 15:39:18 celerate.py:281 step:14K smpl:883K ep:5K epch:3.23 loss:0.087 grdn:0.226 lr:4.3e-06 updt_s: +0.515 data_s:0.003 +INFO 2025-09-08 15:41:02 celerate.py:281 step:14K smpl:896K ep:6K epch:3.28 loss:0.087 grdn:0.230 lr:3.8e-06 updt_s: +0.515 data_s:0.003 +INFO 2025-09-08 15:42:45 celerate.py:281 step:14K smpl:909K ep:6K epch:3.32 loss:0.086 grdn:0.229 lr:3.4e-06 updt_s: +0.507 data_s:0.008 +INFO 2025-09-08 15:44:29 celerate.py:281 step:14K smpl:922K ep:6K epch:3.37 loss:0.087 grdn:0.229 lr:3.0e-06 updt_s: +0.515 data_s:0.003 +INFO 2025-09-08 15:46:12 celerate.py:281 step:15K smpl:934K ep:6K epch:3.42 loss:0.087 grdn:0.228 lr:2.8e-06 updt_s: +0.502 data_s:0.011 +INFO 2025-09-08 15:47:56 celerate.py:281 step:15K smpl:947K ep:6K epch:3.46 loss:0.086 grdn:0.232 lr:2.6e-06 updt_s: +0.515 data_s:0.004 +INFO 2025-09-08 15:49:39 celerate.py:281 step:15K smpl:960K ep:6K epch:3.51 loss:0.087 grdn:0.234 lr:2.5e-06 updt_s: +0.365 data_s:0.147 +INFO 2025-09-08 15:51:22 celerate.py:281 step:15K smpl:973K ep:6K epch:3.56 loss:0.086 grdn:0.230 lr:2.5e-06 updt_s: +0.333 data_s:0.179 +INFO 2025-09-08 15:53:07 celerate.py:281 step:15K smpl:986K ep:6K epch:3.60 loss:0.087 grdn:0.229 lr:2.5e-06 updt_s: +0.357 data_s:0.164 +INFO 2025-09-08 15:54:50 celerate.py:281 step:16K smpl:998K ep:6K epch:3.65 loss:0.087 grdn:0.230 lr:2.5e-06 updt_s: +0.365 data_s:0.151 +INFO 2025-09-08 15:56:35 celerate.py:281 step:16K smpl:1M ep:6K epch:3.70 loss:0.086 grdn:0.228 lr:2.5e-06 updt_s:0. +450 data_s:0.071 +INFO 2025-09-08 15:58:19 celerate.py:281 step:16K smpl:1M ep:6K epch:3.74 loss:0.087 grdn:0.232 lr:2.5e-06 updt_s:0. +495 data_s:0.023 +INFO 2025-09-08 16:00:04 celerate.py:281 step:16K smpl:1M ep:6K epch:3.79 loss:0.087 grdn:0.227 lr:2.5e-06 updt_s:0. +393 data_s:0.131 +INFO 2025-09-08 16:01:48 celerate.py:281 step:16K smpl:1M ep:6K epch:3.84 loss:0.086 grdn:0.230 lr:2.5e-06 updt_s:0. +515 data_s:0.003 +INFO 2025-09-08 16:03:32 celerate.py:281 step:17K smpl:1M ep:7K epch:3.88 loss:0.087 grdn:0.231 lr:2.5e-06 updt_s:0. +518 data_s:0.003 +INFO 2025-09-08 16:05:17 celerate.py:281 step:17K smpl:1M ep:7K epch:3.93 loss:0.087 grdn:0.231 lr:2.5e-06 updt_s:0. +397 data_s:0.125 +INFO 2025-09-08 16:12:41 celerate.py:281 step:17K smpl:1M ep:7K epch:3.98 loss:0.086 grdn:0.230 lr:2.5e-06 updt_s:1. +496 data_s:0.719 +INFO 2025-09-08 16:14:28 celerate.py:281 step:17K smpl:1M ep:7K epch:4.03 loss:0.087 grdn:0.228 lr:2.5e-06 updt_s:0. +413 data_s:0.124 +INFO 2025-09-08 16:16:13 celerate.py:281 step:17K smpl:1M ep:7K epch:4.07 loss:0.087 grdn:0.234 lr:2.5e-06 updt_s:0. +469 data_s:0.050 +INFO 2025-09-08 16:17:57 celerate.py:281 step:18K smpl:1M ep:7K epch:4.12 loss:0.087 grdn:0.228 lr:2.5e-06 updt_s:0. +375 data_s:0.145 +INFO 2025-09-08 16:19:42 celerate.py:281 step:18K smpl:1M ep:7K epch:4.17 loss:0.086 grdn:0.230 lr:2.5e-06 updt_s:0. +333 data_s:0.189 +INFO 2025-09-08 16:21:26 celerate.py:281 step:18K smpl:1M ep:7K epch:4.21 loss:0.087 grdn:0.231 lr:2.5e-06 updt_s:0. +334 data_s:0.184 +INFO 2025-09-08 16:23:09 celerate.py:281 step:18K smpl:1M ep:7K epch:4.26 loss:0.086 grdn:0.235 lr:2.5e-06 updt_s:0. +331 data_s:0.185 +INFO 2025-09-08 16:24:53 celerate.py:281 step:18K smpl:1M ep:7K epch:4.31 loss:0.088 grdn:0.236 lr:2.5e-06 updt_s:0. +333 data_s:0.182 +INFO 2025-09-08 16:26:38 celerate.py:281 step:19K smpl:1M ep:7K epch:4.35 loss:0.086 grdn:0.230 lr:2.5e-06 updt_s:0. +337 data_s:0.188 +INFO 2025-09-08 16:28:22 celerate.py:281 step:19K smpl:1M ep:7K epch:4.40 loss:0.086 grdn:0.231 lr:2.5e-06 updt_s:0. +420 data_s:0.099 +INFO 2025-09-08 16:30:06 celerate.py:281 step:19K smpl:1M ep:8K epch:4.45 loss:0.086 grdn:0.232 lr:2.5e-06 updt_s:0. +444 data_s:0.075 +INFO 2025-09-08 16:31:49 celerate.py:281 step:19K smpl:1M ep:8K epch:4.49 loss:0.087 grdn:0.235 lr:2.5e-06 updt_s:0. +475 data_s:0.036 +INFO 2025-09-08 16:33:33 celerate.py:281 step:19K smpl:1M ep:8K epch:4.54 loss:0.087 grdn:0.236 lr:2.5e-06 updt_s:0. +379 data_s:0.139 +INFO 2025-09-08 16:35:17 celerate.py:281 step:20K smpl:1M ep:8K epch:4.59 loss:0.087 grdn:0.237 lr:2.5e-06 updt_s:0. +348 data_s:0.171 +INFO 2025-09-08 16:37:01 celerate.py:281 step:20K smpl:1M ep:8K epch:4.63 loss:0.087 grdn:0.231 lr:2.5e-06 updt_s:0. +332 data_s:0.185 +/home/jade_choghari/miniconda3/envs/lerobot/lib/python3.10/site-packages/torch/distributed/distributed_c10d.py:4631: + UserWarning: No device id is provided via `init_process_group` or `barrier `. Using the current device set by the u +ser. + warnings.warn( # warn only once +INFO 2025-09-08 16:38:46 celerate.py:281 step:20K smpl:1M ep:8K epch:4.68 loss:0.086 grdn:0.228 lr:2.5e-06 updt_s:0. +486 data_s:0.037 +INFO 2025-09-08 16:38:46 celerate.py:295 Checkpoint policy after step 20000 +INFO 2025-09-08 16:40:30 celerate.py:281 step:20K smpl:1M ep:8K epch:4.73 loss:0.086 grdn:0.232 lr:2.5e-06 updt_s:0. +509 data_s:0.003 +INFO 2025-09-08 16:42:16 celerate.py:281 step:20K smpl:1M ep:8K epch:4.77 loss:0.086 grdn:0.237 lr:2.5e-06 updt_s:0. +527 data_s:0.003 +INFO 2025-09-08 16:44:01 celerate.py:281 step:21K smpl:1M ep:8K epch:4.82 loss:0.086 grdn:0.238 lr:2.5e-06 updt_s:0. +519 data_s:0.003 +INFO 2025-09-08 16:45:45 celerate.py:281 step:21K smpl:1M ep:8K epch:4.87 loss:0.087 grdn:0.235 lr:2.5e-06 updt_s:0. +504 data_s:0.013 +INFO 2025-09-08 16:47:29 celerate.py:281 step:21K smpl:1M ep:8K epch:4.91 loss:0.087 grdn:0.233 lr:2.5e-06 updt_s:0. +509 data_s:0.011 +INFO 2025-09-08 16:49:19 celerate.py:281 step:21K smpl:1M ep:8K epch:4.96 loss:0.086 grdn:0.234 lr:2.5e-06 updt_s:0. +544 data_s:0.003 +INFO 2025-09-08 16:51:04 celerate.py:281 step:21K smpl:1M ep:8K epch:5.01 loss:0.086 grdn:0.225 lr:2.5e-06 updt_s:0. +488 data_s:0.039 +INFO 2025-09-08 16:52:51 celerate.py:281 step:22K smpl:1M ep:9K epch:5.06 loss:0.086 grdn:0.232 lr:2.5e-06 updt_s:0. +430 data_s:0.099 +INFO 2025-09-08 16:54:36 celerate.py:281 step:22K smpl:1M ep:9K epch:5.10 loss:0.087 grdn:0.236 lr:2.5e-06 updt_s:0. +521 data_s:0.003 +INFO 2025-09-08 16:56:23 celerate.py:281 step:22K smpl:1M ep:9K epch:5.15 loss:0.086 grdn:0.233 lr:2.5e-06 updt_s:0. +521 data_s:0.014 +INFO 2025-09-08 16:58:09 celerate.py:281 step:22K smpl:1M ep:9K epch:5.20 loss:0.087 grdn:0.234 lr:2.5e-06 updt_s:0. +525 data_s:0.003 +INFO 2025-09-08 17:00:04 celerate.py:281 step:22K smpl:1M ep:9K epch:5.24 loss:0.086 grdn:0.233 lr:2.5e-06 updt_s:0. +568 data_s:0.003 +INFO 2025-09-08 17:02:00 celerate.py:281 step:23K smpl:1M ep:9K epch:5.29 loss:0.087 grdn:0.238 lr:2.5e-06 updt_s:0. +575 data_s:0.003 +INFO 2025-09-08 17:03:49 celerate.py:281 step:23K smpl:1M ep:9K epch:5.34 loss:0.087 grdn:0.233 lr:2.5e-06 updt_s:0. +513 data_s:0.030 +INFO 2025-09-08 17:05:39 celerate.py:281 step:23K smpl:1M ep:9K epch:5.38 loss:0.085 grdn:0.227 lr:2.5e-06 updt_s:0. +523 data_s:0.027 +INFO 2025-09-08 17:07:26 celerate.py:281 step:23K smpl:1M ep:9K epch:5.43 loss:0.086 grdn:0.233 lr:2.5e-06 updt_s:0. +529 data_s:0.003 +INFO 2025-09-08 17:09:12 celerate.py:281 step:23K smpl:1M ep:9K epch:5.48 loss:0.086 grdn:0.232 lr:2.5e-06 updt_s:0. +526 data_s:0.003 +INFO 2025-09-08 17:10:55 celerate.py:281 step:24K smpl:2M ep:9K epch:5.52 loss:0.087 grdn:0.230 lr:2.5e-06 updt_s:0. +443 data_s:0.072 +INFO 2025-09-08 17:12:40 celerate.py:281 step:24K smpl:2M ep:9K epch:5.57 loss:0.087 grdn:0.229 lr:2.5e-06 updt_s:0. +518 data_s:0.004 +INFO 2025-09-08 17:14:25 celerate.py:281 step:24K smpl:2M ep:10K epch:5.62 loss:0.087 grdn:0.232 lr:2.5e-06 updt_s:0 +.521 data_s:0.003 +INFO 2025-09-08 17:16:11 celerate.py:281 step:24K smpl:2M ep:10K epch:5.66 loss:0.086 grdn:0.230 lr:2.5e-06 updt_s:0 +.523 data_s:0.003 +INFO 2025-09-08 17:17:55 celerate.py:281 step:24K smpl:2M ep:10K epch:5.71 loss:0.086 grdn:0.228 lr:2.5e-06 updt_s:0 +.515 data_s:0.005 +INFO 2025-09-08 17:19:39 celerate.py:281 step:25K smpl:2M ep:10K epch:5.76 loss:0.087 grdn:0.229 lr:2.5e-06 updt_s:0 +.415 data_s:0.106 +INFO 2025-09-08 17:21:24 celerate.py:281 step:25K smpl:2M ep:10K epch:5.80 loss:0.086 grdn:0.234 lr:2.5e-06 updt_s:0 +.507 data_s:0.016 +INFO 2025-09-08 17:23:08 celerate.py:281 step:25K smpl:2M ep:10K epch:5.85 loss:0.085 grdn:0.229 lr:2.5e-06 updt_s:0 +.514 data_s:0.003 +INFO 2025-09-08 17:24:54 celerate.py:281 step:25K smpl:2M ep:10K epch:5.90 loss:0.087 grdn:0.227 lr:2.5e-06 updt_s:0 +.518 data_s:0.008 +INFO 2025-09-08 17:26:41 celerate.py:281 step:25K smpl:2M ep:10K epch:5.94 loss:0.087 grdn:0.237 lr:2.5e-06 updt_s:0 +.529 data_s:0.003 +INFO 2025-09-08 17:28:24 celerate.py:281 step:26K smpl:2M ep:10K epch:5.99 loss:0.087 grdn:0.232 lr:2.5e-06 updt_s:0 +.513 data_s:0.003 +INFO 2025-09-08 17:30:11 celerate.py:281 step:26K smpl:2M ep:10K epch:6.04 loss:0.087 grdn:0.233 lr:2.5e-06 updt_s:0 +.370 data_s:0.164 +INFO 2025-09-08 17:31:55 celerate.py:281 step:26K smpl:2M ep:10K epch:6.08 loss:0.087 grdn:0.231 lr:2.5e-06 updt_s:0 +.385 data_s:0.132 +INFO 2025-09-08 17:33:39 celerate.py:281 step:26K smpl:2M ep:10K epch:6.13 loss:0.087 grdn:0.237 lr:2.5e-06 updt_s:0 +.450 data_s:0.069 +INFO 2025-09-08 17:35:24 celerate.py:281 step:26K smpl:2M ep:10K epch:6.18 loss:0.087 grdn:0.238 lr:2.5e-06 updt_s:0 +.468 data_s:0.052 +INFO 2025-09-08 17:37:07 celerate.py:281 step:27K smpl:2M ep:11K epch:6.23 loss:0.087 grdn:0.233 lr:2.5e-06 updt_s:0 +.514 data_s:0.004 +INFO 2025-09-08 17:38:52 celerate.py:281 step:27K smpl:2M ep:11K epch:6.27 loss:0.087 grdn:0.236 lr:2.5e-06 updt_s:0 +.519 data_s:0.003 +INFO 2025-09-08 17:40:40 celerate.py:281 step:27K smpl:2M ep:11K epch:6.32 loss:0.086 grdn:0.234 lr:2.5e-06 updt_s:0 +.534 data_s:0.003 +INFO 2025-09-08 17:42:57 celerate.py:281 step:27K smpl:2M ep:11K epch:6.37 loss:0.087 grdn:0.231 lr:2.5e-06 updt_s:0 +.678 data_s:0.007 +INFO 2025-09-08 17:46:13 celerate.py:281 step:27K smpl:2M ep:11K epch:6.41 loss:0.086 grdn:0.233 lr:2.5e-06 updt_s:0 +.968 data_s:0.009 +INFO 2025-09-08 17:49:20 celerate.py:281 step:28K smpl:2M ep:11K epch:6.46 loss:0.087 grdn:0.231 lr:2.5e-06 updt_s:0 +.895 data_s:0.037 +INFO 2025-09-08 17:51:22 celerate.py:281 step:28K smpl:2M ep:11K epch:6.51 loss:0.087 grdn:0.235 lr:2.5e-06 updt_s:0 +.604 data_s:0.003 +INFO 2025-09-08 17:53:07 celerate.py:281 step:28K smpl:2M ep:11K epch:6.55 loss:0.086 grdn:0.236 lr:2.5e-06 updt_s:0 +.521 data_s:0.003 +INFO 2025-09-08 17:54:51 celerate.py:281 step:28K smpl:2M ep:11K epch:6.60 loss:0.087 grdn:0.234 lr:2.5e-06 updt_s:0 +.516 data_s:0.003 +INFO 2025-09-08 17:56:36 celerate.py:281 step:28K smpl:2M ep:11K epch:6.65 loss:0.086 grdn:0.232 lr:2.5e-06 updt_s:0 +.519 data_s:0.003 +INFO 2025-09-08 17:58:21 celerate.py:281 step:29K smpl:2M ep:11K epch:6.69 loss:0.085 grdn:0.228 lr:2.5e-06 updt_s:0 +.521 data_s:0.003 +INFO 2025-09-08 18:00:06 celerate.py:281 step:29K smpl:2M ep:11K epch:6.74 loss:0.087 grdn:0.237 lr:2.5e-06 updt_s:0 +.513 data_s:0.011 +INFO 2025-09-08 18:01:50 celerate.py:281 step:29K smpl:2M ep:11K epch:6.79 loss:0.086 grdn:0.232 lr:2.5e-06 updt_s:0 +.476 data_s:0.041 +INFO 2025-09-08 18:03:34 celerate.py:281 step:29K smpl:2M ep:12K epch:6.83 loss:0.086 grdn:0.236 lr:2.5e-06 updt_s:0 +.506 data_s:0.012 +INFO 2025-09-08 18:05:21 celerate.py:281 step:29K smpl:2M ep:12K epch:6.88 loss:0.086 grdn:0.229 lr:2.5e-06 updt_s:0 +.455 data_s:0.075 +INFO 2025-09-08 18:07:04 celerate.py:281 step:30K smpl:2M ep:12K epch:6.93 loss:0.086 grdn:0.233 lr:2.5e-06 updt_s:0 +.514 data_s:0.003 +INFO 2025-09-08 18:08:47 celerate.py:281 step:30K smpl:2M ep:12K epch:6.97 loss:0.086 grdn:0.236 lr:2.5e-06 updt_s:0 +.509 data_s:0.003 +INFO 2025-09-08 18:10:33 celerate.py:281 step:30K smpl:2M ep:12K epch:7.02 loss:0.087 grdn:0.237 lr:2.5e-06 updt_s:0 +.422 data_s:0.105 +INFO 2025-09-08 18:12:19 celerate.py:281 step:30K smpl:2M ep:12K epch:7.07 loss:0.087 grdn:0.236 lr:2.5e-06 updt_s:0 +.347 data_s:0.182 +INFO 2025-09-08 18:14:05 celerate.py:281 step:30K smpl:2M ep:12K epch:7.11 loss:0.086 grdn:0.231 lr:2.5e-06 updt_s:0 +.473 data_s:0.053 +INFO 2025-09-08 18:15:52 celerate.py:281 step:31K smpl:2M ep:12K epch:7.16 loss:0.086 grdn:0.238 lr:2.5e-06 updt_s:0 +.531 data_s:0.005 +INFO 2025-09-08 18:17:37 celerate.py:281 step:31K smpl:2M ep:12K epch:7.21 loss:0.086 grdn:0.235 lr:2.5e-06 updt_s:0 +.520 data_s:0.003 +INFO 2025-09-08 18:19:22 celerate.py:281 step:31K smpl:2M ep:12K epch:7.26 loss:0.086 grdn:0.231 lr:2.5e-06 updt_s:0 +.500 data_s:0.020 +INFO 2025-09-08 18:21:06 celerate.py:281 step:31K smpl:2M ep:12K epch:7.30 loss:0.087 grdn:0.243 lr:2.5e-06 updt_s:0 +.511 data_s:0.009 +INFO 2025-09-08 18:22:50 celerate.py:281 step:31K smpl:2M ep:12K epch:7.35 loss:0.087 grdn:0.227 lr:2.5e-06 updt_s:0 +.518 data_s:0.003 +INFO 2025-09-08 18:24:33 celerate.py:281 step:32K smpl:2M ep:13K epch:7.40 loss:0.086 grdn:0.235 lr:2.5e-06 updt_s:0 +.507 data_s:0.007 +INFO 2025-09-08 18:26:16 celerate.py:281 step:32K smpl:2M ep:13K epch:7.44 loss:0.087 grdn:0.238 lr:2.5e-06 updt_s:0 +.463 data_s:0.047 +INFO 2025-09-08 18:27:59 celerate.py:281 step:32K smpl:2M ep:13K epch:7.49 loss:0.087 grdn:0.240 lr:2.5e-06 updt_s:0 +.509 data_s:0.007 +INFO 2025-09-08 18:29:43 celerate.py:281 step:32K smpl:2M ep:13K epch:7.54 loss:0.087 grdn:0.234 lr:2.5e-06 updt_s:0 +.514 data_s:0.003 +INFO 2025-09-08 18:31:26 celerate.py:281 step:32K smpl:2M ep:13K epch:7.58 loss:0.086 grdn:0.234 lr:2.5e-06 updt_s:0 +.511 data_s:0.003 +INFO 2025-09-08 18:33:11 celerate.py:281 step:33K smpl:2M ep:13K epch:7.63 loss:0.086 grdn:0.233 lr:2.5e-06 updt_s:0 +.518 data_s:0.003 +INFO 2025-09-08 18:34:57 celerate.py:281 step:33K smpl:2M ep:13K epch:7.68 loss:0.087 grdn:0.236 lr:2.5e-06 updt_s:0 +.512 data_s:0.016 +INFO 2025-09-08 18:36:41 celerate.py:281 step:33K smpl:2M ep:13K epch:7.72 loss:0.086 grdn:0.229 lr:2.5e-06 updt_s:0 +.517 data_s:0.003 +INFO 2025-09-08 18:38:25 celerate.py:281 step:33K smpl:2M ep:13K epch:7.77 loss:0.087 grdn:0.236 lr:2.5e-06 updt_s:0 +.516 data_s:0.003 +INFO 2025-09-08 18:40:07 celerate.py:281 step:33K smpl:2M ep:13K epch:7.82 loss:0.086 grdn:0.235 lr:2.5e-06 updt_s:0 +.506 data_s:0.003 +INFO 2025-09-08 18:41:51 celerate.py:281 step:34K smpl:2M ep:13K epch:7.86 loss:0.087 grdn:0.234 lr:2.5e-06 updt_s:0 +.509 data_s:0.006 +INFO 2025-09-08 18:43:36 celerate.py:281 step:34K smpl:2M ep:13K epch:7.91 loss:0.086 grdn:0.239 lr:2.5e-06 updt_s:0 +.521 data_s:0.003 +INFO 2025-09-08 18:45:19 celerate.py:281 step:34K smpl:2M ep:13K epch:7.96 loss:0.086 grdn:0.233 lr:2.5e-06 updt_s:0 +.511 data_s:0.003 +INFO 2025-09-08 18:47:05 celerate.py:281 step:34K smpl:2M ep:14K epch:8.00 loss:0.086 grdn:0.232 lr:2.5e-06 updt_s:0 +.495 data_s:0.035 +INFO 2025-09-08 18:48:51 celerate.py:281 step:34K smpl:2M ep:14K epch:8.05 loss:0.087 grdn:0.243 lr:2.5e-06 updt_s:0 +.413 data_s:0.112 +INFO 2025-09-08 18:50:34 celerate.py:281 step:35K smpl:2M ep:14K epch:8.10 loss:0.086 grdn:0.237 lr:2.5e-06 updt_s:0 +.515 data_s:0.003 +INFO 2025-09-08 18:52:19 celerate.py:281 step:35K smpl:2M ep:14K epch:8.14 loss:0.087 grdn:0.237 lr:2.5e-06 updt_s:0 +.520 data_s:0.003 +INFO 2025-09-08 18:54:03 celerate.py:281 step:35K smpl:2M ep:14K epch:8.19 loss:0.087 grdn:0.231 lr:2.5e-06 updt_s:0 +.515 data_s:0.003 +INFO 2025-09-08 18:55:48 celerate.py:281 step:35K smpl:2M ep:14K epch:8.24 loss:0.086 grdn:0.231 lr:2.5e-06 updt_s:0 +.420 data_s:0.101 +INFO 2025-09-08 18:57:33 celerate.py:281 step:35K smpl:2M ep:14K epch:8.28 loss:0.086 grdn:0.237 lr:2.5e-06 updt_s:0 +.506 data_s:0.022 +INFO 2025-09-08 18:59:19 celerate.py:281 step:36K smpl:2M ep:14K epch:8.33 loss:0.087 grdn:0.235 lr:2.5e-06 updt_s:0 +.525 data_s:0.003 +INFO 2025-09-08 19:01:03 celerate.py:281 step:36K smpl:2M ep:14K epch:8.38 loss:0.086 grdn:0.228 lr:2.5e-06 updt_s:0 +.516 data_s:0.003 +INFO 2025-09-08 19:02:48 celerate.py:281 step:36K smpl:2M ep:14K epch:8.43 loss:0.086 grdn:0.232 lr:2.5e-06 updt_s:0 +.516 data_s:0.003 +INFO 2025-09-08 19:04:32 celerate.py:281 step:36K smpl:2M ep:14K epch:8.47 loss:0.087 grdn:0.235 lr:2.5e-06 updt_s:0 +.505 data_s:0.013 +INFO 2025-09-08 19:06:15 celerate.py:281 step:36K smpl:2M ep:14K epch:8.52 loss:0.086 grdn:0.232 lr:2.5e-06 updt_s:0 +.384 data_s:0.130 +INFO 2025-09-08 19:07:58 celerate.py:281 step:37K smpl:2M ep:15K epch:8.57 loss:0.087 grdn:0.235 lr:2.5e-06 updt_s:0 +.430 data_s:0.084 +INFO 2025-09-08 19:09:41 celerate.py:281 step:37K smpl:2M ep:15K epch:8.61 loss:0.087 grdn:0.234 lr:2.5e-06 updt_s:0 +.351 data_s:0.162 +INFO 2025-09-08 19:11:25 celerate.py:281 step:37K smpl:2M ep:15K epch:8.66 loss:0.086 grdn:0.239 lr:2.5e-06 updt_s:0 +.337 data_s:0.181 +INFO 2025-09-08 19:13:08 celerate.py:281 step:37K smpl:2M ep:15K epch:8.71 loss:0.086 grdn:0.232 lr:2.5e-06 updt_s:0 +.336 data_s:0.177 +INFO 2025-09-08 19:14:52 celerate.py:281 step:37K smpl:2M ep:15K epch:8.75 loss:0.086 grdn:0.235 lr:2.5e-06 updt_s:0 +.344 data_s:0.174 +INFO 2025-09-08 19:16:35 celerate.py:281 step:38K smpl:2M ep:15K epch:8.80 loss:0.086 grdn:0.231 lr:2.5e-06 updt_s:0 +.332 data_s:0.182 +INFO 2025-09-08 19:18:18 celerate.py:281 step:38K smpl:2M ep:15K epch:8.85 loss:0.086 grdn:0.234 lr:2.5e-06 updt_s:0 +.332 data_s:0.182 +INFO 2025-09-08 19:20:01 celerate.py:281 step:38K smpl:2M ep:15K epch:8.89 loss:0.087 grdn:0.233 lr:2.5e-06 updt_s:0 +.337 data_s:0.177 +INFO 2025-09-08 19:21:45 celerate.py:281 step:38K smpl:2M ep:15K epch:8.94 loss:0.086 grdn:0.234 lr:2.5e-06 updt_s:0 +.495 data_s:0.022 +INFO 2025-09-08 19:23:29 celerate.py:281 step:38K smpl:2M ep:15K epch:8.99 loss:0.086 grdn:0.235 lr:2.5e-06 updt_s:0 +.513 data_s:0.003 +INFO 2025-09-08 19:25:15 celerate.py:281 step:39K smpl:2M ep:15K epch:9.03 loss:0.086 grdn:0.234 lr:2.5e-06 updt_s:0 +.436 data_s:0.097 +INFO 2025-09-08 19:26:59 celerate.py:281 step:39K smpl:2M ep:15K epch:9.08 loss:0.086 grdn:0.237 lr:2.5e-06 updt_s:0 +.378 data_s:0.138 +INFO 2025-09-08 19:28:43 celerate.py:281 step:39K smpl:2M ep:15K epch:9.13 loss:0.087 grdn:0.241 lr:2.5e-06 updt_s:0 +.497 data_s:0.023 +INFO 2025-09-08 19:30:27 celerate.py:281 step:39K smpl:3M ep:16K epch:9.17 loss:0.086 grdn:0.235 lr:2.5e-06 updt_s:0 +.515 data_s:0.003 +INFO 2025-09-08 19:32:14 celerate.py:281 step:39K smpl:3M ep:16K epch:9.22 loss:0.086 grdn:0.233 lr:2.5e-06 updt_s:0 +.526 data_s:0.003 +INFO 2025-09-08 19:33:57 celerate.py:281 step:40K smpl:3M ep:16K epch:9.27 loss:0.087 grdn:0.234 lr:2.5e-06 updt_s:0 +.506 data_s:0.011 +INFO 2025-09-08 19:35:42 celerate.py:281 step:40K smpl:3M ep:16K epch:9.31 loss:0.087 grdn:0.229 lr:2.5e-06 updt_s:0 +.455 data_s:0.066 +INFO 2025-09-08 19:37:26 celerate.py:281 step:40K smpl:3M ep:16K epch:9.36 loss:0.086 grdn:0.238 lr:2.5e-06 updt_s:0 +.493 data_s:0.026 +INFO 2025-09-08 19:37:26 celerate.py:295 Checkpoint policy after step 40000 +INFO 2025-09-08 19:39:10 celerate.py:281 step:40K smpl:3M ep:16K epch:9.41 loss:0.086 grdn:0.236 lr:2.5e-06 updt_s:0 +.397 data_s:0.114 +INFO 2025-09-08 19:40:53 celerate.py:281 step:40K smpl:3M ep:16K epch:9.45 loss:0.086 grdn:0.239 lr:2.5e-06 updt_s:0 +.344 data_s:0.168 +INFO 2025-09-08 19:42:37 celerate.py:281 step:41K smpl:3M ep:16K epch:9.50 loss:0.086 grdn:0.234 lr:2.5e-06 updt_s:0 +.480 data_s:0.036 +INFO 2025-09-08 19:44:21 celerate.py:281 step:41K smpl:3M ep:16K epch:9.55 loss:0.086 grdn:0.237 lr:2.5e-06 updt_s:0 +.517 data_s:0.003 +INFO 2025-09-08 19:46:05 celerate.py:281 step:41K smpl:3M ep:16K epch:9.60 loss:0.086 grdn:0.233 lr:2.5e-06 updt_s:0 +.517 data_s:0.003 +INFO 2025-09-08 19:47:49 celerate.py:281 step:41K smpl:3M ep:16K epch:9.64 loss:0.086 grdn:0.234 lr:2.5e-06 updt_s:0 +.513 data_s:0.003 +INFO 2025-09-08 19:49:33 celerate.py:281 step:41K smpl:3M ep:16K epch:9.69 loss:0.087 grdn:0.236 lr:2.5e-06 updt_s:0 +.515 data_s:0.003 +INFO 2025-09-08 19:51:17 celerate.py:281 step:42K smpl:3M ep:16K epch:9.74 loss:0.086 grdn:0.228 lr:2.5e-06 updt_s:0 +.515 data_s:0.003 +INFO 2025-09-08 19:53:00 celerate.py:281 step:42K smpl:3M ep:17K epch:9.78 loss:0.086 grdn:0.232 lr:2.5e-06 updt_s:0 +.513 data_s:0.003 +INFO 2025-09-08 19:54:44 celerate.py:281 step:42K smpl:3M ep:17K epch:9.83 loss:0.086 grdn:0.233 lr:2.5e-06 updt_s:0 +.516 data_s:0.003 +INFO 2025-09-08 19:56:28 celerate.py:281 step:42K smpl:3M ep:17K epch:9.88 loss:0.087 grdn:0.237 lr:2.5e-06 updt_s:0 +.512 data_s:0.003 +INFO 2025-09-08 19:58:11 celerate.py:281 step:42K smpl:3M ep:17K epch:9.92 loss:0.086 grdn:0.233 lr:2.5e-06 updt_s:0 +.514 data_s:0.003 +INFO 2025-09-08 19:59:55 celerate.py:281 step:43K smpl:3M ep:17K epch:9.97 loss:0.087 grdn:0.238 lr:2.5e-06 updt_s:0 +.514 data_s:0.003 +INFO 2025-09-08 20:01:42 celerate.py:281 step:43K smpl:3M ep:17K epch:10.02 loss:0.087 grdn:0.234 lr:2.5e-06 updt_s: +0.476 data_s:0.057 +INFO 2025-09-08 20:03:25 celerate.py:281 step:43K smpl:3M ep:17K epch:10.06 loss:0.087 grdn:0.239 lr:2.5e-06 updt_s: +0.471 data_s:0.043 +INFO 2025-09-08 20:05:09 celerate.py:281 step:43K smpl:3M ep:17K epch:10.11 loss:0.086 grdn:0.237 lr:2.5e-06 updt_s: +0.515 data_s:0.004 +INFO 2025-09-08 20:06:53 celerate.py:281 step:43K smpl:3M ep:17K epch:10.16 loss:0.087 grdn:0.237 lr:2.5e-06 updt_s: +0.505 data_s:0.013 +INFO 2025-09-08 20:08:36 celerate.py:281 step:44K smpl:3M ep:17K epch:10.20 loss:0.087 grdn:0.237 lr:2.5e-06 updt_s: +0.511 data_s:0.003 +INFO 2025-09-08 20:10:20 celerate.py:281 step:44K smpl:3M ep:17K epch:10.25 loss:0.086 grdn:0.236 lr:2.5e-06 updt_s: +0.516 data_s:0.003 +INFO 2025-09-08 20:12:04 celerate.py:281 step:44K smpl:3M ep:17K epch:10.30 loss:0.086 grdn:0.232 lr:2.5e-06 updt_s: +0.511 data_s:0.003 +INFO 2025-09-08 20:13:47 celerate.py:281 step:44K smpl:3M ep:18K epch:10.34 loss:0.087 grdn:0.235 lr:2.5e-06 updt_s: +0.503 data_s:0.011 +INFO 2025-09-08 20:15:31 celerate.py:281 step:44K smpl:3M ep:18K epch:10.39 loss:0.086 grdn:0.233 lr:2.5e-06 updt_s: +0.416 data_s:0.102 +INFO 2025-09-08 20:17:15 celerate.py:281 step:45K smpl:3M ep:18K epch:10.44 loss:0.087 grdn:0.235 lr:2.5e-06 updt_s: +0.502 data_s:0.017 +INFO 2025-09-08 20:18:58 celerate.py:281 step:45K smpl:3M ep:18K epch:10.48 loss:0.086 grdn:0.234 lr:2.5e-06 updt_s: +0.512 data_s:0.003 +INFO 2025-09-08 20:20:41 celerate.py:281 step:45K smpl:3M ep:18K epch:10.53 loss:0.086 grdn:0.238 lr:2.5e-06 updt_s: +0.496 data_s:0.017 +INFO 2025-09-08 20:22:24 celerate.py:281 step:45K smpl:3M ep:18K epch:10.58 loss:0.086 grdn:0.233 lr:2.5e-06 updt_s: +0.493 data_s:0.022 +INFO 2025-09-08 20:24:08 celerate.py:281 step:45K smpl:3M ep:18K epch:10.63 loss:0.086 grdn:0.236 lr:2.5e-06 updt_s: +0.485 data_s:0.031 +INFO 2025-09-08 20:25:52 celerate.py:281 step:46K smpl:3M ep:18K epch:10.67 loss:0.086 grdn:0.231 lr:2.5e-06 updt_s: +0.518 data_s:0.003 +INFO 2025-09-08 20:27:36 celerate.py:281 step:46K smpl:3M ep:18K epch:10.72 loss:0.085 grdn:0.228 lr:2.5e-06 updt_s: +0.513 data_s:0.003 +INFO 2025-09-08 20:29:19 celerate.py:281 step:46K smpl:3M ep:18K epch:10.77 loss:0.087 grdn:0.235 lr:2.5e-06 updt_s: +0.514 data_s:0.003 +INFO 2025-09-08 20:31:04 celerate.py:281 step:46K smpl:3M ep:18K epch:10.81 loss:0.086 grdn:0.242 lr:2.5e-06 updt_s: +0.516 data_s:0.003 +INFO 2025-09-08 20:32:47 celerate.py:281 step:46K smpl:3M ep:18K epch:10.86 loss:0.086 grdn:0.235 lr:2.5e-06 updt_s: +0.513 data_s:0.003 +INFO 2025-09-08 20:34:32 celerate.py:281 step:47K smpl:3M ep:18K epch:10.91 loss:0.086 grdn:0.234 lr:2.5e-06 updt_s: +0.518 data_s:0.003 +INFO 2025-09-08 20:36:15 celerate.py:281 step:47K smpl:3M ep:19K epch:10.95 loss:0.087 grdn:0.238 lr:2.5e-06 updt_s: +0.514 data_s:0.003 +INFO 2025-09-08 20:38:01 celerate.py:281 step:47K smpl:3M ep:19K epch:11.00 loss:0.086 grdn:0.239 lr:2.5e-06 updt_s: +0.416 data_s:0.114 +INFO 2025-09-08 20:39:45 celerate.py:281 step:47K smpl:3M ep:19K epch:11.05 loss:0.086 grdn:0.233 lr:2.5e-06 updt_s: +0.429 data_s:0.086 +INFO 2025-09-08 20:41:28 celerate.py:281 step:47K smpl:3M ep:19K epch:11.09 loss:0.086 grdn:0.229 lr:2.5e-06 updt_s: +0.409 data_s:0.106 +INFO 2025-09-08 20:43:12 celerate.py:281 step:48K smpl:3M ep:19K epch:11.14 loss:0.086 grdn:0.238 lr:2.5e-06 updt_s: +0.470 data_s:0.050 +INFO 2025-09-08 20:44:56 celerate.py:281 step:48K smpl:3M ep:19K epch:11.19 loss:0.086 grdn:0.237 lr:2.5e-06 updt_s: +0.516 data_s:0.003 +INFO 2025-09-08 20:46:41 celerate.py:281 step:48K smpl:3M ep:19K epch:11.23 loss:0.087 grdn:0.243 lr:2.5e-06 updt_s: +0.517 data_s:0.003 +INFO 2025-09-08 20:48:25 celerate.py:281 step:48K smpl:3M ep:19K epch:11.28 loss:0.086 grdn:0.237 lr:2.5e-06 updt_s: +0.518 data_s:0.003 +INFO 2025-09-08 20:50:08 celerate.py:281 step:48K smpl:3M ep:19K epch:11.33 loss:0.086 grdn:0.238 lr:2.5e-06 updt_s: +0.505 data_s:0.009 +INFO 2025-09-08 20:51:52 celerate.py:281 step:49K smpl:3M ep:19K epch:11.37 loss:0.087 grdn:0.245 lr:2.5e-06 updt_s: +0.449 data_s:0.067 +INFO 2025-09-08 20:53:35 celerate.py:281 step:49K smpl:3M ep:19K epch:11.42 loss:0.086 grdn:0.238 lr:2.5e-06 updt_s: +0.420 data_s:0.094 +INFO 2025-09-08 20:55:19 celerate.py:281 step:49K smpl:3M ep:19K epch:11.47 loss:0.086 grdn:0.232 lr:2.5e-06 updt_s: +0.515 data_s:0.003 +INFO 2025-09-08 20:57:01 celerate.py:281 step:49K smpl:3M ep:19K epch:11.51 loss:0.087 grdn:0.236 lr:2.5e-06 updt_s: +0.505 data_s:0.003 +INFO 2025-09-08 20:58:44 celerate.py:281 step:49K smpl:3M ep:20K epch:11.56 loss:0.086 grdn:0.234 lr:2.5e-06 updt_s: +0.511 data_s:0.003 +INFO 2025-09-08 21:00:28 celerate.py:281 step:50K smpl:3M ep:20K epch:11.61 loss:0.087 grdn:0.239 lr:2.5e-06 updt_s: +0.516 data_s:0.003 +INFO 2025-09-08 21:02:11 celerate.py:281 step:50K smpl:3M ep:20K epch:11.65 loss:0.086 grdn:0.233 lr:2.5e-06 updt_s: +0.402 data_s:0.110 +INFO 2025-09-08 21:03:54 celerate.py:281 step:50K smpl:3M ep:20K epch:11.70 loss:0.086 grdn:0.238 lr:2.5e-06 updt_s: +0.332 data_s:0.184 +INFO 2025-09-08 21:05:37 celerate.py:281 step:50K smpl:3M ep:20K epch:11.75 loss:0.086 grdn:0.229 lr:2.5e-06 updt_s: +0.332 data_s:0.182 +INFO 2025-09-08 21:07:21 celerate.py:281 step:50K smpl:3M ep:20K epch:11.80 loss:0.087 grdn:0.237 lr:2.5e-06 updt_s: +0.466 data_s:0.049 +INFO 2025-09-08 21:09:05 celerate.py:281 step:51K smpl:3M ep:20K epch:11.84 loss:0.086 grdn:0.238 lr:2.5e-06 updt_s: +0.517 data_s:0.003 +INFO 2025-09-08 21:10:49 celerate.py:281 step:51K smpl:3M ep:20K epch:11.89 loss:0.087 grdn:0.240 lr:2.5e-06 updt_s: +0.512 data_s:0.004 +INFO 2025-09-08 21:12:32 celerate.py:281 step:51K smpl:3M ep:20K epch:11.94 loss:0.085 grdn:0.234 lr:2.5e-06 updt_s: +0.484 data_s:0.032 +INFO 2025-09-08 21:14:17 celerate.py:281 step:51K smpl:3M ep:20K epch:11.98 loss:0.087 grdn:0.236 lr:2.5e-06 updt_s: +0.517 data_s:0.004 +INFO 2025-09-08 21:16:03 celerate.py:281 step:51K smpl:3M ep:20K epch:12.03 loss:0.086 grdn:0.237 lr:2.5e-06 updt_s: +0.424 data_s:0.105 +INFO 2025-09-08 21:17:46 celerate.py:281 step:52K smpl:3M ep:20K epch:12.08 loss:0.086 grdn:0.236 lr:2.5e-06 updt_s: +0.442 data_s:0.073 +INFO 2025-09-08 21:19:30 celerate.py:281 step:52K smpl:3M ep:21K epch:12.12 loss:0.086 grdn:0.231 lr:2.5e-06 updt_s: +0.511 data_s:0.007 +INFO 2025-09-08 21:21:15 celerate.py:281 step:52K smpl:3M ep:21K epch:12.17 loss:0.086 grdn:0.241 lr:2.5e-06 updt_s: +0.520 data_s:0.003 +INFO 2025-09-08 21:22:59 celerate.py:281 step:52K smpl:3M ep:21K epch:12.22 loss:0.086 grdn:0.242 lr:2.5e-06 updt_s: +0.515 data_s:0.003 +INFO 2025-09-08 21:24:43 celerate.py:281 step:52K smpl:3M ep:21K epch:12.26 loss:0.086 grdn:0.241 lr:2.5e-06 updt_s: +0.518 data_s:0.003 +INFO 2025-09-08 21:26:27 celerate.py:281 step:53K smpl:3M ep:21K epch:12.31 loss:0.086 grdn:0.241 lr:2.5e-06 updt_s: +0.514 data_s:0.003 +INFO 2025-09-08 21:28:11 celerate.py:281 step:53K smpl:3M ep:21K epch:12.36 loss:0.087 grdn:0.241 lr:2.5e-06 updt_s: +0.517 data_s:0.003 +INFO 2025-09-08 21:29:55 celerate.py:281 step:53K smpl:3M ep:21K epch:12.40 loss:0.087 grdn:0.236 lr:2.5e-06 updt_s: +0.514 data_s:0.003 +INFO 2025-09-08 21:31:39 celerate.py:281 step:53K smpl:3M ep:21K epch:12.45 loss:0.086 grdn:0.240 lr:2.5e-06 updt_s: +0.517 data_s:0.003 +INFO 2025-09-08 21:33:23 celerate.py:281 step:53K smpl:3M ep:21K epch:12.50 loss:0.086 grdn:0.237 lr:2.5e-06 updt_s: +0.515 data_s:0.003 +INFO 2025-09-08 21:35:08 celerate.py:281 step:54K smpl:3M ep:21K epch:12.54 loss:0.086 grdn:0.236 lr:2.5e-06 updt_s: +0.519 data_s:0.003 +INFO 2025-09-08 21:36:51 celerate.py:281 step:54K smpl:3M ep:21K epch:12.59 loss:0.086 grdn:0.232 lr:2.5e-06 updt_s: +0.511 data_s:0.003 +INFO 2025-09-08 21:38:36 celerate.py:281 step:54K smpl:3M ep:21K epch:12.64 loss:0.087 grdn:0.236 lr:2.5e-06 updt_s: +0.517 data_s:0.003 +INFO 2025-09-08 21:40:18 celerate.py:281 step:54K smpl:3M ep:21K epch:12.68 loss:0.086 grdn:0.237 lr:2.5e-06 updt_s: +0.473 data_s:0.038 +INFO 2025-09-08 21:42:02 celerate.py:281 step:54K smpl:3M ep:22K epch:12.73 loss:0.085 grdn:0.232 lr:2.5e-06 updt_s: +0.408 data_s:0.109 +INFO 2025-09-08 21:43:45 celerate.py:281 step:55K smpl:3M ep:22K epch:12.78 loss:0.086 grdn:0.244 lr:2.5e-06 updt_s: +0.377 data_s:0.136 +INFO 2025-09-08 21:45:29 celerate.py:281 step:55K smpl:4M ep:22K epch:12.83 loss:0.087 grdn:0.233 lr:2.5e-06 updt_s: +0.497 data_s:0.022 +INFO 2025-09-08 21:47:12 celerate.py:281 step:55K smpl:4M ep:22K epch:12.87 loss:0.086 grdn:0.235 lr:2.5e-06 updt_s: +0.512 data_s:0.003 +INFO 2025-09-08 21:48:56 celerate.py:281 step:55K smpl:4M ep:22K epch:12.92 loss:0.086 grdn:0.237 lr:2.5e-06 updt_s: +0.429 data_s:0.086 +INFO 2025-09-08 21:50:39 celerate.py:281 step:55K smpl:4M ep:22K epch:12.97 loss:0.087 grdn:0.241 lr:2.5e-06 updt_s: +0.454 data_s:0.059 +INFO 2025-09-08 21:52:25 celerate.py:281 step:56K smpl:4M ep:22K epch:13.01 loss:0.086 grdn:0.235 lr:2.5e-06 updt_s: +0.459 data_s:0.072 +INFO 2025-09-08 21:54:08 celerate.py:281 step:56K smpl:4M ep:22K epch:13.06 loss:0.086 grdn:0.232 lr:2.5e-06 updt_s: +0.382 data_s:0.132 +INFO 2025-09-08 21:55:51 celerate.py:281 step:56K smpl:4M ep:22K epch:13.11 loss:0.086 grdn:0.236 lr:2.5e-06 updt_s: +0.500 data_s:0.016 +INFO 2025-09-08 21:57:36 celerate.py:281 step:56K smpl:4M ep:22K epch:13.15 loss:0.086 grdn:0.237 lr:2.5e-06 updt_s: +0.517 data_s:0.003 +INFO 2025-09-08 21:59:20 celerate.py:281 step:56K smpl:4M ep:22K epch:13.20 loss:0.085 grdn:0.235 lr:2.5e-06 updt_s: +0.518 data_s:0.003 +INFO 2025-09-08 22:01:03 celerate.py:281 step:57K smpl:4M ep:22K epch:13.25 loss:0.086 grdn:0.244 lr:2.5e-06 updt_s: +0.510 data_s:0.003 +INFO 2025-09-08 22:02:48 celerate.py:281 step:57K smpl:4M ep:23K epch:13.29 loss:0.086 grdn:0.242 lr:2.5e-06 updt_s: +0.517 data_s:0.003 +INFO 2025-09-08 22:04:32 celerate.py:281 step:57K smpl:4M ep:23K epch:13.34 loss:0.086 grdn:0.236 lr:2.5e-06 updt_s: +0.515 data_s:0.003 +INFO 2025-09-08 22:06:14 celerate.py:281 step:57K smpl:4M ep:23K epch:13.39 loss:0.087 grdn:0.244 lr:2.5e-06 updt_s: +0.505 data_s:0.005 +bINFO 2025-09-08 22:07:59 celerate.py:281 step:57K smpl:4M ep:23K epch:13.43 loss:0.086 grdn:0.239 lr:2.5e-06 updt_s +:0.496 data_s:0.026 +INFO 2025-09-08 22:09:43 celerate.py:281 step:58K smpl:4M ep:23K epch:13.48 loss:0.087 grdn:0.239 lr:2.5e-06 updt_s: +0.438 data_s:0.080 +INFO 2025-09-08 22:11:27 celerate.py:281 step:58K smpl:4M ep:23K epch:13.53 loss:0.087 grdn:0.240 lr:2.5e-06 updt_s: +0.444 data_s:0.073 +INFO 2025-09-08 22:13:11 celerate.py:281 step:58K smpl:4M ep:23K epch:13.57 loss:0.086 grdn:0.240 lr:2.5e-06 updt_s: +0.515 data_s:0.003 +INFO 2025-09-08 22:14:55 celerate.py:281 step:58K smpl:4M ep:23K epch:13.62 loss:0.086 grdn:0.237 lr:2.5e-06 updt_s: +0.518 data_s:0.003 +INFO 2025-09-08 22:16:39 celerate.py:281 step:58K smpl:4M ep:23K epch:13.67 loss:0.086 grdn:0.233 lr:2.5e-06 updt_s: +0.513 data_s:0.003 +INFO 2025-09-08 22:18:22 celerate.py:281 step:59K smpl:4M ep:23K epch:13.71 loss:0.087 grdn:0.240 lr:2.5e-06 updt_s: +0.513 data_s:0.003 +INFO 2025-09-08 22:20:05 celerate.py:281 step:59K smpl:4M ep:23K epch:13.76 loss:0.087 grdn:0.239 lr:2.5e-06 updt_s: +0.508 data_s:0.003 +INFO 2025-09-08 22:21:48 celerate.py:281 step:59K smpl:4M ep:23K epch:13.81 loss:0.086 grdn:0.228 lr:2.5e-06 updt_s: +0.505 data_s:0.008 +INFO 2025-09-08 22:23:31 celerate.py:281 step:59K smpl:4M ep:23K epch:13.85 loss:0.086 grdn:0.238 lr:2.5e-06 updt_s: +0.505 data_s:0.008 +INFO 2025-09-08 22:25:15 celerate.py:281 step:59K smpl:4M ep:24K epch:13.90 loss:0.086 grdn:0.240 lr:2.5e-06 updt_s: +0.515 data_s:0.003 +INFO 2025-09-08 22:26:58 celerate.py:281 step:60K smpl:4M ep:24K epch:13.95 loss:0.087 grdn:0.235 lr:2.5e-06 updt_s: +0.493 data_s:0.022 +INFO 2025-09-08 22:28:42 celerate.py:281 step:60K smpl:4M ep:24K epch:14.00 loss:0.086 grdn:0.231 lr:2.5e-06 updt_s: +0.515 data_s:0.003 +INFO 2025-09-08 22:30:29 celerate.py:281 step:60K smpl:4M ep:24K epch:14.04 loss:0.086 grdn:0.235 lr:2.5e-06 updt_s: +0.435 data_s:0.101 +INFO 2025-09-08 22:30:29 celerate.py:295 Checkpoint policy after step 60000 +INFO 2025-09-08 22:32:14 celerate.py:281 step:60K smpl:4M ep:24K epch:14.09 loss:0.086 grdn:0.237 lr:2.5e-06 updt_s: +0.508 data_s:0.003 +INFO 2025-09-08 22:33:58 celerate.py:281 step:60K smpl:4M ep:24K epch:14.14 loss:0.086 grdn:0.238 lr:2.5e-06 updt_s: +0.516 data_s:0.003 +INFO 2025-09-08 22:35:42 celerate.py:281 step:61K smpl:4M ep:24K epch:14.18 loss:0.086 grdn:0.241 lr:2.5e-06 updt_s: +0.515 data_s:0.003 +INFO 2025-09-08 22:37:25 celerate.py:281 step:61K smpl:4M ep:24K epch:14.23 loss:0.086 grdn:0.240 lr:2.5e-06 updt_s: +0.513 data_s:0.003 +INFO 2025-09-08 22:39:09 celerate.py:281 step:61K smpl:4M ep:24K epch:14.28 loss:0.087 grdn:0.235 lr:2.5e-06 updt_s: +0.514 data_s:0.003 +INFO 2025-09-08 22:40:52 celerate.py:281 step:61K smpl:4M ep:24K epch:14.32 loss:0.086 grdn:0.242 lr:2.5e-06 updt_s: +0.509 data_s:0.003 +INFO 2025-09-08 22:42:35 celerate.py:281 step:61K smpl:4M ep:24K epch:14.37 loss:0.086 grdn:0.235 lr:2.5e-06 updt_s: +0.513 data_s:0.003 +INFO 2025-09-08 22:44:18 celerate.py:281 step:62K smpl:4M ep:24K epch:14.42 loss:0.086 grdn:0.233 lr:2.5e-06 updt_s: +0.511 data_s:0.003 +INFO 2025-09-08 22:46:02 celerate.py:281 step:62K smpl:4M ep:24K epch:14.46 loss:0.086 grdn:0.235 lr:2.5e-06 updt_s: +0.515 data_s:0.003 +INFO 2025-09-08 22:47:47 celerate.py:281 step:62K smpl:4M ep:25K epch:14.51 loss:0.086 grdn:0.242 lr:2.5e-06 updt_s: +0.517 data_s:0.003 +INFO 2025-09-08 22:49:30 celerate.py:281 step:62K smpl:4M ep:25K epch:14.56 loss:0.086 grdn:0.234 lr:2.5e-06 updt_s: +0.512 data_s:0.003 +INFO 2025-09-08 22:51:14 celerate.py:281 step:62K smpl:4M ep:25K epch:14.60 loss:0.086 grdn:0.237 lr:2.5e-06 updt_s: +0.514 data_s:0.003 +INFO 2025-09-08 22:52:58 celerate.py:281 step:63K smpl:4M ep:25K epch:14.65 loss:0.087 grdn:0.245 lr:2.5e-06 updt_s: +0.484 data_s:0.033 +INFO 2025-09-08 22:54:41 celerate.py:281 step:63K smpl:4M ep:25K epch:14.70 loss:0.086 grdn:0.239 lr:2.5e-06 updt_s: +0.501 data_s:0.016 +INFO 2025-09-08 22:56:25 celerate.py:281 step:63K smpl:4M ep:25K epch:14.74 loss:0.086 grdn:0.233 lr:2.5e-06 updt_s: +0.436 data_s:0.081 +INFO 2025-09-08 22:58:08 celerate.py:281 step:63K smpl:4M ep:25K epch:14.79 loss:0.087 grdn:0.235 lr:2.5e-06 updt_s: +0.436 data_s:0.080 +INFO 2025-09-08 22:59:51 celerate.py:281 step:63K smpl:4M ep:25K epch:14.84 loss:0.086 grdn:0.234 lr:2.5e-06 updt_s: +0.344 data_s:0.168 +INFO 2025-09-08 23:01:34 celerate.py:281 step:64K smpl:4M ep:25K epch:14.88 loss:0.086 grdn:0.237 lr:2.5e-06 updt_s: +0.479 data_s:0.035 +INFO 2025-09-08 23:03:18 celerate.py:281 step:64K smpl:4M ep:25K epch:14.93 loss:0.086 grdn:0.241 lr:2.5e-06 updt_s: +0.514 data_s:0.003 +INFO 2025-09-08 23:05:02 celerate.py:281 step:64K smpl:4M ep:25K epch:14.98 loss:0.086 grdn:0.230 lr:2.5e-06 updt_s: +0.513 data_s:0.003 +INFO 2025-09-08 23:06:47 celerate.py:281 step:64K smpl:4M ep:25K epch:15.02 loss:0.086 grdn:0.234 lr:2.5e-06 updt_s: +0.405 data_s:0.119 +INFO 2025-09-08 23:08:30 celerate.py:281 step:64K smpl:4M ep:26K epch:15.07 loss:0.087 grdn:0.248 lr:2.5e-06 updt_s: +0.393 data_s:0.121 +INFO 2025-09-08 23:10:13 celerate.py:281 step:65K smpl:4M ep:26K epch:15.12 loss:0.087 grdn:0.242 lr:2.5e-06 updt_s: +0.369 data_s:0.142 +INFO 2025-09-08 23:11:56 celerate.py:281 step:65K smpl:4M ep:26K epch:15.17 loss:0.085 grdn:0.230 lr:2.5e-06 updt_s: +0.360 data_s:0.156 +INFO 2025-09-08 23:13:40 celerate.py:281 step:65K smpl:4M ep:26K epch:15.21 loss:0.086 grdn:0.241 lr:2.5e-06 updt_s: +0.333 data_s:0.182 +INFO 2025-09-08 23:15:23 celerate.py:281 step:65K smpl:4M ep:26K epch:15.26 loss:0.087 grdn:0.241 lr:2.5e-06 updt_s: +0.376 data_s:0.136 +INFO 2025-09-08 23:17:05 celerate.py:281 step:65K smpl:4M ep:26K epch:15.31 loss:0.087 grdn:0.239 lr:2.5e-06 updt_s: +0.439 data_s:0.069 +INFO 2025-09-08 23:18:49 celerate.py:281 step:66K smpl:4M ep:26K epch:15.35 loss:0.086 grdn:0.242 lr:2.5e-06 updt_s: +0.512 data_s:0.006 +INFO 2025-09-08 23:20:32 celerate.py:281 step:66K smpl:4M ep:26K epch:15.40 loss:0.087 grdn:0.242 lr:2.5e-06 updt_s: +0.511 data_s:0.003 +INFO 2025-09-08 23:22:15 celerate.py:281 step:66K smpl:4M ep:26K epch:15.45 loss:0.086 grdn:0.236 lr:2.5e-06 updt_s: +0.508 data_s:0.004 +INFO 2025-09-08 23:23:59 celerate.py:281 step:66K smpl:4M ep:26K epch:15.49 loss:0.086 grdn:0.235 lr:2.5e-06 updt_s: +0.474 data_s:0.046 +INFO 2025-09-08 23:25:42 celerate.py:281 step:66K smpl:4M ep:26K epch:15.54 loss:0.087 grdn:0.238 lr:2.5e-06 updt_s: +0.510 data_s:0.003 +INFO 2025-09-08 23:27:25 celerate.py:281 step:67K smpl:4M ep:26K epch:15.59 loss:0.086 grdn:0.240 lr:2.5e-06 updt_s: +0.487 data_s:0.025 +INFO 2025-09-08 23:29:08 celerate.py:281 step:67K smpl:4M ep:26K epch:15.63 loss:0.086 grdn:0.241 lr:2.5e-06 updt_s: +0.510 data_s:0.005 +INFO 2025-09-08 23:30:50 celerate.py:281 step:67K smpl:4M ep:27K epch:15.68 loss:0.086 grdn:0.237 lr:2.5e-06 updt_s: +0.506 data_s:0.003 +INFO 2025-09-08 23:32:33 celerate.py:281 step:67K smpl:4M ep:27K epch:15.73 loss:0.086 grdn:0.245 lr:2.5e-06 updt_s: +0.509 data_s:0.003 +INFO 2025-09-08 23:34:16 celerate.py:281 step:67K smpl:4M ep:27K epch:15.77 loss:0.087 grdn:0.244 lr:2.5e-06 updt_s: +0.503 data_s:0.014 +INFO 2025-09-08 23:35:59 celerate.py:281 step:68K smpl:4M ep:27K epch:15.82 loss:0.086 grdn:0.236 lr:2.5e-06 updt_s: +0.512 data_s:0.003 +INFO 2025-09-08 23:37:42 celerate.py:281 step:68K smpl:4M ep:27K epch:15.87 loss:0.086 grdn:0.233 lr:2.5e-06 updt_s: +0.509 data_s:0.003 +INFO 2025-09-08 23:39:26 celerate.py:281 step:68K smpl:4M ep:27K epch:15.91 loss:0.085 grdn:0.235 lr:2.5e-06 updt_s: +0.512 data_s:0.003 +INFO 2025-09-08 23:41:10 celerate.py:281 step:68K smpl:4M ep:27K epch:15.96 loss:0.086 grdn:0.241 lr:2.5e-06 updt_s: +0.517 data_s:0.003 +INFO 2025-09-08 23:42:56 celerate.py:281 step:68K smpl:4M ep:27K epch:16.01 loss:0.086 grdn:0.242 lr:2.5e-06 updt_s: +0.469 data_s:0.061 +INFO 2025-09-08 23:44:40 celerate.py:281 step:69K smpl:4M ep:27K epch:16.05 loss:0.086 grdn:0.235 lr:2.5e-06 updt_s: +0.339 data_s:0.179 +INFO 2025-09-08 23:46:22 celerate.py:281 step:69K smpl:4M ep:27K epch:16.10 loss:0.086 grdn:0.242 lr:2.5e-06 updt_s: +0.386 data_s:0.124 +INFO 2025-09-08 23:48:06 celerate.py:281 step:69K smpl:4M ep:27K epch:16.15 loss:0.085 grdn:0.239 lr:2.5e-06 updt_s: +0.515 data_s:0.003 +INFO 2025-09-08 23:49:50 celerate.py:281 step:69K smpl:4M ep:27K epch:16.20 loss:0.086 grdn:0.237 lr:2.5e-06 updt_s: +0.516 data_s:0.003 +INFO 2025-09-08 23:51:33 celerate.py:281 step:69K smpl:4M ep:27K epch:16.24 loss:0.087 grdn:0.246 lr:2.5e-06 updt_s: +0.501 data_s:0.014 +INFO 2025-09-08 23:53:17 celerate.py:281 step:70K smpl:4M ep:28K epch:16.29 loss:0.086 grdn:0.236 lr:2.5e-06 updt_s: +0.513 data_s:0.003 +INFO 2025-09-08 23:54:59 celerate.py:281 step:70K smpl:4M ep:28K epch:16.34 loss:0.087 grdn:0.245 lr:2.5e-06 updt_s: +0.507 data_s:0.005 +INFO 2025-09-08 23:56:43 celerate.py:281 step:70K smpl:4M ep:28K epch:16.38 loss:0.086 grdn:0.238 lr:2.5e-06 updt_s: +0.516 data_s:0.003 +INFO 2025-09-08 23:58:27 celerate.py:281 step:70K smpl:4M ep:28K epch:16.43 loss:0.086 grdn:0.235 lr:2.5e-06 updt_s: +0.514 data_s:0.003 +INFO 2025-09-09 00:00:12 celerate.py:281 step:70K smpl:5M ep:28K epch:16.48 loss:0.086 grdn:0.240 lr:2.5e-06 updt_s: +0.518 data_s:0.003 +INFO 2025-09-09 00:01:55 celerate.py:281 step:71K smpl:5M ep:28K epch:16.52 loss:0.086 grdn:0.241 lr:2.5e-06 updt_s: +0.512 data_s:0.003 +INFO 2025-09-09 00:03:37 celerate.py:281 step:71K smpl:5M ep:28K epch:16.57 loss:0.087 grdn:0.241 lr:2.5e-06 updt_s: +0.508 data_s:0.003 +INFO 2025-09-09 00:05:20 celerate.py:281 step:71K smpl:5M ep:28K epch:16.62 loss:0.086 grdn:0.238 lr:2.5e-06 updt_s: +0.509 data_s:0.003 +INFO 2025-09-09 00:07:04 celerate.py:281 step:71K smpl:5M ep:28K epch:16.66 loss:0.086 grdn:0.235 lr:2.5e-06 updt_s: +0.519 data_s:0.003 +INFO 2025-09-09 00:08:47 celerate.py:281 step:71K smpl:5M ep:28K epch:16.71 loss:0.087 grdn:0.240 lr:2.5e-06 updt_s: +0.509 data_s:0.003 +INFO 2025-09-09 00:10:30 celerate.py:281 step:72K smpl:5M ep:28K epch:16.76 loss:0.087 grdn:0.245 lr:2.5e-06 updt_s: +0.511 data_s:0.003 +INFO 2025-09-09 00:12:13 celerate.py:281 step:72K smpl:5M ep:28K epch:16.80 loss:0.085 grdn:0.230 lr:2.5e-06 updt_s: +0.510 data_s:0.003 +INFO 2025-09-09 00:13:58 celerate.py:281 step:72K smpl:5M ep:29K epch:16.85 loss:0.086 grdn:0.237 lr:2.5e-06 updt_s: +0.522 data_s:0.003 +INFO 2025-09-09 00:15:40 celerate.py:281 step:72K smpl:5M ep:29K epch:16.90 loss:0.086 grdn:0.239 lr:2.5e-06 updt_s: +0.506 data_s:0.003 +INFO 2025-09-09 00:17:23 celerate.py:281 step:72K smpl:5M ep:29K epch:16.94 loss:0.086 grdn:0.237 lr:2.5e-06 updt_s: +0.510 data_s:0.004 +INFO 2025-09-09 00:19:05 celerate.py:281 step:73K smpl:5M ep:29K epch:16.99 loss:0.086 grdn:0.237 lr:2.5e-06 updt_s: +0.505 data_s:0.003 +INFO 2025-09-09 00:20:51 celerate.py:281 step:73K smpl:5M ep:29K epch:17.04 loss:0.086 grdn:0.239 lr:2.5e-06 updt_s: +0.437 data_s:0.092 +INFO 2025-09-09 00:22:34 celerate.py:281 step:73K smpl:5M ep:29K epch:17.08 loss:0.087 grdn:0.241 lr:2.5e-06 updt_s: +0.489 data_s:0.025 +INFO 2025-09-09 00:24:17 celerate.py:281 step:73K smpl:5M ep:29K epch:17.13 loss:0.086 grdn:0.243 lr:2.5e-06 updt_s: +0.511 data_s:0.003 +INFO 2025-09-09 00:26:01 celerate.py:281 step:73K smpl:5M ep:29K epch:17.18 loss:0.087 grdn:0.245 lr:2.5e-06 updt_s: +0.513 data_s:0.003 +INFO 2025-09-09 00:27:45 celerate.py:281 step:74K smpl:5M ep:29K epch:17.22 loss:0.087 grdn:0.246 lr:2.5e-06 updt_s: +0.515 data_s:0.003 +INFO 2025-09-09 00:29:28 celerate.py:281 step:74K smpl:5M ep:29K epch:17.27 loss:0.086 grdn:0.239 lr:2.5e-06 updt_s: +0.509 data_s:0.003 +INFO 2025-09-09 00:31:11 celerate.py:281 step:74K smpl:5M ep:29K epch:17.32 loss:0.086 grdn:0.238 lr:2.5e-06 updt_s: +0.512 data_s:0.003 +INFO 2025-09-09 00:32:54 celerate.py:281 step:74K smpl:5M ep:29K epch:17.37 loss:0.087 grdn:0.237 lr:2.5e-06 updt_s: +0.510 data_s:0.003 +INFO 2025-09-09 00:34:38 celerate.py:281 step:74K smpl:5M ep:29K epch:17.41 loss:0.086 grdn:0.240 lr:2.5e-06 updt_s: +0.515 data_s:0.003 +INFO 2025-09-09 00:36:21 celerate.py:281 step:75K smpl:5M ep:30K epch:17.46 loss:0.086 grdn:0.241 lr:2.5e-06 updt_s: +0.513 data_s:0.003 +INFO 2025-09-09 00:38:05 celerate.py:281 step:75K smpl:5M ep:30K epch:17.51 loss:0.086 grdn:0.244 lr:2.5e-06 updt_s: +0.512 data_s:0.003 +INFO 2025-09-09 00:39:49 celerate.py:281 step:75K smpl:5M ep:30K epch:17.55 loss:0.086 grdn:0.251 lr:2.5e-06 updt_s: +0.517 data_s:0.003 +INFO 2025-09-09 00:41:32 celerate.py:281 step:75K smpl:5M ep:30K epch:17.60 loss:0.086 grdn:0.243 lr:2.5e-06 updt_s: +0.506 data_s:0.005 +INFO 2025-09-09 00:43:14 celerate.py:281 step:75K smpl:5M ep:30K epch:17.65 loss:0.086 grdn:0.240 lr:2.5e-06 updt_s: +0.390 data_s:0.122 +INFO 2025-09-09 00:44:58 celerate.py:281 step:76K smpl:5M ep:30K epch:17.69 loss:0.086 grdn:0.232 lr:2.5e-06 updt_s: +0.410 data_s:0.107 +INFO 2025-09-09 00:46:41 celerate.py:281 step:76K smpl:5M ep:30K epch:17.74 loss:0.087 grdn:0.247 lr:2.5e-06 updt_s: +0.427 data_s:0.085 +INFO 2025-09-09 00:48:24 celerate.py:281 step:76K smpl:5M ep:30K epch:17.79 loss:0.085 grdn:0.241 lr:2.5e-06 updt_s: +0.492 data_s:0.025 +INFO 2025-09-09 00:50:08 celerate.py:281 step:76K smpl:5M ep:30K epch:17.83 loss:0.086 grdn:0.238 lr:2.5e-06 updt_s: +0.514 data_s:0.003 +INFO 2025-09-09 00:51:51 celerate.py:281 step:76K smpl:5M ep:30K epch:17.88 loss:0.087 grdn:0.241 lr:2.5e-06 updt_s: +0.510 data_s:0.003 +INFO 2025-09-09 00:53:33 celerate.py:281 step:77K smpl:5M ep:30K epch:17.93 loss:0.086 grdn:0.234 lr:2.5e-06 updt_s: +0.510 data_s:0.003 +INFO 2025-09-09 00:55:18 celerate.py:281 step:77K smpl:5M ep:30K epch:17.97 loss:0.086 grdn:0.240 lr:2.5e-06 updt_s: +0.519 data_s:0.003 +INFO 2025-09-09 00:57:05 celerate.py:281 step:77K smpl:5M ep:31K epch:18.02 loss:0.086 grdn:0.236 lr:2.5e-06 updt_s: +0.486 data_s:0.046 +INFO 2025-09-09 00:58:47 celerate.py:281 step:77K smpl:5M ep:31K epch:18.07 loss:0.086 grdn:0.239 lr:2.5e-06 updt_s: +0.509 data_s:0.003 +INFO 2025-09-09 01:00:31 celerate.py:281 step:77K smpl:5M ep:31K epch:18.11 loss:0.086 grdn:0.244 lr:2.5e-06 updt_s: +0.514 data_s:0.003 +INFO 2025-09-09 01:02:15 celerate.py:281 step:78K smpl:5M ep:31K epch:18.16 loss:0.086 grdn:0.241 lr:2.5e-06 updt_s: +0.515 data_s:0.003 +INFO 2025-09-09 01:03:56 celerate.py:281 step:78K smpl:5M ep:31K epch:18.21 loss:0.086 grdn:0.233 lr:2.5e-06 updt_s: +0.498 data_s:0.008 +INFO 2025-09-09 01:05:40 celerate.py:281 step:78K smpl:5M ep:31K epch:18.25 loss:0.086 grdn:0.237 lr:2.5e-06 updt_s: +0.508 data_s:0.010 +INFO 2025-09-09 01:07:24 celerate.py:281 step:78K smpl:5M ep:31K epch:18.30 loss:0.085 grdn:0.240 lr:2.5e-06 updt_s: +0.515 data_s:0.003 +INFO 2025-09-09 01:09:07 celerate.py:281 step:78K smpl:5M ep:31K epch:18.35 loss:0.086 grdn:0.237 lr:2.5e-06 updt_s: +0.513 data_s:0.003 +INFO 2025-09-09 01:10:51 celerate.py:281 step:79K smpl:5M ep:31K epch:18.40 loss:0.086 grdn:0.236 lr:2.5e-06 updt_s: +0.515 data_s:0.003 +INFO 2025-09-09 01:12:35 celerate.py:281 step:79K smpl:5M ep:31K epch:18.44 loss:0.086 grdn:0.239 lr:2.5e-06 updt_s: +0.514 data_s:0.003 +INFO 2025-09-09 01:14:17 celerate.py:281 step:79K smpl:5M ep:31K epch:18.49 loss:0.086 grdn:0.247 lr:2.5e-06 updt_s: +0.508 data_s:0.003 +INFO 2025-09-09 01:16:01 celerate.py:281 step:79K smpl:5M ep:31K epch:18.54 loss:0.086 grdn:0.244 lr:2.5e-06 updt_s: +0.513 data_s:0.003 +INFO 2025-09-09 01:17:43 celerate.py:281 step:79K smpl:5M ep:31K epch:18.58 loss:0.086 grdn:0.233 lr:2.5e-06 updt_s: +0.507 data_s:0.003 +INFO 2025-09-09 01:19:26 celerate.py:281 step:80K smpl:5M ep:32K epch:18.63 loss:0.086 grdn:0.237 lr:2.5e-06 updt_s: +0.511 data_s:0.003 +INFO 2025-09-09 01:21:09 celerate.py:281 step:80K smpl:5M ep:32K epch:18.68 loss:0.086 grdn:0.237 lr:2.5e-06 updt_s: +0.513 data_s:0.003 +INFO 2025-09-09 01:22:52 celerate.py:281 step:80K smpl:5M ep:32K epch:18.72 loss:0.086 grdn:0.240 lr:2.5e-06 updt_s: +0.501 data_s:0.011 +INFO 2025-09-09 01:22:52 celerate.py:295 Checkpoint policy after step 80000 +INFO 2025-09-09 01:24:37 celerate.py:281 step:80K smpl:5M ep:32K epch:18.77 loss:0.086 grdn:0.241 lr:2.5e-06 updt_s: +0.510 data_s:0.003 +INFO 2025-09-09 01:26:20 celerate.py:281 step:80K smpl:5M ep:32K epch:18.82 loss:0.087 grdn:0.242 lr:2.5e-06 updt_s: +0.514 data_s:0.003 +INFO 2025-09-09 01:28:05 celerate.py:281 step:81K smpl:5M ep:32K epch:18.86 loss:0.086 grdn:0.241 lr:2.5e-06 updt_s: +0.517 data_s:0.003 +INFO 2025-09-09 01:29:48 celerate.py:281 step:81K smpl:5M ep:32K epch:18.91 loss:0.086 grdn:0.243 lr:2.5e-06 updt_s: +0.511 data_s:0.003 +INFO 2025-09-09 01:31:32 celerate.py:281 step:81K smpl:5M ep:32K epch:18.96 loss:0.086 grdn:0.237 lr:2.5e-06 updt_s: +0.517 data_s:0.003 +INFO 2025-09-09 01:33:18 celerate.py:281 step:81K smpl:5M ep:32K epch:19.00 loss:0.087 grdn:0.244 lr:2.5e-06 updt_s: +0.487 data_s:0.042 +INFO 2025-09-09 01:35:01 celerate.py:281 step:81K smpl:5M ep:32K epch:19.05 loss:0.086 grdn:0.236 lr:2.5e-06 updt_s: +0.461 data_s:0.054 +INFO 2025-09-09 01:36:46 celerate.py:281 step:82K smpl:5M ep:32K epch:19.10 loss:0.086 grdn:0.239 lr:2.5e-06 updt_s: +0.450 data_s:0.071 +INFO 2025-09-09 01:38:29 celerate.py:281 step:82K smpl:5M ep:32K epch:19.14 loss:0.086 grdn:0.241 lr:2.5e-06 updt_s: +0.451 data_s:0.064 +INFO 2025-09-09 01:40:12 celerate.py:281 step:82K smpl:5M ep:32K epch:19.19 loss:0.086 grdn:0.234 lr:2.5e-06 updt_s: +0.512 data_s:0.003 +INFO 2025-09-09 01:41:54 celerate.py:281 step:82K smpl:5M ep:33K epch:19.24 loss:0.085 grdn:0.237 lr:2.5e-06 updt_s: +0.480 data_s:0.030 +INFO 2025-09-09 01:43:37 celerate.py:281 step:82K smpl:5M ep:33K epch:19.28 loss:0.086 grdn:0.242 lr:2.5e-06 updt_s: +0.434 data_s:0.080 +INFO 2025-09-09 01:45:20 celerate.py:281 step:83K smpl:5M ep:33K epch:19.33 loss:0.086 grdn:0.243 lr:2.5e-06 updt_s: +0.511 data_s:0.003 +INFO 2025-09-09 01:47:04 celerate.py:281 step:83K smpl:5M ep:33K epch:19.38 loss:0.086 grdn:0.243 lr:2.5e-06 updt_s: +0.511 data_s:0.003 +INFO 2025-09-09 01:48:49 celerate.py:281 step:83K smpl:5M ep:33K epch:19.42 loss:0.086 grdn:0.243 lr:2.5e-06 updt_s: +0.520 data_s:0.003 +INFO 2025-09-09 01:50:32 celerate.py:281 step:83K smpl:5M ep:33K epch:19.47 loss:0.086 grdn:0.238 lr:2.5e-06 updt_s: +0.511 data_s:0.003 +INFO 2025-09-09 01:52:15 celerate.py:281 step:83K smpl:5M ep:33K epch:19.52 loss:0.086 grdn:0.243 lr:2.5e-06 updt_s: +0.513 data_s:0.003 +INFO 2025-09-09 01:53:59 celerate.py:281 step:84K smpl:5M ep:33K epch:19.57 loss:0.085 grdn:0.236 lr:2.5e-06 updt_s: +0.513 data_s:0.003 +INFO 2025-09-09 01:55:41 celerate.py:281 step:84K smpl:5M ep:33K epch:19.61 loss:0.086 grdn:0.238 lr:2.5e-06 updt_s: +0.508 data_s:0.003 +INFO 2025-09-09 01:57:24 celerate.py:281 step:84K smpl:5M ep:33K epch:19.66 loss:0.087 grdn:0.237 lr:2.5e-06 updt_s: +0.500 data_s:0.013 +INFO 2025-09-09 01:59:07 celerate.py:281 step:84K smpl:5M ep:33K epch:19.71 loss:0.087 grdn:0.245 lr:2.5e-06 updt_s: +0.510 data_s:0.003 +INFO 2025-09-09 02:00:50 celerate.py:281 step:84K smpl:5M ep:33K epch:19.75 loss:0.086 grdn:0.237 lr:2.5e-06 updt_s: +0.512 data_s:0.003 +INFO 2025-09-09 02:02:34 celerate.py:281 step:85K smpl:5M ep:34K epch:19.80 loss:0.086 grdn:0.240 lr:2.5e-06 updt_s: +0.516 data_s:0.003 +INFO 2025-09-09 02:04:17 celerate.py:281 step:85K smpl:5M ep:34K epch:19.85 loss:0.086 grdn:0.244 lr:2.5e-06 updt_s: +0.511 data_s:0.003 +INFO 2025-09-09 02:05:59 celerate.py:281 step:85K smpl:5M ep:34K epch:19.89 loss:0.086 grdn:0.243 lr:2.5e-06 updt_s: +0.507 data_s:0.004 +INFO 2025-09-09 02:07:43 celerate.py:281 step:85K smpl:5M ep:34K epch:19.94 loss:0.086 grdn:0.237 lr:2.5e-06 updt_s: +0.505 data_s:0.011 +INFO 2025-09-09 02:09:26 celerate.py:281 step:85K smpl:5M ep:34K epch:19.99 loss:0.086 grdn:0.242 lr:2.5e-06 updt_s: +0.392 data_s:0.124 +INFO 2025-09-09 02:11:14 celerate.py:281 step:86K smpl:5M ep:34K epch:20.03 loss:0.086 grdn:0.240 lr:2.5e-06 updt_s: +0.435 data_s:0.100 +INFO 2025-09-09 02:12:57 celerate.py:281 step:86K smpl:5M ep:34K epch:20.08 loss:0.086 grdn:0.242 lr:2.5e-06 updt_s: +0.514 data_s:0.003 +INFO 2025-09-09 02:14:41 celerate.py:281 step:86K smpl:6M ep:34K epch:20.13 loss:0.085 grdn:0.234 lr:2.5e-06 updt_s: +0.516 data_s:0.003 +INFO 2025-09-09 02:16:25 celerate.py:281 step:86K smpl:6M ep:34K epch:20.17 loss:0.085 grdn:0.234 lr:2.5e-06 updt_s: +0.514 data_s:0.003 +INFO 2025-09-09 02:18:09 celerate.py:281 step:86K smpl:6M ep:34K epch:20.22 loss:0.085 grdn:0.238 lr:2.5e-06 updt_s: +0.506 data_s:0.010 +INFO 2025-09-09 02:19:53 celerate.py:281 step:87K smpl:6M ep:34K epch:20.27 loss:0.086 grdn:0.238 lr:2.5e-06 updt_s: +0.515 data_s:0.003 +INFO 2025-09-09 02:21:35 celerate.py:281 step:87K smpl:6M ep:34K epch:20.31 loss:0.086 grdn:0.243 lr:2.5e-06 updt_s: +0.501 data_s:0.007 +INFO 2025-09-09 02:23:17 celerate.py:281 step:87K smpl:6M ep:34K epch:20.36 loss:0.085 grdn:0.242 lr:2.5e-06 updt_s: +0.432 data_s:0.080 +INFO 2025-09-09 02:25:01 celerate.py:281 step:87K smpl:6M ep:35K epch:20.41 loss:0.086 grdn:0.235 lr:2.5e-06 updt_s: +0.445 data_s:0.073 +INFO 2025-09-09 02:26:43 celerate.py:281 step:87K smpl:6M ep:35K epch:20.45 loss:0.085 grdn:0.239 lr:2.5e-06 updt_s: +0.401 data_s:0.107 +INFO 2025-09-09 02:28:26 celerate.py:281 step:88K smpl:6M ep:35K epch:20.50 loss:0.086 grdn:0.237 lr:2.5e-06 updt_s: +0.502 data_s:0.009 +INFO 2025-09-09 02:30:09 celerate.py:281 step:88K smpl:6M ep:35K epch:20.55 loss:0.087 grdn:0.248 lr:2.5e-06 updt_s: +0.491 data_s:0.021 +INFO 2025-09-09 02:31:52 celerate.py:281 step:88K smpl:6M ep:35K epch:20.59 loss:0.086 grdn:0.243 lr:2.5e-06 updt_s: +0.391 data_s:0.124 +INFO 2025-09-09 02:33:35 celerate.py:281 step:88K smpl:6M ep:35K epch:20.64 loss:0.086 grdn:0.238 lr:2.5e-06 updt_s: +0.332 data_s:0.180 +INFO 2025-09-09 02:35:18 celerate.py:281 step:88K smpl:6M ep:35K epch:20.69 loss:0.087 grdn:0.243 lr:2.5e-06 updt_s: +0.333 data_s:0.181 +INFO 2025-09-09 02:37:02 celerate.py:281 step:89K smpl:6M ep:35K epch:20.74 loss:0.086 grdn:0.245 lr:2.5e-06 updt_s: +0.332 data_s:0.185 +INFO 2025-09-09 02:38:47 celerate.py:281 step:89K smpl:6M ep:35K epch:20.78 loss:0.086 grdn:0.241 lr:2.5e-06 updt_s: +0.332 data_s:0.190 +INFO 2025-09-09 02:40:30 celerate.py:281 step:89K smpl:6M ep:35K epch:20.83 loss:0.086 grdn:0.240 lr:2.5e-06 updt_s: +0.330 data_s:0.187 +INFO 2025-09-09 02:42:12 celerate.py:281 step:89K smpl:6M ep:35K epch:20.88 loss:0.085 grdn:0.243 lr:2.5e-06 updt_s: +0.331 data_s:0.177 +INFO 2025-09-09 02:43:56 celerate.py:281 step:89K smpl:6M ep:35K epch:20.92 loss:0.086 grdn:0.244 lr:2.5e-06 updt_s: +0.330 data_s:0.185 +INFO 2025-09-09 02:45:39 celerate.py:281 step:90K smpl:6M ep:36K epch:20.97 loss:0.087 grdn:0.252 lr:2.5e-06 updt_s: +0.335 data_s:0.182 +INFO 2025-09-09 02:47:25 celerate.py:281 step:90K smpl:6M ep:36K epch:21.02 loss:0.087 grdn:0.247 lr:2.5e-06 updt_s: +0.330 data_s:0.197 +INFO 2025-09-09 02:49:08 celerate.py:281 step:90K smpl:6M ep:36K epch:21.06 loss:0.086 grdn:0.240 lr:2.5e-06 updt_s: +0.330 data_s:0.182 +INFO 2025-09-09 02:50:51 celerate.py:281 step:90K smpl:6M ep:36K epch:21.11 loss:0.086 grdn:0.243 lr:2.5e-06 updt_s: +0.334 data_s:0.181 +INFO 2025-09-09 02:52:34 celerate.py:281 step:90K smpl:6M ep:36K epch:21.16 loss:0.086 grdn:0.237 lr:2.5e-06 updt_s: +0.330 data_s:0.183 +INFO 2025-09-09 02:54:17 celerate.py:281 step:91K smpl:6M ep:36K epch:21.20 loss:0.085 grdn:0.235 lr:2.5e-06 updt_s: +0.342 data_s:0.173 +INFO 2025-09-09 02:56:01 celerate.py:281 step:91K smpl:6M ep:36K epch:21.25 loss:0.086 grdn:0.246 lr:2.5e-06 updt_s: +0.331 data_s:0.189 +INFO 2025-09-09 02:57:44 celerate.py:281 step:91K smpl:6M ep:36K epch:21.30 loss:0.086 grdn:0.236 lr:2.5e-06 updt_s: +0.329 data_s:0.182 +INFO 2025-09-09 02:59:27 celerate.py:281 step:91K smpl:6M ep:36K epch:21.34 loss:0.087 grdn:0.246 lr:2.5e-06 updt_s: +0.341 data_s:0.175 +INFO 2025-09-09 03:01:11 celerate.py:281 step:91K smpl:6M ep:36K epch:21.39 loss:0.086 grdn:0.242 lr:2.5e-06 updt_s: +0.330 data_s:0.188 +INFO 2025-09-09 03:02:54 celerate.py:281 step:92K smpl:6M ep:36K epch:21.44 loss:0.086 grdn:0.237 lr:2.5e-06 updt_s: +0.328 data_s:0.186 +INFO 2025-09-09 03:04:39 celerate.py:281 step:92K smpl:6M ep:36K epch:21.48 loss:0.086 grdn:0.244 lr:2.5e-06 updt_s: +0.329 data_s:0.195 +INFO 2025-09-09 03:06:22 celerate.py:281 step:92K smpl:6M ep:36K epch:21.53 loss:0.086 grdn:0.244 lr:2.5e-06 updt_s: +0.330 data_s:0.183 +INFO 2025-09-09 03:08:04 celerate.py:281 step:92K smpl:6M ep:37K epch:21.58 loss:0.086 grdn:0.241 lr:2.5e-06 updt_s: +0.462 data_s:0.051 +INFO 2025-09-09 03:09:48 celerate.py:281 step:92K smpl:6M ep:37K epch:21.62 loss:0.086 grdn:0.248 lr:2.5e-06 updt_s: +0.407 data_s:0.108 +INFO 2025-09-09 03:11:32 celerate.py:281 step:93K smpl:6M ep:37K epch:21.67 loss:0.086 grdn:0.232 lr:2.5e-06 updt_s: +0.333 data_s:0.185 +INFO 2025-09-09 03:13:15 celerate.py:281 step:93K smpl:6M ep:37K epch:21.72 loss:0.085 grdn:0.242 lr:2.5e-06 updt_s: +0.329 data_s:0.187 +INFO 2025-09-09 03:14:58 celerate.py:281 step:93K smpl:6M ep:37K epch:21.77 loss:0.086 grdn:0.240 lr:2.5e-06 updt_s: +0.357 data_s:0.156 +INFO 2025-09-09 03:16:41 celerate.py:281 step:93K smpl:6M ep:37K epch:21.81 loss:0.086 grdn:0.237 lr:2.5e-06 updt_s: +0.487 data_s:0.027 +INFO 2025-09-09 03:18:25 celerate.py:281 step:93K smpl:6M ep:37K epch:21.86 loss:0.086 grdn:0.240 lr:2.5e-06 updt_s: +0.512 data_s:0.003 +INFO 2025-09-09 03:20:08 celerate.py:281 step:94K smpl:6M ep:37K epch:21.91 loss:0.087 grdn:0.247 lr:2.5e-06 updt_s: +0.512 data_s:0.003 +INFO 2025-09-09 03:21:51 celerate.py:281 step:94K smpl:6M ep:37K epch:21.95 loss:0.086 grdn:0.238 lr:2.5e-06 updt_s: +0.508 data_s:0.004 +INFO 2025-09-09 03:23:38 celerate.py:281 step:94K smpl:6M ep:37K epch:22.00 loss:0.085 grdn:0.239 lr:2.5e-06 updt_s: +0.429 data_s:0.104 +INFO 2025-09-09 03:25:20 celerate.py:281 step:94K smpl:6M ep:37K epch:22.05 loss:0.086 grdn:0.243 lr:2.5e-06 updt_s: +0.328 data_s:0.183 +INFO 2025-09-09 03:27:04 celerate.py:281 step:94K smpl:6M ep:37K epch:22.09 loss:0.086 grdn:0.246 lr:2.5e-06 updt_s: +0.329 data_s:0.191 +INFO 2025-09-09 03:28:47 celerate.py:281 step:95K smpl:6M ep:37K epch:22.14 loss:0.086 grdn:0.246 lr:2.5e-06 updt_s: +0.329 data_s:0.186 +INFO 2025-09-09 03:30:30 celerate.py:281 step:95K smpl:6M ep:38K epch:22.19 loss:0.086 grdn:0.241 lr:2.5e-06 updt_s: +0.346 data_s:0.166 +INFO 2025-09-09 03:32:13 celerate.py:281 step:95K smpl:6M ep:38K epch:22.23 loss:0.086 grdn:0.247 lr:2.5e-06 updt_s: +0.334 data_s:0.181 +INFO 2025-09-09 03:33:56 celerate.py:281 step:95K smpl:6M ep:38K epch:22.28 loss:0.086 grdn:0.244 lr:2.5e-06 updt_s: +0.402 data_s:0.110 +INFO 2025-09-09 03:35:39 celerate.py:281 step:95K smpl:6M ep:38K epch:22.33 loss:0.085 grdn:0.237 lr:2.5e-06 updt_s: +0.353 data_s:0.161 +INFO 2025-09-09 03:37:22 celerate.py:281 step:96K smpl:6M ep:38K epch:22.37 loss:0.086 grdn:0.235 lr:2.5e-06 updt_s: +0.356 data_s:0.158 +INFO 2025-09-09 03:39:04 celerate.py:281 step:96K smpl:6M ep:38K epch:22.42 loss:0.086 grdn:0.244 lr:2.5e-06 updt_s: +0.379 data_s:0.131 +INFO 2025-09-09 03:40:49 celerate.py:281 step:96K smpl:6M ep:38K epch:22.47 loss:0.085 grdn:0.239 lr:2.5e-06 updt_s: +0.344 data_s:0.175 +INFO 2025-09-09 03:42:32 celerate.py:281 step:96K smpl:6M ep:38K epch:22.51 loss:0.086 grdn:0.236 lr:2.5e-06 updt_s: +0.331 data_s:0.185 +INFO 2025-09-09 03:44:15 celerate.py:281 step:96K smpl:6M ep:38K epch:22.56 loss:0.086 grdn:0.244 lr:2.5e-06 updt_s: +0.331 data_s:0.183 +INFO 2025-09-09 03:45:58 celerate.py:281 step:97K smpl:6M ep:38K epch:22.61 loss:0.086 grdn:0.238 lr:2.5e-06 updt_s: +0.330 data_s:0.184 +INFO 2025-09-09 03:47:43 celerate.py:281 step:97K smpl:6M ep:38K epch:22.65 loss:0.086 grdn:0.248 lr:2.5e-06 updt_s: +0.331 data_s:0.188 +INFO 2025-09-09 03:49:27 celerate.py:281 step:97K smpl:6M ep:38K epch:22.70 loss:0.086 grdn:0.240 lr:2.5e-06 updt_s: +0.333 data_s:0.185 +INFO 2025-09-09 03:51:10 celerate.py:281 step:97K smpl:6M ep:39K epch:22.75 loss:0.085 grdn:0.241 lr:2.5e-06 updt_s: +0.330 data_s:0.185 +INFO 2025-09-09 03:52:54 celerate.py:281 step:97K smpl:6M ep:39K epch:22.79 loss:0.086 grdn:0.247 lr:2.5e-06 updt_s: +0.330 data_s:0.192 +INFO 2025-09-09 03:54:37 celerate.py:281 step:98K smpl:6M ep:39K epch:22.84 loss:0.086 grdn:0.237 lr:2.5e-06 updt_s: +0.329 data_s:0.185 +INFO 2025-09-09 03:56:21 celerate.py:281 step:98K smpl:6M ep:39K epch:22.89 loss:0.086 grdn:0.237 lr:2.5e-06 updt_s: +0.329 data_s:0.187 +INFO 2025-09-09 03:58:04 celerate.py:281 step:98K smpl:6M ep:39K epch:22.94 loss:0.086 grdn:0.242 lr:2.5e-06 updt_s: +0.329 data_s:0.185 +INFO 2025-09-09 03:59:46 celerate.py:281 step:98K smpl:6M ep:39K epch:22.98 loss:0.086 grdn:0.239 lr:2.5e-06 updt_s: +0.329 data_s:0.183 +INFO 2025-09-09 04:01:32 celerate.py:281 step:98K smpl:6M ep:39K epch:23.03 loss:0.087 grdn:0.250 lr:2.5e-06 updt_s: +0.376 data_s:0.151 +INFO 2025-09-09 04:03:16 celerate.py:281 step:99K smpl:6M ep:39K epch:23.08 loss:0.086 grdn:0.241 lr:2.5e-06 updt_s: +0.329 data_s:0.187 +INFO 2025-09-09 04:04:59 celerate.py:281 step:99K smpl:6M ep:39K epch:23.12 loss:0.086 grdn:0.243 lr:2.5e-06 updt_s: +0.379 data_s:0.136 +INFO 2025-09-09 04:06:42 celerate.py:281 step:99K smpl:6M ep:39K epch:23.17 loss:0.086 grdn:0.240 lr:2.5e-06 updt_s: +0.513 data_s:0.003 +INFO 2025-09-09 04:08:25 celerate.py:281 step:99K smpl:6M ep:39K epch:23.22 loss:0.086 grdn:0.242 lr:2.5e-06 updt_s: +0.510 data_s:0.003 +INFO 2025-09-09 04:10:07 celerate.py:281 step:99K smpl:6M ep:39K epch:23.26 loss:0.087 grdn:0.242 lr:2.5e-06 updt_s: +0.470 data_s:0.039 +INFO 2025-09-09 04:11:50 celerate.py:281 step:100K smpl:6M ep:39K epch:23.31 loss:0.086 grdn:0.247 lr:2.5e-06 updt_s +:0.347 data_s:0.169 +INFO 2025-09-09 04:13:34 celerate.py:281 step:100K smpl:6M ep:40K epch:23.36 loss:0.085 grdn:0.237 lr:2.5e-06 updt_s +:0.330 data_s:0.185 +INFO 2025-09-09 04:15:17 celerate.py:281 step:100K smpl:6M ep:40K epch:23.40 loss:0.086 grdn:0.239 lr:2.5e-06 updt_s +:0.356 data_s:0.156 +INFO 2025-09-09 04:15:17 celerate.py:295 Checkpoint policy after step 100000 +INFO 2025-09-09 04:15:18 celerate.py:359 End of training +(lerobot) jade_choghari@hf-dgx-01:~/lerobot$ clear +(lerobot) jade_choghari@hf-dgx-01:~/lerobot$ tmux capture-pane -pS - > tmux_log.txt + + + + + + + + + + From aa40c8c813ee35c8c4289675d00cb75ea7609733 Mon Sep 17 00:00:00 2001 From: Jade Choghari Date: Wed, 10 Sep 2025 23:24:18 +0200 Subject: [PATCH 03/12] More things --- examples/6_evaluate_libero.sh | 4 +- examples/9_evaluate_must.sh | 2811 +++++++++++++++++ src/lerobot/envs/configs.py | 2 - src/lerobot/envs/factory.py | 47 +- src/lerobot/envs/libero copy.py | 326 ++ src/lerobot/envs/libero.py | 15 +- src/lerobot/envs/libero2.py | 308 ++ src/lerobot/envs/utils.py | 49 + src/lerobot/policies/factory.py | 2 +- .../policies/smolpi0/modeling_smolpi0.py | 110 +- .../policies/smolvla/modeling_smolvla.py | 3 +- src/lerobot/scripts/eval.py | 41 +- 12 files changed, 3670 insertions(+), 48 deletions(-) create mode 100644 examples/9_evaluate_must.sh create mode 100644 src/lerobot/envs/libero copy.py create mode 100644 src/lerobot/envs/libero2.py diff --git a/examples/6_evaluate_libero.sh b/examples/6_evaluate_libero.sh index ad6ca0f13..c15d71c95 100644 --- a/examples/6_evaluate_libero.sh +++ b/examples/6_evaluate_libero.sh @@ -14,11 +14,11 @@ export HF_DATASETS_OFFLINE=1 export HF_HUB_OFFLINE=1 export TOKENIZERS_PARALLELISM=false export MUJOCO_GL=egl -export CUDA_VISIBLE_DEVICES=3 +export CUDA_VISIBLE_DEVICES=2 # CONFIGURATION POLICY_PATH="/raid/jade/logs/lerobot/lerobot_2_HuggingFaceVLA_libero_smolvla_lr1e-4bs32steps100000/checkpoints/100000/pretrained_model" -POLICY_PATH="/raid/jade/models/smolvlamust" +POLICY_PATH="/raid/jade/logs/lerobot/lerobot_new_HuggingfaceVLA_libero_smolvla_lr1e-4bs32steps100000/checkpoints/100000/pretrained_model" TASK=libero_spatial ENV_TYPE="libero" BATCH_SIZE=10 diff --git a/examples/9_evaluate_must.sh b/examples/9_evaluate_must.sh new file mode 100644 index 000000000..534153330 --- /dev/null +++ b/examples/9_evaluate_must.sh @@ -0,0 +1,2811 @@ +#!/bin/bash + +#SBATCH --job-name=lerobot_eval_smolpi0_libero_eval10ep_ca_sa2_16vlm_w075_smolvlm2b_lr7e5 +#SBATCH --nodes=1 +#SBATCH --ntasks=1 +#SBATCH --gpus-per-node=1 +#SBATCH --mail-type=END,FAIL +#SBATCH --output=/lustre/fswork/projects/rech/dyf/ugz83ue/logs/slurm/lerobot_eval_smolpi0_libero_eval10ep_ca_sa2_16vlm_w075_smolvlm2b_lr7e5.out +###SBATCH --nodelist=jean-zay-a101 +#SBATCH --cpus-per-task=45 +###SBATCH --exclusive +#SBATCH --time=15:00:00 +#SBATCH --mail-user=mustafa.shukor@isir.upmc.fr + +##SBATCH --partition=gpu_p2 +##SBATCH --qos=qos_gpu-t3 +###SBATCH -C v100-32g +##SBATCH -A dyf@v100 + +##SBATCH --partition=gpu_p5 +##SBATCH -C a100 +###SBATCH -A dyf@a100 +##SBATCH -A lqm@a100 +##SBATCH --qos=qos_gpu_a100-dev +##SBATCH --qos=qos_gpu_a100-t3 + +#SBATCH --partition=gpu_p6 +#SBATCH -C h100 +#SBATCH -A lqm@h100 +###SBATCH --qos=qos_gpu_h100-dev +#SBATCH --qos=qos_gpu_h100-t3 + +###SBATCH --begin=now+2hour + +# cd ~/lerobot_pi +# source ~/.bashrc +# source activate lerobot +# export LC_ALL=C + +# rm core-* +export CUDA_VISIBLE_DEVICES=3 +# storage / caches +RAID=/raid/jade +export TRANSFORMERS_CACHE=$RAID/.cache/huggingface/transformers +export HF_HOME=$RAID/.cache/huggingface +export HF_DATASETS_CACHE=$RAID/.cache/huggingface/datasets +export HF_LEROBOT_HOME=$RAID/.cache/huggingface/lerobot +export WANDB_CACHE_DIR=$RAID/.cache/wandb +export TMPDIR=$RAID/.cache/tmp +mkdir -p $TMPDIR +export WANDB_MODE=offline +export HF_DATASETS_OFFLINE=1 +# export HF_HUB_OFFLINE=1 +export TOKENIZERS_PARALLELISM=false +export MUJOCO_GL=egl +export CUDA_VISIBLE_DEVICES=3 + +PORT=29512 + +## then later +## wandb sync wandb/offline-run-* + + +ENV=libero + +# TASK=libero_10 +TASK=libero_spatial +# TASK=libero_spatial +# TASK=libero_10 +# TASK=libero_spatial + + +POLICY_NAME=smolpi0 + +POLICY=smolpi0 +ENV=libero + + + + + + +CKPT_KEYS_MAPPING=model._orig_mod.//model. +LOAD_VLM_WEIGHTS=true +PEFT_METHOD=freeze +SELF_ATTN_ONLY_ACTIONS=false +CAUSAL_ATTENTION_ON_HISTORY=false + +PREDICT_RELATIVE_ACTIONS=false +RELATIVE_ACTIONS_MODE=first +SHUFFLE_CAMERA_POSITIONS=false + +VLM_IMG_SIZE=-1 +REGRESSION_LOSS=false + + +# ## Baseline for ablation study +# EXPERT_WIDTH_MULTIPLIER=0.5 +# PAST_OBS_KEYS="image" +# N_OBS_STEPS=1 +# NUM_EXPERT_LAYERS=0 +# CHUNK_SIZE=50 +# NUM_VLM_LAYERS=0 +# PAD_LANG_TO=max_length +# EVAL_CKPT=$WORK/logs/lerobot/lerobot_physical_intelligence_libero_smolpi0_lr1e-4_bs8_steps100000_gpus2_freeze32_onlyexpert_1act_promptfalse_imgtoktrue_nobs1_compiletrue_cross_attn_pref0_gap1_localimgfalse_reverseimgorderfalse_statetopreftrue/checkpoints/last/pretrained_model/ +# ADD_IMAGE_TOKENS=true +# ATTN_MODE=cross_attn +# STATE_TO_PREFIX=true +# CAUSAL_ACTION_ATTENTION_MASK=false +# SELF_ATTN_EVERY_N_LAYERS=0 +# VLM_NAME=HuggingFaceTB/SmolVLM2-500M-Video-Instruct + +# EXPERT_WIDTH_MULTIPLIER=0.5 +# PAST_OBS_KEYS="image" +# N_OBS_STEPS=1 +# NUM_EXPERT_LAYERS=0 +# CHUNK_SIZE=50 +# NUM_VLM_LAYERS=0 +# PAD_LANG_TO=max_length +# EVAL_CKPT=$WORK/logs/lerobot/lerobot_physical_intelligence_libero_smolpi0_lr1e-4_bs8_steps100000_gpus2_freeze32_onlyexpert_1act_promptfalse_imgtoktrue_nobs1_compiletrue_cross_attn/checkpoints/last/pretrained_model/ +# ADD_IMAGE_TOKENS=true +# ATTN_MODE=cross_attn +# STATE_TO_PREFIX=false +# CAUSAL_ACTION_ATTENTION_MASK=false +# SELF_ATTN_EVERY_N_LAYERS=0 +# VLM_NAME=HuggingFaceTB/SmolVLM2-500M-Video-Instruct + + +# EXPERT_WIDTH_MULTIPLIER=0.5 +# PAST_OBS_KEYS="image" +# N_OBS_STEPS=1 +# NUM_EXPERT_LAYERS=0 +# CHUNK_SIZE=50 +# NUM_VLM_LAYERS=0 +# PAD_LANG_TO=max_length +# EVAL_CKPT=$WORK/logs/lerobot/lerobot_physical_intelligence_libero_smolpi0_lr1e-4_bs8_steps100000_gpus2_freeze32_onlyexpert_1act_promptfalse_imgtoktrue_nobs1_compiletrue_self_attn/checkpoints/last/pretrained_model/ +# ADD_IMAGE_TOKENS=true +# ATTN_MODE=self_attn +# STATE_TO_PREFIX=false +# CAUSAL_ACTION_ATTENTION_MASK=false +# SELF_ATTN_EVERY_N_LAYERS=0 +# VLM_NAME=HuggingFaceTB/SmolVLM2-500M-Video-Instruct + + +# EXPERT_WIDTH_MULTIPLIER=0.5 +# PAST_OBS_KEYS="image" +# N_OBS_STEPS=1 +# NUM_EXPERT_LAYERS=0 +# CHUNK_SIZE=50 +# NUM_VLM_LAYERS=0 +# PAD_LANG_TO=longest +# EVAL_CKPT=$WORK/logs/lerobot/lerobot_physical_intelligence_libero_smolpi0_lr1e-4bs8steps100000gpus2freeze32_imgtoktrue_self_attn_gap1_localimgfalse_statetopreffalse_explay0_vlml0_causalacttrue_sa0/checkpoints/last/pretrained_model/ +# ADD_IMAGE_TOKENS=true +# ATTN_MODE=self_attn +# STATE_TO_PREFIX=false +# CAUSAL_ACTION_ATTENTION_MASK=true +# SELF_ATTN_EVERY_N_LAYERS=0 +# VLM_NAME=HuggingFaceTB/SmolVLM2-500M-Video-Instruct + +# EXPERT_WIDTH_MULTIPLIER=0.5 +# PAST_OBS_KEYS="image" +# N_OBS_STEPS=1 +# NUM_EXPERT_LAYERS=0 +# CHUNK_SIZE=50 +# NUM_VLM_LAYERS=0 +# PAD_LANG_TO=longest +# EVAL_CKPT=$WORK/logs/lerobot/lerobot_physical_intelligence_libero_smolpi0_lr5e-4bs8steps100000gpus2freeze32_imgtoktrue_cross_attn_gap1_localimgfalse_statetopreftrue_explay0_vlml0_causalactfalse_sa0_smolvlm2250/checkpoints/best/pretrained_model/ +# ADD_IMAGE_TOKENS=true +# ATTN_MODE=cross_attn +# STATE_TO_PREFIX=true +# CAUSAL_ACTION_ATTENTION_MASK=false +# SELF_ATTN_EVERY_N_LAYERS=0 +# VLM_NAME=HuggingFaceTB/SmolVLM2-256M-Video-Instruct + +# EXPERT_WIDTH_MULTIPLIER=0.5 +# PAST_OBS_KEYS="image" +# N_OBS_STEPS=1 +# NUM_EXPERT_LAYERS=0 +# CHUNK_SIZE=50 +# NUM_VLM_LAYERS=0 +# PAD_LANG_TO=longest +# EVAL_CKPT=$WORK/logs/lerobot/lerobot_physical_intelligence_libero_smolpi0_lr8e-5bs8steps100000gpus2freeze32_imgtoktrue_cross_attn_gap1_localimgfalse_statetopreftrue_explay0_vlml0_causalactfalse_sa0_smolvlm22b/checkpoints/best/pretrained_model/ +# ADD_IMAGE_TOKENS=true +# ATTN_MODE=cross_attn +# STATE_TO_PREFIX=true +# CAUSAL_ACTION_ATTENTION_MASK=false +# SELF_ATTN_EVERY_N_LAYERS=0 +# VLM_NAME=HuggingFaceTB/SmolVLM2-2.2B-Instruct + +# EXPERT_WIDTH_MULTIPLIER=0.5 +# PAST_OBS_KEYS="image" +# N_OBS_STEPS=1 +# NUM_EXPERT_LAYERS=0 +# CHUNK_SIZE=50 +# NUM_VLM_LAYERS=0 +# PAD_LANG_TO=longest +# EVAL_CKPT=$WORK/logs/lerobot/lerobot_physical_intelligence_libero_smolpi0_lr1e-4_bs8_steps100000_gpus2_freeze32_onlyexpert_1act_promptfalse_imgtoktrue_nobs1_compiletrue_self_attn_pref0_gap1_localimgfalse_reverseimgorderfalse_statetopreftrue_toklongest_explay0_vlml0_causalacttrue/checkpoints/last/pretrained_model/ +# ADD_IMAGE_TOKENS=true +# ATTN_MODE=self_attn +# STATE_TO_PREFIX=true +# CAUSAL_ACTION_ATTENTION_MASK=true +# SELF_ATTN_EVERY_N_LAYERS=0 +# VLM_NAME=HuggingFaceTB/SmolVLM2-500M-Video-Instruct + +# EXPERT_WIDTH_MULTIPLIER=0.5 +# PAST_OBS_KEYS="image" +# N_OBS_STEPS=1 +# NUM_EXPERT_LAYERS=0 +# CHUNK_SIZE=50 +# NUM_VLM_LAYERS=0 +# PAD_LANG_TO=longest +# EVAL_CKPT=$WORK/logs/lerobot/lerobot_physical_intelligence_libero_smolpi0_lr1e-4bs8steps100000gpus2freeze32_imgtoktrue_cross_attn_gap1_localimgfalse_statetopreftrue_explay0_vlml0_causalacttrue_sa2/checkpoints/best/pretrained_model/ +# ADD_IMAGE_TOKENS=true +# ATTN_MODE=cross_attn +# STATE_TO_PREFIX=true +# CAUSAL_ACTION_ATTENTION_MASK=true +# SELF_ATTN_EVERY_N_LAYERS=2 +# VLM_NAME=HuggingFaceTB/SmolVLM2-500M-Video-Instruct + +# EXPERT_WIDTH_MULTIPLIER=0.5 +# PAST_OBS_KEYS="image" +# N_OBS_STEPS=1 +# NUM_EXPERT_LAYERS=0 +# CHUNK_SIZE=50 +# NUM_VLM_LAYERS=0 +# PAD_LANG_TO=longest +# EVAL_CKPT=$WORK/logs/lerobot/lerobot_physical_intelligence_libero_smolpi0_lr1e-4_bs8_steps100000_gpus2_freeze32_onlyexpert_1act_promptfalse_imgtoktrue_nobs1_compiletrue_cross_attn_pref0_gap1_localimgfalse_reverseimgorderfalse_statetopreffalse/checkpoints/last/pretrained_model/ +# ADD_IMAGE_TOKENS=true +# ATTN_MODE=cross_attn +# STATE_TO_PREFIX=false +# CAUSAL_ACTION_ATTENTION_MASK=true +# SELF_ATTN_EVERY_N_LAYERS=0 +# VLM_NAME=HuggingFaceTB/SmolVLM2-500M-Video-Instruct + +# EXPERT_WIDTH_MULTIPLIER=0.5 +# PAST_OBS_KEYS="image" +# N_OBS_STEPS=1 +# NUM_EXPERT_LAYERS=0 +# CHUNK_SIZE=50 +# NUM_VLM_LAYERS=0 +# PAD_LANG_TO=longest +# EVAL_CKPT=$WORK/logs/lerobot/lerobot_physical_intelligence_libero_smolpi0_lr1e-4bs8steps100000gpus2freeze32_imgtoktrue_self_attn_gap1_localimgfalse_statetopreffalse_explay0_vlml0_causalacttrue_sa0_smolvlm2500/checkpoints/best/pretrained_model/ +# ADD_IMAGE_TOKENS=true +# ATTN_MODE=self_attn +# STATE_TO_PREFIX=false +# CAUSAL_ACTION_ATTENTION_MASK=true +# SELF_ATTN_EVERY_N_LAYERS=0 +# VLM_NAME=HuggingFaceTB/SmolVLM2-500M-Video-Instruct + +# EXPERT_WIDTH_MULTIPLIER=0.5 +# PAST_OBS_KEYS="image" +# N_OBS_STEPS=1 +# NUM_EXPERT_LAYERS=0 +# CHUNK_SIZE=50 +# NUM_VLM_LAYERS=0 +# PAD_LANG_TO=longest +# EVAL_CKPT=$WORK/logs/lerobot/lerobot_physical_intelligence_libero_smolpi0_lr1e-4bs8steps100000gpus2freeze32_imgtoktrue_self_attn_gap1_localimgfalse_statetopreftrue_explay0_vlml0_causalactfalse_sa0_smolvlm2500/checkpoints/best/pretrained_model/ +# ADD_IMAGE_TOKENS=true +# ATTN_MODE=self_attn +# STATE_TO_PREFIX=true +# CAUSAL_ACTION_ATTENTION_MASK=false +# SELF_ATTN_EVERY_N_LAYERS=0 +# VLM_NAME=HuggingFaceTB/SmolVLM2-500M-Video-Instruct + +# EXPERT_WIDTH_MULTIPLIER=0.5 +# PAST_OBS_KEYS="image" +# N_OBS_STEPS=1 +# NUM_EXPERT_LAYERS=0 +# CHUNK_SIZE=50 +# NUM_VLM_LAYERS=8 +# PAD_LANG_TO=longest +# EVAL_CKPT=$WORK/logs/lerobot/lerobot_physical_intelligence_libero_smolpi0_lr1e-4bs8steps100000gpus2freeze32_imgtoktrue_cross_attn_gap1_localimgfalse_statetopreftrue_explay0_vlml8_causalactfalse_sa0/checkpoints/best/pretrained_model/ +# ADD_IMAGE_TOKENS=true +# ATTN_MODE=cross_attn +# STATE_TO_PREFIX=true +# CAUSAL_ACTION_ATTENTION_MASK=false +# SELF_ATTN_EVERY_N_LAYERS=0 +# VLM_NAME=HuggingFaceTB/SmolVLM2-500M-Video-Instruct + +# EXPERT_WIDTH_MULTIPLIER=0.5 +# PAST_OBS_KEYS="image" +# N_OBS_STEPS=1 +# NUM_EXPERT_LAYERS=0 +# CHUNK_SIZE=50 +# NUM_VLM_LAYERS=16 +# PAD_LANG_TO=longest +# EVAL_CKPT=$WORK/logs/lerobot/lerobot_physical_intelligence_libero_smolpi0_lr1e-4bs8steps100000gpus2freeze32_imgtoktrue_cross_attn_gap1_localimgfalse_statetopreftrue_explay0_vlml16_causalactfalse_sa0/checkpoints/last/pretrained_model/ +# ADD_IMAGE_TOKENS=true +# ATTN_MODE=cross_attn +# STATE_TO_PREFIX=true +# CAUSAL_ACTION_ATTENTION_MASK=false +# SELF_ATTN_EVERY_N_LAYERS=0 +# VLM_NAME=HuggingFaceTB/SmolVLM2-500M-Video-Instruct + +# EXPERT_WIDTH_MULTIPLIER=0.5 +# PAST_OBS_KEYS="image" +# N_OBS_STEPS=1 +# NUM_EXPERT_LAYERS=0 +# CHUNK_SIZE=50 +# NUM_VLM_LAYERS=24 +# PAD_LANG_TO=longest +# EVAL_CKPT=$WORK/logs/lerobot/lerobot_physical_intelligence_libero_smolpi0_lr1e-4bs8steps100000gpus2freeze32_imgtoktrue_cross_attn_gap1_localimgfalse_statetopreftrue_explay0_vlml24_causalactfalse_sa0/checkpoints/best/pretrained_model/ +# ADD_IMAGE_TOKENS=true +# ATTN_MODE=cross_attn +# STATE_TO_PREFIX=true +# CAUSAL_ACTION_ATTENTION_MASK=false +# SELF_ATTN_EVERY_N_LAYERS=0 +# VLM_NAME=HuggingFaceTB/SmolVLM2-500M-Video-Instruct + +# EXPERT_WIDTH_MULTIPLIER=0.5 +# PAST_OBS_KEYS="image" +# N_OBS_STEPS=1 +# NUM_EXPERT_LAYERS=0 +# CHUNK_SIZE=50 +# NUM_VLM_LAYERS=0 +# PAD_LANG_TO=longest +# EVAL_CKPT=$WORK/logs/lerobot/lerobot_physical_intelligence_libero_smolpi0_lr1e-4bs8steps100000gpus2freeze32_imgtokfalse_cross_attn_gap1_localimgfalse_statetopreftrue_explay0_vlml0_causalactfalse_sa0_smolvlm2500/checkpoints/best/pretrained_model/ +# ADD_IMAGE_TOKENS=false +# ATTN_MODE=cross_attn +# STATE_TO_PREFIX=true +# CAUSAL_ACTION_ATTENTION_MASK=false +# SELF_ATTN_EVERY_N_LAYERS=0 +# VLM_NAME=HuggingFaceTB/SmolVLM2-500M-Video-Instruct + +# EXPERT_WIDTH_MULTIPLIER=0.5 +# PAST_OBS_KEYS="image" +# N_OBS_STEPS=1 +# NUM_EXPERT_LAYERS=0 +# CHUNK_SIZE=100 +# NUM_VLM_LAYERS=0 +# PAD_LANG_TO=longest +# EVAL_CKPT=$WORK/logs/lerobot/lerobot_physical_intelligence_libero_smolpi0_lr1e-4bs8steps100000gpus2freeze32_imgtoktrue_cross_attn_gap1_localimgfalse_statetopreftrue_explay0_vlml0_causalactfalse_sa0_smolvlm2500_chunk100/checkpoints/best/pretrained_model/ +# ADD_IMAGE_TOKENS=true +# ATTN_MODE=cross_attn +# STATE_TO_PREFIX=true +# CAUSAL_ACTION_ATTENTION_MASK=false +# SELF_ATTN_EVERY_N_LAYERS=0 +# VLM_NAME=HuggingFaceTB/SmolVLM2-500M-Video-Instruct + +# EXPERT_WIDTH_MULTIPLIER=0.5 +# PAST_OBS_KEYS="image" +# N_OBS_STEPS=1 +# NUM_EXPERT_LAYERS=0 +# CHUNK_SIZE=30 +# NUM_VLM_LAYERS=0 +# PAD_LANG_TO=longest +# EVAL_CKPT=$WORK/logs/lerobot/lerobot_physical_intelligence_libero_smolpi0_lr1e-4bs8steps100000gpus2freeze32_imgtoktrue_cross_attn_gap1_localimgfalse_statetopreftrue_explay0_vlml0_causalactfalse_sa0_smolvlm2500_chunk30/checkpoints/best/pretrained_model/ +# ADD_IMAGE_TOKENS=true +# ATTN_MODE=cross_attn +# STATE_TO_PREFIX=true +# CAUSAL_ACTION_ATTENTION_MASK=false +# SELF_ATTN_EVERY_N_LAYERS=0 +# VLM_NAME=HuggingFaceTB/SmolVLM2-500M-Video-Instruct + +# EXPERT_WIDTH_MULTIPLIER=0.5 +# PAST_OBS_KEYS="image" +# N_OBS_STEPS=1 +# NUM_EXPERT_LAYERS=0 +# CHUNK_SIZE=10 +# NUM_VLM_LAYERS=0 +# PAD_LANG_TO=longest +# EVAL_CKPT=$WORK/logs/lerobot/lerobot_physical_intelligence_libero_smolpi0_lr1e-4bs8steps100000gpus2freeze32_imgtoktrue_cross_attn_gap1_localimgfalse_statetopreftrue_explay0_vlml0_causalactfalse_sa0_smolvlm2500_chunk10/checkpoints/best/pretrained_model/ +# ADD_IMAGE_TOKENS=true +# ATTN_MODE=cross_attn +# STATE_TO_PREFIX=true +# CAUSAL_ACTION_ATTENTION_MASK=false +# SELF_ATTN_EVERY_N_LAYERS=0 +# VLM_NAME=HuggingFaceTB/SmolVLM2-500M-Video-Instruct + +# EXPERT_WIDTH_MULTIPLIER=0.5 +# PAST_OBS_KEYS="image" +# N_OBS_STEPS=1 +# NUM_EXPERT_LAYERS=0 +# CHUNK_SIZE=1 +# NUM_VLM_LAYERS=0 +# PAD_LANG_TO=longest +# EVAL_CKPT=$WORK/logs/lerobot/lerobot_physical_intelligence_libero_smolpi0_lr1e-4bs8steps100000gpus2freeze32_imgtoktrue_cross_attn_gap1_localimgfalse_statetopreftrue_explay0_vlml0_causalactfalse_sa0_smolvlm2500_chunk1/checkpoints/best/pretrained_model/ +# ADD_IMAGE_TOKENS=true +# ATTN_MODE=cross_attn +# STATE_TO_PREFIX=true +# CAUSAL_ACTION_ATTENTION_MASK=false +# SELF_ATTN_EVERY_N_LAYERS=0 +# VLM_NAME=HuggingFaceTB/SmolVLM2-500M-Video-Instruct + +# EXPERT_WIDTH_MULTIPLIER=0.5 +# PAST_OBS_KEYS="image" +# N_OBS_STEPS=1 +# NUM_EXPERT_LAYERS=16 +# CHUNK_SIZE=50 +# NUM_VLM_LAYERS=0 +# PAD_LANG_TO=longest +# EVAL_CKPT=$WORK/logs/lerobot/lerobot_physical_intelligence_libero_smolpi0_lr1e-4bs8steps100000gpus2freeze32_imgtoktrue_cross_attn_gap1_localimgfalse_statetopreftrue_explay16_vlml0_causalactfalse_sa0_smolvlm2500_chunk50_nobs1/checkpoints/best/pretrained_model/ +# ADD_IMAGE_TOKENS=true +# ATTN_MODE=cross_attn +# STATE_TO_PREFIX=true +# CAUSAL_ACTION_ATTENTION_MASK=false +# SELF_ATTN_EVERY_N_LAYERS=0 +# VLM_NAME=HuggingFaceTB/SmolVLM2-500M-Video-Instruct + + +# EXPERT_WIDTH_MULTIPLIER=0.5 +# PAST_OBS_KEYS="image" +# N_OBS_STEPS=2 +# NUM_EXPERT_LAYERS=0 +# CHUNK_SIZE=50 +# NUM_VLM_LAYERS=0 +# PAD_LANG_TO=longest +# EVAL_CKPT=$WORK/logs/lerobot/lerobot_physical_intelligence_libero_smolpi0_lr1e-4bs8steps100000gpus2freeze32_imgtoktrue_cross_attn_gap1_localimgfalse_statetopreftrue_explay0_vlml0_causalactfalse_sa0_smolvlm2500_chunk50_nobs2/checkpoints/best/pretrained_model/ +# ADD_IMAGE_TOKENS=true +# ATTN_MODE=cross_attn +# STATE_TO_PREFIX=true +# CAUSAL_ACTION_ATTENTION_MASK=false +# SELF_ATTN_EVERY_N_LAYERS=0 +# VLM_NAME=HuggingFaceTB/SmolVLM2-500M-Video-Instruct + +# EXPERT_WIDTH_MULTIPLIER=0.5 +# PAST_OBS_KEYS="image" +# N_OBS_STEPS=3 +# NUM_EXPERT_LAYERS=0 +# CHUNK_SIZE=50 +# NUM_VLM_LAYERS=0 +# PAD_LANG_TO=longest +# EVAL_CKPT=$WORK/logs/lerobot/lerobot_physical_intelligence_libero_smolpi0_lr1e-4_bs8_steps100000_gpus2_freeze32_onlyexpert_1act_promptfalse_imgtoktrue_nobs3_compiletrue_cross_attn_pref0_gap1_localimgfalse_reverseimgorderfalse_statetopreftrue_toklongest/checkpoints/last/pretrained_model/ +# ADD_IMAGE_TOKENS=true +# ATTN_MODE=cross_attn +# STATE_TO_PREFIX=true +# CAUSAL_ACTION_ATTENTION_MASK=false +# SELF_ATTN_EVERY_N_LAYERS=0 +# VLM_NAME=HuggingFaceTB/SmolVLM2-500M-Video-Instruct + +# EXPERT_WIDTH_MULTIPLIER=0.5 +# PAST_OBS_KEYS="observation.state" +# N_OBS_STEPS=3 +# NUM_EXPERT_LAYERS=0 +# CHUNK_SIZE=50 +# NUM_VLM_LAYERS=0 +# PAD_LANG_TO=longest +# EVAL_CKPT=$WORK/logs/lerobot/lerobot_physical_intelligence_libero_smolpi0_lr1e-4bs8steps100000gpus2freeze32_imgtoktrue_cross_attn_gap1_localimgfalse_statetopreftrue_explay0_vlml0_causalactfalse_sa0_smolvlm2500_chunk50_nobs3_paststates/checkpoints/best/pretrained_model/ +# ADD_IMAGE_TOKENS=true +# ATTN_MODE=cross_attn +# STATE_TO_PREFIX=true +# CAUSAL_ACTION_ATTENTION_MASK=false +# SELF_ATTN_EVERY_N_LAYERS=0 +# VLM_NAME=HuggingFaceTB/SmolVLM2-500M-Video-Instruct + +# EXPERT_WIDTH_MULTIPLIER=0.5 +# PAST_OBS_KEYS="observation.state,image" +# N_OBS_STEPS=3 +# NUM_EXPERT_LAYERS=0 +# CHUNK_SIZE=50 +# NUM_VLM_LAYERS=0 +# PAD_LANG_TO=longest +# EVAL_CKPT=$WORK/logs/lerobot/lerobot_physical_intelligence_libero_smolpi0_lr1e-4bs8steps100000gpus2freeze32_imgtoktrue_cross_attn_gap1_localimgfalse_statetopreftrue_explay0_vlml0_causalactfalse_sa0_smolvlm2500_chunk50_nobs3_paststatesimgs/checkpoints/best/pretrained_model/ +# ADD_IMAGE_TOKENS=true +# ATTN_MODE=cross_attn +# STATE_TO_PREFIX=true +# CAUSAL_ACTION_ATTENTION_MASK=false +# SELF_ATTN_EVERY_N_LAYERS=0 +# VLM_NAME=HuggingFaceTB/SmolVLM2-500M-Video-Instruct + + +# EXPERT_WIDTH_MULTIPLIER=1 +# PAST_OBS_KEYS="image" +# N_OBS_STEPS=1 +# NUM_EXPERT_LAYERS=0 +# CHUNK_SIZE=50 +# NUM_VLM_LAYERS=0 +# PAD_LANG_TO=longest +# EVAL_CKPT=$WORK/logs/lerobot/lerobot_physical_intelligence_libero_smolpi0_lr9.5e-5bs8steps100000gpus2freeze32_imgtoktrue_cross_attn_gap1_localimgfalse_statetopreftrue_explay0_vlml0_causalactfalse_sa0_smolvlm2500_chunk50_nobs1_expw1/checkpoints/best/pretrained_model/ +# ADD_IMAGE_TOKENS=true +# ATTN_MODE=cross_attn +# STATE_TO_PREFIX=true +# CAUSAL_ACTION_ATTENTION_MASK=false +# SELF_ATTN_EVERY_N_LAYERS=0 +# VLM_NAME=HuggingFaceTB/SmolVLM2-500M-Video-Instruct + + +# EXPERT_WIDTH_MULTIPLIER=0.75 +# PAST_OBS_KEYS="image" +# N_OBS_STEPS=1 +# NUM_EXPERT_LAYERS=0 +# CHUNK_SIZE=50 +# NUM_VLM_LAYERS=0 +# PAD_LANG_TO=longest +# EVAL_CKPT=$WORK/logs/lerobot/lerobot_physical_intelligence_libero_smolpi0_lr9.5e-5bs8steps100000gpus2freeze32_imgtoktrue_cross_attn_gap1_localimgfalse_statetopreftrue_explay0_vlml0_causalactfalse_sa0_smolvlm2500_chunk50_nobs1_expw0.75/checkpoints/best/pretrained_model/ +# ADD_IMAGE_TOKENS=true +# ATTN_MODE=cross_attn +# STATE_TO_PREFIX=true +# CAUSAL_ACTION_ATTENTION_MASK=false +# SELF_ATTN_EVERY_N_LAYERS=0 +# VLM_NAME=HuggingFaceTB/SmolVLM2-500M-Video-Instruct + + +# EXPERT_WIDTH_MULTIPLIER=0.25 +# PAST_OBS_KEYS="image" +# N_OBS_STEPS=1 +# NUM_EXPERT_LAYERS=0 +# CHUNK_SIZE=50 +# NUM_VLM_LAYERS=0 +# PAD_LANG_TO=longest +# EVAL_CKPT=$WORK/logs/lerobot/lerobot_physical_intelligence_libero_smolpi0_lr2e-4bs8steps100000gpus2freeze32_imgtoktrue_cross_attn_gap1_localimgfalse_statetopreftrue_explay0_vlml0_causalactfalse_sa0_smolvlm2500_chunk50_nobs1_expw0.25/checkpoints/best/pretrained_model/ +# ADD_IMAGE_TOKENS=true +# ATTN_MODE=cross_attn +# STATE_TO_PREFIX=true +# CAUSAL_ACTION_ATTENTION_MASK=false +# SELF_ATTN_EVERY_N_LAYERS=0 +# VLM_NAME=HuggingFaceTB/SmolVLM2-500M-Video-Instruct + + +# EXPERT_WIDTH_MULTIPLIER=0.5 +# PAST_OBS_KEYS="observation.state,image" +# N_OBS_STEPS=1 +# NUM_EXPERT_LAYERS=0 +# CHUNK_SIZE=50 +# NUM_VLM_LAYERS=16 +# PAD_LANG_TO=longest +# EVAL_CKPT=$WORK/logs/lerobot/lerobot_physical_intelligence_libero_smolpi0_lr1e-4bs8steps100000gpus2freeze32_imgtokfalse_cross_attn_gap1_localimgfalse_statetopreftrue_explay0_vlml16_causalactfalse_sa0_smolvlm2500_chunk50_nobs1_expw0.5/checkpoints/best/pretrained_model/ +# ADD_IMAGE_TOKENS=false +# ATTN_MODE=cross_attn +# STATE_TO_PREFIX=true +# CAUSAL_ACTION_ATTENTION_MASK=false +# SELF_ATTN_EVERY_N_LAYERS=0 +# VLM_NAME=HuggingFaceTB/SmolVLM2-500M-Video-Instruct + + +# EXPERT_WIDTH_MULTIPLIER=0.5 +# PAST_OBS_KEYS="observation.state,image" +# N_OBS_STEPS=1 +# NUM_EXPERT_LAYERS=0 +# CHUNK_SIZE=50 +# NUM_VLM_LAYERS=16 +# PAD_LANG_TO=longest +# EVAL_CKPT=$WORK/logs/lerobot/lerobot_physical_intelligence_libero_smolpi0_lr1e-4bs16steps100000gpus2freeze32_imgtokfalse_cross_attn_gap1_localimgfalse_statetopreftrue_explay0_vlml16_causalactfalse_sa0_smolvlm2500_chunk50_nobs1_expw0.5/checkpoints/best/pretrained_model/ +# ADD_IMAGE_TOKENS=false +# ATTN_MODE=cross_attn +# STATE_TO_PREFIX=true +# CAUSAL_ACTION_ATTENTION_MASK=false +# SELF_ATTN_EVERY_N_LAYERS=0 +# VLM_NAME=HuggingFaceTB/SmolVLM2-500M-Video-Instruct + + + +# EXPERT_WIDTH_MULTIPLIER=0.5 +# PAST_OBS_KEYS="observation.state,image" +# N_OBS_STEPS=1 +# NUM_EXPERT_LAYERS=0 +# CHUNK_SIZE=50 +# NUM_VLM_LAYERS=16 +# PAD_LANG_TO=longest +# EVAL_CKPT=$WORK/logs/lerobot/lerobot_physical_intelligence_libero_smolpi0_lr1e-4bs32steps100000gpus2freeze32_imgtokfalse_cross_attn_gap1_localimgfalse_statetopreftrue_explay0_vlml16_causalactfalse_sa0_smolvlm2500_chunk50_nobs1_expw0.5/checkpoints/best/pretrained_model/ +# ADD_IMAGE_TOKENS=false +# ATTN_MODE=cross_attn +# STATE_TO_PREFIX=true +# CAUSAL_ACTION_ATTENTION_MASK=false +# SELF_ATTN_EVERY_N_LAYERS=0 +# VLM_NAME=HuggingFaceTB/SmolVLM2-500M-Video-Instruct + + +# EXPERT_WIDTH_MULTIPLIER=0.5 +# PAST_OBS_KEYS="image" +# N_OBS_STEPS=1 +# NUM_EXPERT_LAYERS=0 +# CHUNK_SIZE=50 +# NUM_VLM_LAYERS=0 +# PAD_LANG_TO=longest +# EVAL_CKPT=$WORK/logs/lerobot/lerobot_physical_intelligence_libero_smolpi0_lr5e-4bs8steps100000gpus2freeze32_imgtoktrue_cross_attn_gap1_localimgfalse_statetopreftrue_explay0_vlml0_causalactfalse_sa0_smolvlm1250_chunk50_nobs1_expw0.5/checkpoints/best/pretrained_model/ +# ADD_IMAGE_TOKENS=true +# ATTN_MODE=cross_attn +# STATE_TO_PREFIX=true +# CAUSAL_ACTION_ATTENTION_MASK=false +# SELF_ATTN_EVERY_N_LAYERS=0 +# VLM_NAME=HuggingFaceTB/SmolVLM-256M-Instruct + + +# EXPERT_WIDTH_MULTIPLIER=0.5 +# PAST_OBS_KEYS="image" +# N_OBS_STEPS=1 +# NUM_EXPERT_LAYERS=0 +# CHUNK_SIZE=50 +# NUM_VLM_LAYERS=0 +# PAD_LANG_TO=longest +# EVAL_CKPT=$WORK/logs/lerobot/lerobot_physical_intelligence_libero_smolpi0_lr8e-5bs8steps100000gpus2freeze32_imgtoktrue_cross_attn_gap1_localimgfalse_statetopreftrue_explay0_vlml0_causalactfalse_sa0_smolvlm12b_chunk50_nobs1_expw0.5/checkpoints/best/pretrained_model/ +# ADD_IMAGE_TOKENS=true +# ATTN_MODE=cross_attn +# STATE_TO_PREFIX=true +# CAUSAL_ACTION_ATTENTION_MASK=false +# SELF_ATTN_EVERY_N_LAYERS=0 +# VLM_NAME=HuggingFaceTB/SmolVLM-Instruct + + + +# EXPERT_WIDTH_MULTIPLIER=0.5 +# PAST_OBS_KEYS="image" +# N_OBS_STEPS=1 +# NUM_EXPERT_LAYERS=0 +# CHUNK_SIZE=50 +# NUM_VLM_LAYERS=0 +# PAD_LANG_TO=longest +# EVAL_CKPT=$WORK/logs/lerobot/lerobot_physical_intelligence_libero_smolpi0_lr1e-4bs8steps100000gpus2freeze32_imgtoktrue_cross_attn_gap1_localimgfalse_statetopreftrue_explay0_vlml0_causalactfalse_sa0_smolvlm1500_chunk50_nobs1_expw0.5/checkpoints/best/pretrained_model/ +# ADD_IMAGE_TOKENS=true +# ATTN_MODE=cross_attn +# STATE_TO_PREFIX=true +# CAUSAL_ACTION_ATTENTION_MASK=false +# SELF_ATTN_EVERY_N_LAYERS=0 +# VLM_NAME=HuggingFaceTB/SmolVLM-500M-Instruct + + +# EXPERT_WIDTH_MULTIPLIER=0.5 +# PAST_OBS_KEYS="image" +# N_OBS_STEPS=1 +# NUM_EXPERT_LAYERS=0 +# CHUNK_SIZE=50 +# NUM_VLM_LAYERS=0 +# PAD_LANG_TO=longest +# EVAL_CKPT=$WORK/logs/lerobot/lerobot_physical_intelligence_libero_smolpi0_lr8e-5bs8steps100000gpus2freeze32_imgtokfalse_cross_attn_gap1_localimgfalse_statetopreftrue_explay0_vlml0_causalactfalse_sa0_smolvlm1500_chunk50_nobs1_expw0.5/checkpoints/best/pretrained_model/ +# ADD_IMAGE_TOKENS=false +# ATTN_MODE=cross_attn +# STATE_TO_PREFIX=true +# CAUSAL_ACTION_ATTENTION_MASK=false +# SELF_ATTN_EVERY_N_LAYERS=0 +# VLM_NAME=HuggingFaceTB/SmolVLM-500M-Instruct + + + +# EXPERT_WIDTH_MULTIPLIER=0.5 +# PAST_OBS_KEYS="image" +# N_OBS_STEPS=1 +# NUM_EXPERT_LAYERS=0 +# CHUNK_SIZE=50 +# NUM_VLM_LAYERS=0 +# PAD_LANG_TO=longest +# EVAL_CKPT=$WORK/logs/lerobot/lerobot_physical_intelligence_libero_smolpi0_lr8e-5bs8steps100000gpus2freeze32_imgtokfalse_cross_attn_gap1_localimgfalse_statetopreftrue_explay0_vlml0_causalactfalse_sa0_smolvlm1500_chunk50_nobs1_expw0.5/checkpoints/best/pretrained_model/ +# ADD_IMAGE_TOKENS=false +# ATTN_MODE=cross_attn +# STATE_TO_PREFIX=true +# CAUSAL_ACTION_ATTENTION_MASK=false +# SELF_ATTN_EVERY_N_LAYERS=0 +# VLM_NAME=HuggingFaceTB/SmolVLM-500M-Instruct + +# EXPERT_WIDTH_MULTIPLIER=0.5 +# PAST_OBS_KEYS="image" +# N_OBS_STEPS=1 +# NUM_EXPERT_LAYERS=0 +# CHUNK_SIZE=50 +# NUM_VLM_LAYERS=16 +# PAD_LANG_TO=longest +# EVAL_CKPT=$WORK/logs/lerobot/lerobot_physical_intelligence_libero_smolpi0_lr1e-4bs8steps100000gpus2freeze32_imgtoktrue_cross_attn_gap1_localimgfalse_statetopreftrue_explay0_vlml16_causalactfalse_sa0_smolvlm2500_chunk50_nobs1_expw0.5/checkpoints/best/pretrained_model/ +# ADD_IMAGE_TOKENS=true +# ATTN_MODE=cross_attn +# STATE_TO_PREFIX=true +# CAUSAL_ACTION_ATTENTION_MASK=false +# SELF_ATTN_EVERY_N_LAYERS=0 +# VLM_NAME=HuggingFaceTB/SmolVLM-500M-Instruct + + +# EXPERT_WIDTH_MULTIPLIER=0.5 +# PAST_OBS_KEYS="image" +# N_OBS_STEPS=1 +# NUM_EXPERT_LAYERS=0 +# CHUNK_SIZE=50 +# NUM_VLM_LAYERS=16 +# PAD_LANG_TO=longest +# EVAL_CKPT=$WORK/logs/lerobot/lerobot_physical_intelligence_libero_smolpi0_lr1e-4bs8steps100000gpus2freeze32_imgtoktrue_cross_attn_gap1_localimgfalse_statetopreftrue_explay0_vlml16_causalacttrue_sa2_smolvlm2500_chunk50_nobs1_expw0.5/checkpoints/best/pretrained_model/ +# ADD_IMAGE_TOKENS=true +# ATTN_MODE=cross_attn +# STATE_TO_PREFIX=true +# CAUSAL_ACTION_ATTENTION_MASK=true +# SELF_ATTN_EVERY_N_LAYERS=2 +# VLM_NAME=HuggingFaceTB/SmolVLM-500M-Instruct + + +# EXPERT_WIDTH_MULTIPLIER=0.5 +# PAST_OBS_KEYS="image" +# N_OBS_STEPS=1 +# NUM_EXPERT_LAYERS=0 +# CHUNK_SIZE=50 +# NUM_VLM_LAYERS=16 +# PAD_LANG_TO=longest +# EVAL_CKPT=$WORK/logs/lerobot/lerobot_physical_intelligence_libero_smolpi0_lr1e-4bs32steps100000gpus2freeze32_imgtoktrue_cross_attn_gap1_localimgfalse_statetopreftrue_explay0_vlml16_causalacttrue_sa2_smolvlm2500_chunk50_nobs1_expw0.5/checkpoints/best/pretrained_model/ +# ADD_IMAGE_TOKENS=true +# ATTN_MODE=cross_attn +# STATE_TO_PREFIX=true +# CAUSAL_ACTION_ATTENTION_MASK=true +# SELF_ATTN_EVERY_N_LAYERS=2 +# VLM_NAME=HuggingFaceTB/SmolVLM-500M-Instruct + + + +# EXPERT_WIDTH_MULTIPLIER=0.5 +# PAST_OBS_KEYS="image" +# N_OBS_STEPS=1 +# NUM_EXPERT_LAYERS=0 +# CHUNK_SIZE=50 +# NUM_VLM_LAYERS=16 +# PAD_LANG_TO=longest +# EVAL_CKPT=$WORK/logs/lerobot/lerobot_physical_intelligence_libero_smolpi0_lr1e-4bs32steps100000gpus2freeze32_imgtoktrue_cross_attn_gap1_localimgfalse_statetopreftrue_explay0_vlml16_causalactfalse_sa0_smolvlm2500_chunk50_nobs1_expw0.5/checkpoints/best/pretrained_model/ +# ADD_IMAGE_TOKENS=true +# ATTN_MODE=cross_attn +# STATE_TO_PREFIX=true +# CAUSAL_ACTION_ATTENTION_MASK=false +# SELF_ATTN_EVERY_N_LAYERS=0 +# VLM_NAME=HuggingFaceTB/SmolVLM-500M-Instruct + + +# EXPERT_WIDTH_MULTIPLIER=0.5 +# PAST_OBS_KEYS="image" +# N_OBS_STEPS=1 +# NUM_EXPERT_LAYERS=0 +# CHUNK_SIZE=50 +# NUM_VLM_LAYERS=0 +# PAD_LANG_TO=longest +# EVAL_CKPT=$WORK/logs/lerobot/lerobot_physical_intelligence_libero_smolpi0_lr1e-4bs8steps100000gpus2freeze32_imgtoktrue_cross_attn_gap1_localimgfalse_statetopreftrue_explay0_vlml0_causalactfalse_sa0_smolvlm2500_chunk50_nobs1_expw0.5_rep/checkpoints/best/pretrained_model/ +# ADD_IMAGE_TOKENS=true +# ATTN_MODE=cross_attn +# STATE_TO_PREFIX=true +# CAUSAL_ACTION_ATTENTION_MASK=false +# SELF_ATTN_EVERY_N_LAYERS=0 +# VLM_NAME=HuggingFaceTB/SmolVLM-500M-Instruct + +# EXPERT_WIDTH_MULTIPLIER=0.5 +# PAST_OBS_KEYS="image" +# N_OBS_STEPS=1 +# NUM_EXPERT_LAYERS=0 +# CHUNK_SIZE=50 +# NUM_VLM_LAYERS=16 +# PAD_LANG_TO=longest +# EVAL_CKPT=$WORK/logs/lerobot/lerobot_physical_intelligence_libero_smolpi0_lr1e-4bs32steps100000gpus2freeze32_imgtoktrue_cross_attn_gap1_localimgfalse_statetopreftrue_explay0_vlml16_causalacttrue_sa2_smolvlm2500_chunk50_nobs1_expw0.5/checkpoints/best/pretrained_model/ +# ADD_IMAGE_TOKENS=true +# ATTN_MODE=cross_attn +# STATE_TO_PREFIX=true +# CAUSAL_ACTION_ATTENTION_MASK=true +# SELF_ATTN_EVERY_N_LAYERS=2 +# VLM_NAME=HuggingFaceTB/SmolVLM-500M-Instruct + +# EXPERT_WIDTH_MULTIPLIER=0.5 +# PAST_OBS_KEYS="image" +# N_OBS_STEPS=1 +# NUM_EXPERT_LAYERS=0 +# CHUNK_SIZE=50 +# NUM_VLM_LAYERS=16 +# PAD_LANG_TO=longest +# EVAL_CKPT=$WORK/logs/lerobot/lerobot_physical_intelligence_libero_smolpi0_lr1e-4bs32steps100000gpus2freeze32_imgtoktrue_cross_attn_gap1_localimgfalse_statetopreftrue_explay0_vlml16_causalactfalse_sa2_smolvlm2500_chunk50_nobs1_expw0.5_rep/checkpoints/best/pretrained_model/ +# ADD_IMAGE_TOKENS=true +# ATTN_MODE=cross_attn +# STATE_TO_PREFIX=true +# CAUSAL_ACTION_ATTENTION_MASK=false +# SELF_ATTN_EVERY_N_LAYERS=2 +# VLM_NAME=HuggingFaceTB/SmolVLM-500M-Instruct + + +# EXPERT_WIDTH_MULTIPLIER=0.5 +# PAST_OBS_KEYS="image" +# N_OBS_STEPS=1 +# NUM_EXPERT_LAYERS=0 +# CHUNK_SIZE=50 +# NUM_VLM_LAYERS=16 +# PAD_LANG_TO=longest +# EVAL_CKPT=$WORK/logs/lerobot/lerobot_physical_intelligence_libero_smolpi0_lr1e-4bs32steps100000gpus2freeze32_imgtokfalse_cross_attn_gap1_localimgfalse_statetopreftrue_explay0_vlml16_causalacttrue_sa2_smolvlm2500_chunk50_nobs1_expw0.5/checkpoints/best/pretrained_model/ +# ADD_IMAGE_TOKENS=false +# ATTN_MODE=cross_attn +# STATE_TO_PREFIX=true +# CAUSAL_ACTION_ATTENTION_MASK=true +# SELF_ATTN_EVERY_N_LAYERS=2 +# VLM_NAME=HuggingFaceTB/SmolVLM-500M-Instruct + + +# EXPERT_WIDTH_MULTIPLIER=0.5 +# PAST_OBS_KEYS="image" +# N_OBS_STEPS=1 +# NUM_EXPERT_LAYERS=0 +# CHUNK_SIZE=50 +# NUM_VLM_LAYERS=0 +# PAD_LANG_TO=longest +# EVAL_CKPT=$WORK/logs/lerobot/lerobot_physical_intelligence_libero_smolpi0_lr1e-4bs8steps100000gpus2full8_imgtoktrue_cross_attn_gap1_localimgfalse_statetopreftrue_explay0_vlml0_causalacttrue_sa0_smolvlm2500_chunk50_nobs1_expw0.5_lrvlm1e-6/checkpoints/best/pretrained_model/ +# ADD_IMAGE_TOKENS=true +# ATTN_MODE=cross_attn +# STATE_TO_PREFIX=true +# CAUSAL_ACTION_ATTENTION_MASK=true +# SELF_ATTN_EVERY_N_LAYERS=0 +# VLM_NAME=HuggingFaceTB/SmolVLM-500M-Instruct + + +# EXPERT_WIDTH_MULTIPLIER=0.5 +# PAST_OBS_KEYS="image" +# N_OBS_STEPS=1 +# NUM_EXPERT_LAYERS=0 +# CHUNK_SIZE=50 +# NUM_VLM_LAYERS=0 +# PAD_LANG_TO=longest +# EVAL_CKPT=$WORK/logs/lerobot/lerobot_physical_intelligence_libero_smolpi0_lr1e-4bs8steps100000gpus2full8_imgtoktrue_cross_attn_gap1_localimgfalse_statetopreftrue_explay0_vlml0_causalacttrue_sa0_smolvlm2500_chunk50_nobs1_expw0.5_lrvlm1e-5/checkpoints/best/pretrained_model/ +# ADD_IMAGE_TOKENS=true +# ATTN_MODE=cross_attn +# STATE_TO_PREFIX=true +# CAUSAL_ACTION_ATTENTION_MASK=true +# SELF_ATTN_EVERY_N_LAYERS=0 +# VLM_NAME=HuggingFaceTB/SmolVLM-500M-Instruct + + +# EXPERT_WIDTH_MULTIPLIER=0.5 +# PAST_OBS_KEYS="image" +# N_OBS_STEPS=1 +# NUM_EXPERT_LAYERS=0 +# CHUNK_SIZE=50 +# NUM_VLM_LAYERS=0 +# PAD_LANG_TO=longest +# EVAL_CKPT=$WORK/logs/lerobot/lerobot_physical_intelligence_libero_smolpi0_lr1e-4bs8steps100000gpus2full8_imgtoktrue_cross_attn_gap1_localimgfalse_statetopreftrue_explay0_vlml0_causalacttrue_sa0_smolvlm2500_chunk50_nobs1_expw0.5_lrvlm1e-4/checkpoints/best/pretrained_model/ +# ADD_IMAGE_TOKENS=true +# ATTN_MODE=cross_attn +# STATE_TO_PREFIX=true +# CAUSAL_ACTION_ATTENTION_MASK=true +# SELF_ATTN_EVERY_N_LAYERS=0 +# VLM_NAME=HuggingFaceTB/SmolVLM-500M-Instruct + + +# PEFT_METHOD=lora +# PEFT_TARGET_MODEL=text +# LORA_TARGET_MODULES=q_proj,v_proj,k_proj +# LORA_R=32 +# EXPERT_WIDTH_MULTIPLIER=0.5 +# PAST_OBS_KEYS="image" +# N_OBS_STEPS=1 +# NUM_EXPERT_LAYERS=0 +# CHUNK_SIZE=50 +# NUM_VLM_LAYERS=0 +# PAD_LANG_TO=longest +# EVAL_CKPT=$WORK/logs/lerobot/lerobot_physical_intelligence_libero_smolpi0_lr1e-4bs8steps100000gpus2lora32_imgtoktrue_cross_attn_gap1_localimgfalse_statetopreftrue_explay0_vlml0_causalacttrue_sa0_smolvlm2500_chunk50_nobs1_expw0.5_lrvlm1e-4_loraqkv/checkpoints/best/pretrained_model/ +# ADD_IMAGE_TOKENS=true +# ATTN_MODE=cross_attn +# STATE_TO_PREFIX=true +# CAUSAL_ACTION_ATTENTION_MASK=true +# SELF_ATTN_EVERY_N_LAYERS=0 +# VLM_NAME=HuggingFaceTB/SmolVLM-500M-Instruct + + +# PEFT_METHOD=lora +# PEFT_TARGET_MODEL=text +# LORA_TARGET_MODULES=q_proj,v_proj,k_proj +# LORA_R=32 +# EXPERT_WIDTH_MULTIPLIER=0.5 +# PAST_OBS_KEYS="image" +# N_OBS_STEPS=1 +# NUM_EXPERT_LAYERS=0 +# CHUNK_SIZE=50 +# NUM_VLM_LAYERS=0 +# PAD_LANG_TO=longest +# EVAL_CKPT=$WORK/logs/lerobot/lerobot_physical_intelligence_libero_smolpi0_lr1e-4bs8steps100000gpus2lora32_imgtoktrue_cross_attn_gap1_localimgfalse_statetopreftrue_explay0_vlml0_causalacttrue_sa0_smolvlm2500_chunk50_nobs1_expw0.5_lrvlm1e-5_loraqkv/checkpoints/best/pretrained_model/ +# ADD_IMAGE_TOKENS=true +# ATTN_MODE=cross_attn +# STATE_TO_PREFIX=true +# CAUSAL_ACTION_ATTENTION_MASK=true +# SELF_ATTN_EVERY_N_LAYERS=0 +# VLM_NAME=HuggingFaceTB/SmolVLM-500M-Instruct + +# PEFT_METHOD=lora +# PEFT_TARGET_MODEL=text +# LORA_TARGET_MODULES=q_proj,v_proj,k_proj,up_proj,down_proj,gate_proj +# LORA_R=32 +# EXPERT_WIDTH_MULTIPLIER=0.5 +# PAST_OBS_KEYS="image" +# N_OBS_STEPS=1 +# NUM_EXPERT_LAYERS=0 +# CHUNK_SIZE=50 +# NUM_VLM_LAYERS=0 +# PAD_LANG_TO=longest +# EVAL_CKPT=$WORK/logs/lerobot/lerobot_physical_intelligence_libero_smolpi0_lr1e-4bs8steps100000gpus2lora32_imgtoktrue_cross_attn_gap1_localimgfalse_statetopreftrue_explay0_vlml0_causalacttrue_sa0_smolvlm2500_chunk50_nobs1_expw0.5_lrvlm1e-4/checkpoints/best/pretrained_model/ +# ADD_IMAGE_TOKENS=true +# ATTN_MODE=cross_attn +# STATE_TO_PREFIX=true +# CAUSAL_ACTION_ATTENTION_MASK=true +# SELF_ATTN_EVERY_N_LAYERS=0 +# VLM_NAME=HuggingFaceTB/SmolVLM-500M-Instruct + + +# PEFT_METHOD=lora +# PEFT_TARGET_MODEL=text +# LORA_TARGET_MODULES=q_proj,v_proj,k_proj,up_proj,down_proj,gate_proj +# LORA_R=32 +# EXPERT_WIDTH_MULTIPLIER=0.5 +# PAST_OBS_KEYS="image" +# N_OBS_STEPS=1 +# NUM_EXPERT_LAYERS=0 +# CHUNK_SIZE=50 +# NUM_VLM_LAYERS=0 +# PAD_LANG_TO=longest +# EVAL_CKPT=$WORK/logs/lerobot/lerobot_physical_intelligence_libero_smolpi0_lr1e-4bs8steps100000gpus2lora32_imgtoktrue_cross_attn_gap1_localimgfalse_statetopreftrue_explay0_vlml0_causalacttrue_sa0_smolvlm2500_chunk50_nobs1_expw0.5_lrvlm1e-5/checkpoints/best/pretrained_model/ +# ADD_IMAGE_TOKENS=true +# ATTN_MODE=cross_attn +# STATE_TO_PREFIX=true +# CAUSAL_ACTION_ATTENTION_MASK=true +# SELF_ATTN_EVERY_N_LAYERS=0 +# VLM_NAME=HuggingFaceTB/SmolVLM-500M-Instruct + + +# PEFT_METHOD=lora +# PEFT_TARGET_MODEL=text +# LORA_TARGET_MODULES=q_proj,v_proj,k_proj,up_proj,down_proj,gate_proj +# LORA_R=32 +# EXPERT_WIDTH_MULTIPLIER=0.5 +# PAST_OBS_KEYS="image" +# N_OBS_STEPS=1 +# NUM_EXPERT_LAYERS=0 +# CHUNK_SIZE=50 +# NUM_VLM_LAYERS=0 +# PAD_LANG_TO=longest +# EVAL_CKPT=$WORK/logs/lerobot/lerobot_physical_intelligence_libero_smolpi0_lr1e-4bs8steps100000gpus2lora32_imgtoktrue_cross_attn_gap1_localimgfalse_statetopreftrue_explay0_vlml0_causalacttrue_sa0_smolvlm2500_chunk50_nobs1_expw0.5_lrvlm1e-6/checkpoints/best/pretrained_model/ +# ADD_IMAGE_TOKENS=true +# ATTN_MODE=cross_attn +# STATE_TO_PREFIX=true +# CAUSAL_ACTION_ATTENTION_MASK=true +# SELF_ATTN_EVERY_N_LAYERS=0 +# VLM_NAME=HuggingFaceTB/SmolVLM-500M-Instruct + + +# EXPERT_WIDTH_MULTIPLIER=0.5 +# PAST_OBS_KEYS="image" +# N_OBS_STEPS=1 +# NUM_EXPERT_LAYERS=0 +# CHUNK_SIZE=50 +# NUM_VLM_LAYERS=0 +# PAD_LANG_TO=longest +# EVAL_CKPT=$WORK/logs/lerobot/lerobot_physical_intelligence_libero_smolpi0_lr1e-4bs8steps100000gpus2lora32_imgtoktrue_cross_attn_gap1_localimgfalse_statetopreftrue_explay0_vlml0_causalacttrue_sa0_smolvlm2500_chunk50_nobs1_expw0.5_lrvlm1e-5/checkpoints/best/pretrained_model/ +# ADD_IMAGE_TOKENS=true +# ATTN_MODE=cross_attn +# STATE_TO_PREFIX=true +# CAUSAL_ACTION_ATTENTION_MASK=true +# SELF_ATTN_EVERY_N_LAYERS=0 +# VLM_NAME=HuggingFaceTB/SmolVLM-500M-Instruct + + + +# EXPERT_WIDTH_MULTIPLIER=0.5 +# PAST_OBS_KEYS="image" +# N_OBS_STEPS=1 +# NUM_EXPERT_LAYERS=0 +# CHUNK_SIZE=50 +# NUM_VLM_LAYERS=16 +# PAD_LANG_TO=longest +# EVAL_CKPT=$WORK/logs/lerobot/lerobot_physical_intelligence_libero_smolpi0_lr1e-4bs32steps100000gpus2freeze32_imgtokfalse_cross_attn_gap1_localimgfalse_statetopreftrue_explay0_vlml16_causalacttrue_sa0_smolvlm2500_chunk50_nobs1_expw0.5_lrvlm1e-4_loraqkv/checkpoints/best/pretrained_model/ +# ADD_IMAGE_TOKENS=false +# ATTN_MODE=cross_attn +# STATE_TO_PREFIX=true +# CAUSAL_ACTION_ATTENTION_MASK=true +# SELF_ATTN_EVERY_N_LAYERS=0 +# VLM_NAME=HuggingFaceTB/SmolVLM-500M-Instruct + + + +# EXPERT_WIDTH_MULTIPLIER=0.5 +# PAST_OBS_KEYS="image" +# N_OBS_STEPS=1 +# NUM_EXPERT_LAYERS=0 +# CHUNK_SIZE=50 +# NUM_VLM_LAYERS=0 +# PAD_LANG_TO=longest +# EVAL_CKPT=$WORK/logs/lerobot/lerobot_physical_intelligence_libero_smolpi0_lr1e-4bs32steps100000gpus2freeze32_imgtokfalse_cross_attn_gap1_localimgfalse_statetopreftrue_explay0_vlml0_causalacttrue_sa2_smolvlm2500_chunk50_nobs1_expw0.5_lrvlm1e-4/checkpoints/best/pretrained_model/ +# ADD_IMAGE_TOKENS=false +# ATTN_MODE=cross_attn +# STATE_TO_PREFIX=true +# CAUSAL_ACTION_ATTENTION_MASK=true +# SELF_ATTN_EVERY_N_LAYERS=2 +# VLM_NAME=HuggingFaceTB/SmolVLM-500M-Instruct + + +# EXPERT_WIDTH_MULTIPLIER=0.5 +# PAST_OBS_KEYS="image" +# N_OBS_STEPS=1 +# NUM_EXPERT_LAYERS=0 +# CHUNK_SIZE=50 +# NUM_VLM_LAYERS=0 +# PAD_LANG_TO=longest +# EVAL_CKPT=$WORK/logs/lerobot/lerobot_physical_intelligence_libero_smolpi0_lr1e-4bs32steps100000gpus2freeze32_imgtoktrue_cross_attn_gap1_localimgfalse_statetopreftrue_explay0_vlml0_causalacttrue_sa2_smolvlm2500_chunk50_nobs1_expw0.5_lrvlm1e-4/checkpoints/best/pretrained_model/ +# ADD_IMAGE_TOKENS=true +# ATTN_MODE=cross_attn +# STATE_TO_PREFIX=true +# CAUSAL_ACTION_ATTENTION_MASK=true +# SELF_ATTN_EVERY_N_LAYERS=2 +# VLM_NAME=HuggingFaceTB/SmolVLM-500M-Instruct + + + +# EXPERT_WIDTH_MULTIPLIER=0.5 +# PAST_OBS_KEYS="image" +# N_OBS_STEPS=1 +# NUM_EXPERT_LAYERS=0 +# CHUNK_SIZE=50 +# NUM_VLM_LAYERS=16 +# PAD_LANG_TO=longest +# EVAL_CKPT=$WORK/logs/lerobot/lerobot_physical_intelligence_libero_smolpi0_lr1e-4bs16steps100000gpus2freeze32_imgtoktrue_cross_attn_gap1_localimgfalse_statetopreftrue_explay0_vlml16_causalacttrue_sa2_smolvlm2500_chunk50_nobs1_expw0.5_lrvlm1e-4/checkpoints/best/pretrained_model/ +# ADD_IMAGE_TOKENS=true +# ATTN_MODE=cross_attn +# STATE_TO_PREFIX=true +# CAUSAL_ACTION_ATTENTION_MASK=true +# SELF_ATTN_EVERY_N_LAYERS=2 +# VLM_NAME=HuggingFaceTB/SmolVLM-500M-Instruct + + +# EXPERT_WIDTH_MULTIPLIER=0.5 +# PAST_OBS_KEYS="image" +# N_OBS_STEPS=1 +# NUM_EXPERT_LAYERS=0 +# CHUNK_SIZE=50 +# NUM_VLM_LAYERS=0 +# PAD_LANG_TO=longest +# EVAL_CKPT=$WORK/logs/lerobot/lerobot_physical_intelligence_libero_smolpi0_lr1e-4bs16steps100000gpus2freeze32_imgtoktrue_cross_attn_gap1_localimgfalse_statetopreftrue_explay0_vlml0_causalacttrue_sa2_smolvlm2500_chunk50_nobs1_expw0.5_lrvlm1e-4/checkpoints/best/pretrained_model/ +# ADD_IMAGE_TOKENS=true +# ATTN_MODE=cross_attn +# STATE_TO_PREFIX=true +# CAUSAL_ACTION_ATTENTION_MASK=true +# SELF_ATTN_EVERY_N_LAYERS=2 +# VLM_NAME=HuggingFaceTB/SmolVLM-500M-Instruct + +# EXPERT_WIDTH_MULTIPLIER=0.5 +# PAST_OBS_KEYS="image" +# N_OBS_STEPS=1 +# NUM_EXPERT_LAYERS=0 +# CHUNK_SIZE=50 +# NUM_VLM_LAYERS=0 +# PAD_LANG_TO=longest +# EVAL_CKPT=$WORK/logs/lerobot/lerobot_physical_intelligence_libero_smolpi0_lr1e-4bs16steps100000gpus2freeze32_imgtokfalse_cross_attn_gap1_localimgfalse_statetopreftrue_explay0_vlml0_causalacttrue_sa2_smolvlm2500_chunk50_nobs1_expw0.5_lrvlm1e-4/checkpoints/best/pretrained_model/ +# ADD_IMAGE_TOKENS=false +# ATTN_MODE=cross_attn +# STATE_TO_PREFIX=true +# CAUSAL_ACTION_ATTENTION_MASK=true +# SELF_ATTN_EVERY_N_LAYERS=2 +# VLM_NAME=HuggingFaceTB/SmolVLM-500M-Instruct + + + +# EXPERT_WIDTH_MULTIPLIER=0.5 +# PAST_OBS_KEYS="image" +# N_OBS_STEPS=1 +# NUM_EXPERT_LAYERS=0 +# CHUNK_SIZE=50 +# NUM_VLM_LAYERS=0 +# PAD_LANG_TO=longest +# EVAL_CKPT=$WORK/logs/lerobot/lerobot_physical_intelligence_libero_smolpi0_lr1e-4bs8steps100000gpus2freeze32_imgtokfalse_cross_attn_gap1_localimgfalse_statetopreftrue_explay0_vlml0_causalacttrue_sa0_smolvlm2500_chunk50_nobs1_expw0.5_lrvlm1e-4/checkpoints/best/pretrained_model/ +# ADD_IMAGE_TOKENS=false +# ATTN_MODE=cross_attn +# STATE_TO_PREFIX=true +# CAUSAL_ACTION_ATTENTION_MASK=true +# SELF_ATTN_EVERY_N_LAYERS=0 +# VLM_NAME=HuggingFaceTB/SmolVLM-500M-Instruct + + +# SELF_ATTN_ONLY_ACTIONS=true +# EXPERT_WIDTH_MULTIPLIER=0.5 +# PAST_OBS_KEYS="image" +# N_OBS_STEPS=1 +# NUM_EXPERT_LAYERS=0 +# CHUNK_SIZE=50 +# NUM_VLM_LAYERS=16 +# PAD_LANG_TO=longest +# EVAL_CKPT=$WORK/logs/lerobot/lerobot_physical_intelligence_libero_smolpi0_lr1e-4bs8steps100000gpus2freeze32_imgtoktrue_cross_attn_gap1_localimgfalse_statetopreftrue_explay0_vlml16_causalacttrue_sa2_smolvlm2500_chunk50_nobs1_expw0.5_lrvlm1e-4_saacttrue/checkpoints/best/pretrained_model/ +# ADD_IMAGE_TOKENS=true +# ATTN_MODE=cross_attn +# STATE_TO_PREFIX=true +# CAUSAL_ACTION_ATTENTION_MASK=true +# SELF_ATTN_EVERY_N_LAYERS=2 +# VLM_NAME=HuggingFaceTB/SmolVLM-500M-Instruct + + +# SELF_ATTN_ONLY_ACTIONS=false +# EXPERT_WIDTH_MULTIPLIER=0.5 +# PAST_OBS_KEYS="image" +# N_OBS_STEPS=1 +# NUM_EXPERT_LAYERS=0 +# CHUNK_SIZE=50 +# NUM_VLM_LAYERS=16 +# PAD_LANG_TO=max_length +# EVAL_CKPT=$WORK/logs/lerobot/lerobot_physical_intelligence_libero_smolpi0_lr1e-4bs8steps100000gpus2freeze32_imgtoktrue_cross_attn_gap1_localimgfalse_statetopreftrue_explay0_vlml16_causalacttrue_sa2_smolvlm2500_chunk50_nobs1_expw0.5_lrvlm1e-4_saactfalse_droptrue_max_length/checkpoints/best/pretrained_model/ +# ADD_IMAGE_TOKENS=true +# ATTN_MODE=cross_attn +# STATE_TO_PREFIX=true +# CAUSAL_ACTION_ATTENTION_MASK=true +# SELF_ATTN_EVERY_N_LAYERS=2 +# VLM_NAME=HuggingFaceTB/SmolVLM-500M-Instruct + + + +# SELF_ATTN_ONLY_ACTIONS=false +# EXPERT_WIDTH_MULTIPLIER=0.5 +# PAST_OBS_KEYS="image" +# N_OBS_STEPS=1 +# NUM_EXPERT_LAYERS=0 +# CHUNK_SIZE=50 +# NUM_VLM_LAYERS=16 +# PAD_LANG_TO=max_length +# EVAL_CKPT=$WORK/logs/lerobot/lerobot_physical_intelligence_libero_smolpi0_lr1e-4bs8steps100000gpus2freeze32_imgtoktrue_cross_attn_gap1_localimgfalse_statetopreftrue_explay0_vlml16_causalacttrue_sa2_smolvlm2500_chunk50_nobs1_expw0.5_lrvlm1e-4_saactfalse_dropfalse_max_length/checkpoints/best/pretrained_model/ +# ADD_IMAGE_TOKENS=true +# ATTN_MODE=cross_attn +# STATE_TO_PREFIX=true +# CAUSAL_ACTION_ATTENTION_MASK=true +# SELF_ATTN_EVERY_N_LAYERS=2 +# VLM_NAME=HuggingFaceTB/SmolVLM-500M-Instruct + +# SELF_ATTN_ONLY_ACTIONS=false +# EXPERT_WIDTH_MULTIPLIER=0.75 +# PAST_OBS_KEYS="image" +# N_OBS_STEPS=1 +# NUM_EXPERT_LAYERS=0 +# CHUNK_SIZE=50 +# NUM_VLM_LAYERS=16 +# PAD_LANG_TO=max_length +# EVAL_CKPT=$WORK/logs/lerobot/lerobot_physical_intelligence_libero_smolpi0_lr9.5e-5bs8steps100000gpus2freeze32_imgtoktrue_cross_attn_gap1_localimgfalse_statetopreftrue_explay0_vlml16_causalacttrue_sa2_smolvlm2500_chunk50_nobs1_expw0.75_lrvlm1e-4_saactfalse_dropfalse_max_length/checkpoints/best/pretrained_model/ +# ADD_IMAGE_TOKENS=true +# ATTN_MODE=cross_attn +# STATE_TO_PREFIX=true +# CAUSAL_ACTION_ATTENTION_MASK=true +# SELF_ATTN_EVERY_N_LAYERS=2 +# VLM_NAME=HuggingFaceTB/SmolVLM-500M-Instruct + + +# SELF_ATTN_ONLY_ACTIONS=false +# EXPERT_WIDTH_MULTIPLIER=0.75 +# PAST_OBS_KEYS="image" +# N_OBS_STEPS=1 +# NUM_EXPERT_LAYERS=0 +# CHUNK_SIZE=50 +# NUM_VLM_LAYERS=16 +# PAD_LANG_TO=longest +# EVAL_CKPT=$WORK/logs/lerobot/lerobot_physical_intelligence_libero_smolpi0_lr9.5e-5bs32steps100000gpus2freeze32_imgtoktrue_cross_attn_gap1_localimgfalse_statetopreftrue_explay0_vlml16_causalacttrue_sa2_smolvlm2500_chunk50_nobs1_expw0.75_lrvlm1e-4_saactfalse_dropfalse_longest/checkpoints/best/pretrained_model/ +# ADD_IMAGE_TOKENS=true +# ATTN_MODE=cross_attn +# STATE_TO_PREFIX=true +# CAUSAL_ACTION_ATTENTION_MASK=true +# SELF_ATTN_EVERY_N_LAYERS=2 +# VLM_NAME=HuggingFaceTB/SmolVLM-500M-Instruct + + +# SELF_ATTN_ONLY_ACTIONS=false +# EXPERT_WIDTH_MULTIPLIER=0.75 +# PAST_OBS_KEYS="image" +# N_OBS_STEPS=1 +# NUM_EXPERT_LAYERS=0 +# CHUNK_SIZE=50 +# NUM_VLM_LAYERS=16 +# PAD_LANG_TO=longest +# EVAL_CKPT=$WORK/logs/lerobot/lerobot_physical_intelligence_libero_smolpi0_lr1e-4bs32steps100000gpus2freeze32_imgtoktrue_cross_attn_gap1_localimgfalse_statetopreftrue_explay0_vlml16_causalacttrue_sa2_smolvlm2500_chunk50_nobs1_expw0.75_lrvlm1e-4_saactfalse_dropfalse_longest/checkpoints/best/pretrained_model/ +# ADD_IMAGE_TOKENS=true +# ATTN_MODE=cross_attn +# STATE_TO_PREFIX=true +# CAUSAL_ACTION_ATTENTION_MASK=true +# SELF_ATTN_EVERY_N_LAYERS=2 +# VLM_NAME=HuggingFaceTB/SmolVLM-500M-Instruct + + +# SELF_ATTN_ONLY_ACTIONS=false +# EXPERT_WIDTH_MULTIPLIER=0.5 +# PAST_OBS_KEYS="image" +# N_OBS_STEPS=1 +# NUM_EXPERT_LAYERS=0 +# CHUNK_SIZE=50 +# NUM_VLM_LAYERS=16 +# PAD_LANG_TO=longest +# EVAL_CKPT=$WORK/logs/lerobot/lerobot_physical_intelligence_libero_smolpi0_lr1e-4bs8steps100000gpus2freeze32_imgtoktrue_cross_attn_gap1_localimgfalse_statetopreftrue_explay0_vlml16_causalacttrue_sa2_smolvlm2500_chunk50_nobs1_expw0.5_lrvlm1e-4_longest_ptdroidfull/checkpoints/best/pretrained_model/ +# ADD_IMAGE_TOKENS=true +# ATTN_MODE=cross_attn +# STATE_TO_PREFIX=true +# CAUSAL_ACTION_ATTENTION_MASK=true +# SELF_ATTN_EVERY_N_LAYERS=2 +# VLM_NAME=HuggingFaceTB/SmolVLM-500M-Instruct + +# SELF_ATTN_ONLY_ACTIONS=false +# EXPERT_WIDTH_MULTIPLIER=0.5 +# PAST_OBS_KEYS="image" +# N_OBS_STEPS=1 +# NUM_EXPERT_LAYERS=0 +# CHUNK_SIZE=50 +# NUM_VLM_LAYERS=16 +# PAD_LANG_TO=longest +# EVAL_CKPT=$WORK/logs/lerobot/lerobot_physical_intelligence_libero_smolpi0_lr1e-4bs8steps100000gpus2freeze32_imgtoktrue_cross_attn_gap1_localimgfalse_statetopreftrue_explay0_vlml16_causalacttrue_sa2_smolvlm2500_chunk50_nobs1_expw0.5_lrvlm1e-4_longest_ptcomv3freeze/checkpoints/best/pretrained_model/ +# ADD_IMAGE_TOKENS=true +# ATTN_MODE=cross_attn +# STATE_TO_PREFIX=true +# CAUSAL_ACTION_ATTENTION_MASK=true +# SELF_ATTN_EVERY_N_LAYERS=2 +# VLM_NAME=HuggingFaceTB/SmolVLM-500M-Instruct + +# SELF_ATTN_ONLY_ACTIONS=false +# EXPERT_WIDTH_MULTIPLIER=0.5 +# PAST_OBS_KEYS="image" +# N_OBS_STEPS=1 +# NUM_EXPERT_LAYERS=0 +# CHUNK_SIZE=50 +# NUM_VLM_LAYERS=16 +# PAD_LANG_TO=longest +# EVAL_CKPT=$WORK/logs/lerobot/lerobot_physical_intelligence_libero_smolpi0_lr1e-4bs8steps100000gpus2freeze32_imgtoktrue_cross_attn_gap1_localimgfalse_statetopreftrue_explay0_vlml16_causalacttrue_sa2_smolvlm2500_chunk50_nobs1_expw0.5_lrvlm1e-4_longest_ptcomv1v2full/checkpoints/best/pretrained_model/ +# ADD_IMAGE_TOKENS=true +# ATTN_MODE=cross_attn +# STATE_TO_PREFIX=true +# CAUSAL_ACTION_ATTENTION_MASK=true +# SELF_ATTN_EVERY_N_LAYERS=2 +# VLM_NAME=HuggingFaceTB/SmolVLM-500M-Instruct + +# SELF_ATTN_ONLY_ACTIONS=false +# EXPERT_WIDTH_MULTIPLIER=0.5 +# PAST_OBS_KEYS="image" +# N_OBS_STEPS=1 +# NUM_EXPERT_LAYERS=0 +# CHUNK_SIZE=50 +# NUM_VLM_LAYERS=16 +# PAD_LANG_TO=longest +# EVAL_CKPT=$WORK/logs/lerobot/lerobot_physical_intelligence_libero_smolpi0_lr1e-4bs8steps100000gpus2freeze32_imgtoktrue_cross_attn_gap1_localimgfalse_statetopreftrue_explay0_vlml16_causalacttrue_sa2_smolvlm2500_chunk50_nobs1_expw0.5_lrvlm1e-4_longest_ptcomv1v2freeze/checkpoints/best/pretrained_model/ +# ADD_IMAGE_TOKENS=true +# ATTN_MODE=cross_attn +# STATE_TO_PREFIX=true +# CAUSAL_ACTION_ATTENTION_MASK=true +# SELF_ATTN_EVERY_N_LAYERS=2 +# VLM_NAME=HuggingFaceTB/SmolVLM-500M-Instruct + + + +# SELF_ATTN_ONLY_ACTIONS=false +# EXPERT_WIDTH_MULTIPLIER=0.5 +# PAST_OBS_KEYS="image" +# N_OBS_STEPS=1 +# NUM_EXPERT_LAYERS=0 +# CHUNK_SIZE=50 +# NUM_VLM_LAYERS=16 +# PAD_LANG_TO=longest +# EVAL_CKPT=$WORK/logs/lerobot/lerobot_physical_intelligence_libero_smolpi0_lr2e-4bs8steps100000gpus2freeze32_imgtoktrue_cross_attn_gap1_localimgfalse_statetopreftrue_explay0_vlml16_causalacttrue_sa2_smolvlm2500_chunk50_nobs1_expw0.5_lrvlm1e-4_longest_pt_trans1true/checkpoints/best/pretrained_model/ +# ADD_IMAGE_TOKENS=true +# ATTN_MODE=cross_attn +# STATE_TO_PREFIX=true +# CAUSAL_ACTION_ATTENTION_MASK=true +# SELF_ATTN_EVERY_N_LAYERS=2 +# VLM_NAME=HuggingFaceTB/SmolVLM-500M-Instruct + + +# SELF_ATTN_ONLY_ACTIONS=false +# EXPERT_WIDTH_MULTIPLIER=0.5 +# PAST_OBS_KEYS="image" +# N_OBS_STEPS=1 +# NUM_EXPERT_LAYERS=0 +# CHUNK_SIZE=50 +# NUM_VLM_LAYERS=16 +# PAD_LANG_TO=longest +# EVAL_CKPT=$WORK/logs/lerobot/lerobot_physical_intelligence_libero_smolpi0_lr2e-4bs8steps100000gpus2freeze32_imgtoktrue_cross_attn_gap1_localimgfalse_statetopreftrue_explay0_vlml16_causalacttrue_sa2_smolvlm2500_chunk50_nobs1_expw0.5_lrvlm1e-4_longest_pt_trans3true/checkpoints/best/pretrained_model/ +# ADD_IMAGE_TOKENS=true +# ATTN_MODE=cross_attn +# STATE_TO_PREFIX=true +# CAUSAL_ACTION_ATTENTION_MASK=true +# SELF_ATTN_EVERY_N_LAYERS=2 +# VLM_NAME=HuggingFaceTB/SmolVLM-500M-Instruct + +# SELF_ATTN_ONLY_ACTIONS=false +# EXPERT_WIDTH_MULTIPLIER=0.5 +# PAST_OBS_KEYS="image" +# N_OBS_STEPS=1 +# NUM_EXPERT_LAYERS=0 +# CHUNK_SIZE=50 +# NUM_VLM_LAYERS=0 +# PAD_LANG_TO=longest +# EVAL_CKPT=$WORK/logs/lerobot/lerobot_physical_intelligence_libero_smolpi0_lr1e-4bs32steps100000gpus2freeze32_imgtoktrue_self_attn_gap1_localimgfalse_statetopreftrue_explay0_vlml0_causalacttrue_sa0_smolvlm2500_chunk50_nobs1_expw0.5_lrvlm1e-4_longest_pt_trans1false/checkpoints/best/pretrained_model/ +# ADD_IMAGE_TOKENS=true +# ATTN_MODE=self_attn +# STATE_TO_PREFIX=true +# CAUSAL_ACTION_ATTENTION_MASK=true +# SELF_ATTN_EVERY_N_LAYERS=0 +# VLM_NAME=HuggingFaceTB/SmolVLM-500M-Instruct + + +# SELF_ATTN_ONLY_ACTIONS=false +# EXPERT_WIDTH_MULTIPLIER=0.5 +# PAST_OBS_KEYS="image" +# N_OBS_STEPS=1 +# NUM_EXPERT_LAYERS=0 +# CHUNK_SIZE=50 +# NUM_VLM_LAYERS=0 +# PAD_LANG_TO=longest +# EVAL_CKPT=$WORK/logs/lerobot/lerobot_physical_intelligence_libero_smolpi0_lr1e-4bs32steps100000gpus2freeze32_imgtoktrue_self_attn_gap1_localimgfalse_statetopreftrue_explay0_vlml0_causalactfalse_sa0_smolvlm2500_chunk50_nobs1_expw0.5_lrvlm1e-4_longest_pt_trans1false/checkpoints/best/pretrained_model/ +# ADD_IMAGE_TOKENS=true +# ATTN_MODE=self_attn +# STATE_TO_PREFIX=true +# CAUSAL_ACTION_ATTENTION_MASK=false +# SELF_ATTN_EVERY_N_LAYERS=0 +# VLM_NAME=HuggingFaceTB/SmolVLM-500M-Instruct + +# SELF_ATTN_ONLY_ACTIONS=false +# EXPERT_WIDTH_MULTIPLIER=0.5 +# PAST_OBS_KEYS="image" +# N_OBS_STEPS=1 +# NUM_EXPERT_LAYERS=0 +# CHUNK_SIZE=50 +# NUM_VLM_LAYERS=0 +# PAD_LANG_TO=longest +# EVAL_CKPT=$WORK/logs/lerobot/lerobot_physical_intelligence_libero_smolpi0_lr1e-4bs32steps100000gpus2freeze32_imgtoktrue_cross_attn_gap1_localimgfalse_statetopreftrue_explay0_vlml0_causalacttrue_sa2_smolvlm2500_chunk50_nobs1_expw0.5_lrvlm1e-4_longest_pt_trans1false/checkpoints/best/pretrained_model/ +# ADD_IMAGE_TOKENS=true +# ATTN_MODE=cross_attn +# STATE_TO_PREFIX=true +# CAUSAL_ACTION_ATTENTION_MASK=true +# SELF_ATTN_EVERY_N_LAYERS=2 +# VLM_NAME=HuggingFaceTB/SmolVLM-500M-Instruct + + +# SELF_ATTN_ONLY_ACTIONS=false +# EXPERT_WIDTH_MULTIPLIER=0.5 +# PAST_OBS_KEYS="image" +# N_OBS_STEPS=1 +# NUM_EXPERT_LAYERS=0 +# CHUNK_SIZE=50 +# NUM_VLM_LAYERS=0 +# PAD_LANG_TO=longest +# EVAL_CKPT=$WORK/logs/lerobot/lerobot_physical_intelligence_libero_smolpi0_lr1e-4bs32steps100000gpus2freeze32_imgtoktrue_cross_attn_gap1_localimgfalse_statetopreftrue_explay0_vlml0_causalacttrue_sa0_smolvlm2500_chunk50_nobs1_expw0.5_lrvlm1e-4_longest_pt_trans1false/checkpoints/best/pretrained_model/ +# ADD_IMAGE_TOKENS=true +# ATTN_MODE=cross_attn +# STATE_TO_PREFIX=true +# CAUSAL_ACTION_ATTENTION_MASK=true +# SELF_ATTN_EVERY_N_LAYERS=0 +# VLM_NAME=HuggingFaceTB/SmolVLM-500M-Instruct + + +# SELF_ATTN_ONLY_ACTIONS=false +# EXPERT_WIDTH_MULTIPLIER=0.75 +# PAST_OBS_KEYS="image" +# N_OBS_STEPS=1 +# NUM_EXPERT_LAYERS=0 +# CHUNK_SIZE=50 +# NUM_VLM_LAYERS=16 +# PAD_LANG_TO=longest +# EVAL_CKPT=$WORK/logs/lerobot/lerobot_physical_intelligence_libero_smolpi0_lr1e-4bs8steps100000gpus2freeze32_imgtoktrue_cross_attn_gap1_localimgfalse_statetopreftrue_explay0_vlml16_causalacttrue_sa2_smolvlm2500_chunk50_nobs1_expw0.75_lrvlm1e-4_longest_ptcomv3freeze25_trans1false/checkpoints/best/pretrained_model/ +# ADD_IMAGE_TOKENS=true +# ATTN_MODE=cross_attn +# STATE_TO_PREFIX=true +# CAUSAL_ACTION_ATTENTION_MASK=true +# SELF_ATTN_EVERY_N_LAYERS=2 +# VLM_NAME=HuggingFaceTB/SmolVLM-500M-Instruct + + +# SELF_ATTN_ONLY_ACTIONS=false +# EXPERT_WIDTH_MULTIPLIER=0.75 +# PAST_OBS_KEYS="image" +# N_OBS_STEPS=1 +# NUM_EXPERT_LAYERS=0 +# CHUNK_SIZE=50 +# NUM_VLM_LAYERS=16 +# PAD_LANG_TO=longest +# EVAL_CKPT=$WORK/logs/lerobot/lerobot_physical_intelligence_libero_smolpi0_lr1e-4bs8steps100000gpus2freeze32_imgtoktrue_cross_attn_gap1_localimgfalse_statetopreftrue_explay0_vlml16_causalacttrue_sa2_smolvlm2500_chunk50_nobs1_expw0.75_lrvlm1e-4_longest_ptcomv3freeze50_trans1false/checkpoints/best/pretrained_model/ +# ADD_IMAGE_TOKENS=true +# ATTN_MODE=cross_attn +# STATE_TO_PREFIX=true +# CAUSAL_ACTION_ATTENTION_MASK=true +# SELF_ATTN_EVERY_N_LAYERS=2 +# VLM_NAME=HuggingFaceTB/SmolVLM-500M-Instruct + +# SELF_ATTN_ONLY_ACTIONS=false +# EXPERT_WIDTH_MULTIPLIER=0.75 +# PAST_OBS_KEYS="image" +# N_OBS_STEPS=1 +# NUM_EXPERT_LAYERS=0 +# CHUNK_SIZE=50 +# NUM_VLM_LAYERS=16 +# PAD_LANG_TO=longest +# EVAL_CKPT=$WORK/logs/lerobot/lerobot_physical_intelligence_libero_smolpi0_lr1e-4bs8steps100000gpus2freeze32_imgtoktrue_cross_attn_gap1_localimgfalse_statetopreftrue_explay0_vlml16_causalacttrue_sa2_smolvlm2500_chunk50_nobs1_expw0.75_lrvlm1e-4_longest_ptcomv3freeze75_trans1false/checkpoints/best/pretrained_model/ +# ADD_IMAGE_TOKENS=true +# ATTN_MODE=cross_attn +# STATE_TO_PREFIX=true +# CAUSAL_ACTION_ATTENTION_MASK=true +# SELF_ATTN_EVERY_N_LAYERS=2 +# VLM_NAME=HuggingFaceTB/SmolVLM-500M-Instruct + +# SELF_ATTN_ONLY_ACTIONS=false +# EXPERT_WIDTH_MULTIPLIER=0.75 +# PAST_OBS_KEYS="image" +# N_OBS_STEPS=1 +# NUM_EXPERT_LAYERS=0 +# CHUNK_SIZE=50 +# NUM_VLM_LAYERS=16 +# PAD_LANG_TO=longest +# EVAL_CKPT=$WORK/logs/lerobot/lerobot_physical_intelligence_libero_smolpi0_lr1e-4bs8steps100000gpus2freeze32_imgtoktrue_cross_attn_gap1_localimgfalse_statetopreftrue_explay0_vlml16_causalacttrue_sa2_smolvlm2500_chunk50_nobs1_expw0.75_lrvlm1e-4_longest_ptcomv3freeze100_trans1false/checkpoints/best/pretrained_model/ +# ADD_IMAGE_TOKENS=true +# ATTN_MODE=cross_attn +# STATE_TO_PREFIX=true +# CAUSAL_ACTION_ATTENTION_MASK=true +# SELF_ATTN_EVERY_N_LAYERS=2 +# VLM_NAME=HuggingFaceTB/SmolVLM-500M-Instruct + +# SELF_ATTN_ONLY_ACTIONS=false +# EXPERT_WIDTH_MULTIPLIER=0.5 +# PAST_OBS_KEYS="image" +# N_OBS_STEPS=1 +# NUM_EXPERT_LAYERS=0 +# CHUNK_SIZE=50 +# NUM_VLM_LAYERS=16 +# PAD_LANG_TO=longest +# EVAL_CKPT=$WORK/logs/lerobot/lerobot_physical_intelligence_libero_smolpi0_lr1e-4bs8steps100000gpus2freeze32_imgtoktrue_cross_attn_gap1_localimgfalse_statetopreftrue_explay0_vlml16_causalacttrue_sa2_smolvlm2500_chunk50_nobs1_expw0.5_lrvlm1e-4_longest_pt_trans6true/checkpoints/best/pretrained_model/ +# ADD_IMAGE_TOKENS=true +# ATTN_MODE=cross_attn +# STATE_TO_PREFIX=true +# CAUSAL_ACTION_ATTENTION_MASK=true +# SELF_ATTN_EVERY_N_LAYERS=2 +# VLM_NAME=HuggingFaceTB/SmolVLM-500M-Instruct + +# SELF_ATTN_ONLY_ACTIONS=false +# EXPERT_WIDTH_MULTIPLIER=0.5 +# PAST_OBS_KEYS="image" +# N_OBS_STEPS=1 +# NUM_EXPERT_LAYERS=0 +# CHUNK_SIZE=50 +# NUM_VLM_LAYERS=16 +# PAD_LANG_TO=longest +# EVAL_CKPT=$WORK/logs/lerobot/lerobot_physical_intelligence_libero_smolpi0_lr1e-4bs8steps100000gpus2freeze32_imgtoktrue_cross_attn_gap1_localimgfalse_statetopreftrue_explay0_vlml16_causalacttrue_sa2_smolvlm2500_chunk50_nobs1_expw0.5_lrvlm1e-4_longest_pt_trans4true/checkpoints/best/pretrained_model/ +# ADD_IMAGE_TOKENS=true +# ATTN_MODE=cross_attn +# STATE_TO_PREFIX=true +# CAUSAL_ACTION_ATTENTION_MASK=true +# SELF_ATTN_EVERY_N_LAYERS=2 +# VLM_NAME=HuggingFaceTB/SmolVLM-500M-Instruct + +# SELF_ATTN_ONLY_ACTIONS=false +# EXPERT_WIDTH_MULTIPLIER=0.5 +# PAST_OBS_KEYS="image" +# N_OBS_STEPS=1 +# NUM_EXPERT_LAYERS=0 +# CHUNK_SIZE=50 +# NUM_VLM_LAYERS=16 +# PAD_LANG_TO=longest +# EVAL_CKPT=$WORK/logs/lerobot/lerobot_physical_intelligence_libero_smolpi0_lr1e-4bs8steps100000gpus2freeze32_imgtoktrue_cross_attn_gap1_localimgfalse_statetopreftrue_explay0_vlml16_causalacttrue_sa2_smolvlm2500_chunk50_nobs1_expw0.5_lrvlm1e-4_longest_pt_trans7true/checkpoints/best/pretrained_model/ +# ADD_IMAGE_TOKENS=true +# ATTN_MODE=cross_attn +# STATE_TO_PREFIX=true +# CAUSAL_ACTION_ATTENTION_MASK=true +# SELF_ATTN_EVERY_N_LAYERS=2 +# VLM_NAME=HuggingFaceTB/SmolVLM-500M-Instruct + +# SELF_ATTN_ONLY_ACTIONS=false +# EXPERT_WIDTH_MULTIPLIER=0.5 +# PAST_OBS_KEYS="image" +# N_OBS_STEPS=1 +# NUM_EXPERT_LAYERS=0 +# CHUNK_SIZE=50 +# NUM_VLM_LAYERS=16 +# PAD_LANG_TO=longest +# EVAL_CKPT=$WORK/logs/lerobot/lerobot_physical_intelligence_libero_smolpi0_lr1e-4bs8steps100000gpus2freeze32_imgtoktrue_cross_attn_gap1_localimgfalse_statetopreftrue_explay0_vlml16_causalacttrue_sa2_smolvlm2500_chunk50_nobs1_expw0.5_lrvlm1e-4_longest_pt_trans5true/checkpoints/best/pretrained_model/ +# ADD_IMAGE_TOKENS=true +# ATTN_MODE=cross_attn +# STATE_TO_PREFIX=true +# CAUSAL_ACTION_ATTENTION_MASK=true +# SELF_ATTN_EVERY_N_LAYERS=2 +# VLM_NAME=HuggingFaceTB/SmolVLM-500M-Instruct + + +# SELF_ATTN_ONLY_ACTIONS=false +# EXPERT_WIDTH_MULTIPLIER=0.5 +# PAST_OBS_KEYS="image" +# N_OBS_STEPS=1 +# NUM_EXPERT_LAYERS=0 +# CHUNK_SIZE=50 +# NUM_VLM_LAYERS=16 +# PAD_LANG_TO=longest +# EVAL_CKPT=$WORK/logs/lerobot/lerobot_physical_intelligence_libero_smolpi0_lr1e-4bs8steps100000gpus2freeze32_imgtoktrue_cross_attn_gap1_localimgfalse_statetopreftrue_explay0_vlml16_causalacttrue_sa2_smolvlm2500_chunk50_nobs1_expw0.5_lrvlm1e-4_longest_pt_trans2true/checkpoints/best/pretrained_model/ +# ADD_IMAGE_TOKENS=true +# ATTN_MODE=cross_attn +# STATE_TO_PREFIX=true +# CAUSAL_ACTION_ATTENTION_MASK=true +# SELF_ATTN_EVERY_N_LAYERS=2 +# VLM_NAME=HuggingFaceTB/SmolVLM-500M-Instruct + + + +# SELF_ATTN_ONLY_ACTIONS=false +# EXPERT_WIDTH_MULTIPLIER=0.75 +# PAST_OBS_KEYS="image" +# N_OBS_STEPS=1 +# NUM_EXPERT_LAYERS=0 +# CHUNK_SIZE=50 +# NUM_VLM_LAYERS=16 +# PAD_LANG_TO=longest +# EVAL_CKPT=$WORK/logs/lerobot/lerobot_physical_intelligence_libero_smolpi0_lr1e-4bs32steps100000gpus2freeze32_imgtoktrue_cross_attn_gap1_localimgfalse_statetopreftrue_explay0_vlml16_causalacttrue_sa2_smolvlm2500_chunk50_nobs1_expw0.75_lrvlm1e-4_longest_pt_trans1true/checkpoints/best/pretrained_model/ +# ADD_IMAGE_TOKENS=true +# ATTN_MODE=cross_attn +# STATE_TO_PREFIX=true +# CAUSAL_ACTION_ATTENTION_MASK=true +# SELF_ATTN_EVERY_N_LAYERS=2 +# VLM_NAME=HuggingFaceTB/SmolVLM-500M-Instruct + +# SELF_ATTN_ONLY_ACTIONS=false +# EXPERT_WIDTH_MULTIPLIER=0.5 +# PAST_OBS_KEYS="image" +# N_OBS_STEPS=1 +# NUM_EXPERT_LAYERS=0 +# CHUNK_SIZE=50 +# NUM_VLM_LAYERS=16 +# PAD_LANG_TO=longest +# EVAL_CKPT=$WORK/logs/lerobot/lerobot_physical_intelligence_libero_smolpi0_lr1e-4bs8steps100000gpus2freeze32_imgtoktrue_cross_attn_gap1_localimgfalse_statetopreftrue_explay0_vlml16_causalacttrue_sa2_smolvlm2500_chunk50_nobs1_expw0.5_lrvlm1e-4_longest_pt_trans8true/checkpoints/best/pretrained_model/ +# ADD_IMAGE_TOKENS=true +# ATTN_MODE=cross_attn +# STATE_TO_PREFIX=true +# CAUSAL_ACTION_ATTENTION_MASK=true +# SELF_ATTN_EVERY_N_LAYERS=2 +# VLM_NAME=HuggingFaceTB/SmolVLM-500M-Instruct + +# SELF_ATTN_ONLY_ACTIONS=false +# EXPERT_WIDTH_MULTIPLIER=0.5 +# PAST_OBS_KEYS="image" +# N_OBS_STEPS=1 +# NUM_EXPERT_LAYERS=0 +# CHUNK_SIZE=50 +# NUM_VLM_LAYERS=16 +# PAD_LANG_TO=longest +# EVAL_CKPT=$WORK/logs/lerobot/lerobot_physical_intelligence_libero_smolpi0_lr1e-4bs8steps100000gpus2freeze32_imgtoktrue_cross_attn_gap1_localimgfalse_statetopreftrue_explay0_vlml16_causalacttrue_sa2_smolvlm2500_chunk50_nobs1_expw0.5_lrvlm1e-4_longest_pt_trans9true/checkpoints/best/pretrained_model/ +# ADD_IMAGE_TOKENS=true +# ATTN_MODE=cross_attn +# STATE_TO_PREFIX=true +# CAUSAL_ACTION_ATTENTION_MASK=true +# SELF_ATTN_EVERY_N_LAYERS=2 +# VLM_NAME=HuggingFaceTB/SmolVLM-500M-Instruct + +# SELF_ATTN_ONLY_ACTIONS=false +# EXPERT_WIDTH_MULTIPLIER=0.5 +# PAST_OBS_KEYS="image" +# N_OBS_STEPS=1 +# NUM_EXPERT_LAYERS=0 +# CHUNK_SIZE=50 +# NUM_VLM_LAYERS=16 +# PAD_LANG_TO=longest +# EVAL_CKPT=$WORK/logs/lerobot/lerobot_physical_intelligence_libero_smolpi0_lr1e-4bs8steps100000gpus2freeze32_imgtoktrue_cross_attn_gap1_localimgfalse_statetopreftrue_explay0_vlml16_causalacttrue_sa2_smolvlm2500_chunk50_nobs1_expw0.5_lrvlm1e-4_longest_ptcomv1v2full/checkpoints/best/pretrained_model/ +# ADD_IMAGE_TOKENS=true +# ATTN_MODE=cross_attn +# STATE_TO_PREFIX=true +# CAUSAL_ACTION_ATTENTION_MASK=true +# SELF_ATTN_EVERY_N_LAYERS=2 +# VLM_NAME=HuggingFaceTB/SmolVLM-500M-Instruct + +# SELF_ATTN_ONLY_ACTIONS=false +# EXPERT_WIDTH_MULTIPLIER=0.25 +# PAST_OBS_KEYS="image" +# N_OBS_STEPS=1 +# NUM_EXPERT_LAYERS=0 +# CHUNK_SIZE=50 +# NUM_VLM_LAYERS=0 +# PAD_LANG_TO=longest +# EVAL_CKPT=$WORK/logs/lerobot/lerobot_physical_intelligence_libero_smolpi0_lr1e-4bs8steps100000gpus2freeze32_imgtoktrue_cross_attn_gap1_localimgfalse_statetopreftrue_explay0_vlml0_causalacttrue_sa0_smolvlm2500_chunk50_nobs1_expw0.25_lrvlm1e-4_longest_pt_trans0false/checkpoints/best/pretrained_model/ +# ADD_IMAGE_TOKENS=true +# ATTN_MODE=cross_attn +# STATE_TO_PREFIX=true +# CAUSAL_ACTION_ATTENTION_MASK=true +# SELF_ATTN_EVERY_N_LAYERS=0 +# VLM_NAME=HuggingFaceTB/SmolVLM-500M-Instruct + + +# SELF_ATTN_ONLY_ACTIONS=false +# EXPERT_WIDTH_MULTIPLIER=1 +# PAST_OBS_KEYS="image" +# N_OBS_STEPS=1 +# NUM_EXPERT_LAYERS=0 +# CHUNK_SIZE=50 +# NUM_VLM_LAYERS=0 +# PAD_LANG_TO=longest +# EVAL_CKPT=$WORK/logs/lerobot/lerobot_physical_intelligence_libero_smolpi0_lr1e-4bs8steps100000gpus2freeze32_imgtoktrue_cross_attn_gap1_localimgfalse_statetopreftrue_explay0_vlml0_causalacttrue_sa0_smolvlm2500_chunk50_nobs1_expw1_lrvlm1e-4_longest_pt_trans0false/checkpoints/best/pretrained_model/ +# ADD_IMAGE_TOKENS=true +# ATTN_MODE=cross_attn +# STATE_TO_PREFIX=true +# CAUSAL_ACTION_ATTENTION_MASK=true +# SELF_ATTN_EVERY_N_LAYERS=0 +# VLM_NAME=HuggingFaceTB/SmolVLM-500M-Instruct + +# SELF_ATTN_ONLY_ACTIONS=false +# EXPERT_WIDTH_MULTIPLIER=0.75 +# PAST_OBS_KEYS="image" +# N_OBS_STEPS=1 +# NUM_EXPERT_LAYERS=0 +# CHUNK_SIZE=50 +# NUM_VLM_LAYERS=0 +# PAD_LANG_TO=longest +# EVAL_CKPT=$WORK/logs/lerobot/lerobot_physical_intelligence_libero_smolpi0_lr1e-4bs8steps100000gpus2freeze32_imgtoktrue_cross_attn_gap1_localimgfalse_statetopreftrue_explay0_vlml0_causalacttrue_sa0_smolvlm2500_chunk50_nobs1_expw0.75_lrvlm1e-4_longest_pt_trans0false/checkpoints/best/pretrained_model/ +# ADD_IMAGE_TOKENS=true +# ATTN_MODE=cross_attn +# STATE_TO_PREFIX=true +# CAUSAL_ACTION_ATTENTION_MASK=true +# SELF_ATTN_EVERY_N_LAYERS=0 +# VLM_NAME=HuggingFaceTB/SmolVLM-500M-Instruct + +# SELF_ATTN_ONLY_ACTIONS=false +# EXPERT_WIDTH_MULTIPLIER=0.75 +# PAST_OBS_KEYS="image" +# N_OBS_STEPS=1 +# NUM_EXPERT_LAYERS=0 +# CHUNK_SIZE=50 +# NUM_VLM_LAYERS=0 +# PAD_LANG_TO=longest +# EVAL_CKPT=$WORK/logs/lerobot/lerobot_physical_intelligence_libero_smolpi0_lr1e-4bs32steps100000gpus2freeze32_imgtoktrue_cross_attn_gap1_localimgfalse_statetopreftrue_explay0_vlml0_causalacttrue_sa0_smolvlm2500_chunk50_nobs1_expw0.75_lrvlm1e-4_longest_pt_trans0false/checkpoints/best/pretrained_model/ +# ADD_IMAGE_TOKENS=true +# ATTN_MODE=cross_attn +# STATE_TO_PREFIX=true +# CAUSAL_ACTION_ATTENTION_MASK=true +# SELF_ATTN_EVERY_N_LAYERS=0 +# VLM_NAME=HuggingFaceTB/SmolVLM-500M-Instruct + +# SELF_ATTN_ONLY_ACTIONS=false +# EXPERT_WIDTH_MULTIPLIER=0.25 +# PAST_OBS_KEYS="image" +# N_OBS_STEPS=1 +# NUM_EXPERT_LAYERS=0 +# CHUNK_SIZE=50 +# NUM_VLM_LAYERS=0 +# PAD_LANG_TO=longest +# EVAL_CKPT=$WORK/logs/lerobot/lerobot_physical_intelligence_libero_smolpi0_lr1e-4bs32steps100000gpus2freeze32_imgtoktrue_cross_attn_gap1_localimgfalse_statetopreftrue_explay0_vlml0_causalacttrue_sa0_smolvlm2500_chunk50_nobs1_expw0.25_lrvlm1e-4_longest_pt_trans0false/checkpoints/best/pretrained_model/ +# ADD_IMAGE_TOKENS=true +# ATTN_MODE=cross_attn +# STATE_TO_PREFIX=true +# CAUSAL_ACTION_ATTENTION_MASK=true +# SELF_ATTN_EVERY_N_LAYERS=0 +# VLM_NAME=HuggingFaceTB/SmolVLM-500M-Instruct + +# SELF_ATTN_ONLY_ACTIONS=false +# EXPERT_WIDTH_MULTIPLIER=1 +# PAST_OBS_KEYS="image" +# N_OBS_STEPS=1 +# NUM_EXPERT_LAYERS=0 +# CHUNK_SIZE=50 +# NUM_VLM_LAYERS=0 +# PAD_LANG_TO=longest +# EVAL_CKPT=$WORK/logs/lerobot/lerobot_physical_intelligence_libero_smolpi0_lr1e-4bs32steps100000gpus2freeze32_imgtoktrue_cross_attn_gap1_localimgfalse_statetopreftrue_explay0_vlml0_causalacttrue_sa0_smolvlm2500_chunk50_nobs1_expw1_lrvlm1e-4_longest_pt_trans0false/checkpoints/best/pretrained_model/ +# ADD_IMAGE_TOKENS=true +# ATTN_MODE=cross_attn +# STATE_TO_PREFIX=true +# CAUSAL_ACTION_ATTENTION_MASK=true +# SELF_ATTN_EVERY_N_LAYERS=0 +# VLM_NAME=HuggingFaceTB/SmolVLM-500M-Instruct + + +# SELF_ATTN_ONLY_ACTIONS=false +# EXPERT_WIDTH_MULTIPLIER=0.75 +# PAST_OBS_KEYS="observation.state" +# N_OBS_STEPS=3 +# NUM_EXPERT_LAYERS=0 +# CHUNK_SIZE=50 +# NUM_VLM_LAYERS=16 +# PAD_LANG_TO=longest +# EVAL_CKPT=$WORK/logs/lerobot/lerobot_physical_intelligence_libero_smolpi0_lr1e-4bs8steps100000gpus2freeze32_imgtoktrue_cross_attn_gap1_localimgfalse_statetopreftrue_explay0_vlml16_causalacttrue_sa2_smolvlm2500_chunk50_nobs3statestrue_expw0.75_lrvlm1e-4_longest_pt_trans0false/checkpoints/best/pretrained_model/ +# ADD_IMAGE_TOKENS=true +# ATTN_MODE=cross_attn +# STATE_TO_PREFIX=true +# CAUSAL_ACTION_ATTENTION_MASK=true +# SELF_ATTN_EVERY_N_LAYERS=2 +# VLM_NAME=HuggingFaceTB/SmolVLM-500M-Instruct + + +# CAUSAL_ATTENTION_ON_HISTORY=true +# SELF_ATTN_ONLY_ACTIONS=false +# EXPERT_WIDTH_MULTIPLIER=0.75 +# PAST_OBS_KEYS="image,observation.state" +# N_OBS_STEPS=3 +# NUM_EXPERT_LAYERS=0 +# CHUNK_SIZE=50 +# NUM_VLM_LAYERS=16 +# PAD_LANG_TO=longest +# EVAL_CKPT=$WORK/logs/lerobot/lerobot_physical_intelligence_libero_smolpi0_lr1e-4bs8steps100000gpus2freeze32_imgtoktrue_cross_attn_gap1_localimgfalse_statetopreftrue_explay0_vlml16_causalacttrue_sa2_smolvlm2500_chunk50_nobs3_expw0.75_lrvlm1e-4_longest_pt_trans0false/checkpoints/best/pretrained_model/ +# ADD_IMAGE_TOKENS=true +# ATTN_MODE=cross_attn +# STATE_TO_PREFIX=true +# CAUSAL_ACTION_ATTENTION_MASK=true +# SELF_ATTN_EVERY_N_LAYERS=2 +# VLM_NAME=HuggingFaceTB/SmolVLM-500M-Instruct + + + +# CAUSAL_ATTENTION_ON_HISTORY=true +# SELF_ATTN_ONLY_ACTIONS=false +# EXPERT_WIDTH_MULTIPLIER=0.75 +# PAST_OBS_KEYS="image,observation.state" +# N_OBS_STEPS=1 +# NUM_EXPERT_LAYERS=0 +# CHUNK_SIZE=50 +# NUM_VLM_LAYERS=16 +# PAD_LANG_TO=longest +# EVAL_CKPT=$WORK/logs/lerobot/lerobot_physical_intelligence_libero_smolpi0_lr1e-4bs8steps100000gpus2freeze32_imgtoktrue_cross_attn_vlml16_causalacttrue_sa2_smolvlm2500_chunk50_nobs1statestrue_expw0.75_lrvlm1e-4_longest_pt_trans0false_decaylr1e-5100000/checkpoints/best/pretrained_model/ +# ADD_IMAGE_TOKENS=true +# ATTN_MODE=cross_attn +# STATE_TO_PREFIX=true +# CAUSAL_ACTION_ATTENTION_MASK=true +# SELF_ATTN_EVERY_N_LAYERS=2 +# VLM_NAME=HuggingFaceTB/SmolVLM-500M-Instruct + + + +# CAUSAL_ATTENTION_ON_HISTORY=true +# SELF_ATTN_ONLY_ACTIONS=false +# EXPERT_WIDTH_MULTIPLIER=0.75 +# PAST_OBS_KEYS="image,observation.state" +# N_OBS_STEPS=1 +# NUM_EXPERT_LAYERS=0 +# CHUNK_SIZE=50 +# NUM_VLM_LAYERS=16 +# PAD_LANG_TO=longest +# EVAL_CKPT=$WORK/logs/lerobot/lerobot_physical_intelligence_libero_smolpi0_lr1e-4bs8steps100000gpus2freeze32_imgtoktrue_cross_attn_vlml16_causalacttrue_sa2_smolvlm2500_chunk50_nobs1statestrue_expw0.75_lrvlm1e-4_longest_pt_trans0false_decaylr1e-530000/checkpoints/best/pretrained_model/ +# ADD_IMAGE_TOKENS=true +# ATTN_MODE=cross_attn +# STATE_TO_PREFIX=true +# CAUSAL_ACTION_ATTENTION_MASK=true +# SELF_ATTN_EVERY_N_LAYERS=2 +# VLM_NAME=HuggingFaceTB/SmolVLM-500M-Instruct + + +# CAUSAL_ATTENTION_ON_HISTORY=true +# SELF_ATTN_ONLY_ACTIONS=false +# EXPERT_WIDTH_MULTIPLIER=0.75 +# PAST_OBS_KEYS="image,observation.state" +# N_OBS_STEPS=1 +# NUM_EXPERT_LAYERS=0 +# CHUNK_SIZE=50 +# NUM_VLM_LAYERS=16 +# PAD_LANG_TO=longest +# EVAL_CKPT=$WORK/logs/lerobot/lerobot_physical_intelligence_libero_smolpi0_lr1e-4bs8steps100000gpus2freeze32_imgtoktrue_cross_attn_vlml16_causalacttrue_sa2_smolvlm2500_chunk50_nobs1statestrue_expw0.75_lrvlm1e-4_longest_pt_trans0false_decaylr5e-6100000/checkpoints/best/pretrained_model/ +# ADD_IMAGE_TOKENS=true +# ATTN_MODE=cross_attn +# STATE_TO_PREFIX=true +# CAUSAL_ACTION_ATTENTION_MASK=true +# SELF_ATTN_EVERY_N_LAYERS=2 +# VLM_NAME=HuggingFaceTB/SmolVLM-500M-Instruct + + +# CAUSAL_ATTENTION_ON_HISTORY=true +# SELF_ATTN_ONLY_ACTIONS=false +# EXPERT_WIDTH_MULTIPLIER=0.75 +# PAST_OBS_KEYS="image" +# N_OBS_STEPS=1 +# NUM_EXPERT_LAYERS=0 +# CHUNK_SIZE=50 +# NUM_VLM_LAYERS=16 +# PAD_LANG_TO=longest +# EVAL_CKPT=$WORK/logs/lerobot/lerobot_physical_intelligence_libero_smolpi0_lr1e-4bs8steps100000gpus2freeze32_imgtoktrue_cross_attn_vlml16_causalacttrue_sa2_smolvlm2500_chunk50_nobs1statestrue_expw0.75_lrvlm1e-4_longest_pt_trans0false_decaylr2.5e-6100000/checkpoints/best/pretrained_model/ +# ADD_IMAGE_TOKENS=true +# ATTN_MODE=cross_attn +# STATE_TO_PREFIX=true +# CAUSAL_ACTION_ATTENTION_MASK=true +# SELF_ATTN_EVERY_N_LAYERS=2 +# VLM_NAME=HuggingFaceTB/SmolVLM-500M-Instruct + +# CAUSAL_ATTENTION_ON_HISTORY=true +# SELF_ATTN_ONLY_ACTIONS=false +# EXPERT_WIDTH_MULTIPLIER=0.75 +# PAST_OBS_KEYS="image" +# N_OBS_STEPS=1 +# NUM_EXPERT_LAYERS=0 +# CHUNK_SIZE=50 +# NUM_VLM_LAYERS=16 +# PAD_LANG_TO=longest +# EVAL_CKPT=$WORK/logs/lerobot/lerobot_physical_intelligence_libero_smolpi0_lr1e-4bs8steps100000gpus2freeze32_imgtoktrue_cross_attn_vlml16_causalacttrue_sa2_smolvlm2500_chunk50_nobs1statestrue_expw0.75_lrvlm1e-4_longest_pt_trans0false_decaylr2.5e-630000/checkpoints/best/pretrained_model/ +# ADD_IMAGE_TOKENS=true +# ATTN_MODE=cross_attn +# STATE_TO_PREFIX=true +# CAUSAL_ACTION_ATTENTION_MASK=true +# SELF_ATTN_EVERY_N_LAYERS=2 +# VLM_NAME=HuggingFaceTB/SmolVLM-500M-Instruct + +# CAUSAL_ATTENTION_ON_HISTORY=true +# SELF_ATTN_ONLY_ACTIONS=false +# EXPERT_WIDTH_MULTIPLIER=0.75 +# PAST_OBS_KEYS="image" +# N_OBS_STEPS=1 +# NUM_EXPERT_LAYERS=0 +# CHUNK_SIZE=50 +# NUM_VLM_LAYERS=16 +# PAD_LANG_TO=longest +# EVAL_CKPT=$WORK/logs/lerobot/lerobot_physical_intelligence_libero_smolpi0_lr1e-4bs8steps100000gpus2freeze32_imgtoktrue_cross_attn_vlml16_causalacttrue_sa2_smolvlm2500_chunk50_nobs1statestrue_expw0.75_lrvlm1e-4_longest_pt_trans0true_decaylr2.5e-630000/checkpoints/best/pretrained_model/ +# ADD_IMAGE_TOKENS=true +# ATTN_MODE=cross_attn +# STATE_TO_PREFIX=true +# CAUSAL_ACTION_ATTENTION_MASK=true +# SELF_ATTN_EVERY_N_LAYERS=2 +# VLM_NAME=HuggingFaceTB/SmolVLM-500M-Instruct + + +# CAUSAL_ATTENTION_ON_HISTORY=true +# SELF_ATTN_ONLY_ACTIONS=false +# EXPERT_WIDTH_MULTIPLIER=0.75 +# PAST_OBS_KEYS="image" +# N_OBS_STEPS=1 +# NUM_EXPERT_LAYERS=0 +# CHUNK_SIZE=50 +# NUM_VLM_LAYERS=16 +# PAD_LANG_TO=longest +# EVAL_CKPT=$WORK/logs/lerobot/lerobot_physical_intelligence_libero_smolpi0_lr1e-4bs32steps100000gpus2freeze32_imgtoktrue_cross_attn_vlml16_causalacttrue_sa2_smolvlm2500_chunk50_nobs1statestrue_expw0.75_lrvlm1e-4_longest_pt_trans0false_decaylr1e-5200000/checkpoints/best/pretrained_model/ +# ADD_IMAGE_TOKENS=true +# ATTN_MODE=cross_attn +# STATE_TO_PREFIX=true +# CAUSAL_ACTION_ATTENTION_MASK=true +# SELF_ATTN_EVERY_N_LAYERS=2 +# VLM_NAME=HuggingFaceTB/SmolVLM-500M-Instruct + + +# CAUSAL_ATTENTION_ON_HISTORY=true +# SELF_ATTN_ONLY_ACTIONS=false +# EXPERT_WIDTH_MULTIPLIER=0.75 +# PAST_OBS_KEYS="image" +# N_OBS_STEPS=1 +# NUM_EXPERT_LAYERS=0 +# CHUNK_SIZE=50 +# NUM_VLM_LAYERS=16 +# PAD_LANG_TO=longest +# EVAL_CKPT=$WORK/logs/lerobot/lerobot_physical_intelligence_libero_smolpi0_lr1e-4bs32steps100000gpus2freeze32_imgtoktrue_cross_attn_vlml16_causalacttrue_sa2_smolvlm2500_chunk50_nobs1statestrue_expw0.75_lrvlm1e-4_longest_pt_trans0false_decaylr5e-6200000/checkpoints/best/pretrained_model/ +# ADD_IMAGE_TOKENS=true +# ATTN_MODE=cross_attn +# STATE_TO_PREFIX=true +# CAUSAL_ACTION_ATTENTION_MASK=true +# SELF_ATTN_EVERY_N_LAYERS=2 +# VLM_NAME=HuggingFaceTB/SmolVLM-500M-Instruct + +# CAUSAL_ATTENTION_ON_HISTORY=true +# SELF_ATTN_ONLY_ACTIONS=false +# EXPERT_WIDTH_MULTIPLIER=0.75 +# PAST_OBS_KEYS="image" +# N_OBS_STEPS=1 +# NUM_EXPERT_LAYERS=0 +# CHUNK_SIZE=50 +# NUM_VLM_LAYERS=16 +# PAD_LANG_TO=longest +# EVAL_CKPT=$WORK/logs/lerobot/lerobot_physical_intelligence_libero_smolpi0_lr1e-4bs32steps100000gpus2freeze32_imgtoktrue_cross_attn_vlml16_causalacttrue_sa2_smolvlm2500_chunk50_nobs1statestrue_expw0.75_lrvlm1e-4_longest_pt_trans0false_decaylr1e-530000/checkpoints/best/pretrained_model/ +# ADD_IMAGE_TOKENS=true +# ATTN_MODE=cross_attn +# STATE_TO_PREFIX=true +# CAUSAL_ACTION_ATTENTION_MASK=true +# SELF_ATTN_EVERY_N_LAYERS=2 +# VLM_NAME=HuggingFaceTB/SmolVLM-500M-Instruct + + +# CAUSAL_ATTENTION_ON_HISTORY=true +# SELF_ATTN_ONLY_ACTIONS=false +# EXPERT_WIDTH_MULTIPLIER=0.75 +# PAST_OBS_KEYS="image" +# N_OBS_STEPS=1 +# NUM_EXPERT_LAYERS=0 +# CHUNK_SIZE=50 +# NUM_VLM_LAYERS=16 +# PAD_LANG_TO=longest +# EVAL_CKPT=$WORK/logs/lerobot/lerobot_physical_intelligence_libero_smolpi0_lr1e-4bs32steps100000gpus2freeze32_imgtoktrue_cross_attn_vlml16_causalacttrue_sa2_smolvlm2500_chunk50_nobs1statestrue_expw0.75_lrvlm1e-4_longest_pt_trans0false_decaylr2.5e-6200000/checkpoints/best/pretrained_model/ +# ADD_IMAGE_TOKENS=true +# ATTN_MODE=cross_attn +# STATE_TO_PREFIX=true +# CAUSAL_ACTION_ATTENTION_MASK=true +# SELF_ATTN_EVERY_N_LAYERS=2 +# VLM_NAME=HuggingFaceTB/SmolVLM-500M-Instruct + + +# CAUSAL_ATTENTION_ON_HISTORY=true +# SELF_ATTN_ONLY_ACTIONS=false +# EXPERT_WIDTH_MULTIPLIER=0.75 +# PAST_OBS_KEYS="image" +# N_OBS_STEPS=1 +# NUM_EXPERT_LAYERS=0 +# CHUNK_SIZE=50 +# NUM_VLM_LAYERS=16 +# PAD_LANG_TO=longest +# EVAL_CKPT=$WORK/logs/lerobot/lerobot_physical_intelligence_libero_smolpi0_lr1e-4bs32steps100000gpus2freeze32_imgtoktrue_cross_attn_vlml16_causalacttrue_sa2_smolvlm2500_chunk50_nobs1statestrue_expw0.75_lrvlm1e-4_longest_pt_trans0false_decaylr2.5e-6100000/checkpoints/best/pretrained_model/ +# ADD_IMAGE_TOKENS=true +# ATTN_MODE=cross_attn +# STATE_TO_PREFIX=true +# CAUSAL_ACTION_ATTENTION_MASK=true +# SELF_ATTN_EVERY_N_LAYERS=2 +# VLM_NAME=HuggingFaceTB/SmolVLM-500M-Instruct + + +# CAUSAL_ATTENTION_ON_HISTORY=true +# SELF_ATTN_ONLY_ACTIONS=false +# EXPERT_WIDTH_MULTIPLIER=0.75 +# PAST_OBS_KEYS="image" +# N_OBS_STEPS=1 +# NUM_EXPERT_LAYERS=0 +# CHUNK_SIZE=50 +# NUM_VLM_LAYERS=16 +# PAD_LANG_TO=longest +# EVAL_CKPT=$WORK/logs/lerobot/lerobot_physical_intelligence_libero_smolpi0_lr1e-4bs32steps100000gpus2freeze32_imgtoktrue_cross_attn_vlml16_causalacttrue_sa2_smolvlm2500_chunk50_nobs1statestrue_expw0.75_lrvlm1e-4_longest_pt_trans0false_decaylr2.5e-630000/checkpoints/best/pretrained_model/ +# ADD_IMAGE_TOKENS=true +# ATTN_MODE=cross_attn +# STATE_TO_PREFIX=true +# CAUSAL_ACTION_ATTENTION_MASK=true +# SELF_ATTN_EVERY_N_LAYERS=2 +# VLM_NAME=HuggingFaceTB/SmolVLM-500M-Instruct + + +# CAUSAL_ATTENTION_ON_HISTORY=true +# SELF_ATTN_ONLY_ACTIONS=false +# EXPERT_WIDTH_MULTIPLIER=0.75 +# PAST_OBS_KEYS="image" +# N_OBS_STEPS=1 +# NUM_EXPERT_LAYERS=0 +# CHUNK_SIZE=50 +# NUM_VLM_LAYERS=16 +# PAD_LANG_TO=longest +# EVAL_CKPT=$WORK/logs/lerobot/lerobot_physical_intelligence_libero_smolpi0_lr1e-4bs32steps100000gpus2freeze32_imgtoktrue_cross_attn_vlml16_causalacttrue_sa2_smolvlm2500_chunk50_nobs1statestrue_expw0.75_lrvlm1e-4_longest_pt_trans0false_decaylr5e-6100000/checkpoints/best/pretrained_model/ +# ADD_IMAGE_TOKENS=true +# ATTN_MODE=cross_attn +# STATE_TO_PREFIX=true +# CAUSAL_ACTION_ATTENTION_MASK=true +# SELF_ATTN_EVERY_N_LAYERS=2 +# VLM_NAME=HuggingFaceTB/SmolVLM-500M-Instruct + + +# CAUSAL_ATTENTION_ON_HISTORY=true +# SELF_ATTN_ONLY_ACTIONS=false +# EXPERT_WIDTH_MULTIPLIER=0.75 +# PAST_OBS_KEYS="image" +# N_OBS_STEPS=1 +# NUM_EXPERT_LAYERS=0 +# CHUNK_SIZE=50 +# NUM_VLM_LAYERS=16 +# PAD_LANG_TO=longest +# EVAL_CKPT=$WORK/logs/lerobot/lerobot_physical_intelligence_libero_smolpi0_lr1e-4bs8steps100000gpus2freeze32_imgtoktrue_cross_attn_vlml16_causalacttrue_sa2_smolvla500base_chunk50_nobs1statestrue_expw0.75_lrvlm1e-4_longest_pt_trans0false_decaylr2.5e-630000_relactfalsefirst_camfalse/checkpoints/best/pretrained_model/ +# ADD_IMAGE_TOKENS=true +# ATTN_MODE=cross_attn +# STATE_TO_PREFIX=true +# CAUSAL_ACTION_ATTENTION_MASK=true +# SELF_ATTN_EVERY_N_LAYERS=2 +# VLM_NAME=HuggingFaceTB/SmolVLA-500M-Base + + + +# PREDICT_RELATIVE_ACTIONS=true +# RELATIVE_ACTIONS_MODE=relative +# CAUSAL_ATTENTION_ON_HISTORY=true +# SELF_ATTN_ONLY_ACTIONS=false +# EXPERT_WIDTH_MULTIPLIER=0.75 +# PAST_OBS_KEYS="image" +# N_OBS_STEPS=1 +# NUM_EXPERT_LAYERS=0 +# CHUNK_SIZE=50 +# NUM_VLM_LAYERS=16 +# PAD_LANG_TO=longest +# EVAL_CKPT=$WORK/logs/lerobot/lerobot_physical_intelligence_libero_smolpi0_lr1e-4bs32steps100000gpus2freeze32_imgtoktrue_cross_attn_vlml16_causalacttrue_sa2_smolvlm2500_chunk50_nobs1statestrue_expw0.75_lrvlm1e-4_longest_pt_trans0false_decaylr2.5e-630000_relacttruerelative/checkpoints/best/pretrained_model/ +# ADD_IMAGE_TOKENS=true +# ATTN_MODE=cross_attn +# STATE_TO_PREFIX=true +# CAUSAL_ACTION_ATTENTION_MASK=true +# SELF_ATTN_EVERY_N_LAYERS=2 +# VLM_NAME=HuggingFaceTB/SmolVLM-500M-Instruct + + + +# PREDICT_RELATIVE_ACTIONS=true +# RELATIVE_ACTIONS_MODE=first +# CAUSAL_ATTENTION_ON_HISTORY=true +# SELF_ATTN_ONLY_ACTIONS=false +# EXPERT_WIDTH_MULTIPLIER=0.75 +# PAST_OBS_KEYS="image" +# N_OBS_STEPS=1 +# NUM_EXPERT_LAYERS=0 +# CHUNK_SIZE=50 +# NUM_VLM_LAYERS=16 +# PAD_LANG_TO=longest +# EVAL_CKPT=$WORK/logs/lerobot/lerobot_physical_intelligence_libero_smolpi0_lr1e-4bs32steps100000gpus2freeze32_imgtoktrue_cross_attn_vlml16_causalacttrue_sa2_smolvlm2500_chunk50_nobs1statestrue_expw0.75_lrvlm1e-4_longest_pt_trans0false_decaylr2.5e-630000_relacttruefirst/checkpoints/best/pretrained_model/ +# ADD_IMAGE_TOKENS=true +# ATTN_MODE=cross_attn +# STATE_TO_PREFIX=true +# CAUSAL_ACTION_ATTENTION_MASK=true +# SELF_ATTN_EVERY_N_LAYERS=2 +# VLM_NAME=HuggingFaceTB/SmolVLM-500M-Instruct + + +# CAUSAL_ATTENTION_ON_HISTORY=true +# SELF_ATTN_ONLY_ACTIONS=false +# EXPERT_WIDTH_MULTIPLIER=0.75 +# PAST_OBS_KEYS="image" +# N_OBS_STEPS=1 +# NUM_EXPERT_LAYERS=0 +# CHUNK_SIZE=50 +# NUM_VLM_LAYERS=16 +# PAD_LANG_TO=longest +# EVAL_CKPT=$WORK/logs/lerobot/lerobot_physical_intelligence_libero_smolpi0_lr1e-4bs32steps100000gpus2freeze32_imgtoktrue_cross_attn_vlml16_causalacttrue_sa2_smolvla500base_chunk50_nobs1statestrue_expw0.75_lrvlm1e-4_longest_pt_trans0false_decaylr2.5e-630000_relactfalsefirst_camfalse/checkpoints/best/pretrained_model/ +# ADD_IMAGE_TOKENS=true +# ATTN_MODE=cross_attn +# STATE_TO_PREFIX=true +# CAUSAL_ACTION_ATTENTION_MASK=true +# SELF_ATTN_EVERY_N_LAYERS=2 +# VLM_NAME=HuggingFaceTB/SmolVLA-500M-Base + + +# CAUSAL_ATTENTION_ON_HISTORY=true +# SELF_ATTN_ONLY_ACTIONS=false +# EXPERT_WIDTH_MULTIPLIER=0.75 +# PAST_OBS_KEYS="image" +# N_OBS_STEPS=1 +# NUM_EXPERT_LAYERS=0 +# CHUNK_SIZE=50 +# NUM_VLM_LAYERS=16 +# PAD_LANG_TO=longest +# EVAL_CKPT=$WORK/logs/lerobot/lerobot_physical_intelligence_libero_smolpi0_lr8e-5bs8steps100000gpus2freeze32_imgtoktrue_cross_attn_vlml16_causalacttrue_sa2_smolvlm2500_chunk50_nobs1statestrue_expw0.75_lrvlm1e-4_longest_pt_trans0false_decaylr2.5e-6100000_relactfalsefirst_camfalse/checkpoints/best/pretrained_model/ +# ADD_IMAGE_TOKENS=true +# ATTN_MODE=cross_attn +# STATE_TO_PREFIX=true +# CAUSAL_ACTION_ATTENTION_MASK=true +# SELF_ATTN_EVERY_N_LAYERS=2 +# VLM_NAME=HuggingFaceTB/SmolVLM-500M-Instruct + + +# CAUSAL_ATTENTION_ON_HISTORY=true +# SELF_ATTN_ONLY_ACTIONS=false +# EXPERT_WIDTH_MULTIPLIER=0.75 +# PAST_OBS_KEYS="image" +# N_OBS_STEPS=1 +# NUM_EXPERT_LAYERS=0 +# CHUNK_SIZE=50 +# NUM_VLM_LAYERS=16 +# PAD_LANG_TO=longest +# EVAL_CKPT=$WORK/logs/lerobot/lerobot_physical_intelligence_libero_smolpi0_lr6e-5bs8steps100000gpus2freeze32_imgtoktrue_cross_attn_vlml16_causalacttrue_sa2_smolvlm2500_chunk50_nobs1statestrue_expw0.75_lrvlm1e-4_longest_pt_trans0false_decaylr2.5e-6100000_relactfalsefirst_camfalse/checkpoints/best/pretrained_model/ +# ADD_IMAGE_TOKENS=true +# ATTN_MODE=cross_attn +# STATE_TO_PREFIX=true +# CAUSAL_ACTION_ATTENTION_MASK=true +# SELF_ATTN_EVERY_N_LAYERS=2 +# VLM_NAME=HuggingFaceTB/SmolVLM-500M-Instruct + +# CAUSAL_ATTENTION_ON_HISTORY=true +# SELF_ATTN_ONLY_ACTIONS=false +# EXPERT_WIDTH_MULTIPLIER=0.75 +# PAST_OBS_KEYS="image" +# N_OBS_STEPS=1 +# NUM_EXPERT_LAYERS=0 +# CHUNK_SIZE=50 +# NUM_VLM_LAYERS=16 +# PAD_LANG_TO=longest +# EVAL_CKPT=$WORK/logs/lerobot/lerobot_physical_intelligence_libero_smolpi0_lr4e-5bs8steps100000gpus2freeze32_imgtoktrue_cross_attn_vlml16_causalacttrue_sa2_smolvlm2500_chunk50_nobs1statestrue_expw0.75_lrvlm1e-4_longest_pt_trans0false_decaylr2.5e-6100000_relactfalsefirst_camfalse/checkpoints/best/pretrained_model/ +# ADD_IMAGE_TOKENS=true +# ATTN_MODE=cross_attn +# STATE_TO_PREFIX=true +# CAUSAL_ACTION_ATTENTION_MASK=true +# SELF_ATTN_EVERY_N_LAYERS=2 +# VLM_NAME=HuggingFaceTB/SmolVLM-500M-Instruct + +# CAUSAL_ATTENTION_ON_HISTORY=true +# SELF_ATTN_ONLY_ACTIONS=false +# EXPERT_WIDTH_MULTIPLIER=0.75 +# PAST_OBS_KEYS="image" +# N_OBS_STEPS=1 +# NUM_EXPERT_LAYERS=0 +# CHUNK_SIZE=50 +# NUM_VLM_LAYERS=16 +# PAD_LANG_TO=longest +# EVAL_CKPT=$WORK/logs/lerobot/lerobot_physical_intelligence_libero_smolpi0_lr1e-4bs32steps100000gpus2freeze32_cross_attn_vlml16_sa2_smolvlm2500_chunk50_nobs1_expw0.75_lrvlm1e-4_longest_ptcomv1v2freezebs64transv0_trans0false_decaylr2.5e-630000_relactfalsefirst_camfalse_vim-1/checkpoints/best/pretrained_model/ +# ADD_IMAGE_TOKENS=true +# ATTN_MODE=cross_attn +# STATE_TO_PREFIX=true +# CAUSAL_ACTION_ATTENTION_MASK=true +# SELF_ATTN_EVERY_N_LAYERS=2 +# VLM_NAME=HuggingFaceTB/SmolVLM-500M-Instruct + +# CAUSAL_ATTENTION_ON_HISTORY=true +# SELF_ATTN_ONLY_ACTIONS=false +# EXPERT_WIDTH_MULTIPLIER=0.75 +# PAST_OBS_KEYS="image" +# N_OBS_STEPS=1 +# NUM_EXPERT_LAYERS=0 +# CHUNK_SIZE=50 +# NUM_VLM_LAYERS=16 +# PAD_LANG_TO=longest +# EVAL_CKPT=$WORK/logs/lerobot/lerobot_physical_intelligence_libero_smolpi0_lr1e-4bs32steps100000gpus2freeze32_cross_attn_vlml16_sa2_smolvlm2500_chunk50_nobs1_expw0.75_lrvlm1e-4_longest_pt_trans1true_decaylr2.5e-630000_relactfalsefirst_camfalse_vim-1/checkpoints/best/pretrained_model/ +# ADD_IMAGE_TOKENS=true +# ATTN_MODE=cross_attn +# STATE_TO_PREFIX=true +# CAUSAL_ACTION_ATTENTION_MASK=true +# SELF_ATTN_EVERY_N_LAYERS=2 +# VLM_NAME=HuggingFaceTB/SmolVLM-500M-Instruct + +# REGRESSION_LOSS=true +# CAUSAL_ATTENTION_ON_HISTORY=true +# SELF_ATTN_ONLY_ACTIONS=false +# EXPERT_WIDTH_MULTIPLIER=0.5 +# PAST_OBS_KEYS="image" +# N_OBS_STEPS=1 +# NUM_EXPERT_LAYERS=0 +# CHUNK_SIZE=50 +# NUM_VLM_LAYERS=0 +# PAD_LANG_TO=longest +# EVAL_CKPT=$WORK/logs/lerobot/lerobot_physical_intelligence_libero_smolpi0_lr1e-4bs8steps100000gpus2freeze32_cross_attn_vlml0_sa0_smolvlm2500_chunk50_nobs1_expw0.5_lrvlm1e-4_longest_pt_trans0false_decaylr2.5e-630000_relactfalsefirst_camfalse_vim-1_regtrue/checkpoints/best/pretrained_model/ +# ADD_IMAGE_TOKENS=true +# ATTN_MODE=cross_attn +# STATE_TO_PREFIX=true +# CAUSAL_ACTION_ATTENTION_MASK=true +# SELF_ATTN_EVERY_N_LAYERS=0 +# VLM_NAME=HuggingFaceTB/SmolVLM-500M-Instruct + +# REGRESSION_LOSS=true +# CAUSAL_ATTENTION_ON_HISTORY=true +# SELF_ATTN_ONLY_ACTIONS=false +# EXPERT_WIDTH_MULTIPLIER=0.75 +# PAST_OBS_KEYS="image" +# N_OBS_STEPS=1 +# NUM_EXPERT_LAYERS=0 +# CHUNK_SIZE=50 +# NUM_VLM_LAYERS=16 +# PAD_LANG_TO=longest +# EVAL_CKPT=$WORK/logs/lerobot/lerobot_physical_intelligence_libero_smolpi0_lr1e-4bs32steps100000gpus2freeze32_cross_attn_vlml16_sa2_smolvlm2500_chunk50_nobs1_expw0.75_lrvlm1e-4_longest_pt_trans0false_decaylr2.5e-630000_relactfalsefirst_camfalse_vim-1_regtrue/checkpoints/best/pretrained_model/ +# ADD_IMAGE_TOKENS=true +# ATTN_MODE=cross_attn +# STATE_TO_PREFIX=true +# CAUSAL_ACTION_ATTENTION_MASK=true +# SELF_ATTN_EVERY_N_LAYERS=2 +# VLM_NAME=HuggingFaceTB/SmolVLM-500M-Instruct + + +# CAUSAL_ATTENTION_ON_HISTORY=true +# SELF_ATTN_ONLY_ACTIONS=false +# EXPERT_WIDTH_MULTIPLIER=0.75 +# PAST_OBS_KEYS="image" +# N_OBS_STEPS=1 +# NUM_EXPERT_LAYERS=0 +# CHUNK_SIZE=50 +# NUM_VLM_LAYERS=16 +# PAD_LANG_TO=longest +# EVAL_CKPT=$WORK/logs/lerobot/lerobot_physical_intelligence_libero_smolpi0_lr1e-4bs32steps100000gpus2freeze32_cross_attn_vlml16_sa2_smolvlm2500_chunk50_nobs1_expw0.75_lrvlm1e-4_longest_pt_trans0false_decaylr2.5e-6100000_relactfalsefirst_camfalse_vim-1/checkpoints/best/pretrained_model/ +# ADD_IMAGE_TOKENS=true +# ATTN_MODE=cross_attn +# STATE_TO_PREFIX=true +# CAUSAL_ACTION_ATTENTION_MASK=true +# SELF_ATTN_EVERY_N_LAYERS=2 +# VLM_NAME=HuggingFaceTB/SmolVLM-500M-Instruct + +# CAUSAL_ATTENTION_ON_HISTORY=true +# SELF_ATTN_ONLY_ACTIONS=false +# EXPERT_WIDTH_MULTIPLIER=0.75 +# PAST_OBS_KEYS="image" +# N_OBS_STEPS=1 +# NUM_EXPERT_LAYERS=0 +# CHUNK_SIZE=50 +# NUM_VLM_LAYERS=16 +# PAD_LANG_TO=longest +# EVAL_CKPT=$WORK/logs/lerobot/lerobot_physical_intelligence_libero_smolpi0_lr2e-4bs32steps100000gpus2freeze32_cross_attn_vlml16_sa2_smolvlm2500_chunk50_nobs1_expw0.75_lrvlm1e-4_longest_pt_trans0false_decaylr2.5e-6100000_relactfalsefirst_camfalse_vim-1/checkpoints/best/pretrained_model/ +# ADD_IMAGE_TOKENS=true +# ATTN_MODE=cross_attn +# STATE_TO_PREFIX=true +# CAUSAL_ACTION_ATTENTION_MASK=true +# SELF_ATTN_EVERY_N_LAYERS=2 +# VLM_NAME=HuggingFaceTB/SmolVLM-500M-Instruct + + + +# CAUSAL_ATTENTION_ON_HISTORY=true +# SELF_ATTN_ONLY_ACTIONS=false +# EXPERT_WIDTH_MULTIPLIER=0.75 +# PAST_OBS_KEYS="image" +# N_OBS_STEPS=1 +# NUM_EXPERT_LAYERS=0 +# CHUNK_SIZE=50 +# NUM_VLM_LAYERS=16 +# PAD_LANG_TO=longest +# EVAL_CKPT=$WORK/logs/lerobot/lerobot_physical_intelligence_libero_smolpi0_lr3e-4bs32steps100000gpus2freeze32_cross_attn_vlml16_sa2_smolvlm2500_chunk50_nobs1_expw0.75_lrvlm1e-4_longest_pt_trans0false_decaylr2.5e-6100000_relactfalsefirst_camfalse_vim-1/checkpoints/best/pretrained_model/ +# ADD_IMAGE_TOKENS=true +# ATTN_MODE=cross_attn +# STATE_TO_PREFIX=true +# CAUSAL_ACTION_ATTENTION_MASK=true +# SELF_ATTN_EVERY_N_LAYERS=2 +# VLM_NAME=HuggingFaceTB/SmolVLM-500M-Instruct + + +# REGRESSION_LOSS=true +# CAUSAL_ATTENTION_ON_HISTORY=true +# SELF_ATTN_ONLY_ACTIONS=false +# EXPERT_WIDTH_MULTIPLIER=0.5 +# PAST_OBS_KEYS="image" +# N_OBS_STEPS=1 +# NUM_EXPERT_LAYERS=0 +# CHUNK_SIZE=50 +# NUM_VLM_LAYERS=0 +# PAD_LANG_TO=longest +# EVAL_CKPT=$WORK/logs/lerobot/lerobot_physical_intelligence_libero_smolpi0_lr9e-5bs8steps100000gpus2freeze32_cross_attn_vlml0_sa0_smolvlm2500_chunk50_nobs1_expw0.5_lrvlm1e-4_longest_pt_trans0false_decaylr2.5e-630000_relactfalsefirst_camfalse_vim-1_regtrue/checkpoints/best/pretrained_model/ +# ADD_IMAGE_TOKENS=true +# ATTN_MODE=cross_attn +# STATE_TO_PREFIX=true +# CAUSAL_ACTION_ATTENTION_MASK=true +# SELF_ATTN_EVERY_N_LAYERS=0 +# VLM_NAME=HuggingFaceTB/SmolVLM-500M-Instruct + + + +# REGRESSION_LOSS=true +# CAUSAL_ATTENTION_ON_HISTORY=true +# SELF_ATTN_ONLY_ACTIONS=false +# EXPERT_WIDTH_MULTIPLIER=0.5 +# PAST_OBS_KEYS="image" +# N_OBS_STEPS=1 +# NUM_EXPERT_LAYERS=0 +# CHUNK_SIZE=50 +# NUM_VLM_LAYERS=0 +# PAD_LANG_TO=longest +# EVAL_CKPT=$WORK/logs/lerobot/lerobot_physical_intelligence_libero_smolpi0_lr2e-4bs8steps100000gpus2freeze32_cross_attn_vlml0_sa0_smolvlm2500_chunk50_nobs1_expw0.5_lrvlm1e-4_longest_pt_trans0false_decaylr2.5e-630000_relactfalsefirst_camfalse_vim-1_regtrue/checkpoints/best/pretrained_model/ +# ADD_IMAGE_TOKENS=true +# ATTN_MODE=cross_attn +# STATE_TO_PREFIX=true +# CAUSAL_ACTION_ATTENTION_MASK=true +# SELF_ATTN_EVERY_N_LAYERS=0 +# VLM_NAME=HuggingFaceTB/SmolVLM-500M-Instruct + + +# CAUSAL_ATTENTION_ON_HISTORY=true +# SELF_ATTN_ONLY_ACTIONS=false +# EXPERT_WIDTH_MULTIPLIER=0.75 +# PAST_OBS_KEYS="image" +# N_OBS_STEPS=1 +# NUM_EXPERT_LAYERS=0 +# CHUNK_SIZE=50 +# NUM_VLM_LAYERS=16 +# PAD_LANG_TO=longest +# EVAL_CKPT=$WORK/logs/lerobot/lerobot_physical_intelligence_libero_smolpi0_lr9e-5bs32steps100000gpus2freeze32_cross_attn_vlml16_sa2_smolvlm2500_chunk50_nobs1_expw0.75_lrvlm1e-4_longest_pt_trans0false_decaylr2.5e-6100000_relactfalsefirst_camfalse_vim-1_regfalse/checkpoints/best/pretrained_model/ +# ADD_IMAGE_TOKENS=true +# ATTN_MODE=cross_attn +# STATE_TO_PREFIX=true +# CAUSAL_ACTION_ATTENTION_MASK=true +# SELF_ATTN_EVERY_N_LAYERS=2 +# VLM_NAME=HuggingFaceTB/SmolVLM-500M-Instruct + + +# CAUSAL_ATTENTION_ON_HISTORY=true +# SELF_ATTN_ONLY_ACTIONS=false +# EXPERT_WIDTH_MULTIPLIER=0.75 +# PAST_OBS_KEYS="image" +# N_OBS_STEPS=1 +# NUM_EXPERT_LAYERS=0 +# CHUNK_SIZE=50 +# NUM_VLM_LAYERS=16 +# PAD_LANG_TO=longest +# EVAL_CKPT=$WORK/logs/lerobot/lerobot_physical_intelligence_libero_smolpi0_lr5e-4bs8steps100000gpus2freeze32_cross_attn_vlml16_sa2_smolvlm2250_chunk50_nobs1_expw0.75_lrvlm1e-4_longest_pt_trans0false_decaylr2.5e-630000_relactfalsefirst_camfalse_vim-1_regfalse/checkpoints/best/pretrained_model/ +# ADD_IMAGE_TOKENS=true +# ATTN_MODE=cross_attn +# STATE_TO_PREFIX=true +# CAUSAL_ACTION_ATTENTION_MASK=true +# SELF_ATTN_EVERY_N_LAYERS=2 +# VLM_NAME=HuggingFaceTB/SmolVLM2-256M-Video-Instruct + +# CAUSAL_ATTENTION_ON_HISTORY=true +# SELF_ATTN_ONLY_ACTIONS=false +# EXPERT_WIDTH_MULTIPLIER=0.75 +# PAST_OBS_KEYS="image" +# N_OBS_STEPS=1 +# NUM_EXPERT_LAYERS=0 +# CHUNK_SIZE=50 +# NUM_VLM_LAYERS=16 +# PAD_LANG_TO=longest +# EVAL_CKPT=$WORK/logs/lerobot/lerobot_physical_intelligence_libero_smolpi0_lr8e-5bs8steps100000gpus2freeze32_cross_attn_vlml16_sa2_smolvlm22b_chunk50_nobs1_expw0.75_lrvlm1e-4_longest_pt_trans0false_decaylr2.5e-630000_relactfalsefirst_camfalse_vim-1_regfalse/checkpoints/best/pretrained_model/ +# ADD_IMAGE_TOKENS=true +# ATTN_MODE=cross_attn +# STATE_TO_PREFIX=true +# CAUSAL_ACTION_ATTENTION_MASK=true +# SELF_ATTN_EVERY_N_LAYERS=2 +# VLM_NAME=HuggingFaceTB/SmolVLM2-2.2B-Instruct + +# CAUSAL_ATTENTION_ON_HISTORY=true +# SELF_ATTN_ONLY_ACTIONS=false +# EXPERT_WIDTH_MULTIPLIER=0.75 +# PAST_OBS_KEYS="image" +# N_OBS_STEPS=1 +# NUM_EXPERT_LAYERS=0 +# CHUNK_SIZE=50 +# NUM_VLM_LAYERS=16 +# PAD_LANG_TO=longest +# EVAL_CKPT=$WORK/logs/lerobot/lerobot_physical_intelligence_libero_smolpi0_lr5e-4bs32steps100000gpus2freeze32_cross_attn_vlml16_sa2_smolvlm1250_chunk50_nobs1_expw0.75_lrvlm1e-4_longest_pt_trans0false_decaylr2.5e-630000_relactfalsefirst_camfalse_vim-1_regfalse/checkpoints/best/pretrained_model/ +# ADD_IMAGE_TOKENS=true +# ATTN_MODE=cross_attn +# STATE_TO_PREFIX=true +# CAUSAL_ACTION_ATTENTION_MASK=true +# SELF_ATTN_EVERY_N_LAYERS=2 +# VLM_NAME=HuggingFaceTB/SmolVLM2-256M-Video-Instruct + + +# CAUSAL_ATTENTION_ON_HISTORY=true +# SELF_ATTN_ONLY_ACTIONS=false +# EXPERT_WIDTH_MULTIPLIER=0.75 +# PAST_OBS_KEYS="image" +# N_OBS_STEPS=1 +# NUM_EXPERT_LAYERS=0 +# CHUNK_SIZE=50 +# NUM_VLM_LAYERS=16 +# PAD_LANG_TO=longest +# EVAL_CKPT=$WORK/logs/lerobot/lerobot_physical_intelligence_libero_smolpi0_lr4e-4bs32steps100000gpus2freeze32_cross_attn_vlml16_sa2_smolvlm1250_chunk50_nobs1_expw0.75_lrvlm1e-4_longest_pt_trans0false_decaylr2.5e-630000_relactfalsefirst_camfalse_vim-1_regfalse/checkpoints/best/pretrained_model/ +# ADD_IMAGE_TOKENS=true +# ATTN_MODE=cross_attn +# STATE_TO_PREFIX=true +# CAUSAL_ACTION_ATTENTION_MASK=true +# SELF_ATTN_EVERY_N_LAYERS=2 +# VLM_NAME=HuggingFaceTB/SmolVLM2-256M-Video-Instruct + + +# CAUSAL_ATTENTION_ON_HISTORY=true +# SELF_ATTN_ONLY_ACTIONS=false +# EXPERT_WIDTH_MULTIPLIER=0.75 +# PAST_OBS_KEYS="image" +# N_OBS_STEPS=1 +# NUM_EXPERT_LAYERS=0 +# CHUNK_SIZE=50 +# NUM_VLM_LAYERS=16 +# PAD_LANG_TO=longest +# EVAL_CKPT=$WORK/logs/lerobot/lerobot_physical_intelligence_libero_smolpi0_lr3e-4bs32steps100000gpus2freeze32_cross_attn_vlml16_sa2_smolvlm1250_chunk50_nobs1_expw0.75_lrvlm1e-4_longest_pt_trans0false_decaylr2.5e-630000_relactfalsefirst_camfalse_vim-1_regfalse/checkpoints/best/pretrained_model/ +# ADD_IMAGE_TOKENS=true +# ATTN_MODE=cross_attn +# STATE_TO_PREFIX=true +# CAUSAL_ACTION_ATTENTION_MASK=true +# SELF_ATTN_EVERY_N_LAYERS=2 +# VLM_NAME=HuggingFaceTB/SmolVLM2-256M-Video-Instruct + + +# CAUSAL_ATTENTION_ON_HISTORY=true +# SELF_ATTN_ONLY_ACTIONS=false +# EXPERT_WIDTH_MULTIPLIER=0.75 +# PAST_OBS_KEYS="image" +# N_OBS_STEPS=1 +# NUM_EXPERT_LAYERS=0 +# CHUNK_SIZE=50 +# NUM_VLM_LAYERS=16 +# PAD_LANG_TO=longest +# EVAL_CKPT=$WORK/logs/lerobot/lerobot_physical_intelligence_libero_smolpi0_lr6e-5bs32steps100000gpus2freeze32_cross_attn_vlml16_sa2_smolvlm12b_chunk50_nobs1_expw0.75_lrvlm1e-4_longest_pt_trans0false_decaylr2.5e-630000_relactfalsefirst_camfalse_vim-1_regfalse/checkpoints/best/pretrained_model/ +# ADD_IMAGE_TOKENS=true +# ATTN_MODE=cross_attn +# STATE_TO_PREFIX=true +# CAUSAL_ACTION_ATTENTION_MASK=true +# SELF_ATTN_EVERY_N_LAYERS=2 +# VLM_NAME=HuggingFaceTB/SmolVLM2-2.2B-Instruct + + +# CAUSAL_ATTENTION_ON_HISTORY=true +# SELF_ATTN_ONLY_ACTIONS=false +# EXPERT_WIDTH_MULTIPLIER=0.75 +# PAST_OBS_KEYS="image" +# N_OBS_STEPS=1 +# NUM_EXPERT_LAYERS=0 +# CHUNK_SIZE=50 +# NUM_VLM_LAYERS=16 +# PAD_LANG_TO=longest +# EVAL_CKPT=$WORK/logs/lerobot/lerobot_physical_intelligence_libero_smolpi0_lr1e-4bs32steps100000gpus2freeze32_cross_attn_vlml16_sa2_smolvlm12b_chunk50_nobs1_expw0.75_lrvlm1e-4_longest_pt_trans0false_decaylr2.5e-630000_relactfalsefirst_camfalse_vim-1_regfalse/checkpoints/best/pretrained_model/ +# ADD_IMAGE_TOKENS=true +# ATTN_MODE=cross_attn +# STATE_TO_PREFIX=true +# CAUSAL_ACTION_ATTENTION_MASK=true +# SELF_ATTN_EVERY_N_LAYERS=2 +# VLM_NAME=HuggingFaceTB/SmolVLM2-2.2B-Instruct + + + +# CAUSAL_ATTENTION_ON_HISTORY=true +# SELF_ATTN_ONLY_ACTIONS=false +# EXPERT_WIDTH_MULTIPLIER=0.75 +# PAST_OBS_KEYS="image" +# N_OBS_STEPS=1 +# NUM_EXPERT_LAYERS=0 +# CHUNK_SIZE=50 +# NUM_VLM_LAYERS=16 +# PAD_LANG_TO=longest +# EVAL_CKPT=$WORK/logs/lerobot/lerobot_physical_intelligence_libero_smolpi0_lr9e-5bs32steps100000gpus2freeze32_cross_attn_vlml16_sa2_smolvlm12b_chunk50_nobs1_expw0.75_lrvlm1e-4_longest_pt_trans0false_decaylr2.5e-630000_relactfalsefirst_camfalse_vim-1_regfalse/checkpoints/best/pretrained_model/ +# ADD_IMAGE_TOKENS=true +# ATTN_MODE=cross_attn +# STATE_TO_PREFIX=true +# CAUSAL_ACTION_ATTENTION_MASK=true +# SELF_ATTN_EVERY_N_LAYERS=2 +# VLM_NAME=HuggingFaceTB/SmolVLM2-2.2B-Instruct + + +# CAUSAL_ATTENTION_ON_HISTORY=true +# SELF_ATTN_ONLY_ACTIONS=false +# EXPERT_WIDTH_MULTIPLIER=0.75 +# PAST_OBS_KEYS="image" +# N_OBS_STEPS=1 +# NUM_EXPERT_LAYERS=0 +# CHUNK_SIZE=50 +# NUM_VLM_LAYERS=16 +# PAD_LANG_TO=longest +# EVAL_CKPT=$WORK/logs/lerobot/lerobot_physical_intelligence_libero_smolpi0_lr7e-5bs32steps100000gpus2freeze32_cross_attn_vlml16_sa2_smolvlm12b_chunk50_nobs1_expw0.75_lrvlm1e-4_longest_pt_trans0false_decaylr2.5e-630000_relactfalsefirst_camfalse_vim-1_regfalse/checkpoints/best/pretrained_model/ +# ADD_IMAGE_TOKENS=true +# ATTN_MODE=cross_attn +# STATE_TO_PREFIX=true +# CAUSAL_ACTION_ATTENTION_MASK=true +# SELF_ATTN_EVERY_N_LAYERS=2 +# VLM_NAME=HuggingFaceTB/SmolVLM2-2.2B-Instruct + + +# CAUSAL_ATTENTION_ON_HISTORY=true +# SELF_ATTN_ONLY_ACTIONS=false +# EXPERT_WIDTH_MULTIPLIER=0.75 +# PAST_OBS_KEYS="image" +# N_OBS_STEPS=1 +# NUM_EXPERT_LAYERS=0 +# CHUNK_SIZE=50 +# NUM_VLM_LAYERS=16 +# PAD_LANG_TO=longest +# EVAL_CKPT=$WORK/logs/lerobot/lerobot_physical_intelligence_libero_smolpi0_lr8e-5bs32steps100000gpus2freeze32_cross_attn_vlml16_sa2_smolvlm12b_chunk50_nobs1_expw0.75_lrvlm1e-4_longest_pt_trans0false_decaylr2.5e-630000_relactfalsefirst_camfalse_vim-1_regfalse/checkpoints/best/pretrained_model/ +# ADD_IMAGE_TOKENS=true +# ATTN_MODE=cross_attn +# STATE_TO_PREFIX=true +# CAUSAL_ACTION_ATTENTION_MASK=true +# SELF_ATTN_EVERY_N_LAYERS=2 +# VLM_NAME=HuggingFaceTB/SmolVLM2-2.2B-Instruct + + +CAUSAL_ATTENTION_ON_HISTORY=true +SELF_ATTN_ONLY_ACTIONS=false +EXPERT_WIDTH_MULTIPLIER=0.75 +PAST_OBS_KEYS="image" +N_OBS_STEPS=1 +NUM_EXPERT_LAYERS=0 +CHUNK_SIZE=50 +NUM_VLM_LAYERS=16 +PAD_LANG_TO=longest +EVAL_CKPT=/raid/jade/models/smolvlamust +ADD_IMAGE_TOKENS=true +ATTN_MODE=cross_attn +STATE_TO_PREFIX=true +CAUSAL_ACTION_ATTENTION_MASK=true +SELF_ATTN_EVERY_N_LAYERS=2 +VLM_NAME=HuggingFaceTB/SmolVLM-500M-Instruct + + +# CAUSAL_ATTENTION_ON_HISTORY=true +# SELF_ATTN_ONLY_ACTIONS=false +# EXPERT_WIDTH_MULTIPLIER=0.75 +# PAST_OBS_KEYS="image" +# N_OBS_STEPS=1 +# NUM_EXPERT_LAYERS=0 +# CHUNK_SIZE=50 +# NUM_VLM_LAYERS=16 +# PAD_LANG_TO=longest +# EVAL_CKPT=$WORK/logs/lerobot/lerobot_physical_intelligence_libero_smolpi0_lr6e-5bs32steps100000gpus2freeze32_cross_attn_vlml16_sa2_smolvlm22b_chunk50_nobs1_expw0.75_lrvlm1e-4_longest_pt_trans0false_decaylr2.5e-630000_relactfalsefirst_camfalse_vim-1_regfalse_compilefalse/checkpoints/best/pretrained_model/ +# ADD_IMAGE_TOKENS=true +# ATTN_MODE=cross_attn +# STATE_TO_PREFIX=true +# CAUSAL_ACTION_ATTENTION_MASK=true +# SELF_ATTN_EVERY_N_LAYERS=2 +# VLM_NAME=HuggingFaceTB/SmolVLM2-2.2B-Instruct + +# CAUSAL_ATTENTION_ON_HISTORY=true +# SELF_ATTN_ONLY_ACTIONS=false +# EXPERT_WIDTH_MULTIPLIER=0.75 +# PAST_OBS_KEYS="image" +# N_OBS_STEPS=1 +# NUM_EXPERT_LAYERS=0 +# CHUNK_SIZE=50 +# NUM_VLM_LAYERS=16 +# PAD_LANG_TO=longest +# EVAL_CKPT=$WORK/logs/lerobot/lerobot_physical_intelligence_libero_smolpi0_lr9e-5bs32steps100000gpus2freeze32_cross_attn_vlml16_sa2_smolvlm22b_chunk50_nobs1_expw0.75_lrvlm1e-4_longest_pt_trans0false_decaylr2.5e-630000_relactfalsefirst_camfalse_vim-1_regfalse_compilefalse/checkpoints/best/pretrained_model/ +# ADD_IMAGE_TOKENS=true +# ATTN_MODE=cross_attn +# STATE_TO_PREFIX=true +# CAUSAL_ACTION_ATTENTION_MASK=true +# SELF_ATTN_EVERY_N_LAYERS=2 +# VLM_NAME=HuggingFaceTB/SmolVLM2-2.2B-Instruct + + +# CAUSAL_ATTENTION_ON_HISTORY=true +# SELF_ATTN_ONLY_ACTIONS=false +# EXPERT_WIDTH_MULTIPLIER=0.75 +# PAST_OBS_KEYS="image" +# N_OBS_STEPS=1 +# NUM_EXPERT_LAYERS=0 +# CHUNK_SIZE=50 +# NUM_VLM_LAYERS=16 +# PAD_LANG_TO=longest +# EVAL_CKPT=$WORK/logs/lerobot/lerobot_physical_intelligence_libero_smolpi0_lr8e-5bs32steps100000gpus2freeze32_cross_attn_vlml16_sa2_smolvlm22b_chunk50_nobs1_expw0.75_lrvlm1e-4_longest_pt_trans0false_decaylr2.5e-630000_relactfalsefirst_camfalse_vim-1_regfalse_compilefalse/checkpoints/best/pretrained_model/ +# ADD_IMAGE_TOKENS=true +# ATTN_MODE=cross_attn +# STATE_TO_PREFIX=true +# CAUSAL_ACTION_ATTENTION_MASK=true +# SELF_ATTN_EVERY_N_LAYERS=2 +# VLM_NAME=HuggingFaceTB/SmolVLM2-2.2B-Instruct + +# CAUSAL_ATTENTION_ON_HISTORY=true +# SELF_ATTN_ONLY_ACTIONS=false +# EXPERT_WIDTH_MULTIPLIER=0.75 +# PAST_OBS_KEYS="image" +# N_OBS_STEPS=1 +# NUM_EXPERT_LAYERS=0 +# CHUNK_SIZE=50 +# NUM_VLM_LAYERS=16 +# PAD_LANG_TO=longest +# EVAL_CKPT=$WORK/logs/lerobot/lerobot_physical_intelligence_libero_smolpi0_lr1e-4bs32steps100000gpus2freeze32_cross_attn_vlml16_sa2_smolvlm22b_chunk50_nobs1_expw0.75_lrvlm1e-4_longest_pt_trans0false_decaylr2.5e-630000_relactfalsefirst_camfalse_vim-1_regfalse_compilefalse/checkpoints/best/pretrained_model/ +# ADD_IMAGE_TOKENS=true +# ATTN_MODE=cross_attn +# STATE_TO_PREFIX=true +# CAUSAL_ACTION_ATTENTION_MASK=true +# SELF_ATTN_EVERY_N_LAYERS=2 +# VLM_NAME=HuggingFaceTB/SmolVLM2-2.2B-Instruct + + + +# CAUSAL_ATTENTION_ON_HISTORY=true +# SELF_ATTN_ONLY_ACTIONS=false +# EXPERT_WIDTH_MULTIPLIER=0.75 +# PAST_OBS_KEYS="image" +# N_OBS_STEPS=1 +# NUM_EXPERT_LAYERS=0 +# CHUNK_SIZE=50 +# NUM_VLM_LAYERS=16 +# PAD_LANG_TO=longest +# EVAL_CKPT=$WORK/logs/lerobot/lerobot_physical_intelligence_libero_smolpi0_lr7e-4bs32steps100000gpus2freeze32_cross_attn_vlml16_sa2_smolvlm1250_chunk50_nobs1_expw0.75_lrvlm1e-4_longest_pt_trans0false_decaylr2.5e-630000_relactfalsefirst_camfalse_vim-1_regfalse/checkpoints/best/pretrained_model/ +# ADD_IMAGE_TOKENS=true +# ATTN_MODE=cross_attn +# STATE_TO_PREFIX=true +# CAUSAL_ACTION_ATTENTION_MASK=true +# SELF_ATTN_EVERY_N_LAYERS=2 +# VLM_NAME=HuggingFaceTB/SmolVLM2-256M-Video-Instruct + + +# CAUSAL_ATTENTION_ON_HISTORY=true +# SELF_ATTN_ONLY_ACTIONS=false +# EXPERT_WIDTH_MULTIPLIER=0.75 +# PAST_OBS_KEYS="image" +# N_OBS_STEPS=1 +# NUM_EXPERT_LAYERS=0 +# CHUNK_SIZE=50 +# NUM_VLM_LAYERS=16 +# PAD_LANG_TO=longest +# EVAL_CKPT=$WORK/logs/lerobot/lerobot_physical_intelligence_libero_smolpi0_lr1e-4bs32steps100000gpus2freeze32_cross_attn_vlml16_sa2_smolvlm2250_chunk50_nobs1_expw0.75_lrvlm1e-4_longest_pt_trans0false_decaylr2.5e-630000_relactfalsefirst_camfalse_vim-1_regfalse_compilefalse/checkpoints/best/pretrained_model/ +# ADD_IMAGE_TOKENS=true +# ATTN_MODE=cross_attn +# STATE_TO_PREFIX=true +# CAUSAL_ACTION_ATTENTION_MASK=true +# SELF_ATTN_EVERY_N_LAYERS=2 +# VLM_NAME=HuggingFaceTB/SmolVLM2-256M-Video-Instruct + + +# CAUSAL_ATTENTION_ON_HISTORY=true +# SELF_ATTN_ONLY_ACTIONS=false +# EXPERT_WIDTH_MULTIPLIER=0.75 +# PAST_OBS_KEYS="image" +# N_OBS_STEPS=1 +# NUM_EXPERT_LAYERS=0 +# CHUNK_SIZE=50 +# NUM_VLM_LAYERS=16 +# PAD_LANG_TO=longest +# EVAL_CKPT=$WORK/logs/lerobot/lerobot_physical_intelligence_libero_smolpi0_lr3e-4bs32steps100000gpus2freeze32_cross_attn_vlml16_sa2_smolvlm2250_chunk50_nobs1_expw0.75_lrvlm1e-4_longest_pt_trans0false_decaylr2.5e-630000_relactfalsefirst_camfalse_vim-1_regfalse_compilefalse/checkpoints/best/pretrained_model/ +# ADD_IMAGE_TOKENS=true +# ATTN_MODE=cross_attn +# STATE_TO_PREFIX=true +# CAUSAL_ACTION_ATTENTION_MASK=true +# SELF_ATTN_EVERY_N_LAYERS=2 +# VLM_NAME=HuggingFaceTB/SmolVLM2-256M-Video-Instruct + +# CAUSAL_ATTENTION_ON_HISTORY=true +# SELF_ATTN_ONLY_ACTIONS=false +# EXPERT_WIDTH_MULTIPLIER=0.75 +# PAST_OBS_KEYS="image" +# N_OBS_STEPS=1 +# NUM_EXPERT_LAYERS=0 +# CHUNK_SIZE=50 +# NUM_VLM_LAYERS=16 +# PAD_LANG_TO=longest +# EVAL_CKPT=$WORK/logs/lerobot/lerobot_physical_intelligence_libero_smolpi0_lr5e-4bs32steps100000gpus2freeze32_cross_attn_vlml16_sa2_smolvlm2250_chunk50_nobs1_expw0.75_lrvlm1e-4_longest_pt_trans0false_decaylr2.5e-630000_relactfalsefirst_camfalse_vim-1_regfalse_compilefalse/checkpoints/best/pretrained_model/ +# ADD_IMAGE_TOKENS=true +# ATTN_MODE=cross_attn +# STATE_TO_PREFIX=true +# CAUSAL_ACTION_ATTENTION_MASK=true +# SELF_ATTN_EVERY_N_LAYERS=2 +# VLM_NAME=HuggingFaceTB/SmolVLM2-256M-Video-Instruct + + +# CAUSAL_ATTENTION_ON_HISTORY=true +# SELF_ATTN_ONLY_ACTIONS=false +# EXPERT_WIDTH_MULTIPLIER=0.75 +# PAST_OBS_KEYS="image" +# N_OBS_STEPS=1 +# NUM_EXPERT_LAYERS=0 +# CHUNK_SIZE=50 +# NUM_VLM_LAYERS=16 +# PAD_LANG_TO=longest +# EVAL_CKPT=$WORK/logs/lerobot/lerobot_physical_intelligence_libero_smolpi0_lr7e-4bs32steps100000gpus2freeze32_cross_attn_vlml16_sa2_smolvlm2250_chunk50_nobs1_expw0.75_lrvlm1e-4_longest_pt_trans0false_decaylr2.5e-630000_relactfalsefirst_camfalse_vim-1_regfalse_compilefalse/checkpoints/best/pretrained_model/ +# ADD_IMAGE_TOKENS=true +# ATTN_MODE=cross_attn +# STATE_TO_PREFIX=true +# CAUSAL_ACTION_ATTENTION_MASK=true +# SELF_ATTN_EVERY_N_LAYERS=2 +# VLM_NAME=HuggingFaceTB/SmolVLM2-256M-Video-Instruct + +# CAUSAL_ATTENTION_ON_HISTORY=true +# SELF_ATTN_ONLY_ACTIONS=false +# EXPERT_WIDTH_MULTIPLIER=0.75 +# PAST_OBS_KEYS="image" +# N_OBS_STEPS=1 +# NUM_EXPERT_LAYERS=0 +# CHUNK_SIZE=50 +# NUM_VLM_LAYERS=16 +# PAD_LANG_TO=longest +# EVAL_CKPT=$WORK/logs/lerobot/lerobot_physical_intelligence_libero_smolpi0_lr6e-4bs32steps100000gpus2freeze32_cross_attn_vlml16_sa2_smolvlm2250_chunk50_nobs1_expw0.75_lrvlm1e-4_longest_pt_trans0false_decaylr2.5e-630000_relactfalsefirst_camfalse_vim-1_regfalse_compilefalse/checkpoints/best/pretrained_model/ +# ADD_IMAGE_TOKENS=true +# ATTN_MODE=cross_attn +# STATE_TO_PREFIX=true +# CAUSAL_ACTION_ATTENTION_MASK=true +# SELF_ATTN_EVERY_N_LAYERS=2 +# VLM_NAME=HuggingFaceTB/SmolVLM2-256M-Video-Instruct + +# CAUSAL_ATTENTION_ON_HISTORY=true +# SELF_ATTN_ONLY_ACTIONS=false +# EXPERT_WIDTH_MULTIPLIER=0.75 +# PAST_OBS_KEYS="image" +# N_OBS_STEPS=1 +# NUM_EXPERT_LAYERS=0 +# CHUNK_SIZE=50 +# NUM_VLM_LAYERS=16 +# PAD_LANG_TO=longest +# EVAL_CKPT=$WORK/logs/lerobot/lerobot_physical_intelligence_libero_smolpi0_lr4e-4bs32steps100000gpus2freeze32_cross_attn_vlml16_sa2_smolvlm2250_chunk50_nobs1_expw0.75_lrvlm1e-4_longest_pt_trans0false_decaylr2.5e-630000_relactfalsefirst_camfalse_vim-1_regfalse_compilefalse/checkpoints/best/pretrained_model/ +# ADD_IMAGE_TOKENS=true +# ATTN_MODE=cross_attn +# STATE_TO_PREFIX=true +# CAUSAL_ACTION_ATTENTION_MASK=true +# SELF_ATTN_EVERY_N_LAYERS=2 +# VLM_NAME=HuggingFaceTB/SmolVLM2-256M-Video-Instruct + + +# # TASK=libero_spatial +# MULTITASK_EVAL=false +# N_EPISODES=50 + +# TASK=libero_spatial,libero_object,libero_goal,libero_10 +MULTITASK_EVAL=true +# N_EPISODES=5 +N_EPISODES=1 + +# MAX_PARRALLEL_TASKS=5 +# MAX_PARRALLEL_TASKS=2 +MAX_PARRALLEL_TASKS=1 + +# NUM_EVALS=2 +# SEEDS=(1000 5000) +SEEDS=(5000) +# ACTION_STEPS_LIST=(1 10 30 50) +ACTION_STEPS_LIST=(1) +# ACTION_STEPS_LIST=(50) +TASK_LIST=(libero_spatial libero_object libero_goal libero_10) +TASK_LIST=(libero_spatial) +for SEED in "${SEEDS[@]}"; do + for N_ACTION_STEPS in "${ACTION_STEPS_LIST[@]}"; do + for TASK in "${TASK_LIST[@]}"; do + echo "$TASK Evaluating: $EVAL_CKPT | N_ACTION_STEPS=$N_ACTION_STEPS | EVAL SEED=$SEED" + python src/lerobot/scripts/eval.py \ + --output_dir=/raid/jade/logs/lerobot/tmp \ + --env.type=$ENV \ + --env.task=$TASK \ + --eval.batch_size=$N_EPISODES \ + --eval.n_episodes=$N_EPISODES \ + --seed=$SEED \ + --policy.use_amp=false \ + --policy.path=$EVAL_CKPT \ + --policy.n_action_steps=$N_ACTION_STEPS \ + --policy.checkpoint_path=$EVAL_CKPT \ + --env.multitask_eval=$MULTITASK_EVAL --env.max_parallel_tasks=$MAX_PARRALLEL_TASKS \ + --policy.add_image_special_tokens=$ADD_IMAGE_TOKENS \ + --policy.attention_mode=$ATTN_MODE \ + --policy.causal_action_attention_mask=$CAUSAL_ACTION_ATTENTION_MASK \ + --policy.state_to_prefix=$STATE_TO_PREFIX \ + --policy.self_attn_every_n_layers=$SELF_ATTN_EVERY_N_LAYERS \ + --policy.pad_language_to=$PAD_LANG_TO \ + --policy.load_vlm_weights=$LOAD_VLM_WEIGHTS \ + --policy.vlm_model_name=$VLM_NAME \ + --policy.num_vlm_layers=$NUM_VLM_LAYERS \ + --policy.chunk_size=$CHUNK_SIZE \ + --policy.n_obs_steps=$N_OBS_STEPS \ + --policy.past_obs_keys=$PAST_OBS_KEYS \ + --policy.num_expert_layers=$NUM_EXPERT_LAYERS \ + --policy.expert_width_multiplier=$EXPERT_WIDTH_MULTIPLIER \ + --policy.peft_method=$PEFT_METHOD \ + --policy.self_attn_only_actions=$SELF_ATTN_ONLY_ACTIONS \ + --policy.causal_attention_on_history=$CAUSAL_ATTENTION_ON_HISTORY \ + --policy.predict_relative_actions=$PREDICT_RELATIVE_ACTIONS --policy.relative_actions_mode=$RELATIVE_ACTIONS_MODE --policy.shuffle_camera_positions=$SHUFFLE_CAMERA_POSITIONS \ + --policy.vlm_img_size=$VLM_IMG_SIZE \ + --policy.regression_loss=$REGRESSION_LOSS + # --policy.peft_config.r=$LORA_R --policy.peft_config.target_modules=$LORA_TARGET_MODULES --policy.peft_method=$PEFT_METHOD --policy.peft_target_model=$PEFT_TARGET_MODEL + + echo "Done with: $EVAL_CKPT | Steps=$N_ACTION_STEPS | EVAL SEED=$SEED" + echo "------------------------------------------------------" + done + done +done + + +# ############################################################################################################################################ +# ############################################################################################################################################ +# ############################################################################################################################################ +# ########### Offline eval + + +# # ############################ +# # # Community datasets V1 +# # # REPO_ID=pranavsaroha/so100_legos4,pranavsaroha/so100_onelego2,jpata/so100_pick_place_tangerine,pranavsaroha/so100_onelego3,pranavsaroha/so100_carrot_2,pranavsaroha/so100_carrot_5,pandaRQ/pick_med_1,HITHY/so100_strawberry,vladfatu/so100_above,koenvanwijk/orange50-1,koenvanwijk/orange50-variation-2,FeiYjf/new_GtoR,CSCSXX/pick_place_cube_1.18,vladfatu/so100_office,dragon-95/so100_sorting,dragon-95/so100_sorting_1,nbaron99/so100_pick_and_place4,Beegbrain/pick_place_green_block,Ityl/so100_recording2,dragon-95/so100_sorting_2,dragon-95/so100_sorting_3,aractingi/push_cube_offline_data,HITHY/so100_peach3,HITHY/so100_peach4,shreyasgite/so100_legocube_50,shreyasgite/so100_base_env,triton7777/so100_dataset_mix,Deason11/Open_the_drawer_to_place_items,Deason11/PLACE_TAPE_PUSH_DRAWER,NONHUMAN-RESEARCH/SOARM100_TASK_VENDA,mikechambers/block_cup_14,samsam0510/tooth_extraction_3,samsam0510/tooth_extraction_4,samsam0510/cube_reorientation_2,samsam0510/cube_reorientation_4,samsam0510/glove_reorientation_1,DorayakiLin/so100_pick_charger_on_tissue,zijian2022/noticehuman3,liuhuanjim013/so100_th +# # # Inconsistent actions dim: Deason11/Open_the_drawer_to_place_items, Deason11/PLACE_TAPE_PUSH_DRAWER +# # # Filtered datasets +# # REPO_ID=pranavsaroha/so100_onelego2,pranavsaroha/so100_onelego3,pranavsaroha/so100_carrot_2,vladfatu/so100_above,koenvanwijk/orange50-1,CSCSXX/pick_place_cube_1.18,dragon-95/so100_sorting,dragon-95/so100_sorting_1,nbaron99/so100_pick_and_place4,Beegbrain/pick_place_green_block,dragon-95/so100_sorting_3,HITHY/so100_peach3,shreyasgite/so100_legocube_50,triton7777/so100_dataset_mix,NONHUMAN-RESEARCH/SOARM100_TASK_VENDA,mikechambers/block_cup_14,samsam0510/tooth_extraction_3,samsam0510/tooth_extraction_4,samsam0510/cube_reorientation_2,samsam0510/cube_reorientation_4,samsam0510/glove_reorientation_1,vladfatu/so100_office,pranavsaroha/so100_legos4,Ityl/so100_recording2,FeiYjf/new_GtoR,dragon-95/so100_sorting_2,HITHY/so100_peach4,jpata/so100_pick_place_tangerine,HITHY/so100_strawberry,shreyasgite/so100_base_env,koenvanwijk/orange50-variation-2,pranavsaroha/so100_carrot_5,pandaRQ/pick_med_1,aractingi/push_cube_offline_data,DorayakiLin/so100_pick_charger_on_tissue,zijian2022/noticehuman3,liuhuanjim013/so100_th +# # SAMPLING_WEIGHTS= +# # DATASET_NAME=so100_community_v1 + + +# # # Community datasets V2 +# # # Inconsistent actions: 1g0rrr/sam_openpi_solder1, 1g0rrr/sam_openpi03, 1g0rrr/sam_openpi_solder2 +# # # Other issues: pierfabre/rabbit bensprenger/right_arm_p_brick_in_box_with_y_noise_v0 pierfabre/horse pierfabre/pig2 pierfabre/pig3 pierfabre/cow2,pierfabre/sheep +# # # REPO_ID=Chojins/chess_game_009_white,sihyun77/suho_3_17_1,sihyun77/sihyun_3_17_2,sihyun77/suho_3_17_3,sihyun77/sihyun_3_17_5,Odog16/so100_cube_drop_pick_v1,sihyun77/sihyun_main_2,sihyun77/suho_main_2,Bartm3/dice2,sihyun77/sihyun_main_3,Loki0929/so100_duck,pietroom/holdthis,pietroom/actualeasytask,Beegbrain/pick_lemon_and_drop_in_bowl,Beegbrain/sweep_tissue_cube,zijian2022/321,gxy1111/so100_pick_place,Odog16/so100_cube_stacking_v1,sihyun77/mond_1,andlyu/so100_indoor_1,andlyu/so100_indoor_3,frk2/so100large,lirislab/sweep_tissue_cube,lirislab/lemon_into_bowl,lirislab/red_cube_into_green_lego_block,lirislab/red_cube_into_blue_cube,00ri/so100_battery,frk2/so100largediffcam,FsqZ/so100_1,ZGGZZG/so100_drop0,Chojins/chess_game_000_white_red,smanni/train_so100_fluffy_box,ganker5/so100_push_20250328,ganker5/so100_dataline_0328,ganker5/so100_color_0328,CrazyYhang/A1234-B-C_mvA2B,RasmusP/so100_Orange2Green,sixpigs1/so100_pick_cube_in_box,ganker5/so100_push_20250331,ganker5/so100_dataline_20250331,lirislab/put_caps_into_teabox,lirislab/close_top_drawer_teabox,lirislab/open_top_drawer_teabox,lirislab/unfold_bottom_right,lirislab/push_cup_target,lirislab/put_banana_bowl,Chojins/chess_game_001_blue_stereo,Chojins/chess_game_001_red_stereo,ganker5/so100_toy_20250402,Gano007/so100_medic,00ri/so100_battery_bin_center,paszea/so100_whale_2,lirislab/fold_bottom_right,lirislab/put_coffee_cap_teabox,therarelab/so100_pick_place_2,paszea/so100_whale_3,paszea/so100_whale_4,paszea/so100_lego,LemonadeDai/so100_coca,zijian2022/backgrounda,zijian2022/backgroundb,356c/so100_nut_sort_1,Mwuqiu/so100_0408_muti,aimihat/so100_tape,lirislab/so100_demo,356c/so100_duck_reposition_1,zijian2022/sort1,weiye11/so100_410_zwy,VoicAndrei/so100_banana_to_plate_only,sixpigs1/so100_stack_cube_error,isadev/bougies3,zijian2022/close3,bensprenger/left_arm_yellow_brick_in_box_v0,lirislab/guess_who_so100,bensprenger/left_arm_yellow_brick_in_box_with_purple_noise_v0,roboticshack/team16-can-stacking,zijian2022/insert2,roboticshack/team-7-right-arm-grasp-tape,Jiangeng/so100_413,roboticshack/team9-pick_cube_place_static_plate,AndrejOrsula/lerobot_double_ball_stacking_random,roboticshack/left-arm-grasp-lego-brick,roboticshack/team-7-left-arm-grasp-motor,roboticshack/team9-pick_chicken_place_plate,roboticshack/team13-two-balls-stacking,tkc79/so100_lego_box_1,roboticshack/team13-three-balls-stacking,pierfabre/chicken,roboticshack/team16-water-pouring,ad330/cubePlace,Jiafei1224/so100_pa222per,paszea/so100_lego_2cam,bensprenger/chess_game_001_blue_stereo,Mohamedal/put_banana,tkc79/so100_lego_box_2,samanthalhy/so100_herding_1,jlesein/TestBoulon7 +# # REPO_ID=pierfabre/rabbit,bensprenger/right_arm_p_brick_in_box_with_y_noise_v0,pierfabre/horse,pierfabre/pig2,pierfabre/pig3,pierfabre/cow2,pierfabre/sheep,Chojins/chess_game_009_white,sihyun77/suho_3_17_1,sihyun77/sihyun_3_17_2,sihyun77/suho_3_17_3,sihyun77/sihyun_3_17_5,Odog16/so100_cube_drop_pick_v1,sihyun77/sihyun_main_2,sihyun77/suho_main_2,Bartm3/dice2,sihyun77/sihyun_main_3,Loki0929/so100_duck,pietroom/holdthis,pietroom/actualeasytask,Beegbrain/pick_lemon_and_drop_in_bowl,Beegbrain/sweep_tissue_cube,zijian2022/321,gxy1111/so100_pick_place,Odog16/so100_cube_stacking_v1,sihyun77/mond_1,andlyu/so100_indoor_1,andlyu/so100_indoor_3,frk2/so100large,lirislab/sweep_tissue_cube,lirislab/lemon_into_bowl,lirislab/red_cube_into_green_lego_block,lirislab/red_cube_into_blue_cube,00ri/so100_battery,frk2/so100largediffcam,FsqZ/so100_1,ZGGZZG/so100_drop0,Chojins/chess_game_000_white_red,smanni/train_so100_fluffy_box,ganker5/so100_push_20250328,ganker5/so100_dataline_0328,ganker5/so100_color_0328,CrazyYhang/A1234-B-C_mvA2B,RasmusP/so100_Orange2Green,sixpigs1/so100_pick_cube_in_box,ganker5/so100_push_20250331,ganker5/so100_dataline_20250331,lirislab/put_caps_into_teabox,lirislab/close_top_drawer_teabox,lirislab/open_top_drawer_teabox,lirislab/unfold_bottom_right,lirislab/push_cup_target,lirislab/put_banana_bowl,Chojins/chess_game_001_blue_stereo,Chojins/chess_game_001_red_stereo,ganker5/so100_toy_20250402,Gano007/so100_medic,00ri/so100_battery_bin_center,paszea/so100_whale_2,lirislab/fold_bottom_right,lirislab/put_coffee_cap_teabox,therarelab/so100_pick_place_2,paszea/so100_whale_3,paszea/so100_whale_4,paszea/so100_lego,LemonadeDai/so100_coca,zijian2022/backgrounda,zijian2022/backgroundb,356c/so100_nut_sort_1,Mwuqiu/so100_0408_muti,aimihat/so100_tape,lirislab/so100_demo,356c/so100_duck_reposition_1,zijian2022/sort1,weiye11/so100_410_zwy,VoicAndrei/so100_banana_to_plate_only,sixpigs1/so100_stack_cube_error,isadev/bougies3,zijian2022/close3,bensprenger/left_arm_yellow_brick_in_box_v0,lirislab/guess_who_so100,bensprenger/left_arm_yellow_brick_in_box_with_purple_noise_v0,roboticshack/team16-can-stacking,zijian2022/insert2,roboticshack/team-7-right-arm-grasp-tape,Jiangeng/so100_413,roboticshack/team9-pick_cube_place_static_plate,AndrejOrsula/lerobot_double_ball_stacking_random,roboticshack/left-arm-grasp-lego-brick,roboticshack/team-7-left-arm-grasp-motor,roboticshack/team9-pick_chicken_place_plate,roboticshack/team13-two-balls-stacking,tkc79/so100_lego_box_1,roboticshack/team13-three-balls-stacking,pierfabre/chicken,roboticshack/team16-water-pouring,ad330/cubePlace,Jiafei1224/so100_pa222per,paszea/so100_lego_2cam,bensprenger/chess_game_001_blue_stereo,Mohamedal/put_banana,tkc79/so100_lego_box_2,samanthalhy/so100_herding_1,jlesein/TestBoulon7 +# # SAMPLING_WEIGHTS= +# # DATASET_NAME=so100_community_v2 + +# # Community datasets V1+V2 +# # REPO_ID=pierfabre/rabbit,bensprenger/right_arm_p_brick_in_box_with_y_noise_v0,pierfabre/horse,pierfabre/pig2,pierfabre/pig3,pierfabre/cow2,pierfabre/sheep,Chojins/chess_game_009_white,sihyun77/suho_3_17_1,sihyun77/sihyun_3_17_2,sihyun77/suho_3_17_3,sihyun77/sihyun_3_17_5,Odog16/so100_cube_drop_pick_v1,sihyun77/sihyun_main_2,sihyun77/suho_main_2,Bartm3/dice2,sihyun77/sihyun_main_3,Loki0929/so100_duck,pietroom/holdthis,pietroom/actualeasytask,Beegbrain/pick_lemon_and_drop_in_bowl,Beegbrain/sweep_tissue_cube,zijian2022/321,gxy1111/so100_pick_place,Odog16/so100_cube_stacking_v1,sihyun77/mond_1,andlyu/so100_indoor_1,andlyu/so100_indoor_3,frk2/so100large,lirislab/sweep_tissue_cube,lirislab/lemon_into_bowl,lirislab/red_cube_into_green_lego_block,lirislab/red_cube_into_blue_cube,00ri/so100_battery,frk2/so100largediffcam,FsqZ/so100_1,ZGGZZG/so100_drop0,Chojins/chess_game_000_white_red,smanni/train_so100_fluffy_box,ganker5/so100_push_20250328,ganker5/so100_dataline_0328,ganker5/so100_color_0328,CrazyYhang/A1234-B-C_mvA2B,RasmusP/so100_Orange2Green,sixpigs1/so100_pick_cube_in_box,ganker5/so100_push_20250331,ganker5/so100_dataline_20250331,lirislab/put_caps_into_teabox,lirislab/close_top_drawer_teabox,lirislab/open_top_drawer_teabox,lirislab/unfold_bottom_right,lirislab/push_cup_target,lirislab/put_banana_bowl,Chojins/chess_game_001_blue_stereo,Chojins/chess_game_001_red_stereo,ganker5/so100_toy_20250402,Gano007/so100_medic,00ri/so100_battery_bin_center,paszea/so100_whale_2,lirislab/fold_bottom_right,lirislab/put_coffee_cap_teabox,therarelab/so100_pick_place_2,paszea/so100_whale_3,paszea/so100_whale_4,paszea/so100_lego,LemonadeDai/so100_coca,zijian2022/backgrounda,zijian2022/backgroundb,356c/so100_nut_sort_1,Mwuqiu/so100_0408_muti,aimihat/so100_tape,lirislab/so100_demo,356c/so100_duck_reposition_1,zijian2022/sort1,weiye11/so100_410_zwy,VoicAndrei/so100_banana_to_plate_only,sixpigs1/so100_stack_cube_error,isadev/bougies3,zijian2022/close3,bensprenger/left_arm_yellow_brick_in_box_v0,lirislab/guess_who_so100,bensprenger/left_arm_yellow_brick_in_box_with_purple_noise_v0,roboticshack/team16-can-stacking,zijian2022/insert2,roboticshack/team-7-right-arm-grasp-tape,Jiangeng/so100_413,roboticshack/team9-pick_cube_place_static_plate,AndrejOrsula/lerobot_double_ball_stacking_random,roboticshack/left-arm-grasp-lego-brick,roboticshack/team-7-left-arm-grasp-motor,roboticshack/team9-pick_chicken_place_plate,roboticshack/team13-two-balls-stacking,tkc79/so100_lego_box_1,roboticshack/team13-three-balls-stacking,pierfabre/chicken,roboticshack/team16-water-pouring,ad330/cubePlace,Jiafei1224/so100_pa222per,paszea/so100_lego_2cam,bensprenger/chess_game_001_blue_stereo,Mohamedal/put_banana,tkc79/so100_lego_box_2,samanthalhy/so100_herding_1,jlesein/TestBoulon7,pranavsaroha/so100_onelego2,pranavsaroha/so100_onelego3,pranavsaroha/so100_carrot_2,vladfatu/so100_above,koenvanwijk/orange50-1,CSCSXX/pick_place_cube_1.18,dragon-95/so100_sorting,dragon-95/so100_sorting_1,nbaron99/so100_pick_and_place4,Beegbrain/pick_place_green_block,dragon-95/so100_sorting_3,HITHY/so100_peach3,shreyasgite/so100_legocube_50,triton7777/so100_dataset_mix,NONHUMAN-RESEARCH/SOARM100_TASK_VENDA,mikechambers/block_cup_14,samsam0510/tooth_extraction_3,samsam0510/tooth_extraction_4,samsam0510/cube_reorientation_2,samsam0510/cube_reorientation_4,samsam0510/glove_reorientation_1,vladfatu/so100_office,pranavsaroha/so100_legos4,Ityl/so100_recording2,FeiYjf/new_GtoR,dragon-95/so100_sorting_2,HITHY/so100_peach4,jpata/so100_pick_place_tangerine,HITHY/so100_strawberry,shreyasgite/so100_base_env,koenvanwijk/orange50-variation-2,pranavsaroha/so100_carrot_5,pandaRQ/pick_med_1,aractingi/push_cube_offline_data,DorayakiLin/so100_pick_charger_on_tissue,zijian2022/noticehuman3,liuhuanjim013/so100_th +# REPO_ID=pierfabre/rabbit,bensprenger/right_arm_p_brick_in_box_with_y_noise_v0,pierfabre/horse,pierfabre/pig2 +# SAMPLING_WEIGHTS= + +# # # Community V3 +# # # issues, yskim2025/unitylerobot (version), cranberrysoft/so100 (don't exist),29 datasets different actions: nguyen-v/so100_rotate_red_button satvikahuja/mixer_on_off_new_1 ... +# # REPO_ID=satvikahuja/mixer_on_off_new_1,aergogo/so100_pick_place,andy309/so100_0314_fold_cloths,jchun/so100_pickplace_small_20250323_120056,astroyat/cube,Ofiroz91/so_100_cube2bowl,HappyPablo/dec3_data2,ZCM5115/so100_1210,francescocrivelli/orange_feeding,francescocrivelli/carrot_eating,0x00raghu/toffee_red,0x00raghu/toffee_red_2,0x00raghu/toffee_red_3__,0x00raghu/toffee_blue,0x00raghu/toffee_blue_2,0x00raghu/toffee_to_hand_1,0x00raghu/toffee_to_hand_2,liyitenga/so100_bi_hello,liyitenga/so100_bi_giveme5,ZCM5115/so100_2Arm3cameras_movebox,pranavsaroha/so100_carrot_1,pranavsaroha/so100_carrot_3,pranavsaroha/so100_carrot_4,maximilienroberti/so100_lego_red_box,pranavsaroha/so100_squishy,rabhishek100/so100_train_dataset,pranavsaroha/so100_squishy100,swarajgosavi/kikobot_pusht_real_v2,pandaRQ/pickmed,swarajgosavi/act_kikobot_pusht_real,pranavsaroha/so100_squishy2colors,pranavsaroha/so100_squishy2colors_1,Chojins/chess_game_001_white,jmrog/so100_sweet_pick,Chojins/chess_game_002_white,pranavsaroha/so100_squishy2colors_2_new,Chojins/chess_game_003_white,aractingi/pick_place_lego_cube,Chojins/chess_game_004_white,Chojins/chess_game_005_white,Chojins/chess_game_006_white,Chojins/chess_game_007_white,koenvanwijk/blue2,jlitch/so100multicam3,koenvanwijk/blue52,jlitch/so100multicam6,aractingi/pick_place_lego_cube_1,jlitch/so100multicam7,vladfatu/so100_ds,Chojins/chess_game_000_white,HITHY/so100-kiwi,HITHY/so100_peach1,HITHY/so100_redstrawberry,satvikahuja/orange_mixer_1,satvikahuja/mixer_on_off,satvikahuja/orange_pick_place_new1,satvikahuja/mixer_on_off_new,danmac1/real_real332,FeiYjf/Makalu_push,liyitenga/so100_pick_taffy1,chmadran/so100_dataset04,FeiYjf/Maklu_dataset,FeiYjf/new_Dataset,liyitenga/so100_pick_taffy2,satvikahuja/mixer_on_off_new_4,CSCSXX/pick_place_cube_1.17,liyitenga/so100_pick_taffy3,liyitenga/so100_pick_taffy4,yuz1wan/so100_pick_pink,yuz1wan/so100_pick_wahaha,yuz1wan/so100_pp_pink,yuz1wan/so100_pour_cup,liyitenga/so100_pick_taffy5,liyitenga/so100_pick_taffy6,yuz1wan/so100_button,yuz1wan/so100_pickplace,liyitenga/so100_pick_taffy7,FeiYjf/push_gg,FeiYjf/push_0094,swarajgosavi/act_kikobot_block_real,liyitenga/so100_pick_taffy8,phospho-ai/OrangeBrick3Cameras,vaishanthr/toy_pick_place,SeanLMH/so100_picknplace_v2,pepijn223/yellow_lego_in_box1,DimiSch/so100_50ep_2,DimiSch/so100_50ep_3,SeanLMH/so100_picknplace,nbaron99/so100_pick_and_place2,chmadran/so100_dataset08,vaishanthr/toy_pickplace_50ep,Beegbrain/pick_place_green_block_lr,Ityl/so100_recording1,vaishanthr/toy_pickplace,ad330/so100_box_pickPlace,Beegbrain/so100_put_cube_cup,aractingi/push_green_cube_hf,aractingi/push_green_cube_hf_cropped_resized,carpit680/giraffe_task,carpit680/giraffe_sock_demo_1,DimiSch/so100_terra_50_2,carpit680/giraffe_sock_demo_2,aractingi/push_cube_to_face_reward,aractingi/push_cube_to_face_reward_cropped_resized,aractingi/push_cube_reward_data,aractingi/push_cube_reward_data_cropped_resized,aractingi/push_cube_offline_data_cropped_resized,aractingi/push_cube_front_side_reward,aractingi/push_cube_front_side_reward_cropped_resized,aractingi/push_cube_front_side_reward_long,aractingi/push_cube_front_side_reward_long_cropped_resized,aractingi/push_cube_reward,aractingi/push_cube_reward_cropped_resized,aractingi/push_cube_square_reward_cropped_resized,aractingi/push_cube_square_reward_1,aractingi/push_cube_square_reward_1_cropped_resized,aractingi/push_cube_square_light_reward,aractingi/push_cube_square_light_offline_demo,aractingi/push_cube_square_light_offline_demo_cropped_resized,denghj/dataset_red_tape01,aractingi/push_cube_square_offline_demo,aractingi/push_cube_square_offline_demo_cropped_resized,Beegbrain/stack_two_cubes,FeiYjf/Test_NNNN,LegrandFrederic/Orange-brick-lower-resolution,aractingi/pick_place_lego_cube_cropped_resized,aractingi/push_cube_overfit,aractingi/push_cube_overfit_cropped_resized,HITHY/so100_peach,zaringleb/so100_cube_2,andreasBihlmaier/dual_arm_transfer_2025_02_16,zaringleb/so100_cube_4_binary,1g0rrr/reward_pickplace1,1g0rrr/reward_pickplace1_cropped_resized,FeiYjf/Hold_Pieces,FeiYjf/Grab_Pieces,hegdearyandev/so100_eraser_cup_v1,jbraumann/so100_1902,liyitenga/so100_pick_taffy10,mikechambers/block_cup_5,zaringleb/so100_cube_5_linear,yuz1wan/so100_pickplace_0223_2,yuz1wan/so100_pickplace_0223_3,samsam0510/mj_data_temp,samsam0510/tape_insert_1,samsam0510/tape_insert_2,pengjunkun/so100_push_to_hole,Deason11/Random_Kitchen,1g0rrr/reward_dataset_name2,1g0rrr/reward_dataset_name2_cropped_resized,1g0rrr/offline_dataset_name2,1g0rrr/offline_dataset_name2_cropped_resized,aractingi/push_cube_simp_cropped_resized,danielkr452/so100_work6,Loki0929/so100_100,yuz1wan/so100_fold_0227_1,yuz1wan/so100_fold_0227_2,speedyyoshi/so100_grasp_pink_block,lirislab/stack_two_red_cubes,lirislab/red_cube_into_mug,lirislab/green_lego_block_into_mug,lirislab/green_lego_block_into_mug_easy,kevin510/lerobot-cat-toy-placement,NONHUMAN-RESEARCH/SOARM100_TASK_VENDA_BOX,wangjl1512/pour_water,airthebear/so100_GL,zijian2022/noticehuman1,zijian2022/noticehuman2,kantine/so100_kapla_tower6,zijian2022/noticehuman5,zijian2022/llm40,Ashton3/lerobot-aloha,zijian2022/noticehuman50,AaronNewman/screwdriver_task_batch1,AaronNewman/screwdriver_task_batch2,AaronNewman/screwdriver_task_batch3,zijian2022/noticehuman60,zijian2022/noticehuman70,Bartm3/tape_to_bin,liuhuanjim013/so100_th_1,Pi-robot/barbecue_flip,Pi-robot/barbecue_put,wangjl1512/doll,sshh11/so100_orange_50ep_1,sshh11/so100_orange_50ep_2,DorayakiLin/so100_pick_cube_in_box,Bartm3/tape_to_bin2,luke250305/play_dice_250311.1,andy309/so100_0311_1152,sihyun77/suho_so100,sihyun77/si_so100,shreyasgite/so100_base_left,sihyun77/suho_red,liuhuanjim013/so100_block,andy309/so100_0313_no_wrist_camera,zijian2022/l9,zijian2022/n1_2,DorayakiLin/so100_stack_cube,andy309/so100_0313_no_wrist_camera_with_two_arms_cloths,joaoocruz00/so100_makeitD1,zijian2022/l10_1,zijian2022/l10_5,sihyun77/suho_red2,sihyun77/suho_angel,sihyun77/sihyun_king,acrampette/third_arm_01,Winster/so100_cube,1g0rrr/sam_openpi03,thedevansh/mar16_1336,hkphoooey/throw_stuffie,doujiangwang/task1_10epi_100000step,sihyun77/sihyun_3_17_1,acrampette/third_arm_02,imsyed00/so100_yellowbowl_pickplace_1,kumarhans/so100_tape_task,sihyun77/sihyun_main,doujiangwang/task2_10epi_100000step,kantine/industrial_robothon_buttons_expert,kantine/industrial_robothon_buttons_anomaly,kantine/industrial_robothon_hatchAndProbe_expert,kantine/industrial_robothon_hatchAndProbe_anomaly,Odog16/so100_tea_towel_folding_v1,zijian2022/so100_318,zijian2022/so100_318_1,Congying1112/so100_place_blue_bottle_with_two_cameras,Congying1112/so100_place_blue_bottle_with_two_cameras2,Congying1112/so100_place_blue_bottle_with_single_camera,pietroom/first_task_short,kantine/industrial_screws_sorting_expert,kantine/industrial_screws_sorting_anomaly,pietroom/second_task,zijian2022/c0,doujiangwang/task4_10epi_100000step,Congying1112/so100_switch_with_onhand_camera,HYAIYN/so100_get_orange_10epi,doujiangwang/task5_10epi_100000step,1g0rrr/sam_openpi_cube_low10,1g0rrr/sam_openpi_cube_top10,1g0rrr/sam_openpi_wire10,1g0rrr/sam_openpi_solder1,1g0rrr/sam_openpi_solder2,wcode/so100_put_pen_50,jchun/so100_pickplace_small_20250322_193929,bnarin/so100_tic_tac_toe_we_do_it_live,dc2ac/so100-t5,chmadran/so100_home_dataset,baladhurgesh97/so100_final_picking_3,bnarin/so100_tic_tac_toe_move_0_0,bnarin/so100_tic_tac_toe_move_1_0,bnarin/so100_tic_tac_toe_move_2_1,bnarin/so100_tic_tac_toe_move_4_0,zaringleb/so100_cube_6_2d,andlyu/so100_indoor_0,andlyu/so100_indoor_2,Winster/so100_sim,badwolf256/so100_twin_cam_duck,Congying1112/so100_simplepick_with_2_cameras_from_top,andlyu/so100_indoor_4,Zak-Y/so100_grap_dataset,kantine/domotic_pouringCoffee_expert,kantine/domotic_pouringCoffee_anomaly,lucasngoo/so100_strawberry_grape,kantine/domotic_makingCoffee_expert,kantine/domotic_makingCoffee_anomaly,ZGGZZG/so100_drop1,kantine/industrial_soldering_expert,kantine/industrial_soldering_anomaly,Yotofu/so100_sweeper_shoes,kantine/domotic_dishTidyUp_expert,kantine/domotic_dishTidyUp_anomaly,kantine/domotic_groceriesSorting_expert,kantine/domotic_groceriesSorting_anomaly,badwolf256/so100_twin_cam_duck_v2,kantine/domotic_vegetagblesAndFruitsSorting_expert,kantine/domotic_vegetagblesAndFruitsSorting_anomaly,kantine/domotic_setTheTable_expert,kantine/domotic_setTheTable_anomaly,therarelab/so100_pick_place,abhisb/so100_51_ep,andlyu/so100_indoor_val_0,allenchienxxx/so100Test,lizi178119985/so100_jia,badwolf256/so100_twin_cam_duck_v3,andrewcole712/so100_tape_bin_place,Gano007/so100_lolo,Zak-Y/so100_three_cameras_dataset,Gano007/so100_doliprane,XXRRSSRR/so100_v3_num_episodes_50,zijian2022/assemblyarm2,ganker5/so100_action_20250403,andlyu/so100_indoor_val2,Gano007/so100_gano,paszea/so100_whale_grab,paszea/so100_whale,Clementppr/lerobot_pick_and_place_dataset_world_model,andlyu/so100_indoor_10,RasmusP/so100_dataset50ep_a,RasmusP/so100_dataset50ep,Gano007/so100_second,zaringleb/so100_cude_linear_and_2d_comb,dsfsg/grasp_pens,zijian2022/digitalfix,zijian2022/digitalfix2,zijian2022/digitalfix3,T1g3rGE/so100_pickplace_small_20250407_171912,sihyun77/mond_13,abokinala/sputnik_100_11_pick_place_container,dsfsg/bring_bottle,duthvik/sputnik_100_13_pick_place_container,abokinala/sputnik_100_12_pick_place_container,Mwuqiu/so100_0408,AK51/4090_01,356c/so100_rope_reposition_1,paszea/so100_lego_mix,abokinala/sputnik_100_14_pick_place_container,abokinala/sputnik_100_23_pick_place_surface,jiajun001/eraser00_2,jlesein/TestBoulon2,duthvik/sputnik_100_31_pour_liquid,duthvik/sputnik_100_24_pick_place_surface,duthvik/sputnik_100_25_pick_place_surface,duthvik/sputnik_100_17_pick_place_container,duthvik/sputnik_100_26_pick_place_surface,VoicAndrei/so100_banana_to_plate_rebel_full,isadev/bougies1,danaaubakirova/so100_task_1,danaaubakirova/so100_task_2,danaaubakirova/so100_task_3,danaaubakirova/so100_task_4,sixpigs1/so100_pick_cube_in_box_error,sixpigs1/so100_push_cube_error,sixpigs1/so100_pull_cube_error,isadev/bougies2,therarelab/med_dis_rare_6,duthvik/sputnik_100_27_pick_place_surface,zijian2022/closer3,duthvik/sputnik_100_41_custom_tasks,duthvik/sputnik_100_42_custom_tasks,duthvik/sputnik_100_43_custom_tasks,duthvik/sputnik_100_44_custom_tasks,duthvik/sputnik_100_51_kitchen_tasks,duthvik/sputnik_100_52_kitchen_tasks,duthvik/sputnik_100_53_kitchen_tasks,duthvik/sputnik_100_45_custom_tasks,duthvik/sputnik_100_32_pour_liquid,duthvik/sputnik_100_29_pick_place_surface,duthvik/sputnik_100_18_pick_place_container,sixpigs1/so100_pull_cube_by_tool_error,sixpigs1/so100_insert_cylinder_error,abokinala/sputnik_100_54_kitchen_tasks,abokinala/sputnik_100_55_kitchen_tasks,m1b/so100_bluelego,abokinala/sputnik_100_46_custom_tasks,m1b/so100_bluelego_updt,kantine/flip_A0,kantine/flip_A1,kantine/flip_A2,kantine/flip_A3,lirislab/guess_who_no_cond,kantine/flip_A4,kantine/flip_A5,lirislab/guess_who_lighting,nguyen-v/so100_press_red_button,nguyen-v/so100_bimanual_grab_lemon_put_in_box2,pierfabre/cow,nguyen-v/press_red_button_new,nguyen-v/so100_rotate_red_button,raghav-katta-1/lerobot2,Cidoyi/so100_all_notes,roboticshack/team10-red-block,Cidoyi/so100_all_notes_1,roboticshack/team_5-QuiEstCe_everyBox,roboticshack/team11_pianobot,roboticshack/team2-guess_who_so100,roboticshack/team2-guess_who_so100_light,roboticshack/team2-guess_who_so100_edge_case,roboticshack/team2-guess_who_less_ligth,Cidoyi/so100_all_notes_3,dsfsg/grasp_pen_and_bottle,abokinala/sputnik_100_60_kitchen_tasks,abokinala/sputnik_100_58_kitchen_tasks,danaaubakirova/so100_v2_task_1,danaaubakirova/so100_v2_task_2,danaaubakirova/so100_v2_task_3,danaaubakirova/so100_v2_task_4,zijian2022/force1,zijian2022/force2,zijian2022/force3,jiajun001/eraser00_3,zijian2022/bi2,zijian2022/bi1,zijian2022/hand1,Setchii/so100_grab_ball,MossProphet/so100_square-1-2-3.2 +# # SAMPLING_WEIGHTS= +# # DATASET_NAME=so100_community_v3 + +# ########################## + +# ROBOT=so100 +# export TOKENIZERS_PARALLELISM=false +# export MUJOCO_GL=egl + + + +# SAMPLING_WEIGHTS= +# FEATURES_VERSION=2 +# NUM_IMAGE_TRANSFORMS=10 +# TRAIN_ON_ALL_FEATURES=true +# NORM_PER_ROBOT=true +# USE_IMAGENET_STATS=false + +# MAX_STATE_DIM=6 +# MAX_ACTION_DIM=6 +# MAX_NUM_IMAGES=3 +# MAX_IMAGE_DIM=256 + + +# SEED=5000 +# BATCH_SIZE=32 +# # EVAL_STEPS=1000 +# EVAL_STEPS=100 + + + + + +# SELF_ATTN_ONLY_ACTIONS=false +# EXPERT_WIDTH_MULTIPLIER=0.75 +# PAST_OBS_KEYS="image" +# N_OBS_STEPS=1 +# NUM_EXPERT_LAYERS=0 +# CHUNK_SIZE=50 +# NUM_VLM_LAYERS=16 +# PAD_LANG_TO=longest +# EVAL_CKPT=/lustre/fswork/projects/rech/dyf/ugz83ue/logs/lerobot/lerobot_so100_community_v1_v2_smolpi0_lr1e-4bs64steps200000gpus4freeze32_imgtoktrue_cross_attn_gap1_localimgfalse_statetopreftrue_explay0_vlml16_causalacttrue_sa2_smolvlm2500_chunk50_nobs1_expw0.75_feat2_lrvlm1e-4_droptrue_max_length/checkpoints/080000/pretrained_model/ +# ADD_IMAGE_TOKENS=true +# ATTN_MODE=cross_attn +# STATE_TO_PREFIX=true +# CAUSAL_ACTION_ATTENTION_MASK=true +# SELF_ATTN_EVERY_N_LAYERS=2 +# VLM_NAME=HuggingFaceTB/SmolVLM-500M-Instruct + + +# python lerobot/scripts/offline_inference.py \ +# --output_dir=$WORK/logs/lerobot/tmp \ +# --batch_size=$BATCH_SIZE \ +# --seed=$SEED \ +# --eval_steps=$EVAL_STEPS \ +# --use_amp=false \ +# --device=cuda \ +# --dataset.repo_id=$REPO_ID --dataset.local_files_only=true --dataset.sampling_weights=$SAMPLING_WEIGHTS --dataset.use_imagenet_stats=$USE_IMAGENET_STATS --policy.normalize_per_robot_type=$NORM_PER_ROBOT \ +# --dataset.image_transforms.max_num_transforms=$NUM_IMAGE_TRANSFORMS --dataset.image_transforms.enable=true --dataset.train_on_all_features=$TRAIN_ON_ALL_FEATURES \ +# --dataset.max_action_dim=$MAX_ACTION_DIM --dataset.max_state_dim=$MAX_STATE_DIM --dataset.max_num_images=$MAX_NUM_IMAGES --dataset.max_image_dim=$MAX_IMAGE_DIM --dataset.features_version=$FEATURES_VERSION \ +# --policy.type=$POLICY \ +# --policy.checkpoint_path=$EVAL_CKPT \ +# --policy.checkpoint_keys_mapping=$CKPT_KEYS_MAPPING \ +# --policy.add_image_special_tokens=$ADD_IMAGE_TOKENS \ +# --policy.attention_mode=$ATTN_MODE \ +# --policy.causal_action_attention_mask=$CAUSAL_ACTION_ATTENTION_MASK \ +# --policy.state_to_prefix=$STATE_TO_PREFIX \ +# --policy.self_attn_every_n_layers=$SELF_ATTN_EVERY_N_LAYERS \ +# --policy.vlm_model_name=$VLM_NAME \ +# --policy.pad_language_to=$PAD_LANG_TO \ +# --policy.load_vlm_weights=$LOAD_VLM_WEIGHTS \ +# --policy.num_vlm_layers=$NUM_VLM_LAYERS \ +# --policy.chunk_size=$CHUNK_SIZE \ +# --policy.n_obs_steps=$N_OBS_STEPS \ +# --policy.past_obs_keys=$PAST_OBS_KEYS \ +# --policy.num_expert_layers=$NUM_EXPERT_LAYERS \ +# --policy.expert_width_multiplier=$EXPERT_WIDTH_MULTIPLIER \ +# --policy.peft_method=$PEFT_METHOD \ +# --policy.self_attn_only_actions=$SELF_ATTN_ONLY_ACTIONS \ +# --policy.robot_type=$ROBOT + + + + +# MULTITASK_EVAL=true +# N_EPISODES=5 +# MAX_PARRALLEL_TASKS=1 +# ACTION_STEPS_LIST=(1) +# TASK_LIST=(libero_10) +# for N_ACTION_STEPS in "${ACTION_STEPS_LIST[@]}"; do +# for TASK in "${TASK_LIST[@]}"; do +# echo "$TASK Evaluating: $EVAL_CKPT | N_ACTION_STEPS=$N_ACTION_STEPS" +# python lerobot/scripts/eval.py \ +# --output_dir=$WORK/logs/lerobot/tmp \ +# --env.type=$ENV \ +# --env.task=$TASK \ +# --eval.batch_size=$N_EPISODES \ +# --eval.n_episodes=$N_EPISODES \ +# --use_amp=false \ +# --device=cuda \ +# --policy.n_action_steps=$N_ACTION_STEPS \ +# --policy.type=$POLICY \ +# --policy.checkpoint_path=$EVAL_CKPT \ +# --policy.checkpoint_keys_mapping=$CKPT_KEYS_MAPPING \ +# --env.multitask_eval=$MULTITASK_EVAL --env.max_parallel_tasks=$MAX_PARRALLEL_TASKS \ +# --policy.add_image_special_tokens=$ADD_IMAGE_TOKENS \ +# --policy.attention_mode=$ATTN_MODE \ +# --policy.causal_action_attention_mask=$CAUSAL_ACTION_ATTENTION_MASK \ +# --policy.state_to_prefix=$STATE_TO_PREFIX \ +# --policy.self_attn_every_n_layers=$SELF_ATTN_EVERY_N_LAYERS \ +# --policy.vlm_model_name=$VLM_NAME \ +# --policy.load_vlm_weights=$LOAD_VLM_WEIGHTS \ +# --policy.num_vlm_layers=$NUM_VLM_LAYERS \ +# --policy.chunk_size=$CHUNK_SIZE + +# echo "Done with: $EVAL_CKPT | Steps=$N_ACTION_STEPS" +# echo "------------------------------------------------------" +# done +# done + diff --git a/src/lerobot/envs/configs.py b/src/lerobot/envs/configs.py index 5c648de70..e965446db 100644 --- a/src/lerobot/envs/configs.py +++ b/src/lerobot/envs/configs.py @@ -320,8 +320,6 @@ class LiberoEnv(EnvConfig): @property def gym_kwargs(self) -> dict: return { - # "task": self.task, "obs_type": self.obs_type, "render_mode": self.render_mode, - # "max_episode_steps": self.episode_length, } diff --git a/src/lerobot/envs/factory.py b/src/lerobot/envs/factory.py index be49a9990..211b41714 100644 --- a/src/lerobot/envs/factory.py +++ b/src/lerobot/envs/factory.py @@ -56,37 +56,36 @@ def make_env( names to indexed vectorized environments (when multitask eval is used). """ - if n_envs < 1: - raise ValueError("`n_envs must be at least 1") + if n_envs < 1: + raise ValueError("`n_envs` must be at least 1") - # batched version of the env that returns an observation of shape (b, c) - env_cls = gym.vector.AsyncVectorEnv if use_async_envs else gym.vector.SyncVectorEnv + env_cls = gym.vector.AsyncVectorEnv if use_async_envs else gym.vector.SyncVectorEnv - if "libero" in cfg.type: - from lerobot.envs.libero import create_libero_envs + + if "libero" in cfg.type: + from lerobot.envs.libero import create_libero_envs + return create_libero_envs( + task=cfg.task, + n_envs=n_envs, + camera_name=cfg.camera_name, + init_states=cfg.init_states, + gym_kwargs=cfg.gym_kwargs, + env_cls=env_cls, + multitask_eval=cfg.multitask_eval, + ) - env = create_libero_envs( - task=cfg.task, - n_envs=n_envs, - camera_name=cfg.camera_name, - init_states=cfg.init_states, - gym_kwargs=cfg.gym_kwargs, - env_cls=env_cls, - multitask_eval=cfg.multitask_eval, - ) - else: + package_name = f"gym_{cfg.type}" try: importlib.import_module(package_name) except ModuleNotFoundError as e: - print( - f"{package_name} is not installed. Please install it with `pip install 'lerobot[{cfg.type}]'`" - ) - raise e + raise ModuleNotFoundError( + f"{package_name} is not installed. Install with: pip install \"lerobot[{cfg.type}]\"" + ) from e gym_handle = f"{package_name}/{cfg.task}" - env = env_cls( - [lambda: gym.make(gym_handle, disable_env_checker=True, **cfg.gym_kwargs) for _ in range(n_envs)] - ) + + def _make_one(): + return gym.make(gym_handle, disable_env_checker=True, **(cfg.gym_kwargs or {})) - return env + return env_cls([_make_one for _ in range(n_envs)]) diff --git a/src/lerobot/envs/libero copy.py b/src/lerobot/envs/libero copy.py new file mode 100644 index 000000000..83ccd2fb9 --- /dev/null +++ b/src/lerobot/envs/libero copy.py @@ -0,0 +1,326 @@ +import math +import os +from collections import defaultdict +from collections.abc import Callable +from itertools import chain +from typing import Any + +import gymnasium as gym +import numpy as np +import torch +from gymnasium import spaces +from libero.libero import benchmark, get_libero_path +from libero.libero.envs import OffScreenRenderEnv + + +def create_libero_envs( + task: str, + n_envs: int, + gym_kwargs: dict[str, Any] = None, + camera_name: str = "agentview_image,robot0_eye_in_hand_image", + init_states: bool = True, + env_cls: Callable = None, + multitask_eval: bool = True, +) -> dict[str, dict[str, Any]]: + """ + Here n_envs is per task and equal to the number of rollouts. + Returns: + dict[str, dict[str, list[LiberoEnv]]]: keys are task_suite and values are list of LiberoEnv envs. + """ + print("num envs", n_envs) + print("multitask_eval", multitask_eval) + print("gym_kwargs", gym_kwargs) + if gym_kwargs is None: + gym_kwargs = {} + + if not multitask_eval: + benchmark_dict = benchmark.get_benchmark_dict() + task_suite = benchmark_dict[task]() # can also choose libero_spatial, libero_object, libero_10 etc. + tasks_id = list(range(len(task_suite.tasks))) + episode_indices = [0 for i in range(len(tasks_id))] + if len(tasks_id) == 1: + tasks_id = [tasks_id[0] for _ in range(n_envs)] + episode_indices = list(range(n_envs)) + elif len(tasks_id) < n_envs and n_envs % len(tasks_id) == 0: + n_repeat = n_envs // len(tasks_id) + print("n_repeat", n_repeat) + episode_indices = [] + for _ in range(len(tasks_id)): + episode_indices.extend(list(range(n_repeat))) + tasks_id = list(chain.from_iterable([[item] * n_repeat for item in tasks_id])) + elif n_envs < len(tasks_id): + tasks_id = tasks_id[:n_envs] + episode_indices = list(range(n_envs))[:n_envs] + print(f"WARNING: n_envs < len(tasks_id), evaluating only on {tasks_id}") + print(f"Creating Libero envs with task ids {tasks_id} from suite {task}") + assert n_envs == len(tasks_id), ( + f"len(n_envs) and tasks_id should be the same, got {n_envs} and {len(tasks_id)}" + ) + return env_cls( + [ + lambda i=i: LiberoEnv( + task_suite=task_suite, + task_id=tasks_id[i], + task_suite_name=task, + camera_name=camera_name, + init_states=init_states, + episode_index=episode_indices[i], + **gym_kwargs, + ) + for i in range(n_envs) + ] + ) + else: + envs = defaultdict(dict) + benchmark_dict = benchmark.get_benchmark_dict() + task = task.split(",") + for _task in task: + task_suite = benchmark_dict[ + _task + ]() # can also choose libero_spatial, libero_object, libero_10 etc. + tasks_ids = list(range(len(task_suite.tasks))) + for tasks_id in tasks_ids: + episode_indices = list(range(n_envs)) + print( + f"Creating Libero envs with task ids {tasks_id} from suite {_task}, episode_indices: {episode_indices}" + ) + envs_list = [ + ( + lambda i=i, + task_suite=task_suite, + tasks_id=tasks_id, + _task=_task, + episode_indices=episode_indices: LiberoEnv( + task_suite=task_suite, + task_id=tasks_id, + task_suite_name=_task, + camera_name=camera_name, + init_states=init_states, + episode_index=episode_indices[i], + **gym_kwargs, + ) + ) + for i in range(n_envs) + ] + envs[_task][tasks_id] = env_cls(envs_list) + return envs + + +def quat2axisangle(quat): + """ + Copied from robosuite: https://github.com/ARISE-Initiative/robosuite/blob/eafb81f54ffc104f905ee48a16bb15f059176ad3/robosuite/utils/transform_utils.py#L490C1-L512C55 + + Converts quaternion to axis-angle format. + Returns a unit vector direction scaled by its angle in radians. + + Args: + quat (np.array): (x,y,z,w) vec4 float angles + + Returns: + np.array: (ax,ay,az) axis-angle exponential coordinates + """ + # clip quaternion + if quat[3] > 1.0: + quat[3] = 1.0 + elif quat[3] < -1.0: + quat[3] = -1.0 + + den = np.sqrt(1.0 - quat[3] * quat[3]) + if math.isclose(den, 0.0): + # This is (close to) a zero degree rotation, immediately return + return np.zeros(3) + + return (quat[:3] * 2.0 * math.acos(quat[3])) / den + + +def get_task_init_states(task_suite, i): + init_states_path = os.path.join( + get_libero_path("init_states"), + task_suite.tasks[i].problem_folder, + task_suite.tasks[i].init_states_file, + ) + init_states = torch.load(init_states_path, weights_only=False) # nosec B614 + return init_states + + +def get_libero_dummy_action(): + """Get dummy/no-op action, used to roll out the simulation while the robot does nothing.""" + return [0, 0, 0, 0, 0, 0, -1] + + +OBS_STATE_DIM = 8 +ACTION_DIM = 7 + + +class LiberoEnv(gym.Env): + metadata = {"render_modes": ["rgb_array"], "render_fps": 80} + + def __init__( + self, + task_suite, + task_id, + task_suite_name, + camera_name="agentview_image,robot0_eye_in_hand_image", + obs_type="pixels", + render_mode="rgb_array", + observation_width=256, + observation_height=256, + visualization_width=640, + visualization_height=480, + init_states=True, + episode_index=0, + ): + super().__init__() + self.task_id = task_id + self.obs_type = obs_type + self.render_mode = render_mode + self.observation_width = observation_width + self.observation_height = observation_height + self.visualization_width = visualization_width + self.visualization_height = visualization_height + self.init_states = init_states + self.camera_name = camera_name.split( + "," + ) # agentview_image (main) or robot0_eye_in_hand_image (wrist) + + # Map raw camera names to "image1" and "image2". + # The preprocessing step `preprocess_observation` will then prefix these with `.images.*`, + # following the LeRobot convention (e.g., `observation.images.image`, `observation.images.image2`). + # This ensures the policy consistently receives observations in the + # expected format regardless of the original camera naming. + self.camera_name_mapping = { + "agentview_image": "image", + "robot0_eye_in_hand_image": "image2", + } + + self.num_steps_wait = ( + 10 # Do nothing for the first few timesteps to wait for the simulator drops objects + ) + self.episode_index = episode_index + + self._env = self._make_envs_task(task_suite, self.task_id) + if task_suite_name == "libero_spatial": + max_steps = 220 # longest training demo has 193 steps + elif task_suite_name == "libero_object": + max_steps = 280 # longest training demo has 254 steps + elif task_suite_name == "libero_goal": + max_steps = 300 # longest training demo has 270 steps + elif task_suite_name == "libero_10": + max_steps = 520 # longest training demo has 505 steps + elif task_suite_name == "libero_90": + max_steps = 400 # longest training demo has 373 steps + self._max_episode_steps = max_steps + + images = {} + for cam in self.camera_name: + images[self.camera_name_mapping[cam]] = spaces.Box( + low=0, + high=255, + shape=(self.observation_height, self.observation_width, 3), + dtype=np.uint8, + ) + + if self.obs_type == "state": + raise NotImplementedError() + elif self.obs_type == "pixels": + self.observation_space = spaces.Dict( + { + "pixels": spaces.Dict(images), + } + ) + elif self.obs_type == "pixels_agent_pos": + self.observation_space = spaces.Dict( + { + "pixels": spaces.Dict(images), + "agent_pos": spaces.Box( + low=-1000.0, + high=1000.0, + shape=(OBS_STATE_DIM,), + dtype=np.float64, + ), + } + ) + + self.action_space = spaces.Box(low=-1, high=1, shape=(ACTION_DIM,), dtype=np.float32) + + def render(self): + raw_obs = self._env.env._get_observations() + image = self._format_raw_obs(raw_obs)["pixels"]["image"] + return image + + def _make_envs_task(self, task_suite, task_id: int = 0): + task = task_suite.get_task(task_id) + self.task = task.name + self.task_description = task.language + task_bddl_file = os.path.join(get_libero_path("bddl_files"), task.problem_folder, task.bddl_file) + + env_args = { + "bddl_file_name": task_bddl_file, + "camera_heights": self.observation_height, + "camera_widths": self.observation_width, + } + env = OffScreenRenderEnv(**env_args) + env.reset() + if self.init_states: + init_states = get_task_init_states( + task_suite, task_id + ) # for benchmarking purpose, we fix the a set of initial states FIXME(mshukor): should be in the reset()? + init_state_id = self.episode_index # episode index + env.set_init_state(init_states[init_state_id]) + + return env + + def _format_raw_obs(self, raw_obs): + images = {} + for camera_name in self.camera_name: + image = raw_obs[camera_name] + image = image[::-1, ::-1] # rotate 180 degrees + images[self.camera_name_mapping[camera_name]] = image + state = np.concatenate( + ( + raw_obs["robot0_eef_pos"], + quat2axisangle(raw_obs["robot0_eef_quat"]), + raw_obs["robot0_gripper_qpos"], + ) + ) + agent_pos = state + if self.obs_type == "state": + raise NotImplementedError() + elif self.obs_type == "pixels": + obs = {"pixels": images.copy()} + elif self.obs_type == "pixels_agent_pos": + obs = { + "pixels": images.copy(), + "agent_pos": agent_pos, + } + return obs + + def reset(self, seed=None, **kwargs): + super().reset(seed=seed) + + self._env.seed(seed) + raw_obs = self._env.reset() + # Do nothing for the first few timesteps to wait for the simulator drops objects + for _ in range(self.num_steps_wait): + raw_obs, _, _, _ = self._env.step(get_libero_dummy_action()) + observation = self._format_raw_obs(raw_obs) + info = {"is_success": False} + return observation, info + + def step(self, action): + assert action.ndim == 1 + raw_obs, reward, done, info = self._env.step(action) + + is_success = self._env.check_success() + terminated = done or is_success + info["is_success"] = done # is_success + + observation = self._format_raw_obs(raw_obs) + if done: + self.reset() + print(self.task, self.task_id, done, is_success) + truncated = False + return observation, reward, terminated, truncated, info + + def close(self): + self._env.close() diff --git a/src/lerobot/envs/libero.py b/src/lerobot/envs/libero.py index f815228e7..83ccd2fb9 100644 --- a/src/lerobot/envs/libero.py +++ b/src/lerobot/envs/libero.py @@ -245,9 +245,8 @@ class LiberoEnv(gym.Env): def render(self): raw_obs = self._env.env._get_observations() - formatted = self._format_raw_obs(raw_obs) - # grab the "main" camera - return formatted["pixels"]["image"] + image = self._format_raw_obs(raw_obs)["pixels"]["image"] + return image def _make_envs_task(self, task_suite, task_id: int = 0): task = task_suite.get_task(task_id) @@ -277,7 +276,6 @@ class LiberoEnv(gym.Env): image = raw_obs[camera_name] image = image[::-1, ::-1] # rotate 180 degrees images[self.camera_name_mapping[camera_name]] = image - # images = image if len(images) == 1 else images state = np.concatenate( ( raw_obs["robot0_eef_pos"], @@ -311,14 +309,17 @@ class LiberoEnv(gym.Env): def step(self, action): assert action.ndim == 1 - action[-1] = 1.0 - action[-1] raw_obs, reward, done, info = self._env.step(action) + is_success = self._env.check_success() terminated = done or is_success - info["is_success"] = is_success + info["is_success"] = done # is_success + observation = self._format_raw_obs(raw_obs) + if done: + self.reset() + print(self.task, self.task_id, done, is_success) truncated = False - # note if it is unable to complete get libero error after many steps return observation, reward, terminated, truncated, info def close(self): diff --git a/src/lerobot/envs/libero2.py b/src/lerobot/envs/libero2.py new file mode 100644 index 000000000..1e794072c --- /dev/null +++ b/src/lerobot/envs/libero2.py @@ -0,0 +1,308 @@ +import math +import os +from collections import defaultdict +from itertools import chain +from typing import Any, Callable + +import gymnasium as gym +import numpy as np +import torch +from gymnasium import spaces +from libero.libero import benchmark, get_libero_path +from libero.libero.envs import OffScreenRenderEnv + + +OBS_IMAGE = "observation.image" +OBS_IMAGE_2 = "observation.image2" +def create_libero_envs( + task: str, + n_envs: int, + gym_kwargs: dict[str, Any] = None, + camera_name: str = "agentview_image,robot0_eye_in_hand_image", + init_states: bool = True, + env_cls: Callable = None, + multitask_eval: bool = True, +) -> dict[str, dict[str, Any]]: + """ + Here n_envs is per task and equal to the number of rollouts. + Returns: + dict[str, dict[str, list[LiberoEnv]]]: keys are task_suite and values are list of LiberoEnv envs. + """ + if gym_kwargs is None: + gym_kwargs = {} + + if not multitask_eval: + benchmark_dict = benchmark.get_benchmark_dict() + task_suite = benchmark_dict[task]() # can also choose libero_spatial, libero_object, libero_10 etc. + tasks_id = list(range(len(task_suite.tasks))) + episode_indices = [0 for i in range(len(tasks_id))] + if len(tasks_id) == 1: + tasks_id = [tasks_id[0] for _ in range(n_envs)] + episode_indices = list(range(n_envs)) + elif len(tasks_id) < n_envs and n_envs % len(tasks_id) == 0: + n_repeat = n_envs // len(tasks_id) + episode_indices = [] + for i in range(len(tasks_id)): + episode_indices.extend(list(range(n_repeat))) + tasks_id = list(chain.from_iterable([[item] * n_repeat for item in tasks_id])) + elif n_envs < len(tasks_id): + tasks_id = tasks_id[:n_envs] + episode_indices = list(range(n_envs))[:n_envs] + print(f"WARNING: n_envs < len(tasks_id), evaluating only on {tasks_id}") + print(f"Creating Libero envs with task ids {tasks_id} from suite {task}") + assert n_envs == len( + tasks_id + ), f"len(n_envs) and tasks_id should be the same, got {n_envs} and {len(tasks_id)}" + return env_cls( + [ + lambda i=i: LiberoEnv( + task_suite=task_suite, + task_id=tasks_id[i], + task_suite_name=task, + camera_name=camera_name, + init_states=init_states, + episode_index=episode_indices[i], + **gym_kwargs, + ) + for i in range(n_envs) + ] + ) + else: + envs = defaultdict(dict) + benchmark_dict = benchmark.get_benchmark_dict() + task = task.split(",") + for _task in task: + task_suite = benchmark_dict[ + _task + ]() # can also choose libero_spatial, libero_object, libero_10 etc. + tasks_ids = list(range(len(task_suite.tasks))) + # tasks_ids = [0] # FIXME(mshukor): debug + for tasks_id in tasks_ids: + episode_indices = list(range(n_envs)) + print( + f"Creating Libero envs with task ids {tasks_id} from suite {_task}, episode_indices: {episode_indices}" + ) + envs_list = [ + lambda i=i: LiberoEnv( + task_suite=task_suite, + task_id=tasks_id, + task_suite_name=_task, + camera_name=camera_name, + init_states=init_states, + episode_index=episode_indices[i], + **gym_kwargs, + ) + for i in range(n_envs) + ] + envs[_task][tasks_id] = env_cls(envs_list) + return envs + + +def quat2axisangle(quat): + """ + Copied from robosuite: https://github.com/ARISE-Initiative/robosuite/blob/eafb81f54ffc104f905ee48a16bb15f059176ad3/robosuite/utils/transform_utils.py#L490C1-L512C55 + + Converts quaternion to axis-angle format. + Returns a unit vector direction scaled by its angle in radians. + + Args: + quat (np.array): (x,y,z,w) vec4 float angles + + Returns: + np.array: (ax,ay,az) axis-angle exponential coordinates + """ + # clip quaternion + if quat[3] > 1.0: + quat[3] = 1.0 + elif quat[3] < -1.0: + quat[3] = -1.0 + + den = np.sqrt(1.0 - quat[3] * quat[3]) + if math.isclose(den, 0.0): + # This is (close to) a zero degree rotation, immediately return + return np.zeros(3) + + return (quat[:3] * 2.0 * math.acos(quat[3])) / den + + +def get_task_init_states(task_suite, i): + init_states_path = os.path.join( + get_libero_path("init_states"), + task_suite.tasks[i].problem_folder, + task_suite.tasks[i].init_states_file, + ) + init_states = torch.load(init_states_path, weights_only=False) + return init_states + + +def get_libero_dummy_action(): + """Get dummy/no-op action, used to roll out the simulation while the robot does nothing.""" + return [0, 0, 0, 0, 0, 0, -1] + + +class LiberoEnv(gym.Env): + metadata = {"render_modes": ["rgb_array"], "render_fps": 80} + + def __init__( + self, + task_suite, + task_id, + task_suite_name, + camera_name="agentview_image,robot0_eye_in_hand_image", + obs_type="pixels", + render_mode="rgb_array", + observation_width=256, + observation_height=256, + visualization_width=640, + visualization_height=480, + init_states=True, + episode_index=0, + ): + super().__init__() + self.task_id = task_id + self.obs_type = obs_type + self.render_mode = render_mode + self.observation_width = observation_width + self.observation_height = observation_height + self.visualization_width = visualization_width + self.visualization_height = visualization_height + self.init_states = init_states + self.camera_name = camera_name.split( + "," + ) # agentview_image (main) or robot0_eye_in_hand_image (wrist) + self.camera_name_mapping = { + "agentview_image": OBS_IMAGE, + "robot0_eye_in_hand_image": OBS_IMAGE_2, + } + self.num_steps_wait = ( + 10 # Do nothing for the first few timesteps to wait for the simulator drops objects + ) + self.episode_index = episode_index + + self._env = self._make_envs_task(task_suite, self.task_id) + if task_suite_name == "libero_spatial": + max_steps = 220 # longest training demo has 193 steps + elif task_suite_name == "libero_object": + max_steps = 280 # longest training demo has 254 steps + elif task_suite_name == "libero_goal": + max_steps = 300 # longest training demo has 270 steps + elif task_suite_name == "libero_10": + max_steps = 520 # longest training demo has 505 steps + elif task_suite_name == "libero_90": + max_steps = 400 # longest training demo has 373 steps + self._max_episode_steps = max_steps + + images = {} + for cam in self.camera_name: + images[self.camera_name_mapping[cam]] = spaces.Box( + low=0, + high=255, + shape=(self.observation_height, self.observation_width, 3), + dtype=np.uint8, + ) + + if self.obs_type == "state": + raise NotImplementedError() + elif self.obs_type == "pixels": + self.observation_space = spaces.Dict( + { + "pixels": spaces.Dict(images), + } + ) + elif self.obs_type == "pixels_agent_pos": + self.observation_space = spaces.Dict( + { + "pixels": spaces.Dict(images), + "agent_pos": spaces.Box( + low=-1000.0, + high=1000.0, + shape=(8,), + dtype=np.float64, + ), + } + ) + + self.action_space = spaces.Box(low=-1, high=1, shape=(7,), dtype=np.float32) + + def render(self): + raw_obs = self._env.env._get_observations() + image = self._format_raw_obs(raw_obs)["pixels"][OBS_IMAGE] + return image + + def _make_envs_task(self, task_suite, task_id: int = 0): + task = task_suite.get_task(task_id) + self.task = task.name + self.task_description = task.language + task_bddl_file = os.path.join(get_libero_path("bddl_files"), task.problem_folder, task.bddl_file) + + env_args = { + "bddl_file_name": task_bddl_file, + "camera_heights": self.observation_height, + "camera_widths": self.observation_width, + } + env = OffScreenRenderEnv(**env_args) + env.reset() + if self.init_states: + init_states = get_task_init_states( + task_suite, task_id + ) # for benchmarking purpose, we fix the a set of initial states FIXME(mshukor): should be in the reset()? + init_state_id = self.episode_index # episode index + env.set_init_state(init_states[init_state_id]) + + return env + + def _format_raw_obs(self, raw_obs): + images = {} + for camera_name in self.camera_name: + image = raw_obs[camera_name] + image = image[::-1, ::-1] # rotate 180 degrees + images[self.camera_name_mapping[camera_name]] = image + # images = image if len(images) == 1 else images + state = np.concatenate( + ( + raw_obs["robot0_eef_pos"], + quat2axisangle(raw_obs["robot0_eef_quat"]), + raw_obs["robot0_gripper_qpos"], + ) + ) + agent_pos = state + if self.obs_type == "state": + raise NotImplementedError() + elif self.obs_type == "pixels": + obs = {"pixels": images.copy()} + elif self.obs_type == "pixels_agent_pos": + obs = { + "pixels": images.copy(), + "agent_pos": agent_pos, + } + return obs + + def reset(self, seed=None, **kwargs): + super().reset(seed=seed) + + self._env.seed(seed) + raw_obs = self._env.reset() + # Do nothing for the first few timesteps to wait for the simulator drops objects + for _ in range(self.num_steps_wait): + raw_obs, _, _, _ = self._env.step(get_libero_dummy_action()) + observation = self._format_raw_obs(raw_obs) + info = {"is_success": False} + return observation, info + + def step(self, action): + assert action.ndim == 1 + raw_obs, reward, done, info = self._env.step(action) + + is_success = self._env.check_success() + terminated = done or is_success + info["is_success"] = done # is_success + + observation = self._format_raw_obs(raw_obs) + if done: + self.reset() + print(self.task, self.task_id, done, is_success) + truncated = False + return observation, reward, terminated, truncated, info + + def close(self): + self._env.close() diff --git a/src/lerobot/envs/utils.py b/src/lerobot/envs/utils.py index 2cf9efcfe..5ae252dbe 100644 --- a/src/lerobot/envs/utils.py +++ b/src/lerobot/envs/utils.py @@ -80,7 +80,56 @@ def preprocess_observation(observations: dict[str, np.ndarray]) -> dict[str, Ten return return_observations +def preprocess_observation1( + observations: dict[str, np.ndarray], cfg: dict[str, Any] = None +) -> dict[str, Tensor]: + # TODO(aliberts, rcadene): refactor this to use features from the environment (no hardcoding) + """Convert environment observation to LeRobot format observation. + Args: + observation: Dictionary of observation batches from a Gym vector environment. + Returns: + Dictionary of observation batches with keys renamed to LeRobot format and values as tensors. + """ + # map to expected inputs for the policy + return_observations = {} + image_key = list(cfg.image_features.keys())[0] if cfg else "observation.image" + state_key = cfg.robot_state_feature_key if cfg else "observation.state" + if "pixels" in observations: + if isinstance(observations["pixels"], dict): + # imgs = {f"{image_key}.{key}": img for key, img in observations["pixels"].items()} + imgs = observations["pixels"] # keys should be OBS_IMAGE, OBS_IMAGE_2, OBS_IMAGE_3 + else: + imgs = {f"{image_key}": observations["pixels"]} + for imgkey, img in imgs.items(): + # TODO(aliberts, rcadene): use transforms.ToTensor()? + img = torch.from_numpy(img) + + # sanity check that images are channel last + _, h, w, c = img.shape + assert c < h and c < w, f"expect channel last images, but instead got {img.shape=}" + + # sanity check that images are uint8 + assert img.dtype == torch.uint8, f"expect torch.uint8, but instead {img.dtype=}" + + # convert to channel first of type float32 in range [0,1] + img = einops.rearrange(img, "b h w c -> b c h w").contiguous() + img = img.type(torch.float32) + img /= 255 + + return_observations[imgkey] = img + + if "environment_state" in observations: + return_observations["observation.environment_state"] = torch.from_numpy( + observations["environment_state"] + ).float() + + # TODO(rcadene): enable pixels only baseline with `obs_type="pixels"` in environment by removing + # requirement for "agent_pos" + return_observations[state_key] = torch.from_numpy(observations["agent_pos"]).float() + if "task" in observations: + return_observations["task"] = observations["task"] + return return_observations def env_to_policy_features(env_cfg: EnvConfig) -> dict[str, PolicyFeature]: # TODO(aliberts, rcadene): remove this hardcoding of keys and just use the nested keys as is # (need to also refactor preprocess_observation and externalize normalization from policies) diff --git a/src/lerobot/policies/factory.py b/src/lerobot/policies/factory.py index 79461d3a9..b7d92d988 100644 --- a/src/lerobot/policies/factory.py +++ b/src/lerobot/policies/factory.py @@ -177,6 +177,6 @@ def make_policy( policy = policy_cls(**kwargs) policy.to(cfg.device) assert isinstance(policy, nn.Module) + breakpoint() # policy = torch.compile(policy, mode="reduce-overhead") - return policy diff --git a/src/lerobot/policies/smolpi0/modeling_smolpi0.py b/src/lerobot/policies/smolpi0/modeling_smolpi0.py index fa2d3d5a7..9a128f7b6 100644 --- a/src/lerobot/policies/smolpi0/modeling_smolpi0.py +++ b/src/lerobot/policies/smolpi0/modeling_smolpi0.py @@ -51,7 +51,9 @@ policy = Pi0Policy.from_pretrained("lerobot/pi0") import math from collections import deque - +import os +import re +import safetensors import torch import torch.nn.functional as F # noqa: N812 from torch import Tensor, nn @@ -169,7 +171,72 @@ def resize_with_pad(img, width, height, pad_value=-1): padded_img = F.pad(resized_img, (pad_width, 0, pad_height, 0), value=pad_value) return padded_img +_VARIANT_RE = re.compile(r"\.so\d+(?:-[\w]+)?_buffer_") +def canonicalise(k: str) -> str: + """ + Remove dataset-variant markers like '.so100-blue_' or '.so100_' from a + normalisation-buffer key. + """ + return _VARIANT_RE.sub(".buffer_", k) +def standardise_state_dict( + checkpoint: dict[str, torch.Tensor], ref_keys: set[str], *, verbose: bool = True +) -> tuple[dict[str, torch.Tensor], list[str]]: + """ + β€’ Re-keys `checkpoint ` so that every entry matches the *reference* key set. + β€’ If several variant keys collapse to the same canonical name we keep the + first one and log the collision. + β€’ Returns the new dict + a list of entries that could not be matched. + """ + out, collisions, unmatched = {}, {}, [] + + for k, v in checkpoint.items(): + canon = canonicalise(k) + if canon in ref_keys: + if canon in out: # duplicate after collapsing + collisions.setdefault(canon, []).append(k) + else: + out[canon] = v + else: + unmatched.append(k) + + if verbose: + for canon, variants in collisions.items(): + print(f"[standardise_state_dict] '{canon}' ← {variants}") + if unmatched: + print(f"[standardise_state_dict] kept {len(unmatched)} unmatched keys") + + out.update({k: checkpoint[k] for k in unmatched}) + return out, unmatched + +def load_smolvla( + model: torch.nn.Module, + filename: str | os.PathLike, + *, + device: str = "cpu", + checkpoint_keys_mapping: str = "", +) -> torch.nn.Module: + state_dict = safetensors.torch.load_file(filename, device=device) + + # Optional user-supplied renames (e.g. "model._orig_mod.//model.") + if checkpoint_keys_mapping and "//" in checkpoint_keys_mapping: + state_dict = rename_checkpoint_keys(state_dict, checkpoint_keys_mapping) + + state_dict, _ = standardise_state_dict(state_dict, set(model.state_dict().keys())) + + # HACK(aliberts): to not overwrite normalization parameters as they should come from the dataset + norm_keys = ("normalize_inputs", "normalize_targets", "unnormalize_outputs") + state_dict = {k: v for k, v in state_dict.items() if not k.startswith(norm_keys)} + + missing, unexpected = model.load_state_dict(state_dict, strict=False) + if not all(key.startswith(norm_keys) for key in missing) or unexpected: + raise RuntimeError( + "SmolVLA %d missing / %d unexpected keys", + len(missing), + len(unexpected), + ) + + return model def pad_vector(vector, new_dim): """Can be (batch_size x sequence_length x features_dimension) or (batch_size x features_dimension) @@ -219,7 +286,27 @@ def aloha_gripper_to_angular(value): # The values 0.4 and 1.5 were measured on an actual Trossen robot. return normalize(value, min_val=0.4, max_val=1.5) +def rename_checkpoint_keys(checkpoint: dict, rename_str: str): + """ + Renames keys in a checkpoint dictionary based on the given rename string. + Args: + checkpoint (dict): The checkpoint dictionary. + rename_str (str): A string specifying key mappings in the format "old1//new1,old2//new2". + + Returns: + dict: The modified checkpoint with renamed keys. + """ + + rename_dict = dict(pair.split("//") for pair in rename_str.split(",")) + + new_checkpoint = {} + for k, v in checkpoint.items(): + for old_key, new_key in rename_dict.items(): + if old_key in k: + k = k.replace(old_key, new_key) + new_checkpoint[k] = v + return new_checkpoint def aloha_gripper_from_angular(value): # Convert from the gripper position used by pi0 to the gripper position that is used by Aloha. # Note that the units are still angular but the range is different. @@ -333,7 +420,7 @@ class SMOLPI0Policy(PreTrainedPolicy): self.model.vlm_with_expert.merge_lora_weights() @torch.no_grad - def select_action_chunk(self, batch: dict[str, Tensor], noise: Tensor | None = None) -> Tensor: + def predict_action_chunk(self, batch: dict[str, Tensor], noise: Tensor | None = None) -> Tensor: """Select a single action given environment observations. This method wraps `select_actions` in order to return one action at a time for execution in the @@ -364,7 +451,24 @@ class SMOLPI0Policy(PreTrainedPolicy): actions = self._pi_aloha_encode_actions(actions) return actions - + + # HACK(aliberts, danaaubakirova): we overwrite this classmethod here to fix smolVLA-specific issues + @classmethod + def _load_as_safetensor( + cls, + model: "SmolVLAPolicy", + model_file: str, + map_location: str, + strict: bool, + **kwargs, + ): + safetensors.torch.load_model(model, model_file, strict=strict, device=map_location) + return load_smolvla( + model, + model_file, + device=map_location, + checkpoint_keys_mapping="model._orig_mod.//model.", + ) @torch.no_grad def select_action(self, batch: dict[str, Tensor], noise: Tensor | None = None) -> Tensor: """Select a single action given environment observations. diff --git a/src/lerobot/policies/smolvla/modeling_smolvla.py b/src/lerobot/policies/smolvla/modeling_smolvla.py index 9b7e3520a..1c07d98e5 100644 --- a/src/lerobot/policies/smolvla/modeling_smolvla.py +++ b/src/lerobot/policies/smolvla/modeling_smolvla.py @@ -1027,7 +1027,7 @@ from lerobot.policies.utils import ( populate_queues, ) from lerobot.utils.utils import get_safe_dtype - +# OBS_STATE = 'state' # Matches ".soNNN", optionally followed by "-something", up to the "_buffer_" marker _VARIANT_RE = re.compile(r"\.so\d+(?:-[\w]+)?_buffer_") @@ -1347,6 +1347,7 @@ class SmolVLAPolicy(PreTrainedPolicy): # Unpad actions original_action_dim = self.config.action_feature.shape[0] + original_action_dim = 7 actions = actions[:, :, :original_action_dim] actions = self.unnormalize_outputs({ACTION: actions})[ACTION] diff --git a/src/lerobot/scripts/eval.py b/src/lerobot/scripts/eval.py index 92a3bf833..6fbae645b 100644 --- a/src/lerobot/scripts/eval.py +++ b/src/lerobot/scripts/eval.py @@ -69,7 +69,7 @@ from tqdm import trange from lerobot.configs import parser from lerobot.configs.eval import EvalPipelineConfig from lerobot.envs.factory import make_env -from lerobot.envs.utils import add_envs_task, check_env_attributes_and_types, preprocess_observation +from lerobot.envs.utils import add_envs_task, check_env_attributes_and_types, preprocess_observation, preprocess_observation1 from lerobot.policies.factory import make_policy from lerobot.policies.pretrained import PreTrainedPolicy from lerobot.policies.utils import get_device_from_parameters @@ -125,6 +125,10 @@ def rollout( # Reset the policy and environments. policy.reset() + # added by jade + # for k in list(policy.config.input_features.keys()): + # if k.startswith("observation.image"): + # policy.config.input_features["observation.images." + k.split("observation.", 1)[1]] = policy.config.input_features.pop(k) observation, info = env.reset(seed=seeds) if render_callback is not None: render_callback(env) @@ -149,6 +153,7 @@ def rollout( while not np.all(done) and step < max_steps: # Numpy array to tensor and changing dictionary keys to LeRobot policy format. observation = preprocess_observation(observation) + # observation = preprocess_observation1(observation) if return_observations: all_observations.append(deepcopy(observation)) @@ -159,6 +164,26 @@ def rollout( # Infer "task" from attributes of environments. # TODO: works with SyncVectorEnv but not AsyncVectorEnv observation = add_envs_task(env, observation) + # breakpoint() + # observation = { + # k.replace("observation.images.", "observation.") if k.startswith("observation.images.") else k: v + # for k, v in observation.items() + # # } + # if "observation.image" in observation: + # observation["image"] = observation.pop("observation.image").to( + # device, non_blocking=device.type == "cuda" + # ) + + # if "observation.image2" in observation: + # observation["wrist_image"] = observation.pop("observation.image2").to( + # device, non_blocking=device.type == "cuda" + # ) + + # if "observation.state" in observation: + # observation["state"] = observation.pop("observation.state").to( + # device, non_blocking=device.type == "cuda" + # ) + with torch.inference_mode(): action = policy.select_action(observation) # Convert to CPU / numpy. @@ -489,12 +514,11 @@ def _inject_normalization_stats(policy: SmolVLAPolicy, dataset_meta: LeRobotData print("Normalization layers recreated with dataset stats.") -def load_smolvla(cfg, dataset_repo: str): +def load_smolvla(cfg, dataset_repo: str, policy): from lerobot.datasets.lerobot_dataset import LeRobotDataset dataset = LeRobotDataset(dataset_repo, root='/raid/jade/.cache/huggingface/datasets/') - policy = make_policy(cfg=cfg, ds_meta=dataset.meta) _inject_normalization_stats(policy=policy, dataset_meta=dataset.meta) # only needed if stats are missing - return policy, dataset + return policy.to("cuda"), dataset @parser.wrap() @@ -505,7 +529,7 @@ def eval_main(cfg: EvalPipelineConfig): device = get_safe_torch_device(cfg.policy.device, log=True) #login to hf from huggingface_hub import login - login() + # login() torch.backends.cudnn.benchmark = True torch.backends.cuda.matmul.allow_tf32 = True set_seed(cfg.seed) @@ -520,9 +544,10 @@ def eval_main(cfg: EvalPipelineConfig): cfg=cfg.policy, env_cfg=cfg.env, ) - # breakpoint() - load_smolvla(cfg.policy, "physical-intelligence/libero") - # breakpoint() + breakpoint() + # policy, _ = load_smolvla(cfg.policy, "physical-intelligence/libero", policy) + # rename "image" -> "observation.image" + policy.eval() with torch.no_grad(), torch.autocast(device_type=device.type) if cfg.policy.use_amp else nullcontext(): if cfg.env.multitask_eval: From 96cc634a661283f780f27326c406361c72da7655 Mon Sep 17 00:00:00 2001 From: Jade Choghari Date: Thu, 11 Sep 2025 12:21:21 +0200 Subject: [PATCH 04/12] add new changes --- examples/6_evaluate_libero.sh | 10 +- examples/8_train_smolvla_must.sh | 10 +- src/lerobot/envs/factory.py | 59 +++--- src/lerobot/envs/libero.py | 351 ++++++++++++++++++++++--------- src/lerobot/envs/utils.py | 44 ++++ src/lerobot/policies/factory.py | 1 - src/lerobot/scripts/eval.py | 261 ++++++++++++----------- src/lerobot/scripts/train.py | 73 +++---- 8 files changed, 505 insertions(+), 304 deletions(-) diff --git a/examples/6_evaluate_libero.sh b/examples/6_evaluate_libero.sh index c15d71c95..46355dfa1 100644 --- a/examples/6_evaluate_libero.sh +++ b/examples/6_evaluate_libero.sh @@ -18,11 +18,11 @@ export CUDA_VISIBLE_DEVICES=2 # CONFIGURATION POLICY_PATH="/raid/jade/logs/lerobot/lerobot_2_HuggingFaceVLA_libero_smolvla_lr1e-4bs32steps100000/checkpoints/100000/pretrained_model" -POLICY_PATH="/raid/jade/logs/lerobot/lerobot_new_HuggingfaceVLA_libero_smolvla_lr1e-4bs32steps100000/checkpoints/100000/pretrained_model" -TASK=libero_spatial +POLICY_PATH="/raid/jade/models/smolvlamust" +TASK=libero_spatial,libero_object ENV_TYPE="libero" -BATCH_SIZE=10 -N_EPISODES=10 +BATCH_SIZE=1 +N_EPISODES=1 # storage / caches RAID=/raid/jade N_ACTION_STEPS=1 @@ -46,7 +46,7 @@ python src/lerobot/scripts/eval.py \ --env.type="$ENV_TYPE" \ --eval.batch_size="$BATCH_SIZE" \ --eval.n_episodes="$N_EPISODES" \ - --env.multitask_eval=False \ + --env.multitask_eval=True \ --env.task=$TASK \ # python examples/evaluate_libero.py \ # --policy_path "$POLICY_PATH" \ diff --git a/examples/8_train_smolvla_must.sh b/examples/8_train_smolvla_must.sh index 98029f3ad..828627d85 100644 --- a/examples/8_train_smolvla_must.sh +++ b/examples/8_train_smolvla_must.sh @@ -56,14 +56,16 @@ EXPERT_WIDTH_MULTIPLIER=0.5 # number of gpus to use NUM_PROCESSES=2 NUM_VLM_LAYERS=0 -SELF_ATTN_EVERY_N_LAYERS=2 +SELF_ATTN_EVERY_N_LAYERS=0 CHUNK_SIZE=50 -export CUDA_VISIBLE_DEVICES=0 +export CUDA_VISIBLE_DEVICES=1 PORT=29522 PREFIX_LENGTH=0 LOAD_VLM_WEIGHTS=true +MAX_ACTION_DIM=32 +MAX_STATE_DIM=32 # naming/output dir -TRAIN_DIR=$RAID/logs/lerobot/lerobot_new_${REPO_ID//\//_}_${POLICY}_lr${LR}bs${BATCH_SIZE}steps${OFFLINE_STEPS} +TRAIN_DIR=$RAID/logs/lerobot/lerobot_new_sep11_v2_${REPO_ID//\//_}_${POLICY}_lr${LR}bs${BATCH_SIZE}steps${OFFLINE_STEPS} echo "Training dir: $TRAIN_DIR" rm -rf "$TRAIN_DIR" @@ -137,5 +139,7 @@ python src/lerobot/scripts/train.py \ --policy.load_vlm_weights=$LOAD_VLM_WEIGHTS \ --policy.expert_width_multiplier=$EXPERT_WIDTH_MULTIPLIER \ --policy.self_attn_every_n_layers=$SELF_ATTN_EVERY_N_LAYERS \ + --policy.max_action_dim=$MAX_ACTION_DIM \ + --policy.max_state_dim=$MAX_STATE_DIM \ --seed=$SEED \ --wandb.enable=false diff --git a/src/lerobot/envs/factory.py b/src/lerobot/envs/factory.py index 211b41714..d38b2eed3 100644 --- a/src/lerobot/envs/factory.py +++ b/src/lerobot/envs/factory.py @@ -56,36 +56,41 @@ def make_env( names to indexed vectorized environments (when multitask eval is used). """ - if n_envs < 1: - raise ValueError("`n_envs` must be at least 1") + if n_envs < 1: + raise ValueError("`n_envs` must be at least 1") - env_cls = gym.vector.AsyncVectorEnv if use_async_envs else gym.vector.SyncVectorEnv + env_cls = gym.vector.AsyncVectorEnv if use_async_envs else gym.vector.SyncVectorEnv + + + if "libero" in cfg.type: + from lerobot.envs.libero import create_libero_envs + return create_libero_envs( + task=cfg.task, + n_envs=n_envs, + camera_name=cfg.camera_name, + init_states=cfg.init_states, + gym_kwargs=cfg.gym_kwargs, + env_cls=env_cls, + multitask_eval=cfg.multitask_eval, + ) - if "libero" in cfg.type: - from lerobot.envs.libero import create_libero_envs - return create_libero_envs( - task=cfg.task, - n_envs=n_envs, - camera_name=cfg.camera_name, - init_states=cfg.init_states, - gym_kwargs=cfg.gym_kwargs, - env_cls=env_cls, - multitask_eval=cfg.multitask_eval, - ) + package_name = f"gym_{cfg.type}" + try: + importlib.import_module(package_name) + except ModuleNotFoundError as e: + raise ModuleNotFoundError( + f"{package_name} is not installed. Install with: pip install \"lerobot[{cfg.type}]\"" + ) from e - - package_name = f"gym_{cfg.type}" - try: - importlib.import_module(package_name) - except ModuleNotFoundError as e: - raise ModuleNotFoundError( - f"{package_name} is not installed. Install with: pip install \"lerobot[{cfg.type}]\"" - ) from e + gym_handle = f"{package_name}/{cfg.task}" + + def _make_one(): + return gym.make(gym_handle, disable_env_checker=True, **(cfg.gym_kwargs or {})) - gym_handle = f"{package_name}/{cfg.task}" - - def _make_one(): - return gym.make(gym_handle, disable_env_checker=True, **(cfg.gym_kwargs or {})) + vec = env_cls([_make_one for _ in range(n_envs)]) + + # normalize to {suite: {task_id: vec_env}} for consistency + suite_name = cfg.type # e.g., "pusht", "aloha" + return {suite_name: {0: vec}} - return env_cls([_make_one for _ in range(n_envs)]) diff --git a/src/lerobot/envs/libero.py b/src/lerobot/envs/libero.py index 83ccd2fb9..ff1574416 100644 --- a/src/lerobot/envs/libero.py +++ b/src/lerobot/envs/libero.py @@ -1,3 +1,5 @@ +from __future__ import annotations + import math import os from collections import defaultdict @@ -12,100 +14,153 @@ from gymnasium import spaces from libero.libero import benchmark, get_libero_path from libero.libero.envs import OffScreenRenderEnv +import logging +from collections import defaultdict +from typing import Any, Callable, Dict, Iterable, List, Mapping, Sequence + + +logger = logging.getLogger(__name__) + +# ---- Helpers ----------------------------------------------------------------- + +def _parse_camera_names(camera_name: str | Sequence[str]) -> List[str]: + """Normalize camera_name into a non-empty list of strings.""" + if isinstance(camera_name, str): + cams = [c.strip() for c in camera_name.split(",") if c.strip()] + elif isinstance(camera_name, (list, tuple)): + cams = [str(c).strip() for c in camera_name if str(c).strip()] + else: + raise TypeError(f"camera_name must be str or sequence[str], got {type(camera_name).__name__}") + if not cams: + raise ValueError("camera_name resolved to an empty list.") + return cams + + +def _get_suite(name: str): + """Instantiate a LIBERO suite by name with clear validation.""" + bench = benchmark.get_benchmark_dict() + if name not in bench: + raise ValueError(f"Unknown LIBERO suite '{name}'. Available: {', '.join(sorted(bench.keys()))}") + suite = bench[name]() + if not getattr(suite, "tasks", None): + raise ValueError(f"Suite '{name}' has no tasks.") + return suite + + +def _select_task_ids(total_tasks: int, task_ids: Iterable[int] | None) -> List[int]: + """Validate/normalize task ids. If None β†’ all tasks.""" + if task_ids is None: + return list(range(total_tasks)) + ids = sorted(set(int(t) for t in task_ids)) + for t in ids: + if t < 0 or t >= total_tasks: + raise ValueError(f"task_id {t} out of range [0, {total_tasks-1}].") + return ids + + +def _make_env_fns( + *, + suite, + suite_name: str, + task_id: int, + n_envs: int, + camera_names: List[str], + init_states: bool, + gym_kwargs: Mapping[str, Any], + LiberoEnv: type, # injected to avoid forward ref issues if needed +) -> List[Callable[[], "LiberoEnv"]]: + """Build n_envs factory callables for a single (suite, task_id).""" + joined_cams = ",".join(camera_names) # keep backward-compat: downstream expects a string + fns: List[Callable[[], "LiberoEnv"]] = [] + for i in range(n_envs): + def _mk(i=i, suite=suite, task_id=task_id, suite_name=suite_name, joined_cams=joined_cams, init_states=init_states, gym_kwargs=dict(gym_kwargs)): + return LiberoEnv( + task_suite=suite, + task_id=task_id, + task_suite_name=suite_name, + camera_name=joined_cams, + init_states=init_states, + episode_index=i, + **gym_kwargs, + ) + fns.append(_mk) + return fns + +# ---- Main API ---------------------------------------------------------------- def create_libero_envs( task: str, n_envs: int, - gym_kwargs: dict[str, Any] = None, - camera_name: str = "agentview_image,robot0_eye_in_hand_image", + gym_kwargs: dict[str, Any] | None = None, + camera_name: str | Sequence[str] = "agentview_image,robot0_eye_in_hand_image", init_states: bool = True, - env_cls: Callable = None, - multitask_eval: bool = True, -) -> dict[str, dict[str, Any]]: + env_cls: Callable[[Sequence[Callable[[], Any]]], Any] | None = None, + multitask_eval: bool = True, # kept for signature compatibility; return type is consistent regardless +) -> dict[str, dict[int, Any]]: """ - Here n_envs is per task and equal to the number of rollouts. + Create vectorized LIBERO environments with a consistent return shape. + Returns: - dict[str, dict[str, list[LiberoEnv]]]: keys are task_suite and values are list of LiberoEnv envs. + dict[suite_name][task_id] -> vec_env (env_cls([...]) with exactly n_envs factories) + Notes: + - n_envs is the number of rollouts *per task* (episode_index = 0..n_envs-1). + - `task` can be a single suite or a comma-separated list of suites. + - You may pass `task_ids` (list[int]) inside `gym_kwargs` to restrict tasks per suite. """ - print("num envs", n_envs) - print("multitask_eval", multitask_eval) - print("gym_kwargs", gym_kwargs) - if gym_kwargs is None: - gym_kwargs = {} + if env_cls is None or not callable(env_cls): + raise ValueError("env_cls must be a callable that wraps a list of environment factory callables.") + if not isinstance(n_envs, int) or n_envs <= 0: + raise ValueError(f"n_envs must be a positive int; got {n_envs}.") - if not multitask_eval: - benchmark_dict = benchmark.get_benchmark_dict() - task_suite = benchmark_dict[task]() # can also choose libero_spatial, libero_object, libero_10 etc. - tasks_id = list(range(len(task_suite.tasks))) - episode_indices = [0 for i in range(len(tasks_id))] - if len(tasks_id) == 1: - tasks_id = [tasks_id[0] for _ in range(n_envs)] - episode_indices = list(range(n_envs)) - elif len(tasks_id) < n_envs and n_envs % len(tasks_id) == 0: - n_repeat = n_envs // len(tasks_id) - print("n_repeat", n_repeat) - episode_indices = [] - for _ in range(len(tasks_id)): - episode_indices.extend(list(range(n_repeat))) - tasks_id = list(chain.from_iterable([[item] * n_repeat for item in tasks_id])) - elif n_envs < len(tasks_id): - tasks_id = tasks_id[:n_envs] - episode_indices = list(range(n_envs))[:n_envs] - print(f"WARNING: n_envs < len(tasks_id), evaluating only on {tasks_id}") - print(f"Creating Libero envs with task ids {tasks_id} from suite {task}") - assert n_envs == len(tasks_id), ( - f"len(n_envs) and tasks_id should be the same, got {n_envs} and {len(tasks_id)}" - ) - return env_cls( - [ - lambda i=i: LiberoEnv( - task_suite=task_suite, - task_id=tasks_id[i], - task_suite_name=task, - camera_name=camera_name, - init_states=init_states, - episode_index=episode_indices[i], - **gym_kwargs, - ) - for i in range(n_envs) - ] - ) - else: - envs = defaultdict(dict) - benchmark_dict = benchmark.get_benchmark_dict() - task = task.split(",") - for _task in task: - task_suite = benchmark_dict[ - _task - ]() # can also choose libero_spatial, libero_object, libero_10 etc. - tasks_ids = list(range(len(task_suite.tasks))) - for tasks_id in tasks_ids: - episode_indices = list(range(n_envs)) - print( - f"Creating Libero envs with task ids {tasks_id} from suite {_task}, episode_indices: {episode_indices}" - ) - envs_list = [ - ( - lambda i=i, - task_suite=task_suite, - tasks_id=tasks_id, - _task=_task, - episode_indices=episode_indices: LiberoEnv( - task_suite=task_suite, - task_id=tasks_id, - task_suite_name=_task, - camera_name=camera_name, - init_states=init_states, - episode_index=episode_indices[i], - **gym_kwargs, - ) - ) - for i in range(n_envs) - ] - envs[_task][tasks_id] = env_cls(envs_list) - return envs + gym_kwargs = dict(gym_kwargs or {}) + task_ids_filter = gym_kwargs.pop("task_ids", None) # optional: limit to specific tasks + # Avoid circular import/type issues: assume LiberoEnv is defined in this module + try: + LiberoEnv # type: ignore[name-defined] + except NameError: + # If LiberoEnv is in the same file, this won't run. If it's elsewhere, import here. + exit() + # from .libero_env import LiberoEnv # adjust if your class lives in another module + camera_names = _parse_camera_names(camera_name) + suite_names = [s.strip() for s in str(task).split(",") if s.strip()] + if not suite_names: + raise ValueError("`task` must contain at least one LIBERO suite name.") + + logger.info( + "Creating LIBERO envs | suites=%s | n_envs(per task)=%d | init_states=%s | multitask_eval=%s", + suite_names, n_envs, init_states, bool(multitask_eval) + ) + if task_ids_filter is not None: + logger.info("Restricting to task_ids=%s", task_ids_filter) + + out: Dict[str, Dict[int, Any]] = defaultdict(dict) + + for suite_name in suite_names: + suite = _get_suite(suite_name) + total = len(suite.tasks) + selected = _select_task_ids(total, task_ids_filter) + + if not selected: + raise ValueError(f"No tasks selected for suite '{suite_name}' (available: {total}).") + + for tid in selected: + fns = _make_env_fns( + suite=suite, + suite_name=suite_name, + task_id=tid, + n_envs=n_envs, + camera_names=camera_names, + init_states=init_states, + gym_kwargs=gym_kwargs, + LiberoEnv=LiberoEnv, + ) + out[suite_name][tid] = env_cls(fns) + logger.debug("Built vec env | suite=%s | task_id=%d | n_envs=%d", suite_name, tid, n_envs) + + # return plain dicts for predictability + return {suite: dict(task_map) for suite, task_map in out.items()} def quat2axisangle(quat): """ Copied from robosuite: https://github.com/ARISE-Initiative/robosuite/blob/eafb81f54ffc104f905ee48a16bb15f059176ad3/robosuite/utils/transform_utils.py#L490C1-L512C55 @@ -199,17 +254,15 @@ class LiberoEnv(gym.Env): self.episode_index = episode_index self._env = self._make_envs_task(task_suite, self.task_id) - if task_suite_name == "libero_spatial": - max_steps = 220 # longest training demo has 193 steps - elif task_suite_name == "libero_object": - max_steps = 280 # longest training demo has 254 steps - elif task_suite_name == "libero_goal": - max_steps = 300 # longest training demo has 270 steps - elif task_suite_name == "libero_10": - max_steps = 520 # longest training demo has 505 steps - elif task_suite_name == "libero_90": - max_steps = 400 # longest training demo has 373 steps - self._max_episode_steps = max_steps + TASK_SUITE_MAX_STEPS: dict[str, int] = { + "libero_spatial": 220, # longest training demo has 193 steps + "libero_object": 280, # longest training demo has 254 steps + "libero_goal": 300, # longest training demo has 270 steps + "libero_10": 520, # longest training demo has 505 steps + "libero_90": 400, # longest training demo has 373 steps + } + default_steps = 500 + self._max_episode_steps = TASK_SUITE_MAX_STEPS.get(task_suite_name, default_steps) images = {} for cam in self.camera_name: @@ -221,7 +274,11 @@ class LiberoEnv(gym.Env): ) if self.obs_type == "state": - raise NotImplementedError() + raise NotImplementedError( + "The 'state' observation type is not supported in LiberoEnv. " + "Please switch to an image-based obs_type (e.g. 'pixels', 'pixels_agent_pos')." + ) + elif self.obs_type == "pixels": self.observation_space = spaces.Dict( { @@ -285,7 +342,10 @@ class LiberoEnv(gym.Env): ) agent_pos = state if self.obs_type == "state": - raise NotImplementedError() + raise NotImplementedError( + "The 'state' observation type is not supported in LiberoEnv. " + "Please switch to an image-based obs_type (e.g. 'pixels', 'pixels_agent_pos')." + ) elif self.obs_type == "pixels": obs = {"pixels": images.copy()} elif self.obs_type == "pixels_agent_pos": @@ -308,7 +368,11 @@ class LiberoEnv(gym.Env): return observation, info def step(self, action): - assert action.ndim == 1 + if action.ndim != 1: + raise ValueError( + f"Expected action to be 1-D (shape (action_dim,)), " + f"but got shape {action.shape} with ndim={action.ndim}" + ) raw_obs, reward, done, info = self._env.step(action) is_success = self._env.check_success() @@ -324,3 +388,96 @@ class LiberoEnv(gym.Env): def close(self): self._env.close() + + +def create_libero_envs1( + task: str, + n_envs: int, + gym_kwargs: dict[str, Any] = None, + camera_name: str = "agentview_image,robot0_eye_in_hand_image", + init_states: bool = True, + env_cls: Callable = None, + multitask_eval: bool = True, +) -> dict[str, dict[str, Any]]: + """ + Here n_envs is per task and equal to the number of rollouts. + Returns: + dict[str, dict[str, list[LiberoEnv]]]: keys are task_suite and values are list of LiberoEnv envs. + """ + print("num envs", n_envs) + print("multitask_eval", multitask_eval) + print("gym_kwargs", gym_kwargs) + if gym_kwargs is None: + gym_kwargs = {} + + if not multitask_eval: + benchmark_dict = benchmark.get_benchmark_dict() + task_suite = benchmark_dict[task]() # can also choose libero_spatial, libero_object, libero_10 etc. + tasks_id = list(range(len(task_suite.tasks))) + episode_indices = [0 for i in range(len(tasks_id))] + if len(tasks_id) == 1: + tasks_id = [tasks_id[0] for _ in range(n_envs)] + episode_indices = list(range(n_envs)) + elif len(tasks_id) < n_envs and n_envs % len(tasks_id) == 0: + n_repeat = n_envs // len(tasks_id) + print("n_repeat", n_repeat) + episode_indices = [] + for _ in range(len(tasks_id)): + episode_indices.extend(list(range(n_repeat))) + tasks_id = list(chain.from_iterable([[item] * n_repeat for item in tasks_id])) + elif n_envs < len(tasks_id): + tasks_id = tasks_id[:n_envs] + episode_indices = list(range(n_envs))[:n_envs] + print(f"WARNING: n_envs < len(tasks_id), evaluating only on {tasks_id}") + print(f"Creating Libero envs with task ids {tasks_id} from suite {task}") + assert n_envs == len(tasks_id), ( + f"len(n_envs) and tasks_id should be the same, got {n_envs} and {len(tasks_id)}" + ) + return env_cls( + [ + lambda i=i: LiberoEnv( + task_suite=task_suite, + task_id=tasks_id[i], + task_suite_name=task, + camera_name=camera_name, + init_states=init_states, + episode_index=episode_indices[i], + **gym_kwargs, + ) + for i in range(n_envs) + ] + ) + else: + envs = defaultdict(dict) + benchmark_dict = benchmark.get_benchmark_dict() + task = task.split(",") + for _task in task: + task_suite = benchmark_dict[ + _task + ]() # can also choose libero_spatial, libero_object, libero_10 etc. + tasks_ids = list(range(len(task_suite.tasks))) + for tasks_id in tasks_ids: + episode_indices = list(range(n_envs)) + print( + f"Creating Libero envs with task ids {tasks_id} from suite {_task}, episode_indices: {episode_indices}" + ) + envs_list = [ + ( + lambda i=i, + task_suite=task_suite, + tasks_id=tasks_id, + _task=_task, + episode_indices=episode_indices: LiberoEnv( + task_suite=task_suite, + task_id=tasks_id, + task_suite_name=_task, + camera_name=camera_name, + init_states=init_states, + episode_index=episode_indices[i], + **gym_kwargs, + ) + ) + for i in range(n_envs) + ] + envs[_task][tasks_id] = env_cls(envs_list) + return envs diff --git a/src/lerobot/envs/utils.py b/src/lerobot/envs/utils.py index 5ae252dbe..9490f670e 100644 --- a/src/lerobot/envs/utils.py +++ b/src/lerobot/envs/utils.py @@ -182,3 +182,47 @@ def add_envs_task(env: gym.vector.VectorEnv, observation: dict[str, Any]) -> dic num_envs = observation[list(observation.keys())[0]].shape[0] observation["task"] = ["" for _ in range(num_envs)] return observation + +def _close_single_env(env: Any) -> None: + """Try to close a single env object if it exposes .close().""" + try: + close_fn = getattr(env, "close", None) + if callable(close_fn): + close_fn() + except Exception as exc: + # Best-effort close: log but don't raise + LOG.debug("Exception while closing env %s: %s", env, exc) + +def close_envs(env_or_collection: Any) -> None: + """ + Close a single env or any nested structure of envs. + + Accepts: + - a single env with .close() + - a Mapping of things (e.g. dict) + - a Sequence of things (list/tuple) but NOT str/bytes + - nested combinations of the above + + This is intentionally permissive and best-effort: it will swallow exceptions + encountered while closing individual envs and continue. + """ + # Guard: single object with close() + if hasattr(env_or_collection, "close") and not isinstance(env_or_collection, (Mapping, Sequence)): + _close_single_env(env_or_collection) + return + + # Mapping (e.g., {suite: {task_id: vec_env}}) + if isinstance(env_or_collection, Mapping): + for v in env_or_collection.values(): + close_envs(v) + return + + # Sequence (list/tuple) but skip str/bytes + if isinstance(env_or_collection, Sequence) and not isinstance(env_or_collection, (str, bytes)): + for v in env_or_collection: + close_envs(v) + return + + # Fallback: try to close if possible + if hasattr(env_or_collection, "close"): + _close_single_env(env_or_collection) \ No newline at end of file diff --git a/src/lerobot/policies/factory.py b/src/lerobot/policies/factory.py index b7d92d988..03fa44a2a 100644 --- a/src/lerobot/policies/factory.py +++ b/src/lerobot/policies/factory.py @@ -177,6 +177,5 @@ def make_policy( policy = policy_cls(**kwargs) policy.to(cfg.device) assert isinstance(policy, nn.Module) - breakpoint() # policy = torch.compile(policy, mode="reduce-overhead") return policy diff --git a/src/lerobot/scripts/eval.py b/src/lerobot/scripts/eval.py index 6fbae645b..795ed2b3c 100644 --- a/src/lerobot/scripts/eval.py +++ b/src/lerobot/scripts/eval.py @@ -45,7 +45,6 @@ Note that in both examples, the repo/folder should contain at least `config.json You can learn about the CLI options for this script in the `EvalPipelineConfig` in lerobot/configs/eval.py """ - import concurrent import json import logging @@ -80,6 +79,9 @@ from lerobot.utils.utils import ( init_logging, inside_slurm, ) +from typing import TypedDict, Dict, List, Tuple, Iterator +from collections import defaultdict +import concurrent.futures as cf def rollout( @@ -537,8 +539,8 @@ def eval_main(cfg: EvalPipelineConfig): logging.info(colored("Output dir:", "yellow", attrs=["bold"]) + f" {cfg.output_dir}") logging.info("Making environment.") - env = make_env(cfg.env, n_envs=cfg.eval.batch_size, use_async_envs=cfg.eval.use_async_envs) - + envs = make_env(cfg.env, n_envs=cfg.eval.batch_size, use_async_envs=cfg.eval.use_async_envs) + breakpoint() logging.info("Making policy.") policy = make_policy( cfg=cfg.policy, @@ -550,41 +552,29 @@ def eval_main(cfg: EvalPipelineConfig): policy.eval() with torch.no_grad(), torch.autocast(device_type=device.type) if cfg.policy.use_amp else nullcontext(): - if cfg.env.multitask_eval: - info = eval_policy_multitask( - env, - policy, - cfg.eval.n_episodes, - max_episodes_rendered=10, - videos_dir=Path(cfg.output_dir) / "videos", - start_seed=cfg.seed, - max_parallel_tasks=cfg.env.max_parallel_tasks, - verbose=False, - ) - print("Overall Aggregated Metrics:") - print(info["overall"]["aggregated"]) - - # Print per-suite stats - for task_group, task_group_info in info.items(): - if task_group == "overall": - continue # Skip the overall stats since we already printed it - print(f"\nAggregated Metrics for {task_group}:") - print(task_group_info["aggregated"]) - for _task_group, v in env.items(): - for _env in v.values(): - _env.close() - else: - info = eval_policy( - env, - policy, - cfg.eval.n_episodes, - max_episodes_rendered=10, - videos_dir=Path(cfg.output_dir) / "videos", - start_seed=cfg.seed, - ) - print(info["aggregated"]) - env.close() + info = eval_policy_all( + envs, + policy, + cfg.eval.n_episodes, + max_episodes_rendered=10, + videos_dir=Path(cfg.output_dir) / "videos", + start_seed=cfg.seed, + max_parallel_tasks=cfg.env.max_parallel_tasks, + verbose=False, + ) + print("Overall Aggregated Metrics:") + print(info["overall"]["aggregated"]) + # Print per-suite stats + for task_group, task_group_info in info.items(): + if task_group == "overall": + continue # Skip the overall stats since we already printed it + print(f"\nAggregated Metrics for {task_group}:") + print(task_group_info["aggregated"]) + # Close all vec envs + for _suite, task_map in envs.items(): + for _vec in task_map.values(): + _vec.close() # Save info with open(Path(cfg.output_dir) / "eval_info.json", "w") as f: json.dump(info, f, indent=2) @@ -592,9 +582,19 @@ def eval_main(cfg: EvalPipelineConfig): logging.info("End of eval") -def eval_policy_multitask( - envs: dict[str, dict[str, gym.vector.VectorEnv]], - policy, +# ---- typed payload returned by one task eval ---- +class TaskMetrics(TypedDict): + sum_rewards: List[float] + max_rewards: List[float] + successes: List[bool] + video_paths: List[str] + +ACC_KEYS = ("sum_rewards", "max_rewards", "successes", "video_paths") + + +def eval_policy_all( + envs: dict[str, dict[int, gym.vector.VectorEnv]], + policy: PreTrainedPolicy, n_episodes: int, max_episodes_rendered: int = 0, videos_dir: Path | None = None, @@ -603,129 +603,136 @@ def eval_policy_multitask( max_parallel_tasks: int = 5, verbose: bool = True, ) -> dict: + """ + Evaluate a policy over a dict-of-dicts of vectorized envs: + envs[suite_name][task_id] -> gym.vector.VectorEnv + Returns a dict with per-suite aggregates and an 'overall' block. + """ global_start = time.time() - results = {} - overall_rewards, overall_max_rewards, overall_successes = [], [], [] - overall_video_paths = [] - overall_episode_data = None + # inner: evaluate a single (suite, task) + def eval_one( + task_group: str, + task_id: int, + env: gym.vector.VectorEnv, + *, + policy: PreTrainedPolicy, + n_episodes: int, + max_episodes_rendered: int, + videos_dir: Path | None, + return_episode_data: bool, + start_seed: int | None, + ) -> TaskMetrics: + """Evaluates one task_id of one suite using the provided vec env.""" + if verbose: + print(f"Evaluating: task_group={task_group}, task_id={task_id} ...") - def eval_task(task_group, task_id, env): - """Evaluates a single task in parallel.""" - print(f"Evaluating: task_group: {task_group}, task_id: {task_id} ...") + task_videos_dir = None if videos_dir is not None: task_videos_dir = videos_dir / f"{task_group}_{task_id}" task_videos_dir.mkdir(parents=True, exist_ok=True) + task_result = eval_policy( - env, - policy, - n_episodes, - max_episodes_rendered, - task_videos_dir, - return_episode_data, - start_seed, + env=env, + policy=policy, + n_episodes=n_episodes, + max_episodes_rendered=max_episodes_rendered, + videos_dir=task_videos_dir, + return_episode_data=return_episode_data, + start_seed=start_seed, ) per_episode = task_result["per_episode"] - return { - "task_group": task_group, - "task_id": task_id, - "sum_rewards": [ep["sum_reward"] for ep in per_episode], - "max_rewards": [ep["max_reward"] for ep in per_episode], - "successes": [ep["success"] for ep in per_episode], - "video_paths": task_result.get("video_paths", []), - } + return TaskMetrics( + sum_rewards=[ep["sum_reward"] for ep in per_episode], + max_rewards=[ep["max_reward"] for ep in per_episode], + successes=[ep["success"] for ep in per_episode], + video_paths=task_result.get("video_paths", []), + ) - task_group_results = {} - if max_parallel_tasks == 1: - # sequential mode (safe for colab / EGL) - for task_group, tasks in envs.items(): - for task_id, env in tasks.items(): - task_result = eval_task(task_group, task_id, env) - if task_group not in task_group_results: - task_group_results[task_group] = { - "sum_rewards": [], - "max_rewards": [], - "successes": [], - "video_paths": [], - } - task_group_results[task_group]["sum_rewards"].extend(task_result["sum_rewards"]) - task_group_results[task_group]["max_rewards"].extend(task_result["max_rewards"]) - task_group_results[task_group]["successes"].extend(task_result["successes"]) - task_group_results[task_group]["video_paths"].extend(task_result["video_paths"]) - else: - with concurrent.futures.ThreadPoolExecutor(max_workers=max_parallel_tasks) as executor: - future_to_task = { - executor.submit(eval_task, task_group, task_id, env): (task_group, task_id) - for task_group, tasks in envs.items() - for task_id, env in tasks.items() - } + # result producer: sequential or threaded, same consumer + def iter_task_results() -> Iterator[Tuple[str, int, TaskMetrics]]: + if max_parallel_tasks == 1: + for task_group, tasks in envs.items(): + for task_id, vec in tasks.items(): + yield task_group, task_id, eval_one( + task_group, task_id, vec, + policy=policy, + n_episodes=n_episodes, + max_episodes_rendered=max_episodes_rendered, + videos_dir=videos_dir, + return_episode_data=return_episode_data, + start_seed=start_seed, + ) + else: + with cf.ThreadPoolExecutor(max_workers=max_parallel_tasks) as executor: + fut2key: Dict[cf.Future, Tuple[str, int]] = {} + for task_group, tasks in envs.items(): + for task_id, vec in tasks.items(): + fut = executor.submit( + eval_one, task_group, task_id, vec, + policy=policy, + n_episodes=n_episodes, + max_episodes_rendered=max_episodes_rendered, + videos_dir=videos_dir, + return_episode_data=return_episode_data, + start_seed=start_seed, + ) + fut2key[fut] = (task_group, task_id) + for fut in cf.as_completed(fut2key): + task_group, task_id = fut2key[fut] + yield task_group, task_id, fut.result() - task_group_results = {} + # single accumulator path on the main thread + group_acc: Dict[str, Dict[str, List]] = defaultdict(lambda: {k: [] for k in ACC_KEYS}) + overall: Dict[str, List] = {k: [] for k in ACC_KEYS} - for future in concurrent.futures.as_completed(future_to_task): - task_result = future.result() - task_group = task_result["task_group"] + for task_group, task_id, metrics in iter_task_results(): + acc = group_acc[task_group] + for k in ACC_KEYS: + acc[k].extend(metrics[k]) + overall[k].extend(metrics[k]) - if task_group not in task_group_results: - task_group_results[task_group] = { - "sum_rewards": [], - "max_rewards": [], - "successes": [], - "video_paths": [], - } - - task_group_results[task_group]["sum_rewards"].extend(task_result["sum_rewards"]) - task_group_results[task_group]["max_rewards"].extend(task_result["max_rewards"]) - task_group_results[task_group]["successes"].extend(task_result["successes"]) - task_group_results[task_group]["video_paths"].extend(task_result["video_paths"]) - - # Process results per task group - for task_group, data in task_group_results.items(): + # build outputs + results: Dict[str, dict] = {} + for task_group, data in group_acc.items(): suite_rewards = data["sum_rewards"] - suite_max_rewards = data["max_rewards"] - suite_successes = data["successes"] - suite_video_paths = data["video_paths"] + suite_max = data["max_rewards"] + suite_succ = data["successes"] + suite_vids = data["video_paths"] suite_eval_s = time.time() - global_start suite_eval_ep_s = suite_eval_s / max(1, len(suite_rewards)) results[task_group] = { "aggregated": { - "avg_sum_reward": float(np.nanmean(suite_rewards)), - "avg_max_reward": float(np.nanmean(suite_max_rewards)), - "pc_success": float(np.nanmean(suite_successes) * 100), + "avg_sum_reward": float(np.nanmean(suite_rewards)) if suite_rewards else float("nan"), + "avg_max_reward": float(np.nanmean(suite_max)) if suite_max else float("nan"), + "pc_success": float(np.nanmean(suite_succ) * 100) if suite_succ else float("nan"), "eval_s": suite_eval_s, "eval_ep_s": suite_eval_ep_s, }, - "video_paths": suite_video_paths, - "episodes": None, # Modify if episode data is needed + "video_paths": suite_vids, + "episodes": None, } - overall_rewards.extend(suite_rewards) - overall_max_rewards.extend(suite_max_rewards) - overall_successes.extend(suite_successes) - overall_video_paths.extend(suite_video_paths) - - # Global metrics global_eval_s = time.time() - global_start - global_eval_ep_s = global_eval_s / max(1, len(overall_rewards)) - + global_eval_ep_s = global_eval_s / max(1, len(overall["sum_rewards"])) results["overall"] = { "aggregated": { - "avg_sum_reward": float(np.nanmean(overall_rewards)), - "avg_max_reward": float(np.nanmean(overall_max_rewards)), - "pc_success": float(np.nanmean(overall_successes) * 100), + "avg_sum_reward": float(np.nanmean(overall["sum_rewards"])) if overall["sum_rewards"] else float("nan"), + "avg_max_reward": float(np.nanmean(overall["max_rewards"])) if overall["max_rewards"] else float("nan"), + "pc_success": float(np.nanmean(overall["successes"]) * 100) if overall["successes"] else float("nan"), "eval_s": global_eval_s, "eval_ep_s": global_eval_ep_s, }, - "video_paths": overall_video_paths, - "episodes": overall_episode_data, + "video_paths": overall["video_paths"], + "episodes": None, } - return results + if __name__ == "__main__": init_logging() eval_main() diff --git a/src/lerobot/scripts/train.py b/src/lerobot/scripts/train.py index 74219fc38..3feeb0512 100644 --- a/src/lerobot/scripts/train.py +++ b/src/lerobot/scripts/train.py @@ -30,11 +30,12 @@ from lerobot.datasets.factory import make_dataset from lerobot.datasets.sampler import EpisodeAwareSampler from lerobot.datasets.utils import cycle from lerobot.envs.factory import make_env +from lerobot.envs.utils import close_envs from lerobot.optim.factory import make_optimizer_and_scheduler from lerobot.policies.factory import make_policy from lerobot.policies.pretrained import PreTrainedPolicy from lerobot.policies.utils import get_device_from_parameters -from lerobot.scripts.eval import eval_policy, eval_policy_multitask +from lerobot.scripts.eval import eval_policy_all from lerobot.utils.logging_utils import AverageMeter, MetricsTracker from lerobot.utils.random_utils import set_seed from lerobot.utils.train_utils import ( @@ -270,62 +271,46 @@ def train(cfg: TrainPipelineConfig): if cfg.env and is_eval_step: step_id = get_step_identifier(step, cfg.steps) logging.info(f"Eval policy at step {step}") - with ( - torch.no_grad(), - torch.autocast(device_type=device.type) if cfg.policy.use_amp else nullcontext(), - ): - if cfg.env.multitask_eval: - eval_info = eval_policy_multitask( - eval_env, - policy, - cfg.eval.n_episodes, - videos_dir=cfg.output_dir / "eval" / f"videos_step_{step_id}", - max_episodes_rendered=4, - start_seed=cfg.seed, - max_parallel_tasks=cfg.env.max_parallel_tasks, - ) - aggregated = eval_info["overall"]["aggregated"] - # Print per-suite stats, log? - for task_group, task_group_info in eval_info.items(): - if task_group == "overall": - continue # Skip the overall stats since we already printed it - print(f"\nAggregated Metrics for {task_group}:") - print(task_group_info["aggregated"]) - else: - eval_info = eval_policy( - eval_env, - policy, - cfg.eval.n_episodes, - videos_dir=cfg.output_dir / "eval" / f"videos_step_{step_id}", - max_episodes_rendered=4, - start_seed=cfg.seed, - ) - aggregated = eval_info["aggregated"] + with torch.no_grad(), (torch.autocast(device_type=device.type) if cfg.policy.use_amp else nullcontext()): + eval_info = eval_policy_all( + eval_env, # dict[suite][task_id] -> vec_env + policy, + cfg.eval.n_episodes, + videos_dir=videos_dir, + max_episodes_rendered=4, + start_seed=cfg.seed, + max_parallel_tasks=cfg.env.max_parallel_tasks, + verbose=False, + ) + # overall metrics (suite-agnostic) + aggregated = eval_info["overall"]["aggregated"] + + # optional: per-suite logging + for suite, suite_info in eval_info.items(): + if suite == "overall": + continue + logging.info("Suite %s aggregated: %s", suite, suite_info["aggregated"]) + + # meters/tracker eval_metrics = { "avg_sum_reward": AverageMeter("βˆ‘rwrd", ":.3f"), - "pc_success": AverageMeter("success", ":.1f"), - "eval_s": AverageMeter("eval_s", ":.3f"), + "pc_success": AverageMeter("success", ":.1f"), + "eval_s": AverageMeter("eval_s", ":.3f"), } eval_tracker = MetricsTracker( cfg.batch_size, dataset.num_frames, dataset.num_episodes, eval_metrics, initial_step=step ) - eval_tracker.eval_s = aggregated.pop("eval_s") - eval_tracker.avg_sum_reward = aggregated.pop("avg_sum_reward") - eval_tracker.pc_success = aggregated.pop("pc_success") - logging.info(eval_tracker) + eval_tracker.eval_s = aggregated.get("eval_s", 0.0) + eval_tracker.avg_sum_reward = aggregated.get("avg_sum_reward", float("nan")) + eval_tracker.pc_success = aggregated.get("pc_success", float("nan")) if wandb_logger: wandb_log_dict = {**eval_tracker.to_dict(), **eval_info} wandb_logger.log_dict(wandb_log_dict, step, mode="eval") wandb_logger.log_video(eval_info["video_paths"][0], step, mode="eval") if eval_env: - if cfg.env.multitask_eval: - for _task_group, envs_dict in eval_env.items(): - for _idx, env in envs_dict.items(): - env.close() - else: - eval_env.close() + close_envs(eval_env) logging.info("End of training") if cfg.policy.push_to_hub: From 565c992589ef2bf94a7fda75e857b3852702ee06 Mon Sep 17 00:00:00 2001 From: Jade Choghari Date: Thu, 11 Sep 2025 13:47:58 +0200 Subject: [PATCH 05/12] iterate on review --- examples/6_evaluate_libero_2.sh | 76 - examples/7_train_acc.sh | 93 - examples/7_train_libero_smolvla.sh | 89 - examples/8_train_smolvla_must.sh | 145 -- examples/9_evaluate_must.sh | 2811 ---------------------------- examples/checker.py | 27 - examples/checker2.py | 35 - examples/convert_data.py | 253 --- examples/convert_libero.py | 126 -- examples/evaluate_libero.py | 255 --- examples/requirements.in | 8 - examples/script2.py | 70 - examples/script3.py | 64 - examples/script4.py | 3 - log_text.txt | 1765 ----------------- src/lerobot/envs/libero copy.py | 326 ---- src/lerobot/envs/libero2.py | 308 --- tmux_log.txt | 2008 -------------------- 18 files changed, 8462 deletions(-) delete mode 100644 examples/6_evaluate_libero_2.sh delete mode 100644 examples/7_train_acc.sh delete mode 100644 examples/7_train_libero_smolvla.sh delete mode 100644 examples/8_train_smolvla_must.sh delete mode 100644 examples/9_evaluate_must.sh delete mode 100644 examples/checker.py delete mode 100644 examples/checker2.py delete mode 100644 examples/convert_data.py delete mode 100644 examples/convert_libero.py delete mode 100644 examples/evaluate_libero.py delete mode 100644 examples/requirements.in delete mode 100644 examples/script2.py delete mode 100644 examples/script3.py delete mode 100644 examples/script4.py delete mode 100644 log_text.txt delete mode 100644 src/lerobot/envs/libero copy.py delete mode 100644 src/lerobot/envs/libero2.py delete mode 100644 tmux_log.txt diff --git a/examples/6_evaluate_libero_2.sh b/examples/6_evaluate_libero_2.sh deleted file mode 100644 index 9d05c0330..000000000 --- a/examples/6_evaluate_libero_2.sh +++ /dev/null @@ -1,76 +0,0 @@ -#!/bin/bash - -# storage / caches -RAID=/raid/jade -export TRANSFORMERS_CACHE=$RAID/.cache/huggingface/transformers -export HF_HOME=$RAID/.cache/huggingface -export HF_DATASETS_CACHE=$RAID/.cache/huggingface/datasets -export HF_LEROBOT_HOME=$RAID/.cache/huggingface/lerobot -export WANDB_CACHE_DIR=$RAID/.cache/wandb -export TMPDIR=$RAID/.cache/tmp -mkdir -p $TMPDIR -export WANDB_MODE=offline -export HF_DATASETS_OFFLINE=1 -export HF_HUB_OFFLINE=1 -export TOKENIZERS_PARALLELISM=false -export MUJOCO_GL=egl -export CUDA_VISIBLE_DEVICES=3 - -# CONFIGURATION -POLICY_PATH="/raid/jade/logs/lerobot/lerobot_2_HuggingFaceVLA_libero_smolvla_lr1e-4bs32steps100000/checkpoints/100000/pretrained_model" -POLICY_PATH="AustineJohnBreaker/smolvla_stratch_libero_spatial" -TASK=libero_spatial -ENV_TYPE="libero" -BATCH_SIZE=10 -N_EPISODES=10 -USE_AMP=false -N_ACTION_STEPS=1 -SELF_ATTN_EVERY_N_LAYERS=2 -VLM_NAME=HuggingFaceTB/SmolVLM-500M-Instruct -PAD_LANG_TO=longest -LOAD_VLM_WEIGHTS=true -NUM_VLM_LAYERS=16 -CHUNK_SIZE=50 -N_OBS_STEPS=1 -NUM_EXPERT_LAYERS=0 -EXPERT_WIDTH_MULTIPLIER=0.5 - - -# storage / caches -RAID=/raid/jade -export TRANSFORMERS_CACHE=$RAID/.cache/huggingface/transformers -export HF_HOME=$RAID/.cache/huggingface -export HF_DATASETS_CACHE=$RAID/.cache/huggingface/datasets -export HF_LEROBOT_HOME=$RAID/.cache/huggingface/lerobot -export WANDB_CACHE_DIR=$RAID/.cache/wandb -export TMPDIR=$RAID/.cache/tmp -mkdir -p $TMPDIR -export WANDB_MODE=offline -# export HF_DATASETS_OFFLINE=1 -# export HF_HUB_OFFLINE=1 -export TOKENIZERS_PARALLELISM=false -export MUJOCO_GL=egl -export MUJOCO_GL=egl -ADD_IMAGE_TOKENS=true -unset HF_HUB_OFFLINE -# RUN EVALUATION -python src/lerobot/scripts/eval.py \ - --policy.path="$POLICY_PATH" \ - --env.type="$ENV_TYPE" \ - --eval.batch_size="$BATCH_SIZE" \ - --eval.n_episodes="$N_EPISODES" \ - --env.multitask_eval=False \ - --env.task=$TASK \ - --policy.use_amp=$USE_AMP \ - --policy.n_action_steps=$N_ACTION_STEPS \ - # --policy.add_image_special_tokens=$ADD_IMAGE_TOKENS \ - --policy.attention_mode=$ATTN_MODE \ - --policy.self_attn_every_n_layers=$SELF_ATTN_EVERY_N_LAYERS \ - --policy.vlm_model_name=$VLM_NAME \ - --policy.pad_language_to=$PAD_LANG_TO \ - --policy.load_vlm_weights=$LOAD_VLM_WEIGHTS \ - --policy.num_vlm_layers=$NUM_VLM_LAYERS \ - --policy.chunk_size=$CHUNK_SIZE \ - --policy.n_obs_steps=$N_OBS_STEPS \ - --policy.num_expert_layers=$NUM_EXPERT_LAYERS \ - --policy.expert_width_multiplier=$EXPERT_WIDTH_MULTIPLIER \ diff --git a/examples/7_train_acc.sh b/examples/7_train_acc.sh deleted file mode 100644 index 27f445143..000000000 --- a/examples/7_train_acc.sh +++ /dev/null @@ -1,93 +0,0 @@ -#!/bin/bash -# smolvla training with accelerate - -set -euo pipefail - -# repo/env -cd ~/lerobot || exit 1 -# conda activate lerobot -export LC_ALL=C - -rm -f core-* - -# storage / caches -RAID=/raid/jade -export TRANSFORMERS_CACHE=$RAID/.cache/huggingface/transformers -export HF_HOME=$RAID/.cache/huggingface -export HF_DATASETS_CACHE=$RAID/.cache/huggingface/datasets -export HF_LEROBOT_HOME=$RAID/.cache/huggingface/lerobot -export WANDB_CACHE_DIR=$RAID/.cache/wandb -export TMPDIR=$RAID/.cache/tmp -mkdir -p $TMPDIR -export WANDB_MODE=offline -export HF_DATASETS_OFFLINE=1 -export HF_HUB_OFFLINE=1 -export TOKENIZERS_PARALLELISM=false -export MUJOCO_GL=egl - -# CONFIG -ENV=libero -TASK=libero_spatial -REPO_ID=physical-intelligence/libero - -POLICY=smolvla -VLM=HuggingFaceTB/SmolVLM2-500M-Instruct - -# Optim / scheduling -LR=1e-4 -DECAY_LR=2.5e-6 -DECAY_STEPS=30000 -USE_AMP=true # set to true for mixed precision -TRAIN_EXPERT_ONLY=true -N_ACTION_STEPS=1 -SEED=1000 - -# Training loop -OFFLINE_STEPS=100000 -BATCH_SIZE=32 -EVAL_FREQ=0 -SAVE_FREQ=20000 -EVAL_BATCH_SIZE=1 -NUM_EPISODES=1 - -# number of gpus to use -NUM_PROCESSES=2 -export CUDA_VISIBLE_DEVICES=1,3 -PORT=29522 - -# naming/output dir -TRAIN_DIR=$RAID/logs/lerobot/lerobot_2_${REPO_ID//\//_}_${POLICY}_lr${LR}bs${BATCH_SIZE}steps${OFFLINE_STEPS} -echo "Training dir: $TRAIN_DIR" - -rm -rf "$TRAIN_DIR" - -# RUN -python -m accelerate.commands.launch \ - --num_processes $NUM_PROCESSES \ - --num_machines 1 \ - --main_process_port $PORT \ - --mixed_precision=$( [ "$USE_AMP" = true ] && echo "bf16" || echo "no" ) \ - src/lerobot/scripts/train_accelerate.py \ - --policy.type=$POLICY \ - --policy.use_amp=True \ - --policy.vlm_model_name=$VLM \ - --dataset.repo_id=$REPO_ID \ - --dataset.root=$HF_DATASETS_CACHE \ - --env.type=$ENV \ - --env.task=$TASK \ - --output_dir=$TRAIN_DIR \ - --batch_size=$BATCH_SIZE \ - --steps=$OFFLINE_STEPS \ - --eval_freq=$EVAL_FREQ \ - --save_freq=$SAVE_FREQ \ - --eval.batch_size=$EVAL_BATCH_SIZE \ - --eval.n_episodes=$NUM_EPISODES \ - --policy.optimizer_lr=$LR \ - --policy.repo_id=None \ - --policy.scheduler_decay_lr=$DECAY_LR \ - --policy.scheduler_decay_steps=$DECAY_STEPS \ - --policy.n_action_steps=$N_ACTION_STEPS \ - --policy.train_expert_only=$TRAIN_EXPERT_ONLY \ - --policy.vlm_model_name=$VLM \ - --seed=$SEED \ - --wandb.enable=false diff --git a/examples/7_train_libero_smolvla.sh b/examples/7_train_libero_smolvla.sh deleted file mode 100644 index f0b9de4e5..000000000 --- a/examples/7_train_libero_smolvla.sh +++ /dev/null @@ -1,89 +0,0 @@ -#!/bin/bash -# smolvla training - -set -euo pipefail - -# repo/env -cd ~/lerobot || exit 1 -# conda activate lerobot -export LC_ALL=C - - -rm -f core-* - -# storage / caches (use RAID to avoid filling $HOME) -RAID=/raid/jade -export TRANSFORMERS_CACHE=$RAID/.cache/huggingface/transformers -export HF_HOME=$RAID/.cache/huggingface -export HF_DATASETS_CACHE=$RAID/.cache/huggingface/datasets -export HF_LEROBOT_HOME=$RAID/.cache/huggingface/lerobot -export WANDB_CACHE_DIR=$RAID/.cache/wandb -export TMPDIR=$RAID/.cache/tmp -mkdir -p $TMPDIR -export WANDB_MODE=offline -# export HF_DATASETS_OFFLINE=1 -# export HF_HUB_OFFLINE=1 -export TOKENIZERS_PARALLELISM=false -export MUJOCO_GL=egl - -# will only use if accelerate is used -PORT=29522 - -# =================== CONFIG =================== -ENV=libero -TASK=libero_object -REPO_ID=physical-intelligence/libero -ROOT=$RAID -POLICY=smolvla -VLM=HuggingFaceTB/SmolVLM2-500M-Instruct - -# Optim / scheduling -LR=1e-4 -DECAY_LR=2.5e-6 -DECAY_STEPS=30000 -USE_AMP=false -TRAIN_EXPERT_ONLY=true -N_ACTION_STEPS=1 -SEED=1000 - -# Training loop -OFFLINE_STEPS=100000 -BATCH_SIZE=32 -EVAL_FREQ=0 -SAVE_FREQ=300000 -EVAL_BATCH_SIZE=1 -NUM_EPISODES=1 - -# GPU selection 0, 1, 2, 3 -export CUDA_VISIBLE_DEVICES=0 - -# naming/output dir -TRAIN_DIR=$RAID/logs/lerobot/lerobot_solo_${REPO_ID//\//_}_${POLICY}_lr${LR}bs${BATCH_SIZE}steps${OFFLINE_STEPS} -echo "Training dir: $TRAIN_DIR" - -# train -rm -rf "$TRAIN_DIR" - -python src/lerobot/scripts/train.py \ - --policy.type=$POLICY \ - --policy.vlm_model_name=$VLM \ - --dataset.repo_id=$REPO_ID \ - --env.type=$ENV \ - --env.task=$TASK \ - --output_dir=$TRAIN_DIR \ - --batch_size=$BATCH_SIZE \ - --steps=$OFFLINE_STEPS \ - --eval_freq=$EVAL_FREQ \ - --save_freq=$SAVE_FREQ \ - --eval.batch_size=$EVAL_BATCH_SIZE \ - --eval.n_episodes=$NUM_EPISODES \ - --policy.use_amp=$USE_AMP \ - --policy.optimizer_lr=$LR \ - --policy.repo_id=None \ - --policy.scheduler_decay_lr=$DECAY_LR \ - --policy.scheduler_decay_steps=$DECAY_STEPS \ - --policy.n_action_steps=$N_ACTION_STEPS \ - --policy.train_expert_only=$TRAIN_EXPERT_ONLY \ - --policy.vlm_model_name=$VLM \ - --seed=$SEED \ - --wandb.enable=false diff --git a/examples/8_train_smolvla_must.sh b/examples/8_train_smolvla_must.sh deleted file mode 100644 index 828627d85..000000000 --- a/examples/8_train_smolvla_must.sh +++ /dev/null @@ -1,145 +0,0 @@ -#!/bin/bash -# smolvla training with accelerate - -set -euo pipefail - -# repo/env -cd ~/lerobot || exit 1 -# conda activate lerobot -export LC_ALL=C - -rm -f core-* - -# storage / caches -RAID=/raid/jade -export TRANSFORMERS_CACHE=$RAID/.cache/huggingface/transformers -export HF_HOME=$RAID/.cache/huggingface -export HF_DATASETS_CACHE=$RAID/.cache/huggingface/datasets -export HF_LEROBOT_HOME=$RAID/.cache/huggingface/lerobot -export WANDB_CACHE_DIR=$RAID/.cache/wandb -export TMPDIR=$RAID/.cache/tmp -mkdir -p $TMPDIR -export WANDB_MODE=offline -# export HF_DATASETS_OFFLINE=1 -# export HF_HUB_OFFLINE=1 -export TOKENIZERS_PARALLELISM=false -export MUJOCO_GL=egl - -# CONFIG -ENV=libero -TASK=libero_spatial -REPO_ID=HuggingfaceVLA/libero - -POLICY=smolvla -VLM=HuggingFaceTB/SmolVLM2-500M-Instruct - -# Optim / scheduling -LR=1e-4 -DECAY_LR=2.5e-6 -DECAY_STEPS=30000 -USE_AMP=true # set to true for mixed precision -TRAIN_EXPERT_ONLY=true -N_ACTION_STEPS=1 -SEED=1000 -LOAD_VLM_WEIGHTS=true -# Training loop -OFFLINE_STEPS=100000 -BATCH_SIZE=32 -EVAL_FREQ=0 -SAVE_FREQ=20000 -EVAL_BATCH_SIZE=1 -NUM_EPISODES=1 -ADD_IMAGE_TOKENS=tru -N_OBS_STEPS=1 -ATTN_MODE=cross_attn -EXPERT_WIDTH_MULTIPLIER=0.5 -# number of gpus to use -NUM_PROCESSES=2 -NUM_VLM_LAYERS=0 -SELF_ATTN_EVERY_N_LAYERS=0 -CHUNK_SIZE=50 -export CUDA_VISIBLE_DEVICES=1 -PORT=29522 -PREFIX_LENGTH=0 -LOAD_VLM_WEIGHTS=true -MAX_ACTION_DIM=32 -MAX_STATE_DIM=32 -# naming/output dir -TRAIN_DIR=$RAID/logs/lerobot/lerobot_new_sep11_v2_${REPO_ID//\//_}_${POLICY}_lr${LR}bs${BATCH_SIZE}steps${OFFLINE_STEPS} -echo "Training dir: $TRAIN_DIR" - -rm -rf "$TRAIN_DIR" - -# RUN -# python -m accelerate.commands.launch \ -# --num_processes $NUM_PROCESSES \ -# --num_machines 1 \ -# --main_process_port $PORT \ -# --mixed_precision=$( [ "$USE_AMP" = true ] && echo "bf16" || echo "no" ) \ -# src/lerobot/scripts/train_accelerate.py \ -# --policy.type=$POLICY \ -# --policy.use_amp=True \ -# --policy.vlm_model_name=$VLM \ -# --dataset.repo_id=$REPO_ID \ -# --dataset.root=$HF_DATASETS_CACHE \ -# --env.type=$ENV \ -# --env.task=$TASK \ -# --output_dir=$TRAIN_DIR \ -# --batch_size=$BATCH_SIZE \ -# --steps=$OFFLINE_STEPS \ -# --eval_freq=$EVAL_FREQ \ -# --save_freq=$SAVE_FREQ \ -# --eval.batch_size=$EVAL_BATCH_SIZE \ -# --eval.n_episodes=$NUM_EPISODES \ -# --policy.optimizer_lr=$LR \ -# --policy.repo_id=None \ -# --policy.scheduler_decay_lr=$DECAY_LR \ -# --policy.scheduler_decay_steps=$DECAY_STEPS \ -# --policy.n_action_steps=$N_ACTION_STEPS \ -# --policy.train_expert_only=$TRAIN_EXPERT_ONLY \ -# --policy.vlm_model_name=$VLM \ -# --policy.n_obs_steps=$N_OBS_STEPS \ -# --policy.attention_mode=$ATTN_MODE \ -# --policy.prefix_length=$PREFIX_LENGTH \ -# --policy.num_vlm_layers=$NUM_VLM_LAYERS \ -# --policy.chunk_size=$CHUNK_SIZE \ -# --policy.expert_width_multiplier=$EXPERT_WIDTH_MULTIPLIER \ -# --policy.self_attn_every_n_layers=$SELF_ATTN_EVERY_N_LAYERS \ -# --seed=$SEED \ -# --wandb.enable=false - - -python src/lerobot/scripts/train.py \ - --policy.type=$POLICY \ - --policy.use_amp=False \ - --policy.vlm_model_name=$VLM \ - --dataset.repo_id=$REPO_ID \ - --dataset.root='/raid/jade/.cache/huggingface/lerobot/HuggingFaceVLA/libero/data' \ - --env.type=$ENV \ - --env.task=$TASK \ - --output_dir=$TRAIN_DIR \ - --batch_size=$BATCH_SIZE \ - --steps=$OFFLINE_STEPS \ - --eval_freq=$EVAL_FREQ \ - --save_freq=$SAVE_FREQ \ - --eval.batch_size=$EVAL_BATCH_SIZE \ - --eval.n_episodes=$NUM_EPISODES \ - --policy.optimizer_lr=$LR \ - --policy.repo_id=None \ - --policy.scheduler_decay_lr=$DECAY_LR \ - --policy.scheduler_decay_steps=$DECAY_STEPS \ - --policy.n_action_steps=$N_ACTION_STEPS \ - --policy.train_expert_only=$TRAIN_EXPERT_ONLY \ - --policy.vlm_model_name=$VLM \ - --policy.n_obs_steps=$N_OBS_STEPS \ - --policy.attention_mode=$ATTN_MODE \ - --policy.prefix_length=$PREFIX_LENGTH \ - --policy.num_vlm_layers=$NUM_VLM_LAYERS \ - --policy.chunk_size=$CHUNK_SIZE \ - --policy.load_vlm_weights=$LOAD_VLM_WEIGHTS \ - --policy.expert_width_multiplier=$EXPERT_WIDTH_MULTIPLIER \ - --policy.self_attn_every_n_layers=$SELF_ATTN_EVERY_N_LAYERS \ - --policy.max_action_dim=$MAX_ACTION_DIM \ - --policy.max_state_dim=$MAX_STATE_DIM \ - --seed=$SEED \ - --wandb.enable=false diff --git a/examples/9_evaluate_must.sh b/examples/9_evaluate_must.sh deleted file mode 100644 index 534153330..000000000 --- a/examples/9_evaluate_must.sh +++ /dev/null @@ -1,2811 +0,0 @@ -#!/bin/bash - -#SBATCH --job-name=lerobot_eval_smolpi0_libero_eval10ep_ca_sa2_16vlm_w075_smolvlm2b_lr7e5 -#SBATCH --nodes=1 -#SBATCH --ntasks=1 -#SBATCH --gpus-per-node=1 -#SBATCH --mail-type=END,FAIL -#SBATCH --output=/lustre/fswork/projects/rech/dyf/ugz83ue/logs/slurm/lerobot_eval_smolpi0_libero_eval10ep_ca_sa2_16vlm_w075_smolvlm2b_lr7e5.out -###SBATCH --nodelist=jean-zay-a101 -#SBATCH --cpus-per-task=45 -###SBATCH --exclusive -#SBATCH --time=15:00:00 -#SBATCH --mail-user=mustafa.shukor@isir.upmc.fr - -##SBATCH --partition=gpu_p2 -##SBATCH --qos=qos_gpu-t3 -###SBATCH -C v100-32g -##SBATCH -A dyf@v100 - -##SBATCH --partition=gpu_p5 -##SBATCH -C a100 -###SBATCH -A dyf@a100 -##SBATCH -A lqm@a100 -##SBATCH --qos=qos_gpu_a100-dev -##SBATCH --qos=qos_gpu_a100-t3 - -#SBATCH --partition=gpu_p6 -#SBATCH -C h100 -#SBATCH -A lqm@h100 -###SBATCH --qos=qos_gpu_h100-dev -#SBATCH --qos=qos_gpu_h100-t3 - -###SBATCH --begin=now+2hour - -# cd ~/lerobot_pi -# source ~/.bashrc -# source activate lerobot -# export LC_ALL=C - -# rm core-* -export CUDA_VISIBLE_DEVICES=3 -# storage / caches -RAID=/raid/jade -export TRANSFORMERS_CACHE=$RAID/.cache/huggingface/transformers -export HF_HOME=$RAID/.cache/huggingface -export HF_DATASETS_CACHE=$RAID/.cache/huggingface/datasets -export HF_LEROBOT_HOME=$RAID/.cache/huggingface/lerobot -export WANDB_CACHE_DIR=$RAID/.cache/wandb -export TMPDIR=$RAID/.cache/tmp -mkdir -p $TMPDIR -export WANDB_MODE=offline -export HF_DATASETS_OFFLINE=1 -# export HF_HUB_OFFLINE=1 -export TOKENIZERS_PARALLELISM=false -export MUJOCO_GL=egl -export CUDA_VISIBLE_DEVICES=3 - -PORT=29512 - -## then later -## wandb sync wandb/offline-run-* - - -ENV=libero - -# TASK=libero_10 -TASK=libero_spatial -# TASK=libero_spatial -# TASK=libero_10 -# TASK=libero_spatial - - -POLICY_NAME=smolpi0 - -POLICY=smolpi0 -ENV=libero - - - - - - -CKPT_KEYS_MAPPING=model._orig_mod.//model. -LOAD_VLM_WEIGHTS=true -PEFT_METHOD=freeze -SELF_ATTN_ONLY_ACTIONS=false -CAUSAL_ATTENTION_ON_HISTORY=false - -PREDICT_RELATIVE_ACTIONS=false -RELATIVE_ACTIONS_MODE=first -SHUFFLE_CAMERA_POSITIONS=false - -VLM_IMG_SIZE=-1 -REGRESSION_LOSS=false - - -# ## Baseline for ablation study -# EXPERT_WIDTH_MULTIPLIER=0.5 -# PAST_OBS_KEYS="image" -# N_OBS_STEPS=1 -# NUM_EXPERT_LAYERS=0 -# CHUNK_SIZE=50 -# NUM_VLM_LAYERS=0 -# PAD_LANG_TO=max_length -# EVAL_CKPT=$WORK/logs/lerobot/lerobot_physical_intelligence_libero_smolpi0_lr1e-4_bs8_steps100000_gpus2_freeze32_onlyexpert_1act_promptfalse_imgtoktrue_nobs1_compiletrue_cross_attn_pref0_gap1_localimgfalse_reverseimgorderfalse_statetopreftrue/checkpoints/last/pretrained_model/ -# ADD_IMAGE_TOKENS=true -# ATTN_MODE=cross_attn -# STATE_TO_PREFIX=true -# CAUSAL_ACTION_ATTENTION_MASK=false -# SELF_ATTN_EVERY_N_LAYERS=0 -# VLM_NAME=HuggingFaceTB/SmolVLM2-500M-Video-Instruct - -# EXPERT_WIDTH_MULTIPLIER=0.5 -# PAST_OBS_KEYS="image" -# N_OBS_STEPS=1 -# NUM_EXPERT_LAYERS=0 -# CHUNK_SIZE=50 -# NUM_VLM_LAYERS=0 -# PAD_LANG_TO=max_length -# EVAL_CKPT=$WORK/logs/lerobot/lerobot_physical_intelligence_libero_smolpi0_lr1e-4_bs8_steps100000_gpus2_freeze32_onlyexpert_1act_promptfalse_imgtoktrue_nobs1_compiletrue_cross_attn/checkpoints/last/pretrained_model/ -# ADD_IMAGE_TOKENS=true -# ATTN_MODE=cross_attn -# STATE_TO_PREFIX=false -# CAUSAL_ACTION_ATTENTION_MASK=false -# SELF_ATTN_EVERY_N_LAYERS=0 -# VLM_NAME=HuggingFaceTB/SmolVLM2-500M-Video-Instruct - - -# EXPERT_WIDTH_MULTIPLIER=0.5 -# PAST_OBS_KEYS="image" -# N_OBS_STEPS=1 -# NUM_EXPERT_LAYERS=0 -# CHUNK_SIZE=50 -# NUM_VLM_LAYERS=0 -# PAD_LANG_TO=max_length -# EVAL_CKPT=$WORK/logs/lerobot/lerobot_physical_intelligence_libero_smolpi0_lr1e-4_bs8_steps100000_gpus2_freeze32_onlyexpert_1act_promptfalse_imgtoktrue_nobs1_compiletrue_self_attn/checkpoints/last/pretrained_model/ -# ADD_IMAGE_TOKENS=true -# ATTN_MODE=self_attn -# STATE_TO_PREFIX=false -# CAUSAL_ACTION_ATTENTION_MASK=false -# SELF_ATTN_EVERY_N_LAYERS=0 -# VLM_NAME=HuggingFaceTB/SmolVLM2-500M-Video-Instruct - - -# EXPERT_WIDTH_MULTIPLIER=0.5 -# PAST_OBS_KEYS="image" -# N_OBS_STEPS=1 -# NUM_EXPERT_LAYERS=0 -# CHUNK_SIZE=50 -# NUM_VLM_LAYERS=0 -# PAD_LANG_TO=longest -# EVAL_CKPT=$WORK/logs/lerobot/lerobot_physical_intelligence_libero_smolpi0_lr1e-4bs8steps100000gpus2freeze32_imgtoktrue_self_attn_gap1_localimgfalse_statetopreffalse_explay0_vlml0_causalacttrue_sa0/checkpoints/last/pretrained_model/ -# ADD_IMAGE_TOKENS=true -# ATTN_MODE=self_attn -# STATE_TO_PREFIX=false -# CAUSAL_ACTION_ATTENTION_MASK=true -# SELF_ATTN_EVERY_N_LAYERS=0 -# VLM_NAME=HuggingFaceTB/SmolVLM2-500M-Video-Instruct - -# EXPERT_WIDTH_MULTIPLIER=0.5 -# PAST_OBS_KEYS="image" -# N_OBS_STEPS=1 -# NUM_EXPERT_LAYERS=0 -# CHUNK_SIZE=50 -# NUM_VLM_LAYERS=0 -# PAD_LANG_TO=longest -# EVAL_CKPT=$WORK/logs/lerobot/lerobot_physical_intelligence_libero_smolpi0_lr5e-4bs8steps100000gpus2freeze32_imgtoktrue_cross_attn_gap1_localimgfalse_statetopreftrue_explay0_vlml0_causalactfalse_sa0_smolvlm2250/checkpoints/best/pretrained_model/ -# ADD_IMAGE_TOKENS=true -# ATTN_MODE=cross_attn -# STATE_TO_PREFIX=true -# CAUSAL_ACTION_ATTENTION_MASK=false -# SELF_ATTN_EVERY_N_LAYERS=0 -# VLM_NAME=HuggingFaceTB/SmolVLM2-256M-Video-Instruct - -# EXPERT_WIDTH_MULTIPLIER=0.5 -# PAST_OBS_KEYS="image" -# N_OBS_STEPS=1 -# NUM_EXPERT_LAYERS=0 -# CHUNK_SIZE=50 -# NUM_VLM_LAYERS=0 -# PAD_LANG_TO=longest -# EVAL_CKPT=$WORK/logs/lerobot/lerobot_physical_intelligence_libero_smolpi0_lr8e-5bs8steps100000gpus2freeze32_imgtoktrue_cross_attn_gap1_localimgfalse_statetopreftrue_explay0_vlml0_causalactfalse_sa0_smolvlm22b/checkpoints/best/pretrained_model/ -# ADD_IMAGE_TOKENS=true -# ATTN_MODE=cross_attn -# STATE_TO_PREFIX=true -# CAUSAL_ACTION_ATTENTION_MASK=false -# SELF_ATTN_EVERY_N_LAYERS=0 -# VLM_NAME=HuggingFaceTB/SmolVLM2-2.2B-Instruct - -# EXPERT_WIDTH_MULTIPLIER=0.5 -# PAST_OBS_KEYS="image" -# N_OBS_STEPS=1 -# NUM_EXPERT_LAYERS=0 -# CHUNK_SIZE=50 -# NUM_VLM_LAYERS=0 -# PAD_LANG_TO=longest -# EVAL_CKPT=$WORK/logs/lerobot/lerobot_physical_intelligence_libero_smolpi0_lr1e-4_bs8_steps100000_gpus2_freeze32_onlyexpert_1act_promptfalse_imgtoktrue_nobs1_compiletrue_self_attn_pref0_gap1_localimgfalse_reverseimgorderfalse_statetopreftrue_toklongest_explay0_vlml0_causalacttrue/checkpoints/last/pretrained_model/ -# ADD_IMAGE_TOKENS=true -# ATTN_MODE=self_attn -# STATE_TO_PREFIX=true -# CAUSAL_ACTION_ATTENTION_MASK=true -# SELF_ATTN_EVERY_N_LAYERS=0 -# VLM_NAME=HuggingFaceTB/SmolVLM2-500M-Video-Instruct - -# EXPERT_WIDTH_MULTIPLIER=0.5 -# PAST_OBS_KEYS="image" -# N_OBS_STEPS=1 -# NUM_EXPERT_LAYERS=0 -# CHUNK_SIZE=50 -# NUM_VLM_LAYERS=0 -# PAD_LANG_TO=longest -# EVAL_CKPT=$WORK/logs/lerobot/lerobot_physical_intelligence_libero_smolpi0_lr1e-4bs8steps100000gpus2freeze32_imgtoktrue_cross_attn_gap1_localimgfalse_statetopreftrue_explay0_vlml0_causalacttrue_sa2/checkpoints/best/pretrained_model/ -# ADD_IMAGE_TOKENS=true -# ATTN_MODE=cross_attn -# STATE_TO_PREFIX=true -# CAUSAL_ACTION_ATTENTION_MASK=true -# SELF_ATTN_EVERY_N_LAYERS=2 -# VLM_NAME=HuggingFaceTB/SmolVLM2-500M-Video-Instruct - -# EXPERT_WIDTH_MULTIPLIER=0.5 -# PAST_OBS_KEYS="image" -# N_OBS_STEPS=1 -# NUM_EXPERT_LAYERS=0 -# CHUNK_SIZE=50 -# NUM_VLM_LAYERS=0 -# PAD_LANG_TO=longest -# EVAL_CKPT=$WORK/logs/lerobot/lerobot_physical_intelligence_libero_smolpi0_lr1e-4_bs8_steps100000_gpus2_freeze32_onlyexpert_1act_promptfalse_imgtoktrue_nobs1_compiletrue_cross_attn_pref0_gap1_localimgfalse_reverseimgorderfalse_statetopreffalse/checkpoints/last/pretrained_model/ -# ADD_IMAGE_TOKENS=true -# ATTN_MODE=cross_attn -# STATE_TO_PREFIX=false -# CAUSAL_ACTION_ATTENTION_MASK=true -# SELF_ATTN_EVERY_N_LAYERS=0 -# VLM_NAME=HuggingFaceTB/SmolVLM2-500M-Video-Instruct - -# EXPERT_WIDTH_MULTIPLIER=0.5 -# PAST_OBS_KEYS="image" -# N_OBS_STEPS=1 -# NUM_EXPERT_LAYERS=0 -# CHUNK_SIZE=50 -# NUM_VLM_LAYERS=0 -# PAD_LANG_TO=longest -# EVAL_CKPT=$WORK/logs/lerobot/lerobot_physical_intelligence_libero_smolpi0_lr1e-4bs8steps100000gpus2freeze32_imgtoktrue_self_attn_gap1_localimgfalse_statetopreffalse_explay0_vlml0_causalacttrue_sa0_smolvlm2500/checkpoints/best/pretrained_model/ -# ADD_IMAGE_TOKENS=true -# ATTN_MODE=self_attn -# STATE_TO_PREFIX=false -# CAUSAL_ACTION_ATTENTION_MASK=true -# SELF_ATTN_EVERY_N_LAYERS=0 -# VLM_NAME=HuggingFaceTB/SmolVLM2-500M-Video-Instruct - -# EXPERT_WIDTH_MULTIPLIER=0.5 -# PAST_OBS_KEYS="image" -# N_OBS_STEPS=1 -# NUM_EXPERT_LAYERS=0 -# CHUNK_SIZE=50 -# NUM_VLM_LAYERS=0 -# PAD_LANG_TO=longest -# EVAL_CKPT=$WORK/logs/lerobot/lerobot_physical_intelligence_libero_smolpi0_lr1e-4bs8steps100000gpus2freeze32_imgtoktrue_self_attn_gap1_localimgfalse_statetopreftrue_explay0_vlml0_causalactfalse_sa0_smolvlm2500/checkpoints/best/pretrained_model/ -# ADD_IMAGE_TOKENS=true -# ATTN_MODE=self_attn -# STATE_TO_PREFIX=true -# CAUSAL_ACTION_ATTENTION_MASK=false -# SELF_ATTN_EVERY_N_LAYERS=0 -# VLM_NAME=HuggingFaceTB/SmolVLM2-500M-Video-Instruct - -# EXPERT_WIDTH_MULTIPLIER=0.5 -# PAST_OBS_KEYS="image" -# N_OBS_STEPS=1 -# NUM_EXPERT_LAYERS=0 -# CHUNK_SIZE=50 -# NUM_VLM_LAYERS=8 -# PAD_LANG_TO=longest -# EVAL_CKPT=$WORK/logs/lerobot/lerobot_physical_intelligence_libero_smolpi0_lr1e-4bs8steps100000gpus2freeze32_imgtoktrue_cross_attn_gap1_localimgfalse_statetopreftrue_explay0_vlml8_causalactfalse_sa0/checkpoints/best/pretrained_model/ -# ADD_IMAGE_TOKENS=true -# ATTN_MODE=cross_attn -# STATE_TO_PREFIX=true -# CAUSAL_ACTION_ATTENTION_MASK=false -# SELF_ATTN_EVERY_N_LAYERS=0 -# VLM_NAME=HuggingFaceTB/SmolVLM2-500M-Video-Instruct - -# EXPERT_WIDTH_MULTIPLIER=0.5 -# PAST_OBS_KEYS="image" -# N_OBS_STEPS=1 -# NUM_EXPERT_LAYERS=0 -# CHUNK_SIZE=50 -# NUM_VLM_LAYERS=16 -# PAD_LANG_TO=longest -# EVAL_CKPT=$WORK/logs/lerobot/lerobot_physical_intelligence_libero_smolpi0_lr1e-4bs8steps100000gpus2freeze32_imgtoktrue_cross_attn_gap1_localimgfalse_statetopreftrue_explay0_vlml16_causalactfalse_sa0/checkpoints/last/pretrained_model/ -# ADD_IMAGE_TOKENS=true -# ATTN_MODE=cross_attn -# STATE_TO_PREFIX=true -# CAUSAL_ACTION_ATTENTION_MASK=false -# SELF_ATTN_EVERY_N_LAYERS=0 -# VLM_NAME=HuggingFaceTB/SmolVLM2-500M-Video-Instruct - -# EXPERT_WIDTH_MULTIPLIER=0.5 -# PAST_OBS_KEYS="image" -# N_OBS_STEPS=1 -# NUM_EXPERT_LAYERS=0 -# CHUNK_SIZE=50 -# NUM_VLM_LAYERS=24 -# PAD_LANG_TO=longest -# EVAL_CKPT=$WORK/logs/lerobot/lerobot_physical_intelligence_libero_smolpi0_lr1e-4bs8steps100000gpus2freeze32_imgtoktrue_cross_attn_gap1_localimgfalse_statetopreftrue_explay0_vlml24_causalactfalse_sa0/checkpoints/best/pretrained_model/ -# ADD_IMAGE_TOKENS=true -# ATTN_MODE=cross_attn -# STATE_TO_PREFIX=true -# CAUSAL_ACTION_ATTENTION_MASK=false -# SELF_ATTN_EVERY_N_LAYERS=0 -# VLM_NAME=HuggingFaceTB/SmolVLM2-500M-Video-Instruct - -# EXPERT_WIDTH_MULTIPLIER=0.5 -# PAST_OBS_KEYS="image" -# N_OBS_STEPS=1 -# NUM_EXPERT_LAYERS=0 -# CHUNK_SIZE=50 -# NUM_VLM_LAYERS=0 -# PAD_LANG_TO=longest -# EVAL_CKPT=$WORK/logs/lerobot/lerobot_physical_intelligence_libero_smolpi0_lr1e-4bs8steps100000gpus2freeze32_imgtokfalse_cross_attn_gap1_localimgfalse_statetopreftrue_explay0_vlml0_causalactfalse_sa0_smolvlm2500/checkpoints/best/pretrained_model/ -# ADD_IMAGE_TOKENS=false -# ATTN_MODE=cross_attn -# STATE_TO_PREFIX=true -# CAUSAL_ACTION_ATTENTION_MASK=false -# SELF_ATTN_EVERY_N_LAYERS=0 -# VLM_NAME=HuggingFaceTB/SmolVLM2-500M-Video-Instruct - -# EXPERT_WIDTH_MULTIPLIER=0.5 -# PAST_OBS_KEYS="image" -# N_OBS_STEPS=1 -# NUM_EXPERT_LAYERS=0 -# CHUNK_SIZE=100 -# NUM_VLM_LAYERS=0 -# PAD_LANG_TO=longest -# EVAL_CKPT=$WORK/logs/lerobot/lerobot_physical_intelligence_libero_smolpi0_lr1e-4bs8steps100000gpus2freeze32_imgtoktrue_cross_attn_gap1_localimgfalse_statetopreftrue_explay0_vlml0_causalactfalse_sa0_smolvlm2500_chunk100/checkpoints/best/pretrained_model/ -# ADD_IMAGE_TOKENS=true -# ATTN_MODE=cross_attn -# STATE_TO_PREFIX=true -# CAUSAL_ACTION_ATTENTION_MASK=false -# SELF_ATTN_EVERY_N_LAYERS=0 -# VLM_NAME=HuggingFaceTB/SmolVLM2-500M-Video-Instruct - -# EXPERT_WIDTH_MULTIPLIER=0.5 -# PAST_OBS_KEYS="image" -# N_OBS_STEPS=1 -# NUM_EXPERT_LAYERS=0 -# CHUNK_SIZE=30 -# NUM_VLM_LAYERS=0 -# PAD_LANG_TO=longest -# EVAL_CKPT=$WORK/logs/lerobot/lerobot_physical_intelligence_libero_smolpi0_lr1e-4bs8steps100000gpus2freeze32_imgtoktrue_cross_attn_gap1_localimgfalse_statetopreftrue_explay0_vlml0_causalactfalse_sa0_smolvlm2500_chunk30/checkpoints/best/pretrained_model/ -# ADD_IMAGE_TOKENS=true -# ATTN_MODE=cross_attn -# STATE_TO_PREFIX=true -# CAUSAL_ACTION_ATTENTION_MASK=false -# SELF_ATTN_EVERY_N_LAYERS=0 -# VLM_NAME=HuggingFaceTB/SmolVLM2-500M-Video-Instruct - -# EXPERT_WIDTH_MULTIPLIER=0.5 -# PAST_OBS_KEYS="image" -# N_OBS_STEPS=1 -# NUM_EXPERT_LAYERS=0 -# CHUNK_SIZE=10 -# NUM_VLM_LAYERS=0 -# PAD_LANG_TO=longest -# EVAL_CKPT=$WORK/logs/lerobot/lerobot_physical_intelligence_libero_smolpi0_lr1e-4bs8steps100000gpus2freeze32_imgtoktrue_cross_attn_gap1_localimgfalse_statetopreftrue_explay0_vlml0_causalactfalse_sa0_smolvlm2500_chunk10/checkpoints/best/pretrained_model/ -# ADD_IMAGE_TOKENS=true -# ATTN_MODE=cross_attn -# STATE_TO_PREFIX=true -# CAUSAL_ACTION_ATTENTION_MASK=false -# SELF_ATTN_EVERY_N_LAYERS=0 -# VLM_NAME=HuggingFaceTB/SmolVLM2-500M-Video-Instruct - -# EXPERT_WIDTH_MULTIPLIER=0.5 -# PAST_OBS_KEYS="image" -# N_OBS_STEPS=1 -# NUM_EXPERT_LAYERS=0 -# CHUNK_SIZE=1 -# NUM_VLM_LAYERS=0 -# PAD_LANG_TO=longest -# EVAL_CKPT=$WORK/logs/lerobot/lerobot_physical_intelligence_libero_smolpi0_lr1e-4bs8steps100000gpus2freeze32_imgtoktrue_cross_attn_gap1_localimgfalse_statetopreftrue_explay0_vlml0_causalactfalse_sa0_smolvlm2500_chunk1/checkpoints/best/pretrained_model/ -# ADD_IMAGE_TOKENS=true -# ATTN_MODE=cross_attn -# STATE_TO_PREFIX=true -# CAUSAL_ACTION_ATTENTION_MASK=false -# SELF_ATTN_EVERY_N_LAYERS=0 -# VLM_NAME=HuggingFaceTB/SmolVLM2-500M-Video-Instruct - -# EXPERT_WIDTH_MULTIPLIER=0.5 -# PAST_OBS_KEYS="image" -# N_OBS_STEPS=1 -# NUM_EXPERT_LAYERS=16 -# CHUNK_SIZE=50 -# NUM_VLM_LAYERS=0 -# PAD_LANG_TO=longest -# EVAL_CKPT=$WORK/logs/lerobot/lerobot_physical_intelligence_libero_smolpi0_lr1e-4bs8steps100000gpus2freeze32_imgtoktrue_cross_attn_gap1_localimgfalse_statetopreftrue_explay16_vlml0_causalactfalse_sa0_smolvlm2500_chunk50_nobs1/checkpoints/best/pretrained_model/ -# ADD_IMAGE_TOKENS=true -# ATTN_MODE=cross_attn -# STATE_TO_PREFIX=true -# CAUSAL_ACTION_ATTENTION_MASK=false -# SELF_ATTN_EVERY_N_LAYERS=0 -# VLM_NAME=HuggingFaceTB/SmolVLM2-500M-Video-Instruct - - -# EXPERT_WIDTH_MULTIPLIER=0.5 -# PAST_OBS_KEYS="image" -# N_OBS_STEPS=2 -# NUM_EXPERT_LAYERS=0 -# CHUNK_SIZE=50 -# NUM_VLM_LAYERS=0 -# PAD_LANG_TO=longest -# EVAL_CKPT=$WORK/logs/lerobot/lerobot_physical_intelligence_libero_smolpi0_lr1e-4bs8steps100000gpus2freeze32_imgtoktrue_cross_attn_gap1_localimgfalse_statetopreftrue_explay0_vlml0_causalactfalse_sa0_smolvlm2500_chunk50_nobs2/checkpoints/best/pretrained_model/ -# ADD_IMAGE_TOKENS=true -# ATTN_MODE=cross_attn -# STATE_TO_PREFIX=true -# CAUSAL_ACTION_ATTENTION_MASK=false -# SELF_ATTN_EVERY_N_LAYERS=0 -# VLM_NAME=HuggingFaceTB/SmolVLM2-500M-Video-Instruct - -# EXPERT_WIDTH_MULTIPLIER=0.5 -# PAST_OBS_KEYS="image" -# N_OBS_STEPS=3 -# NUM_EXPERT_LAYERS=0 -# CHUNK_SIZE=50 -# NUM_VLM_LAYERS=0 -# PAD_LANG_TO=longest -# EVAL_CKPT=$WORK/logs/lerobot/lerobot_physical_intelligence_libero_smolpi0_lr1e-4_bs8_steps100000_gpus2_freeze32_onlyexpert_1act_promptfalse_imgtoktrue_nobs3_compiletrue_cross_attn_pref0_gap1_localimgfalse_reverseimgorderfalse_statetopreftrue_toklongest/checkpoints/last/pretrained_model/ -# ADD_IMAGE_TOKENS=true -# ATTN_MODE=cross_attn -# STATE_TO_PREFIX=true -# CAUSAL_ACTION_ATTENTION_MASK=false -# SELF_ATTN_EVERY_N_LAYERS=0 -# VLM_NAME=HuggingFaceTB/SmolVLM2-500M-Video-Instruct - -# EXPERT_WIDTH_MULTIPLIER=0.5 -# PAST_OBS_KEYS="observation.state" -# N_OBS_STEPS=3 -# NUM_EXPERT_LAYERS=0 -# CHUNK_SIZE=50 -# NUM_VLM_LAYERS=0 -# PAD_LANG_TO=longest -# EVAL_CKPT=$WORK/logs/lerobot/lerobot_physical_intelligence_libero_smolpi0_lr1e-4bs8steps100000gpus2freeze32_imgtoktrue_cross_attn_gap1_localimgfalse_statetopreftrue_explay0_vlml0_causalactfalse_sa0_smolvlm2500_chunk50_nobs3_paststates/checkpoints/best/pretrained_model/ -# ADD_IMAGE_TOKENS=true -# ATTN_MODE=cross_attn -# STATE_TO_PREFIX=true -# CAUSAL_ACTION_ATTENTION_MASK=false -# SELF_ATTN_EVERY_N_LAYERS=0 -# VLM_NAME=HuggingFaceTB/SmolVLM2-500M-Video-Instruct - -# EXPERT_WIDTH_MULTIPLIER=0.5 -# PAST_OBS_KEYS="observation.state,image" -# N_OBS_STEPS=3 -# NUM_EXPERT_LAYERS=0 -# CHUNK_SIZE=50 -# NUM_VLM_LAYERS=0 -# PAD_LANG_TO=longest -# EVAL_CKPT=$WORK/logs/lerobot/lerobot_physical_intelligence_libero_smolpi0_lr1e-4bs8steps100000gpus2freeze32_imgtoktrue_cross_attn_gap1_localimgfalse_statetopreftrue_explay0_vlml0_causalactfalse_sa0_smolvlm2500_chunk50_nobs3_paststatesimgs/checkpoints/best/pretrained_model/ -# ADD_IMAGE_TOKENS=true -# ATTN_MODE=cross_attn -# STATE_TO_PREFIX=true -# CAUSAL_ACTION_ATTENTION_MASK=false -# SELF_ATTN_EVERY_N_LAYERS=0 -# VLM_NAME=HuggingFaceTB/SmolVLM2-500M-Video-Instruct - - -# EXPERT_WIDTH_MULTIPLIER=1 -# PAST_OBS_KEYS="image" -# N_OBS_STEPS=1 -# NUM_EXPERT_LAYERS=0 -# CHUNK_SIZE=50 -# NUM_VLM_LAYERS=0 -# PAD_LANG_TO=longest -# EVAL_CKPT=$WORK/logs/lerobot/lerobot_physical_intelligence_libero_smolpi0_lr9.5e-5bs8steps100000gpus2freeze32_imgtoktrue_cross_attn_gap1_localimgfalse_statetopreftrue_explay0_vlml0_causalactfalse_sa0_smolvlm2500_chunk50_nobs1_expw1/checkpoints/best/pretrained_model/ -# ADD_IMAGE_TOKENS=true -# ATTN_MODE=cross_attn -# STATE_TO_PREFIX=true -# CAUSAL_ACTION_ATTENTION_MASK=false -# SELF_ATTN_EVERY_N_LAYERS=0 -# VLM_NAME=HuggingFaceTB/SmolVLM2-500M-Video-Instruct - - -# EXPERT_WIDTH_MULTIPLIER=0.75 -# PAST_OBS_KEYS="image" -# N_OBS_STEPS=1 -# NUM_EXPERT_LAYERS=0 -# CHUNK_SIZE=50 -# NUM_VLM_LAYERS=0 -# PAD_LANG_TO=longest -# EVAL_CKPT=$WORK/logs/lerobot/lerobot_physical_intelligence_libero_smolpi0_lr9.5e-5bs8steps100000gpus2freeze32_imgtoktrue_cross_attn_gap1_localimgfalse_statetopreftrue_explay0_vlml0_causalactfalse_sa0_smolvlm2500_chunk50_nobs1_expw0.75/checkpoints/best/pretrained_model/ -# ADD_IMAGE_TOKENS=true -# ATTN_MODE=cross_attn -# STATE_TO_PREFIX=true -# CAUSAL_ACTION_ATTENTION_MASK=false -# SELF_ATTN_EVERY_N_LAYERS=0 -# VLM_NAME=HuggingFaceTB/SmolVLM2-500M-Video-Instruct - - -# EXPERT_WIDTH_MULTIPLIER=0.25 -# PAST_OBS_KEYS="image" -# N_OBS_STEPS=1 -# NUM_EXPERT_LAYERS=0 -# CHUNK_SIZE=50 -# NUM_VLM_LAYERS=0 -# PAD_LANG_TO=longest -# EVAL_CKPT=$WORK/logs/lerobot/lerobot_physical_intelligence_libero_smolpi0_lr2e-4bs8steps100000gpus2freeze32_imgtoktrue_cross_attn_gap1_localimgfalse_statetopreftrue_explay0_vlml0_causalactfalse_sa0_smolvlm2500_chunk50_nobs1_expw0.25/checkpoints/best/pretrained_model/ -# ADD_IMAGE_TOKENS=true -# ATTN_MODE=cross_attn -# STATE_TO_PREFIX=true -# CAUSAL_ACTION_ATTENTION_MASK=false -# SELF_ATTN_EVERY_N_LAYERS=0 -# VLM_NAME=HuggingFaceTB/SmolVLM2-500M-Video-Instruct - - -# EXPERT_WIDTH_MULTIPLIER=0.5 -# PAST_OBS_KEYS="observation.state,image" -# N_OBS_STEPS=1 -# NUM_EXPERT_LAYERS=0 -# CHUNK_SIZE=50 -# NUM_VLM_LAYERS=16 -# PAD_LANG_TO=longest -# EVAL_CKPT=$WORK/logs/lerobot/lerobot_physical_intelligence_libero_smolpi0_lr1e-4bs8steps100000gpus2freeze32_imgtokfalse_cross_attn_gap1_localimgfalse_statetopreftrue_explay0_vlml16_causalactfalse_sa0_smolvlm2500_chunk50_nobs1_expw0.5/checkpoints/best/pretrained_model/ -# ADD_IMAGE_TOKENS=false -# ATTN_MODE=cross_attn -# STATE_TO_PREFIX=true -# CAUSAL_ACTION_ATTENTION_MASK=false -# SELF_ATTN_EVERY_N_LAYERS=0 -# VLM_NAME=HuggingFaceTB/SmolVLM2-500M-Video-Instruct - - -# EXPERT_WIDTH_MULTIPLIER=0.5 -# PAST_OBS_KEYS="observation.state,image" -# N_OBS_STEPS=1 -# NUM_EXPERT_LAYERS=0 -# CHUNK_SIZE=50 -# NUM_VLM_LAYERS=16 -# PAD_LANG_TO=longest -# EVAL_CKPT=$WORK/logs/lerobot/lerobot_physical_intelligence_libero_smolpi0_lr1e-4bs16steps100000gpus2freeze32_imgtokfalse_cross_attn_gap1_localimgfalse_statetopreftrue_explay0_vlml16_causalactfalse_sa0_smolvlm2500_chunk50_nobs1_expw0.5/checkpoints/best/pretrained_model/ -# ADD_IMAGE_TOKENS=false -# ATTN_MODE=cross_attn -# STATE_TO_PREFIX=true -# CAUSAL_ACTION_ATTENTION_MASK=false -# SELF_ATTN_EVERY_N_LAYERS=0 -# VLM_NAME=HuggingFaceTB/SmolVLM2-500M-Video-Instruct - - - -# EXPERT_WIDTH_MULTIPLIER=0.5 -# PAST_OBS_KEYS="observation.state,image" -# N_OBS_STEPS=1 -# NUM_EXPERT_LAYERS=0 -# CHUNK_SIZE=50 -# NUM_VLM_LAYERS=16 -# PAD_LANG_TO=longest -# EVAL_CKPT=$WORK/logs/lerobot/lerobot_physical_intelligence_libero_smolpi0_lr1e-4bs32steps100000gpus2freeze32_imgtokfalse_cross_attn_gap1_localimgfalse_statetopreftrue_explay0_vlml16_causalactfalse_sa0_smolvlm2500_chunk50_nobs1_expw0.5/checkpoints/best/pretrained_model/ -# ADD_IMAGE_TOKENS=false -# ATTN_MODE=cross_attn -# STATE_TO_PREFIX=true -# CAUSAL_ACTION_ATTENTION_MASK=false -# SELF_ATTN_EVERY_N_LAYERS=0 -# VLM_NAME=HuggingFaceTB/SmolVLM2-500M-Video-Instruct - - -# EXPERT_WIDTH_MULTIPLIER=0.5 -# PAST_OBS_KEYS="image" -# N_OBS_STEPS=1 -# NUM_EXPERT_LAYERS=0 -# CHUNK_SIZE=50 -# NUM_VLM_LAYERS=0 -# PAD_LANG_TO=longest -# EVAL_CKPT=$WORK/logs/lerobot/lerobot_physical_intelligence_libero_smolpi0_lr5e-4bs8steps100000gpus2freeze32_imgtoktrue_cross_attn_gap1_localimgfalse_statetopreftrue_explay0_vlml0_causalactfalse_sa0_smolvlm1250_chunk50_nobs1_expw0.5/checkpoints/best/pretrained_model/ -# ADD_IMAGE_TOKENS=true -# ATTN_MODE=cross_attn -# STATE_TO_PREFIX=true -# CAUSAL_ACTION_ATTENTION_MASK=false -# SELF_ATTN_EVERY_N_LAYERS=0 -# VLM_NAME=HuggingFaceTB/SmolVLM-256M-Instruct - - -# EXPERT_WIDTH_MULTIPLIER=0.5 -# PAST_OBS_KEYS="image" -# N_OBS_STEPS=1 -# NUM_EXPERT_LAYERS=0 -# CHUNK_SIZE=50 -# NUM_VLM_LAYERS=0 -# PAD_LANG_TO=longest -# EVAL_CKPT=$WORK/logs/lerobot/lerobot_physical_intelligence_libero_smolpi0_lr8e-5bs8steps100000gpus2freeze32_imgtoktrue_cross_attn_gap1_localimgfalse_statetopreftrue_explay0_vlml0_causalactfalse_sa0_smolvlm12b_chunk50_nobs1_expw0.5/checkpoints/best/pretrained_model/ -# ADD_IMAGE_TOKENS=true -# ATTN_MODE=cross_attn -# STATE_TO_PREFIX=true -# CAUSAL_ACTION_ATTENTION_MASK=false -# SELF_ATTN_EVERY_N_LAYERS=0 -# VLM_NAME=HuggingFaceTB/SmolVLM-Instruct - - - -# EXPERT_WIDTH_MULTIPLIER=0.5 -# PAST_OBS_KEYS="image" -# N_OBS_STEPS=1 -# NUM_EXPERT_LAYERS=0 -# CHUNK_SIZE=50 -# NUM_VLM_LAYERS=0 -# PAD_LANG_TO=longest -# EVAL_CKPT=$WORK/logs/lerobot/lerobot_physical_intelligence_libero_smolpi0_lr1e-4bs8steps100000gpus2freeze32_imgtoktrue_cross_attn_gap1_localimgfalse_statetopreftrue_explay0_vlml0_causalactfalse_sa0_smolvlm1500_chunk50_nobs1_expw0.5/checkpoints/best/pretrained_model/ -# ADD_IMAGE_TOKENS=true -# ATTN_MODE=cross_attn -# STATE_TO_PREFIX=true -# CAUSAL_ACTION_ATTENTION_MASK=false -# SELF_ATTN_EVERY_N_LAYERS=0 -# VLM_NAME=HuggingFaceTB/SmolVLM-500M-Instruct - - -# EXPERT_WIDTH_MULTIPLIER=0.5 -# PAST_OBS_KEYS="image" -# N_OBS_STEPS=1 -# NUM_EXPERT_LAYERS=0 -# CHUNK_SIZE=50 -# NUM_VLM_LAYERS=0 -# PAD_LANG_TO=longest -# EVAL_CKPT=$WORK/logs/lerobot/lerobot_physical_intelligence_libero_smolpi0_lr8e-5bs8steps100000gpus2freeze32_imgtokfalse_cross_attn_gap1_localimgfalse_statetopreftrue_explay0_vlml0_causalactfalse_sa0_smolvlm1500_chunk50_nobs1_expw0.5/checkpoints/best/pretrained_model/ -# ADD_IMAGE_TOKENS=false -# ATTN_MODE=cross_attn -# STATE_TO_PREFIX=true -# CAUSAL_ACTION_ATTENTION_MASK=false -# SELF_ATTN_EVERY_N_LAYERS=0 -# VLM_NAME=HuggingFaceTB/SmolVLM-500M-Instruct - - - -# EXPERT_WIDTH_MULTIPLIER=0.5 -# PAST_OBS_KEYS="image" -# N_OBS_STEPS=1 -# NUM_EXPERT_LAYERS=0 -# CHUNK_SIZE=50 -# NUM_VLM_LAYERS=0 -# PAD_LANG_TO=longest -# EVAL_CKPT=$WORK/logs/lerobot/lerobot_physical_intelligence_libero_smolpi0_lr8e-5bs8steps100000gpus2freeze32_imgtokfalse_cross_attn_gap1_localimgfalse_statetopreftrue_explay0_vlml0_causalactfalse_sa0_smolvlm1500_chunk50_nobs1_expw0.5/checkpoints/best/pretrained_model/ -# ADD_IMAGE_TOKENS=false -# ATTN_MODE=cross_attn -# STATE_TO_PREFIX=true -# CAUSAL_ACTION_ATTENTION_MASK=false -# SELF_ATTN_EVERY_N_LAYERS=0 -# VLM_NAME=HuggingFaceTB/SmolVLM-500M-Instruct - -# EXPERT_WIDTH_MULTIPLIER=0.5 -# PAST_OBS_KEYS="image" -# N_OBS_STEPS=1 -# NUM_EXPERT_LAYERS=0 -# CHUNK_SIZE=50 -# NUM_VLM_LAYERS=16 -# PAD_LANG_TO=longest -# EVAL_CKPT=$WORK/logs/lerobot/lerobot_physical_intelligence_libero_smolpi0_lr1e-4bs8steps100000gpus2freeze32_imgtoktrue_cross_attn_gap1_localimgfalse_statetopreftrue_explay0_vlml16_causalactfalse_sa0_smolvlm2500_chunk50_nobs1_expw0.5/checkpoints/best/pretrained_model/ -# ADD_IMAGE_TOKENS=true -# ATTN_MODE=cross_attn -# STATE_TO_PREFIX=true -# CAUSAL_ACTION_ATTENTION_MASK=false -# SELF_ATTN_EVERY_N_LAYERS=0 -# VLM_NAME=HuggingFaceTB/SmolVLM-500M-Instruct - - -# EXPERT_WIDTH_MULTIPLIER=0.5 -# PAST_OBS_KEYS="image" -# N_OBS_STEPS=1 -# NUM_EXPERT_LAYERS=0 -# CHUNK_SIZE=50 -# NUM_VLM_LAYERS=16 -# PAD_LANG_TO=longest -# EVAL_CKPT=$WORK/logs/lerobot/lerobot_physical_intelligence_libero_smolpi0_lr1e-4bs8steps100000gpus2freeze32_imgtoktrue_cross_attn_gap1_localimgfalse_statetopreftrue_explay0_vlml16_causalacttrue_sa2_smolvlm2500_chunk50_nobs1_expw0.5/checkpoints/best/pretrained_model/ -# ADD_IMAGE_TOKENS=true -# ATTN_MODE=cross_attn -# STATE_TO_PREFIX=true -# CAUSAL_ACTION_ATTENTION_MASK=true -# SELF_ATTN_EVERY_N_LAYERS=2 -# VLM_NAME=HuggingFaceTB/SmolVLM-500M-Instruct - - -# EXPERT_WIDTH_MULTIPLIER=0.5 -# PAST_OBS_KEYS="image" -# N_OBS_STEPS=1 -# NUM_EXPERT_LAYERS=0 -# CHUNK_SIZE=50 -# NUM_VLM_LAYERS=16 -# PAD_LANG_TO=longest -# EVAL_CKPT=$WORK/logs/lerobot/lerobot_physical_intelligence_libero_smolpi0_lr1e-4bs32steps100000gpus2freeze32_imgtoktrue_cross_attn_gap1_localimgfalse_statetopreftrue_explay0_vlml16_causalacttrue_sa2_smolvlm2500_chunk50_nobs1_expw0.5/checkpoints/best/pretrained_model/ -# ADD_IMAGE_TOKENS=true -# ATTN_MODE=cross_attn -# STATE_TO_PREFIX=true -# CAUSAL_ACTION_ATTENTION_MASK=true -# SELF_ATTN_EVERY_N_LAYERS=2 -# VLM_NAME=HuggingFaceTB/SmolVLM-500M-Instruct - - - -# EXPERT_WIDTH_MULTIPLIER=0.5 -# PAST_OBS_KEYS="image" -# N_OBS_STEPS=1 -# NUM_EXPERT_LAYERS=0 -# CHUNK_SIZE=50 -# NUM_VLM_LAYERS=16 -# PAD_LANG_TO=longest -# EVAL_CKPT=$WORK/logs/lerobot/lerobot_physical_intelligence_libero_smolpi0_lr1e-4bs32steps100000gpus2freeze32_imgtoktrue_cross_attn_gap1_localimgfalse_statetopreftrue_explay0_vlml16_causalactfalse_sa0_smolvlm2500_chunk50_nobs1_expw0.5/checkpoints/best/pretrained_model/ -# ADD_IMAGE_TOKENS=true -# ATTN_MODE=cross_attn -# STATE_TO_PREFIX=true -# CAUSAL_ACTION_ATTENTION_MASK=false -# SELF_ATTN_EVERY_N_LAYERS=0 -# VLM_NAME=HuggingFaceTB/SmolVLM-500M-Instruct - - -# EXPERT_WIDTH_MULTIPLIER=0.5 -# PAST_OBS_KEYS="image" -# N_OBS_STEPS=1 -# NUM_EXPERT_LAYERS=0 -# CHUNK_SIZE=50 -# NUM_VLM_LAYERS=0 -# PAD_LANG_TO=longest -# EVAL_CKPT=$WORK/logs/lerobot/lerobot_physical_intelligence_libero_smolpi0_lr1e-4bs8steps100000gpus2freeze32_imgtoktrue_cross_attn_gap1_localimgfalse_statetopreftrue_explay0_vlml0_causalactfalse_sa0_smolvlm2500_chunk50_nobs1_expw0.5_rep/checkpoints/best/pretrained_model/ -# ADD_IMAGE_TOKENS=true -# ATTN_MODE=cross_attn -# STATE_TO_PREFIX=true -# CAUSAL_ACTION_ATTENTION_MASK=false -# SELF_ATTN_EVERY_N_LAYERS=0 -# VLM_NAME=HuggingFaceTB/SmolVLM-500M-Instruct - -# EXPERT_WIDTH_MULTIPLIER=0.5 -# PAST_OBS_KEYS="image" -# N_OBS_STEPS=1 -# NUM_EXPERT_LAYERS=0 -# CHUNK_SIZE=50 -# NUM_VLM_LAYERS=16 -# PAD_LANG_TO=longest -# EVAL_CKPT=$WORK/logs/lerobot/lerobot_physical_intelligence_libero_smolpi0_lr1e-4bs32steps100000gpus2freeze32_imgtoktrue_cross_attn_gap1_localimgfalse_statetopreftrue_explay0_vlml16_causalacttrue_sa2_smolvlm2500_chunk50_nobs1_expw0.5/checkpoints/best/pretrained_model/ -# ADD_IMAGE_TOKENS=true -# ATTN_MODE=cross_attn -# STATE_TO_PREFIX=true -# CAUSAL_ACTION_ATTENTION_MASK=true -# SELF_ATTN_EVERY_N_LAYERS=2 -# VLM_NAME=HuggingFaceTB/SmolVLM-500M-Instruct - -# EXPERT_WIDTH_MULTIPLIER=0.5 -# PAST_OBS_KEYS="image" -# N_OBS_STEPS=1 -# NUM_EXPERT_LAYERS=0 -# CHUNK_SIZE=50 -# NUM_VLM_LAYERS=16 -# PAD_LANG_TO=longest -# EVAL_CKPT=$WORK/logs/lerobot/lerobot_physical_intelligence_libero_smolpi0_lr1e-4bs32steps100000gpus2freeze32_imgtoktrue_cross_attn_gap1_localimgfalse_statetopreftrue_explay0_vlml16_causalactfalse_sa2_smolvlm2500_chunk50_nobs1_expw0.5_rep/checkpoints/best/pretrained_model/ -# ADD_IMAGE_TOKENS=true -# ATTN_MODE=cross_attn -# STATE_TO_PREFIX=true -# CAUSAL_ACTION_ATTENTION_MASK=false -# SELF_ATTN_EVERY_N_LAYERS=2 -# VLM_NAME=HuggingFaceTB/SmolVLM-500M-Instruct - - -# EXPERT_WIDTH_MULTIPLIER=0.5 -# PAST_OBS_KEYS="image" -# N_OBS_STEPS=1 -# NUM_EXPERT_LAYERS=0 -# CHUNK_SIZE=50 -# NUM_VLM_LAYERS=16 -# PAD_LANG_TO=longest -# EVAL_CKPT=$WORK/logs/lerobot/lerobot_physical_intelligence_libero_smolpi0_lr1e-4bs32steps100000gpus2freeze32_imgtokfalse_cross_attn_gap1_localimgfalse_statetopreftrue_explay0_vlml16_causalacttrue_sa2_smolvlm2500_chunk50_nobs1_expw0.5/checkpoints/best/pretrained_model/ -# ADD_IMAGE_TOKENS=false -# ATTN_MODE=cross_attn -# STATE_TO_PREFIX=true -# CAUSAL_ACTION_ATTENTION_MASK=true -# SELF_ATTN_EVERY_N_LAYERS=2 -# VLM_NAME=HuggingFaceTB/SmolVLM-500M-Instruct - - -# EXPERT_WIDTH_MULTIPLIER=0.5 -# PAST_OBS_KEYS="image" -# N_OBS_STEPS=1 -# NUM_EXPERT_LAYERS=0 -# CHUNK_SIZE=50 -# NUM_VLM_LAYERS=0 -# PAD_LANG_TO=longest -# EVAL_CKPT=$WORK/logs/lerobot/lerobot_physical_intelligence_libero_smolpi0_lr1e-4bs8steps100000gpus2full8_imgtoktrue_cross_attn_gap1_localimgfalse_statetopreftrue_explay0_vlml0_causalacttrue_sa0_smolvlm2500_chunk50_nobs1_expw0.5_lrvlm1e-6/checkpoints/best/pretrained_model/ -# ADD_IMAGE_TOKENS=true -# ATTN_MODE=cross_attn -# STATE_TO_PREFIX=true -# CAUSAL_ACTION_ATTENTION_MASK=true -# SELF_ATTN_EVERY_N_LAYERS=0 -# VLM_NAME=HuggingFaceTB/SmolVLM-500M-Instruct - - -# EXPERT_WIDTH_MULTIPLIER=0.5 -# PAST_OBS_KEYS="image" -# N_OBS_STEPS=1 -# NUM_EXPERT_LAYERS=0 -# CHUNK_SIZE=50 -# NUM_VLM_LAYERS=0 -# PAD_LANG_TO=longest -# EVAL_CKPT=$WORK/logs/lerobot/lerobot_physical_intelligence_libero_smolpi0_lr1e-4bs8steps100000gpus2full8_imgtoktrue_cross_attn_gap1_localimgfalse_statetopreftrue_explay0_vlml0_causalacttrue_sa0_smolvlm2500_chunk50_nobs1_expw0.5_lrvlm1e-5/checkpoints/best/pretrained_model/ -# ADD_IMAGE_TOKENS=true -# ATTN_MODE=cross_attn -# STATE_TO_PREFIX=true -# CAUSAL_ACTION_ATTENTION_MASK=true -# SELF_ATTN_EVERY_N_LAYERS=0 -# VLM_NAME=HuggingFaceTB/SmolVLM-500M-Instruct - - -# EXPERT_WIDTH_MULTIPLIER=0.5 -# PAST_OBS_KEYS="image" -# N_OBS_STEPS=1 -# NUM_EXPERT_LAYERS=0 -# CHUNK_SIZE=50 -# NUM_VLM_LAYERS=0 -# PAD_LANG_TO=longest -# EVAL_CKPT=$WORK/logs/lerobot/lerobot_physical_intelligence_libero_smolpi0_lr1e-4bs8steps100000gpus2full8_imgtoktrue_cross_attn_gap1_localimgfalse_statetopreftrue_explay0_vlml0_causalacttrue_sa0_smolvlm2500_chunk50_nobs1_expw0.5_lrvlm1e-4/checkpoints/best/pretrained_model/ -# ADD_IMAGE_TOKENS=true -# ATTN_MODE=cross_attn -# STATE_TO_PREFIX=true -# CAUSAL_ACTION_ATTENTION_MASK=true -# SELF_ATTN_EVERY_N_LAYERS=0 -# VLM_NAME=HuggingFaceTB/SmolVLM-500M-Instruct - - -# PEFT_METHOD=lora -# PEFT_TARGET_MODEL=text -# LORA_TARGET_MODULES=q_proj,v_proj,k_proj -# LORA_R=32 -# EXPERT_WIDTH_MULTIPLIER=0.5 -# PAST_OBS_KEYS="image" -# N_OBS_STEPS=1 -# NUM_EXPERT_LAYERS=0 -# CHUNK_SIZE=50 -# NUM_VLM_LAYERS=0 -# PAD_LANG_TO=longest -# EVAL_CKPT=$WORK/logs/lerobot/lerobot_physical_intelligence_libero_smolpi0_lr1e-4bs8steps100000gpus2lora32_imgtoktrue_cross_attn_gap1_localimgfalse_statetopreftrue_explay0_vlml0_causalacttrue_sa0_smolvlm2500_chunk50_nobs1_expw0.5_lrvlm1e-4_loraqkv/checkpoints/best/pretrained_model/ -# ADD_IMAGE_TOKENS=true -# ATTN_MODE=cross_attn -# STATE_TO_PREFIX=true -# CAUSAL_ACTION_ATTENTION_MASK=true -# SELF_ATTN_EVERY_N_LAYERS=0 -# VLM_NAME=HuggingFaceTB/SmolVLM-500M-Instruct - - -# PEFT_METHOD=lora -# PEFT_TARGET_MODEL=text -# LORA_TARGET_MODULES=q_proj,v_proj,k_proj -# LORA_R=32 -# EXPERT_WIDTH_MULTIPLIER=0.5 -# PAST_OBS_KEYS="image" -# N_OBS_STEPS=1 -# NUM_EXPERT_LAYERS=0 -# CHUNK_SIZE=50 -# NUM_VLM_LAYERS=0 -# PAD_LANG_TO=longest -# EVAL_CKPT=$WORK/logs/lerobot/lerobot_physical_intelligence_libero_smolpi0_lr1e-4bs8steps100000gpus2lora32_imgtoktrue_cross_attn_gap1_localimgfalse_statetopreftrue_explay0_vlml0_causalacttrue_sa0_smolvlm2500_chunk50_nobs1_expw0.5_lrvlm1e-5_loraqkv/checkpoints/best/pretrained_model/ -# ADD_IMAGE_TOKENS=true -# ATTN_MODE=cross_attn -# STATE_TO_PREFIX=true -# CAUSAL_ACTION_ATTENTION_MASK=true -# SELF_ATTN_EVERY_N_LAYERS=0 -# VLM_NAME=HuggingFaceTB/SmolVLM-500M-Instruct - -# PEFT_METHOD=lora -# PEFT_TARGET_MODEL=text -# LORA_TARGET_MODULES=q_proj,v_proj,k_proj,up_proj,down_proj,gate_proj -# LORA_R=32 -# EXPERT_WIDTH_MULTIPLIER=0.5 -# PAST_OBS_KEYS="image" -# N_OBS_STEPS=1 -# NUM_EXPERT_LAYERS=0 -# CHUNK_SIZE=50 -# NUM_VLM_LAYERS=0 -# PAD_LANG_TO=longest -# EVAL_CKPT=$WORK/logs/lerobot/lerobot_physical_intelligence_libero_smolpi0_lr1e-4bs8steps100000gpus2lora32_imgtoktrue_cross_attn_gap1_localimgfalse_statetopreftrue_explay0_vlml0_causalacttrue_sa0_smolvlm2500_chunk50_nobs1_expw0.5_lrvlm1e-4/checkpoints/best/pretrained_model/ -# ADD_IMAGE_TOKENS=true -# ATTN_MODE=cross_attn -# STATE_TO_PREFIX=true -# CAUSAL_ACTION_ATTENTION_MASK=true -# SELF_ATTN_EVERY_N_LAYERS=0 -# VLM_NAME=HuggingFaceTB/SmolVLM-500M-Instruct - - -# PEFT_METHOD=lora -# PEFT_TARGET_MODEL=text -# LORA_TARGET_MODULES=q_proj,v_proj,k_proj,up_proj,down_proj,gate_proj -# LORA_R=32 -# EXPERT_WIDTH_MULTIPLIER=0.5 -# PAST_OBS_KEYS="image" -# N_OBS_STEPS=1 -# NUM_EXPERT_LAYERS=0 -# CHUNK_SIZE=50 -# NUM_VLM_LAYERS=0 -# PAD_LANG_TO=longest -# EVAL_CKPT=$WORK/logs/lerobot/lerobot_physical_intelligence_libero_smolpi0_lr1e-4bs8steps100000gpus2lora32_imgtoktrue_cross_attn_gap1_localimgfalse_statetopreftrue_explay0_vlml0_causalacttrue_sa0_smolvlm2500_chunk50_nobs1_expw0.5_lrvlm1e-5/checkpoints/best/pretrained_model/ -# ADD_IMAGE_TOKENS=true -# ATTN_MODE=cross_attn -# STATE_TO_PREFIX=true -# CAUSAL_ACTION_ATTENTION_MASK=true -# SELF_ATTN_EVERY_N_LAYERS=0 -# VLM_NAME=HuggingFaceTB/SmolVLM-500M-Instruct - - -# PEFT_METHOD=lora -# PEFT_TARGET_MODEL=text -# LORA_TARGET_MODULES=q_proj,v_proj,k_proj,up_proj,down_proj,gate_proj -# LORA_R=32 -# EXPERT_WIDTH_MULTIPLIER=0.5 -# PAST_OBS_KEYS="image" -# N_OBS_STEPS=1 -# NUM_EXPERT_LAYERS=0 -# CHUNK_SIZE=50 -# NUM_VLM_LAYERS=0 -# PAD_LANG_TO=longest -# EVAL_CKPT=$WORK/logs/lerobot/lerobot_physical_intelligence_libero_smolpi0_lr1e-4bs8steps100000gpus2lora32_imgtoktrue_cross_attn_gap1_localimgfalse_statetopreftrue_explay0_vlml0_causalacttrue_sa0_smolvlm2500_chunk50_nobs1_expw0.5_lrvlm1e-6/checkpoints/best/pretrained_model/ -# ADD_IMAGE_TOKENS=true -# ATTN_MODE=cross_attn -# STATE_TO_PREFIX=true -# CAUSAL_ACTION_ATTENTION_MASK=true -# SELF_ATTN_EVERY_N_LAYERS=0 -# VLM_NAME=HuggingFaceTB/SmolVLM-500M-Instruct - - -# EXPERT_WIDTH_MULTIPLIER=0.5 -# PAST_OBS_KEYS="image" -# N_OBS_STEPS=1 -# NUM_EXPERT_LAYERS=0 -# CHUNK_SIZE=50 -# NUM_VLM_LAYERS=0 -# PAD_LANG_TO=longest -# EVAL_CKPT=$WORK/logs/lerobot/lerobot_physical_intelligence_libero_smolpi0_lr1e-4bs8steps100000gpus2lora32_imgtoktrue_cross_attn_gap1_localimgfalse_statetopreftrue_explay0_vlml0_causalacttrue_sa0_smolvlm2500_chunk50_nobs1_expw0.5_lrvlm1e-5/checkpoints/best/pretrained_model/ -# ADD_IMAGE_TOKENS=true -# ATTN_MODE=cross_attn -# STATE_TO_PREFIX=true -# CAUSAL_ACTION_ATTENTION_MASK=true -# SELF_ATTN_EVERY_N_LAYERS=0 -# VLM_NAME=HuggingFaceTB/SmolVLM-500M-Instruct - - - -# EXPERT_WIDTH_MULTIPLIER=0.5 -# PAST_OBS_KEYS="image" -# N_OBS_STEPS=1 -# NUM_EXPERT_LAYERS=0 -# CHUNK_SIZE=50 -# NUM_VLM_LAYERS=16 -# PAD_LANG_TO=longest -# EVAL_CKPT=$WORK/logs/lerobot/lerobot_physical_intelligence_libero_smolpi0_lr1e-4bs32steps100000gpus2freeze32_imgtokfalse_cross_attn_gap1_localimgfalse_statetopreftrue_explay0_vlml16_causalacttrue_sa0_smolvlm2500_chunk50_nobs1_expw0.5_lrvlm1e-4_loraqkv/checkpoints/best/pretrained_model/ -# ADD_IMAGE_TOKENS=false -# ATTN_MODE=cross_attn -# STATE_TO_PREFIX=true -# CAUSAL_ACTION_ATTENTION_MASK=true -# SELF_ATTN_EVERY_N_LAYERS=0 -# VLM_NAME=HuggingFaceTB/SmolVLM-500M-Instruct - - - -# EXPERT_WIDTH_MULTIPLIER=0.5 -# PAST_OBS_KEYS="image" -# N_OBS_STEPS=1 -# NUM_EXPERT_LAYERS=0 -# CHUNK_SIZE=50 -# NUM_VLM_LAYERS=0 -# PAD_LANG_TO=longest -# EVAL_CKPT=$WORK/logs/lerobot/lerobot_physical_intelligence_libero_smolpi0_lr1e-4bs32steps100000gpus2freeze32_imgtokfalse_cross_attn_gap1_localimgfalse_statetopreftrue_explay0_vlml0_causalacttrue_sa2_smolvlm2500_chunk50_nobs1_expw0.5_lrvlm1e-4/checkpoints/best/pretrained_model/ -# ADD_IMAGE_TOKENS=false -# ATTN_MODE=cross_attn -# STATE_TO_PREFIX=true -# CAUSAL_ACTION_ATTENTION_MASK=true -# SELF_ATTN_EVERY_N_LAYERS=2 -# VLM_NAME=HuggingFaceTB/SmolVLM-500M-Instruct - - -# EXPERT_WIDTH_MULTIPLIER=0.5 -# PAST_OBS_KEYS="image" -# N_OBS_STEPS=1 -# NUM_EXPERT_LAYERS=0 -# CHUNK_SIZE=50 -# NUM_VLM_LAYERS=0 -# PAD_LANG_TO=longest -# EVAL_CKPT=$WORK/logs/lerobot/lerobot_physical_intelligence_libero_smolpi0_lr1e-4bs32steps100000gpus2freeze32_imgtoktrue_cross_attn_gap1_localimgfalse_statetopreftrue_explay0_vlml0_causalacttrue_sa2_smolvlm2500_chunk50_nobs1_expw0.5_lrvlm1e-4/checkpoints/best/pretrained_model/ -# ADD_IMAGE_TOKENS=true -# ATTN_MODE=cross_attn -# STATE_TO_PREFIX=true -# CAUSAL_ACTION_ATTENTION_MASK=true -# SELF_ATTN_EVERY_N_LAYERS=2 -# VLM_NAME=HuggingFaceTB/SmolVLM-500M-Instruct - - - -# EXPERT_WIDTH_MULTIPLIER=0.5 -# PAST_OBS_KEYS="image" -# N_OBS_STEPS=1 -# NUM_EXPERT_LAYERS=0 -# CHUNK_SIZE=50 -# NUM_VLM_LAYERS=16 -# PAD_LANG_TO=longest -# EVAL_CKPT=$WORK/logs/lerobot/lerobot_physical_intelligence_libero_smolpi0_lr1e-4bs16steps100000gpus2freeze32_imgtoktrue_cross_attn_gap1_localimgfalse_statetopreftrue_explay0_vlml16_causalacttrue_sa2_smolvlm2500_chunk50_nobs1_expw0.5_lrvlm1e-4/checkpoints/best/pretrained_model/ -# ADD_IMAGE_TOKENS=true -# ATTN_MODE=cross_attn -# STATE_TO_PREFIX=true -# CAUSAL_ACTION_ATTENTION_MASK=true -# SELF_ATTN_EVERY_N_LAYERS=2 -# VLM_NAME=HuggingFaceTB/SmolVLM-500M-Instruct - - -# EXPERT_WIDTH_MULTIPLIER=0.5 -# PAST_OBS_KEYS="image" -# N_OBS_STEPS=1 -# NUM_EXPERT_LAYERS=0 -# CHUNK_SIZE=50 -# NUM_VLM_LAYERS=0 -# PAD_LANG_TO=longest -# EVAL_CKPT=$WORK/logs/lerobot/lerobot_physical_intelligence_libero_smolpi0_lr1e-4bs16steps100000gpus2freeze32_imgtoktrue_cross_attn_gap1_localimgfalse_statetopreftrue_explay0_vlml0_causalacttrue_sa2_smolvlm2500_chunk50_nobs1_expw0.5_lrvlm1e-4/checkpoints/best/pretrained_model/ -# ADD_IMAGE_TOKENS=true -# ATTN_MODE=cross_attn -# STATE_TO_PREFIX=true -# CAUSAL_ACTION_ATTENTION_MASK=true -# SELF_ATTN_EVERY_N_LAYERS=2 -# VLM_NAME=HuggingFaceTB/SmolVLM-500M-Instruct - -# EXPERT_WIDTH_MULTIPLIER=0.5 -# PAST_OBS_KEYS="image" -# N_OBS_STEPS=1 -# NUM_EXPERT_LAYERS=0 -# CHUNK_SIZE=50 -# NUM_VLM_LAYERS=0 -# PAD_LANG_TO=longest -# EVAL_CKPT=$WORK/logs/lerobot/lerobot_physical_intelligence_libero_smolpi0_lr1e-4bs16steps100000gpus2freeze32_imgtokfalse_cross_attn_gap1_localimgfalse_statetopreftrue_explay0_vlml0_causalacttrue_sa2_smolvlm2500_chunk50_nobs1_expw0.5_lrvlm1e-4/checkpoints/best/pretrained_model/ -# ADD_IMAGE_TOKENS=false -# ATTN_MODE=cross_attn -# STATE_TO_PREFIX=true -# CAUSAL_ACTION_ATTENTION_MASK=true -# SELF_ATTN_EVERY_N_LAYERS=2 -# VLM_NAME=HuggingFaceTB/SmolVLM-500M-Instruct - - - -# EXPERT_WIDTH_MULTIPLIER=0.5 -# PAST_OBS_KEYS="image" -# N_OBS_STEPS=1 -# NUM_EXPERT_LAYERS=0 -# CHUNK_SIZE=50 -# NUM_VLM_LAYERS=0 -# PAD_LANG_TO=longest -# EVAL_CKPT=$WORK/logs/lerobot/lerobot_physical_intelligence_libero_smolpi0_lr1e-4bs8steps100000gpus2freeze32_imgtokfalse_cross_attn_gap1_localimgfalse_statetopreftrue_explay0_vlml0_causalacttrue_sa0_smolvlm2500_chunk50_nobs1_expw0.5_lrvlm1e-4/checkpoints/best/pretrained_model/ -# ADD_IMAGE_TOKENS=false -# ATTN_MODE=cross_attn -# STATE_TO_PREFIX=true -# CAUSAL_ACTION_ATTENTION_MASK=true -# SELF_ATTN_EVERY_N_LAYERS=0 -# VLM_NAME=HuggingFaceTB/SmolVLM-500M-Instruct - - -# SELF_ATTN_ONLY_ACTIONS=true -# EXPERT_WIDTH_MULTIPLIER=0.5 -# PAST_OBS_KEYS="image" -# N_OBS_STEPS=1 -# NUM_EXPERT_LAYERS=0 -# CHUNK_SIZE=50 -# NUM_VLM_LAYERS=16 -# PAD_LANG_TO=longest -# EVAL_CKPT=$WORK/logs/lerobot/lerobot_physical_intelligence_libero_smolpi0_lr1e-4bs8steps100000gpus2freeze32_imgtoktrue_cross_attn_gap1_localimgfalse_statetopreftrue_explay0_vlml16_causalacttrue_sa2_smolvlm2500_chunk50_nobs1_expw0.5_lrvlm1e-4_saacttrue/checkpoints/best/pretrained_model/ -# ADD_IMAGE_TOKENS=true -# ATTN_MODE=cross_attn -# STATE_TO_PREFIX=true -# CAUSAL_ACTION_ATTENTION_MASK=true -# SELF_ATTN_EVERY_N_LAYERS=2 -# VLM_NAME=HuggingFaceTB/SmolVLM-500M-Instruct - - -# SELF_ATTN_ONLY_ACTIONS=false -# EXPERT_WIDTH_MULTIPLIER=0.5 -# PAST_OBS_KEYS="image" -# N_OBS_STEPS=1 -# NUM_EXPERT_LAYERS=0 -# CHUNK_SIZE=50 -# NUM_VLM_LAYERS=16 -# PAD_LANG_TO=max_length -# EVAL_CKPT=$WORK/logs/lerobot/lerobot_physical_intelligence_libero_smolpi0_lr1e-4bs8steps100000gpus2freeze32_imgtoktrue_cross_attn_gap1_localimgfalse_statetopreftrue_explay0_vlml16_causalacttrue_sa2_smolvlm2500_chunk50_nobs1_expw0.5_lrvlm1e-4_saactfalse_droptrue_max_length/checkpoints/best/pretrained_model/ -# ADD_IMAGE_TOKENS=true -# ATTN_MODE=cross_attn -# STATE_TO_PREFIX=true -# CAUSAL_ACTION_ATTENTION_MASK=true -# SELF_ATTN_EVERY_N_LAYERS=2 -# VLM_NAME=HuggingFaceTB/SmolVLM-500M-Instruct - - - -# SELF_ATTN_ONLY_ACTIONS=false -# EXPERT_WIDTH_MULTIPLIER=0.5 -# PAST_OBS_KEYS="image" -# N_OBS_STEPS=1 -# NUM_EXPERT_LAYERS=0 -# CHUNK_SIZE=50 -# NUM_VLM_LAYERS=16 -# PAD_LANG_TO=max_length -# EVAL_CKPT=$WORK/logs/lerobot/lerobot_physical_intelligence_libero_smolpi0_lr1e-4bs8steps100000gpus2freeze32_imgtoktrue_cross_attn_gap1_localimgfalse_statetopreftrue_explay0_vlml16_causalacttrue_sa2_smolvlm2500_chunk50_nobs1_expw0.5_lrvlm1e-4_saactfalse_dropfalse_max_length/checkpoints/best/pretrained_model/ -# ADD_IMAGE_TOKENS=true -# ATTN_MODE=cross_attn -# STATE_TO_PREFIX=true -# CAUSAL_ACTION_ATTENTION_MASK=true -# SELF_ATTN_EVERY_N_LAYERS=2 -# VLM_NAME=HuggingFaceTB/SmolVLM-500M-Instruct - -# SELF_ATTN_ONLY_ACTIONS=false -# EXPERT_WIDTH_MULTIPLIER=0.75 -# PAST_OBS_KEYS="image" -# N_OBS_STEPS=1 -# NUM_EXPERT_LAYERS=0 -# CHUNK_SIZE=50 -# NUM_VLM_LAYERS=16 -# PAD_LANG_TO=max_length -# EVAL_CKPT=$WORK/logs/lerobot/lerobot_physical_intelligence_libero_smolpi0_lr9.5e-5bs8steps100000gpus2freeze32_imgtoktrue_cross_attn_gap1_localimgfalse_statetopreftrue_explay0_vlml16_causalacttrue_sa2_smolvlm2500_chunk50_nobs1_expw0.75_lrvlm1e-4_saactfalse_dropfalse_max_length/checkpoints/best/pretrained_model/ -# ADD_IMAGE_TOKENS=true -# ATTN_MODE=cross_attn -# STATE_TO_PREFIX=true -# CAUSAL_ACTION_ATTENTION_MASK=true -# SELF_ATTN_EVERY_N_LAYERS=2 -# VLM_NAME=HuggingFaceTB/SmolVLM-500M-Instruct - - -# SELF_ATTN_ONLY_ACTIONS=false -# EXPERT_WIDTH_MULTIPLIER=0.75 -# PAST_OBS_KEYS="image" -# N_OBS_STEPS=1 -# NUM_EXPERT_LAYERS=0 -# CHUNK_SIZE=50 -# NUM_VLM_LAYERS=16 -# PAD_LANG_TO=longest -# EVAL_CKPT=$WORK/logs/lerobot/lerobot_physical_intelligence_libero_smolpi0_lr9.5e-5bs32steps100000gpus2freeze32_imgtoktrue_cross_attn_gap1_localimgfalse_statetopreftrue_explay0_vlml16_causalacttrue_sa2_smolvlm2500_chunk50_nobs1_expw0.75_lrvlm1e-4_saactfalse_dropfalse_longest/checkpoints/best/pretrained_model/ -# ADD_IMAGE_TOKENS=true -# ATTN_MODE=cross_attn -# STATE_TO_PREFIX=true -# CAUSAL_ACTION_ATTENTION_MASK=true -# SELF_ATTN_EVERY_N_LAYERS=2 -# VLM_NAME=HuggingFaceTB/SmolVLM-500M-Instruct - - -# SELF_ATTN_ONLY_ACTIONS=false -# EXPERT_WIDTH_MULTIPLIER=0.75 -# PAST_OBS_KEYS="image" -# N_OBS_STEPS=1 -# NUM_EXPERT_LAYERS=0 -# CHUNK_SIZE=50 -# NUM_VLM_LAYERS=16 -# PAD_LANG_TO=longest -# EVAL_CKPT=$WORK/logs/lerobot/lerobot_physical_intelligence_libero_smolpi0_lr1e-4bs32steps100000gpus2freeze32_imgtoktrue_cross_attn_gap1_localimgfalse_statetopreftrue_explay0_vlml16_causalacttrue_sa2_smolvlm2500_chunk50_nobs1_expw0.75_lrvlm1e-4_saactfalse_dropfalse_longest/checkpoints/best/pretrained_model/ -# ADD_IMAGE_TOKENS=true -# ATTN_MODE=cross_attn -# STATE_TO_PREFIX=true -# CAUSAL_ACTION_ATTENTION_MASK=true -# SELF_ATTN_EVERY_N_LAYERS=2 -# VLM_NAME=HuggingFaceTB/SmolVLM-500M-Instruct - - -# SELF_ATTN_ONLY_ACTIONS=false -# EXPERT_WIDTH_MULTIPLIER=0.5 -# PAST_OBS_KEYS="image" -# N_OBS_STEPS=1 -# NUM_EXPERT_LAYERS=0 -# CHUNK_SIZE=50 -# NUM_VLM_LAYERS=16 -# PAD_LANG_TO=longest -# EVAL_CKPT=$WORK/logs/lerobot/lerobot_physical_intelligence_libero_smolpi0_lr1e-4bs8steps100000gpus2freeze32_imgtoktrue_cross_attn_gap1_localimgfalse_statetopreftrue_explay0_vlml16_causalacttrue_sa2_smolvlm2500_chunk50_nobs1_expw0.5_lrvlm1e-4_longest_ptdroidfull/checkpoints/best/pretrained_model/ -# ADD_IMAGE_TOKENS=true -# ATTN_MODE=cross_attn -# STATE_TO_PREFIX=true -# CAUSAL_ACTION_ATTENTION_MASK=true -# SELF_ATTN_EVERY_N_LAYERS=2 -# VLM_NAME=HuggingFaceTB/SmolVLM-500M-Instruct - -# SELF_ATTN_ONLY_ACTIONS=false -# EXPERT_WIDTH_MULTIPLIER=0.5 -# PAST_OBS_KEYS="image" -# N_OBS_STEPS=1 -# NUM_EXPERT_LAYERS=0 -# CHUNK_SIZE=50 -# NUM_VLM_LAYERS=16 -# PAD_LANG_TO=longest -# EVAL_CKPT=$WORK/logs/lerobot/lerobot_physical_intelligence_libero_smolpi0_lr1e-4bs8steps100000gpus2freeze32_imgtoktrue_cross_attn_gap1_localimgfalse_statetopreftrue_explay0_vlml16_causalacttrue_sa2_smolvlm2500_chunk50_nobs1_expw0.5_lrvlm1e-4_longest_ptcomv3freeze/checkpoints/best/pretrained_model/ -# ADD_IMAGE_TOKENS=true -# ATTN_MODE=cross_attn -# STATE_TO_PREFIX=true -# CAUSAL_ACTION_ATTENTION_MASK=true -# SELF_ATTN_EVERY_N_LAYERS=2 -# VLM_NAME=HuggingFaceTB/SmolVLM-500M-Instruct - -# SELF_ATTN_ONLY_ACTIONS=false -# EXPERT_WIDTH_MULTIPLIER=0.5 -# PAST_OBS_KEYS="image" -# N_OBS_STEPS=1 -# NUM_EXPERT_LAYERS=0 -# CHUNK_SIZE=50 -# NUM_VLM_LAYERS=16 -# PAD_LANG_TO=longest -# EVAL_CKPT=$WORK/logs/lerobot/lerobot_physical_intelligence_libero_smolpi0_lr1e-4bs8steps100000gpus2freeze32_imgtoktrue_cross_attn_gap1_localimgfalse_statetopreftrue_explay0_vlml16_causalacttrue_sa2_smolvlm2500_chunk50_nobs1_expw0.5_lrvlm1e-4_longest_ptcomv1v2full/checkpoints/best/pretrained_model/ -# ADD_IMAGE_TOKENS=true -# ATTN_MODE=cross_attn -# STATE_TO_PREFIX=true -# CAUSAL_ACTION_ATTENTION_MASK=true -# SELF_ATTN_EVERY_N_LAYERS=2 -# VLM_NAME=HuggingFaceTB/SmolVLM-500M-Instruct - -# SELF_ATTN_ONLY_ACTIONS=false -# EXPERT_WIDTH_MULTIPLIER=0.5 -# PAST_OBS_KEYS="image" -# N_OBS_STEPS=1 -# NUM_EXPERT_LAYERS=0 -# CHUNK_SIZE=50 -# NUM_VLM_LAYERS=16 -# PAD_LANG_TO=longest -# EVAL_CKPT=$WORK/logs/lerobot/lerobot_physical_intelligence_libero_smolpi0_lr1e-4bs8steps100000gpus2freeze32_imgtoktrue_cross_attn_gap1_localimgfalse_statetopreftrue_explay0_vlml16_causalacttrue_sa2_smolvlm2500_chunk50_nobs1_expw0.5_lrvlm1e-4_longest_ptcomv1v2freeze/checkpoints/best/pretrained_model/ -# ADD_IMAGE_TOKENS=true -# ATTN_MODE=cross_attn -# STATE_TO_PREFIX=true -# CAUSAL_ACTION_ATTENTION_MASK=true -# SELF_ATTN_EVERY_N_LAYERS=2 -# VLM_NAME=HuggingFaceTB/SmolVLM-500M-Instruct - - - -# SELF_ATTN_ONLY_ACTIONS=false -# EXPERT_WIDTH_MULTIPLIER=0.5 -# PAST_OBS_KEYS="image" -# N_OBS_STEPS=1 -# NUM_EXPERT_LAYERS=0 -# CHUNK_SIZE=50 -# NUM_VLM_LAYERS=16 -# PAD_LANG_TO=longest -# EVAL_CKPT=$WORK/logs/lerobot/lerobot_physical_intelligence_libero_smolpi0_lr2e-4bs8steps100000gpus2freeze32_imgtoktrue_cross_attn_gap1_localimgfalse_statetopreftrue_explay0_vlml16_causalacttrue_sa2_smolvlm2500_chunk50_nobs1_expw0.5_lrvlm1e-4_longest_pt_trans1true/checkpoints/best/pretrained_model/ -# ADD_IMAGE_TOKENS=true -# ATTN_MODE=cross_attn -# STATE_TO_PREFIX=true -# CAUSAL_ACTION_ATTENTION_MASK=true -# SELF_ATTN_EVERY_N_LAYERS=2 -# VLM_NAME=HuggingFaceTB/SmolVLM-500M-Instruct - - -# SELF_ATTN_ONLY_ACTIONS=false -# EXPERT_WIDTH_MULTIPLIER=0.5 -# PAST_OBS_KEYS="image" -# N_OBS_STEPS=1 -# NUM_EXPERT_LAYERS=0 -# CHUNK_SIZE=50 -# NUM_VLM_LAYERS=16 -# PAD_LANG_TO=longest -# EVAL_CKPT=$WORK/logs/lerobot/lerobot_physical_intelligence_libero_smolpi0_lr2e-4bs8steps100000gpus2freeze32_imgtoktrue_cross_attn_gap1_localimgfalse_statetopreftrue_explay0_vlml16_causalacttrue_sa2_smolvlm2500_chunk50_nobs1_expw0.5_lrvlm1e-4_longest_pt_trans3true/checkpoints/best/pretrained_model/ -# ADD_IMAGE_TOKENS=true -# ATTN_MODE=cross_attn -# STATE_TO_PREFIX=true -# CAUSAL_ACTION_ATTENTION_MASK=true -# SELF_ATTN_EVERY_N_LAYERS=2 -# VLM_NAME=HuggingFaceTB/SmolVLM-500M-Instruct - -# SELF_ATTN_ONLY_ACTIONS=false -# EXPERT_WIDTH_MULTIPLIER=0.5 -# PAST_OBS_KEYS="image" -# N_OBS_STEPS=1 -# NUM_EXPERT_LAYERS=0 -# CHUNK_SIZE=50 -# NUM_VLM_LAYERS=0 -# PAD_LANG_TO=longest -# EVAL_CKPT=$WORK/logs/lerobot/lerobot_physical_intelligence_libero_smolpi0_lr1e-4bs32steps100000gpus2freeze32_imgtoktrue_self_attn_gap1_localimgfalse_statetopreftrue_explay0_vlml0_causalacttrue_sa0_smolvlm2500_chunk50_nobs1_expw0.5_lrvlm1e-4_longest_pt_trans1false/checkpoints/best/pretrained_model/ -# ADD_IMAGE_TOKENS=true -# ATTN_MODE=self_attn -# STATE_TO_PREFIX=true -# CAUSAL_ACTION_ATTENTION_MASK=true -# SELF_ATTN_EVERY_N_LAYERS=0 -# VLM_NAME=HuggingFaceTB/SmolVLM-500M-Instruct - - -# SELF_ATTN_ONLY_ACTIONS=false -# EXPERT_WIDTH_MULTIPLIER=0.5 -# PAST_OBS_KEYS="image" -# N_OBS_STEPS=1 -# NUM_EXPERT_LAYERS=0 -# CHUNK_SIZE=50 -# NUM_VLM_LAYERS=0 -# PAD_LANG_TO=longest -# EVAL_CKPT=$WORK/logs/lerobot/lerobot_physical_intelligence_libero_smolpi0_lr1e-4bs32steps100000gpus2freeze32_imgtoktrue_self_attn_gap1_localimgfalse_statetopreftrue_explay0_vlml0_causalactfalse_sa0_smolvlm2500_chunk50_nobs1_expw0.5_lrvlm1e-4_longest_pt_trans1false/checkpoints/best/pretrained_model/ -# ADD_IMAGE_TOKENS=true -# ATTN_MODE=self_attn -# STATE_TO_PREFIX=true -# CAUSAL_ACTION_ATTENTION_MASK=false -# SELF_ATTN_EVERY_N_LAYERS=0 -# VLM_NAME=HuggingFaceTB/SmolVLM-500M-Instruct - -# SELF_ATTN_ONLY_ACTIONS=false -# EXPERT_WIDTH_MULTIPLIER=0.5 -# PAST_OBS_KEYS="image" -# N_OBS_STEPS=1 -# NUM_EXPERT_LAYERS=0 -# CHUNK_SIZE=50 -# NUM_VLM_LAYERS=0 -# PAD_LANG_TO=longest -# EVAL_CKPT=$WORK/logs/lerobot/lerobot_physical_intelligence_libero_smolpi0_lr1e-4bs32steps100000gpus2freeze32_imgtoktrue_cross_attn_gap1_localimgfalse_statetopreftrue_explay0_vlml0_causalacttrue_sa2_smolvlm2500_chunk50_nobs1_expw0.5_lrvlm1e-4_longest_pt_trans1false/checkpoints/best/pretrained_model/ -# ADD_IMAGE_TOKENS=true -# ATTN_MODE=cross_attn -# STATE_TO_PREFIX=true -# CAUSAL_ACTION_ATTENTION_MASK=true -# SELF_ATTN_EVERY_N_LAYERS=2 -# VLM_NAME=HuggingFaceTB/SmolVLM-500M-Instruct - - -# SELF_ATTN_ONLY_ACTIONS=false -# EXPERT_WIDTH_MULTIPLIER=0.5 -# PAST_OBS_KEYS="image" -# N_OBS_STEPS=1 -# NUM_EXPERT_LAYERS=0 -# CHUNK_SIZE=50 -# NUM_VLM_LAYERS=0 -# PAD_LANG_TO=longest -# EVAL_CKPT=$WORK/logs/lerobot/lerobot_physical_intelligence_libero_smolpi0_lr1e-4bs32steps100000gpus2freeze32_imgtoktrue_cross_attn_gap1_localimgfalse_statetopreftrue_explay0_vlml0_causalacttrue_sa0_smolvlm2500_chunk50_nobs1_expw0.5_lrvlm1e-4_longest_pt_trans1false/checkpoints/best/pretrained_model/ -# ADD_IMAGE_TOKENS=true -# ATTN_MODE=cross_attn -# STATE_TO_PREFIX=true -# CAUSAL_ACTION_ATTENTION_MASK=true -# SELF_ATTN_EVERY_N_LAYERS=0 -# VLM_NAME=HuggingFaceTB/SmolVLM-500M-Instruct - - -# SELF_ATTN_ONLY_ACTIONS=false -# EXPERT_WIDTH_MULTIPLIER=0.75 -# PAST_OBS_KEYS="image" -# N_OBS_STEPS=1 -# NUM_EXPERT_LAYERS=0 -# CHUNK_SIZE=50 -# NUM_VLM_LAYERS=16 -# PAD_LANG_TO=longest -# EVAL_CKPT=$WORK/logs/lerobot/lerobot_physical_intelligence_libero_smolpi0_lr1e-4bs8steps100000gpus2freeze32_imgtoktrue_cross_attn_gap1_localimgfalse_statetopreftrue_explay0_vlml16_causalacttrue_sa2_smolvlm2500_chunk50_nobs1_expw0.75_lrvlm1e-4_longest_ptcomv3freeze25_trans1false/checkpoints/best/pretrained_model/ -# ADD_IMAGE_TOKENS=true -# ATTN_MODE=cross_attn -# STATE_TO_PREFIX=true -# CAUSAL_ACTION_ATTENTION_MASK=true -# SELF_ATTN_EVERY_N_LAYERS=2 -# VLM_NAME=HuggingFaceTB/SmolVLM-500M-Instruct - - -# SELF_ATTN_ONLY_ACTIONS=false -# EXPERT_WIDTH_MULTIPLIER=0.75 -# PAST_OBS_KEYS="image" -# N_OBS_STEPS=1 -# NUM_EXPERT_LAYERS=0 -# CHUNK_SIZE=50 -# NUM_VLM_LAYERS=16 -# PAD_LANG_TO=longest -# EVAL_CKPT=$WORK/logs/lerobot/lerobot_physical_intelligence_libero_smolpi0_lr1e-4bs8steps100000gpus2freeze32_imgtoktrue_cross_attn_gap1_localimgfalse_statetopreftrue_explay0_vlml16_causalacttrue_sa2_smolvlm2500_chunk50_nobs1_expw0.75_lrvlm1e-4_longest_ptcomv3freeze50_trans1false/checkpoints/best/pretrained_model/ -# ADD_IMAGE_TOKENS=true -# ATTN_MODE=cross_attn -# STATE_TO_PREFIX=true -# CAUSAL_ACTION_ATTENTION_MASK=true -# SELF_ATTN_EVERY_N_LAYERS=2 -# VLM_NAME=HuggingFaceTB/SmolVLM-500M-Instruct - -# SELF_ATTN_ONLY_ACTIONS=false -# EXPERT_WIDTH_MULTIPLIER=0.75 -# PAST_OBS_KEYS="image" -# N_OBS_STEPS=1 -# NUM_EXPERT_LAYERS=0 -# CHUNK_SIZE=50 -# NUM_VLM_LAYERS=16 -# PAD_LANG_TO=longest -# EVAL_CKPT=$WORK/logs/lerobot/lerobot_physical_intelligence_libero_smolpi0_lr1e-4bs8steps100000gpus2freeze32_imgtoktrue_cross_attn_gap1_localimgfalse_statetopreftrue_explay0_vlml16_causalacttrue_sa2_smolvlm2500_chunk50_nobs1_expw0.75_lrvlm1e-4_longest_ptcomv3freeze75_trans1false/checkpoints/best/pretrained_model/ -# ADD_IMAGE_TOKENS=true -# ATTN_MODE=cross_attn -# STATE_TO_PREFIX=true -# CAUSAL_ACTION_ATTENTION_MASK=true -# SELF_ATTN_EVERY_N_LAYERS=2 -# VLM_NAME=HuggingFaceTB/SmolVLM-500M-Instruct - -# SELF_ATTN_ONLY_ACTIONS=false -# EXPERT_WIDTH_MULTIPLIER=0.75 -# PAST_OBS_KEYS="image" -# N_OBS_STEPS=1 -# NUM_EXPERT_LAYERS=0 -# CHUNK_SIZE=50 -# NUM_VLM_LAYERS=16 -# PAD_LANG_TO=longest -# EVAL_CKPT=$WORK/logs/lerobot/lerobot_physical_intelligence_libero_smolpi0_lr1e-4bs8steps100000gpus2freeze32_imgtoktrue_cross_attn_gap1_localimgfalse_statetopreftrue_explay0_vlml16_causalacttrue_sa2_smolvlm2500_chunk50_nobs1_expw0.75_lrvlm1e-4_longest_ptcomv3freeze100_trans1false/checkpoints/best/pretrained_model/ -# ADD_IMAGE_TOKENS=true -# ATTN_MODE=cross_attn -# STATE_TO_PREFIX=true -# CAUSAL_ACTION_ATTENTION_MASK=true -# SELF_ATTN_EVERY_N_LAYERS=2 -# VLM_NAME=HuggingFaceTB/SmolVLM-500M-Instruct - -# SELF_ATTN_ONLY_ACTIONS=false -# EXPERT_WIDTH_MULTIPLIER=0.5 -# PAST_OBS_KEYS="image" -# N_OBS_STEPS=1 -# NUM_EXPERT_LAYERS=0 -# CHUNK_SIZE=50 -# NUM_VLM_LAYERS=16 -# PAD_LANG_TO=longest -# EVAL_CKPT=$WORK/logs/lerobot/lerobot_physical_intelligence_libero_smolpi0_lr1e-4bs8steps100000gpus2freeze32_imgtoktrue_cross_attn_gap1_localimgfalse_statetopreftrue_explay0_vlml16_causalacttrue_sa2_smolvlm2500_chunk50_nobs1_expw0.5_lrvlm1e-4_longest_pt_trans6true/checkpoints/best/pretrained_model/ -# ADD_IMAGE_TOKENS=true -# ATTN_MODE=cross_attn -# STATE_TO_PREFIX=true -# CAUSAL_ACTION_ATTENTION_MASK=true -# SELF_ATTN_EVERY_N_LAYERS=2 -# VLM_NAME=HuggingFaceTB/SmolVLM-500M-Instruct - -# SELF_ATTN_ONLY_ACTIONS=false -# EXPERT_WIDTH_MULTIPLIER=0.5 -# PAST_OBS_KEYS="image" -# N_OBS_STEPS=1 -# NUM_EXPERT_LAYERS=0 -# CHUNK_SIZE=50 -# NUM_VLM_LAYERS=16 -# PAD_LANG_TO=longest -# EVAL_CKPT=$WORK/logs/lerobot/lerobot_physical_intelligence_libero_smolpi0_lr1e-4bs8steps100000gpus2freeze32_imgtoktrue_cross_attn_gap1_localimgfalse_statetopreftrue_explay0_vlml16_causalacttrue_sa2_smolvlm2500_chunk50_nobs1_expw0.5_lrvlm1e-4_longest_pt_trans4true/checkpoints/best/pretrained_model/ -# ADD_IMAGE_TOKENS=true -# ATTN_MODE=cross_attn -# STATE_TO_PREFIX=true -# CAUSAL_ACTION_ATTENTION_MASK=true -# SELF_ATTN_EVERY_N_LAYERS=2 -# VLM_NAME=HuggingFaceTB/SmolVLM-500M-Instruct - -# SELF_ATTN_ONLY_ACTIONS=false -# EXPERT_WIDTH_MULTIPLIER=0.5 -# PAST_OBS_KEYS="image" -# N_OBS_STEPS=1 -# NUM_EXPERT_LAYERS=0 -# CHUNK_SIZE=50 -# NUM_VLM_LAYERS=16 -# PAD_LANG_TO=longest -# EVAL_CKPT=$WORK/logs/lerobot/lerobot_physical_intelligence_libero_smolpi0_lr1e-4bs8steps100000gpus2freeze32_imgtoktrue_cross_attn_gap1_localimgfalse_statetopreftrue_explay0_vlml16_causalacttrue_sa2_smolvlm2500_chunk50_nobs1_expw0.5_lrvlm1e-4_longest_pt_trans7true/checkpoints/best/pretrained_model/ -# ADD_IMAGE_TOKENS=true -# ATTN_MODE=cross_attn -# STATE_TO_PREFIX=true -# CAUSAL_ACTION_ATTENTION_MASK=true -# SELF_ATTN_EVERY_N_LAYERS=2 -# VLM_NAME=HuggingFaceTB/SmolVLM-500M-Instruct - -# SELF_ATTN_ONLY_ACTIONS=false -# EXPERT_WIDTH_MULTIPLIER=0.5 -# PAST_OBS_KEYS="image" -# N_OBS_STEPS=1 -# NUM_EXPERT_LAYERS=0 -# CHUNK_SIZE=50 -# NUM_VLM_LAYERS=16 -# PAD_LANG_TO=longest -# EVAL_CKPT=$WORK/logs/lerobot/lerobot_physical_intelligence_libero_smolpi0_lr1e-4bs8steps100000gpus2freeze32_imgtoktrue_cross_attn_gap1_localimgfalse_statetopreftrue_explay0_vlml16_causalacttrue_sa2_smolvlm2500_chunk50_nobs1_expw0.5_lrvlm1e-4_longest_pt_trans5true/checkpoints/best/pretrained_model/ -# ADD_IMAGE_TOKENS=true -# ATTN_MODE=cross_attn -# STATE_TO_PREFIX=true -# CAUSAL_ACTION_ATTENTION_MASK=true -# SELF_ATTN_EVERY_N_LAYERS=2 -# VLM_NAME=HuggingFaceTB/SmolVLM-500M-Instruct - - -# SELF_ATTN_ONLY_ACTIONS=false -# EXPERT_WIDTH_MULTIPLIER=0.5 -# PAST_OBS_KEYS="image" -# N_OBS_STEPS=1 -# NUM_EXPERT_LAYERS=0 -# CHUNK_SIZE=50 -# NUM_VLM_LAYERS=16 -# PAD_LANG_TO=longest -# EVAL_CKPT=$WORK/logs/lerobot/lerobot_physical_intelligence_libero_smolpi0_lr1e-4bs8steps100000gpus2freeze32_imgtoktrue_cross_attn_gap1_localimgfalse_statetopreftrue_explay0_vlml16_causalacttrue_sa2_smolvlm2500_chunk50_nobs1_expw0.5_lrvlm1e-4_longest_pt_trans2true/checkpoints/best/pretrained_model/ -# ADD_IMAGE_TOKENS=true -# ATTN_MODE=cross_attn -# STATE_TO_PREFIX=true -# CAUSAL_ACTION_ATTENTION_MASK=true -# SELF_ATTN_EVERY_N_LAYERS=2 -# VLM_NAME=HuggingFaceTB/SmolVLM-500M-Instruct - - - -# SELF_ATTN_ONLY_ACTIONS=false -# EXPERT_WIDTH_MULTIPLIER=0.75 -# PAST_OBS_KEYS="image" -# N_OBS_STEPS=1 -# NUM_EXPERT_LAYERS=0 -# CHUNK_SIZE=50 -# NUM_VLM_LAYERS=16 -# PAD_LANG_TO=longest -# EVAL_CKPT=$WORK/logs/lerobot/lerobot_physical_intelligence_libero_smolpi0_lr1e-4bs32steps100000gpus2freeze32_imgtoktrue_cross_attn_gap1_localimgfalse_statetopreftrue_explay0_vlml16_causalacttrue_sa2_smolvlm2500_chunk50_nobs1_expw0.75_lrvlm1e-4_longest_pt_trans1true/checkpoints/best/pretrained_model/ -# ADD_IMAGE_TOKENS=true -# ATTN_MODE=cross_attn -# STATE_TO_PREFIX=true -# CAUSAL_ACTION_ATTENTION_MASK=true -# SELF_ATTN_EVERY_N_LAYERS=2 -# VLM_NAME=HuggingFaceTB/SmolVLM-500M-Instruct - -# SELF_ATTN_ONLY_ACTIONS=false -# EXPERT_WIDTH_MULTIPLIER=0.5 -# PAST_OBS_KEYS="image" -# N_OBS_STEPS=1 -# NUM_EXPERT_LAYERS=0 -# CHUNK_SIZE=50 -# NUM_VLM_LAYERS=16 -# PAD_LANG_TO=longest -# EVAL_CKPT=$WORK/logs/lerobot/lerobot_physical_intelligence_libero_smolpi0_lr1e-4bs8steps100000gpus2freeze32_imgtoktrue_cross_attn_gap1_localimgfalse_statetopreftrue_explay0_vlml16_causalacttrue_sa2_smolvlm2500_chunk50_nobs1_expw0.5_lrvlm1e-4_longest_pt_trans8true/checkpoints/best/pretrained_model/ -# ADD_IMAGE_TOKENS=true -# ATTN_MODE=cross_attn -# STATE_TO_PREFIX=true -# CAUSAL_ACTION_ATTENTION_MASK=true -# SELF_ATTN_EVERY_N_LAYERS=2 -# VLM_NAME=HuggingFaceTB/SmolVLM-500M-Instruct - -# SELF_ATTN_ONLY_ACTIONS=false -# EXPERT_WIDTH_MULTIPLIER=0.5 -# PAST_OBS_KEYS="image" -# N_OBS_STEPS=1 -# NUM_EXPERT_LAYERS=0 -# CHUNK_SIZE=50 -# NUM_VLM_LAYERS=16 -# PAD_LANG_TO=longest -# EVAL_CKPT=$WORK/logs/lerobot/lerobot_physical_intelligence_libero_smolpi0_lr1e-4bs8steps100000gpus2freeze32_imgtoktrue_cross_attn_gap1_localimgfalse_statetopreftrue_explay0_vlml16_causalacttrue_sa2_smolvlm2500_chunk50_nobs1_expw0.5_lrvlm1e-4_longest_pt_trans9true/checkpoints/best/pretrained_model/ -# ADD_IMAGE_TOKENS=true -# ATTN_MODE=cross_attn -# STATE_TO_PREFIX=true -# CAUSAL_ACTION_ATTENTION_MASK=true -# SELF_ATTN_EVERY_N_LAYERS=2 -# VLM_NAME=HuggingFaceTB/SmolVLM-500M-Instruct - -# SELF_ATTN_ONLY_ACTIONS=false -# EXPERT_WIDTH_MULTIPLIER=0.5 -# PAST_OBS_KEYS="image" -# N_OBS_STEPS=1 -# NUM_EXPERT_LAYERS=0 -# CHUNK_SIZE=50 -# NUM_VLM_LAYERS=16 -# PAD_LANG_TO=longest -# EVAL_CKPT=$WORK/logs/lerobot/lerobot_physical_intelligence_libero_smolpi0_lr1e-4bs8steps100000gpus2freeze32_imgtoktrue_cross_attn_gap1_localimgfalse_statetopreftrue_explay0_vlml16_causalacttrue_sa2_smolvlm2500_chunk50_nobs1_expw0.5_lrvlm1e-4_longest_ptcomv1v2full/checkpoints/best/pretrained_model/ -# ADD_IMAGE_TOKENS=true -# ATTN_MODE=cross_attn -# STATE_TO_PREFIX=true -# CAUSAL_ACTION_ATTENTION_MASK=true -# SELF_ATTN_EVERY_N_LAYERS=2 -# VLM_NAME=HuggingFaceTB/SmolVLM-500M-Instruct - -# SELF_ATTN_ONLY_ACTIONS=false -# EXPERT_WIDTH_MULTIPLIER=0.25 -# PAST_OBS_KEYS="image" -# N_OBS_STEPS=1 -# NUM_EXPERT_LAYERS=0 -# CHUNK_SIZE=50 -# NUM_VLM_LAYERS=0 -# PAD_LANG_TO=longest -# EVAL_CKPT=$WORK/logs/lerobot/lerobot_physical_intelligence_libero_smolpi0_lr1e-4bs8steps100000gpus2freeze32_imgtoktrue_cross_attn_gap1_localimgfalse_statetopreftrue_explay0_vlml0_causalacttrue_sa0_smolvlm2500_chunk50_nobs1_expw0.25_lrvlm1e-4_longest_pt_trans0false/checkpoints/best/pretrained_model/ -# ADD_IMAGE_TOKENS=true -# ATTN_MODE=cross_attn -# STATE_TO_PREFIX=true -# CAUSAL_ACTION_ATTENTION_MASK=true -# SELF_ATTN_EVERY_N_LAYERS=0 -# VLM_NAME=HuggingFaceTB/SmolVLM-500M-Instruct - - -# SELF_ATTN_ONLY_ACTIONS=false -# EXPERT_WIDTH_MULTIPLIER=1 -# PAST_OBS_KEYS="image" -# N_OBS_STEPS=1 -# NUM_EXPERT_LAYERS=0 -# CHUNK_SIZE=50 -# NUM_VLM_LAYERS=0 -# PAD_LANG_TO=longest -# EVAL_CKPT=$WORK/logs/lerobot/lerobot_physical_intelligence_libero_smolpi0_lr1e-4bs8steps100000gpus2freeze32_imgtoktrue_cross_attn_gap1_localimgfalse_statetopreftrue_explay0_vlml0_causalacttrue_sa0_smolvlm2500_chunk50_nobs1_expw1_lrvlm1e-4_longest_pt_trans0false/checkpoints/best/pretrained_model/ -# ADD_IMAGE_TOKENS=true -# ATTN_MODE=cross_attn -# STATE_TO_PREFIX=true -# CAUSAL_ACTION_ATTENTION_MASK=true -# SELF_ATTN_EVERY_N_LAYERS=0 -# VLM_NAME=HuggingFaceTB/SmolVLM-500M-Instruct - -# SELF_ATTN_ONLY_ACTIONS=false -# EXPERT_WIDTH_MULTIPLIER=0.75 -# PAST_OBS_KEYS="image" -# N_OBS_STEPS=1 -# NUM_EXPERT_LAYERS=0 -# CHUNK_SIZE=50 -# NUM_VLM_LAYERS=0 -# PAD_LANG_TO=longest -# EVAL_CKPT=$WORK/logs/lerobot/lerobot_physical_intelligence_libero_smolpi0_lr1e-4bs8steps100000gpus2freeze32_imgtoktrue_cross_attn_gap1_localimgfalse_statetopreftrue_explay0_vlml0_causalacttrue_sa0_smolvlm2500_chunk50_nobs1_expw0.75_lrvlm1e-4_longest_pt_trans0false/checkpoints/best/pretrained_model/ -# ADD_IMAGE_TOKENS=true -# ATTN_MODE=cross_attn -# STATE_TO_PREFIX=true -# CAUSAL_ACTION_ATTENTION_MASK=true -# SELF_ATTN_EVERY_N_LAYERS=0 -# VLM_NAME=HuggingFaceTB/SmolVLM-500M-Instruct - -# SELF_ATTN_ONLY_ACTIONS=false -# EXPERT_WIDTH_MULTIPLIER=0.75 -# PAST_OBS_KEYS="image" -# N_OBS_STEPS=1 -# NUM_EXPERT_LAYERS=0 -# CHUNK_SIZE=50 -# NUM_VLM_LAYERS=0 -# PAD_LANG_TO=longest -# EVAL_CKPT=$WORK/logs/lerobot/lerobot_physical_intelligence_libero_smolpi0_lr1e-4bs32steps100000gpus2freeze32_imgtoktrue_cross_attn_gap1_localimgfalse_statetopreftrue_explay0_vlml0_causalacttrue_sa0_smolvlm2500_chunk50_nobs1_expw0.75_lrvlm1e-4_longest_pt_trans0false/checkpoints/best/pretrained_model/ -# ADD_IMAGE_TOKENS=true -# ATTN_MODE=cross_attn -# STATE_TO_PREFIX=true -# CAUSAL_ACTION_ATTENTION_MASK=true -# SELF_ATTN_EVERY_N_LAYERS=0 -# VLM_NAME=HuggingFaceTB/SmolVLM-500M-Instruct - -# SELF_ATTN_ONLY_ACTIONS=false -# EXPERT_WIDTH_MULTIPLIER=0.25 -# PAST_OBS_KEYS="image" -# N_OBS_STEPS=1 -# NUM_EXPERT_LAYERS=0 -# CHUNK_SIZE=50 -# NUM_VLM_LAYERS=0 -# PAD_LANG_TO=longest -# EVAL_CKPT=$WORK/logs/lerobot/lerobot_physical_intelligence_libero_smolpi0_lr1e-4bs32steps100000gpus2freeze32_imgtoktrue_cross_attn_gap1_localimgfalse_statetopreftrue_explay0_vlml0_causalacttrue_sa0_smolvlm2500_chunk50_nobs1_expw0.25_lrvlm1e-4_longest_pt_trans0false/checkpoints/best/pretrained_model/ -# ADD_IMAGE_TOKENS=true -# ATTN_MODE=cross_attn -# STATE_TO_PREFIX=true -# CAUSAL_ACTION_ATTENTION_MASK=true -# SELF_ATTN_EVERY_N_LAYERS=0 -# VLM_NAME=HuggingFaceTB/SmolVLM-500M-Instruct - -# SELF_ATTN_ONLY_ACTIONS=false -# EXPERT_WIDTH_MULTIPLIER=1 -# PAST_OBS_KEYS="image" -# N_OBS_STEPS=1 -# NUM_EXPERT_LAYERS=0 -# CHUNK_SIZE=50 -# NUM_VLM_LAYERS=0 -# PAD_LANG_TO=longest -# EVAL_CKPT=$WORK/logs/lerobot/lerobot_physical_intelligence_libero_smolpi0_lr1e-4bs32steps100000gpus2freeze32_imgtoktrue_cross_attn_gap1_localimgfalse_statetopreftrue_explay0_vlml0_causalacttrue_sa0_smolvlm2500_chunk50_nobs1_expw1_lrvlm1e-4_longest_pt_trans0false/checkpoints/best/pretrained_model/ -# ADD_IMAGE_TOKENS=true -# ATTN_MODE=cross_attn -# STATE_TO_PREFIX=true -# CAUSAL_ACTION_ATTENTION_MASK=true -# SELF_ATTN_EVERY_N_LAYERS=0 -# VLM_NAME=HuggingFaceTB/SmolVLM-500M-Instruct - - -# SELF_ATTN_ONLY_ACTIONS=false -# EXPERT_WIDTH_MULTIPLIER=0.75 -# PAST_OBS_KEYS="observation.state" -# N_OBS_STEPS=3 -# NUM_EXPERT_LAYERS=0 -# CHUNK_SIZE=50 -# NUM_VLM_LAYERS=16 -# PAD_LANG_TO=longest -# EVAL_CKPT=$WORK/logs/lerobot/lerobot_physical_intelligence_libero_smolpi0_lr1e-4bs8steps100000gpus2freeze32_imgtoktrue_cross_attn_gap1_localimgfalse_statetopreftrue_explay0_vlml16_causalacttrue_sa2_smolvlm2500_chunk50_nobs3statestrue_expw0.75_lrvlm1e-4_longest_pt_trans0false/checkpoints/best/pretrained_model/ -# ADD_IMAGE_TOKENS=true -# ATTN_MODE=cross_attn -# STATE_TO_PREFIX=true -# CAUSAL_ACTION_ATTENTION_MASK=true -# SELF_ATTN_EVERY_N_LAYERS=2 -# VLM_NAME=HuggingFaceTB/SmolVLM-500M-Instruct - - -# CAUSAL_ATTENTION_ON_HISTORY=true -# SELF_ATTN_ONLY_ACTIONS=false -# EXPERT_WIDTH_MULTIPLIER=0.75 -# PAST_OBS_KEYS="image,observation.state" -# N_OBS_STEPS=3 -# NUM_EXPERT_LAYERS=0 -# CHUNK_SIZE=50 -# NUM_VLM_LAYERS=16 -# PAD_LANG_TO=longest -# EVAL_CKPT=$WORK/logs/lerobot/lerobot_physical_intelligence_libero_smolpi0_lr1e-4bs8steps100000gpus2freeze32_imgtoktrue_cross_attn_gap1_localimgfalse_statetopreftrue_explay0_vlml16_causalacttrue_sa2_smolvlm2500_chunk50_nobs3_expw0.75_lrvlm1e-4_longest_pt_trans0false/checkpoints/best/pretrained_model/ -# ADD_IMAGE_TOKENS=true -# ATTN_MODE=cross_attn -# STATE_TO_PREFIX=true -# CAUSAL_ACTION_ATTENTION_MASK=true -# SELF_ATTN_EVERY_N_LAYERS=2 -# VLM_NAME=HuggingFaceTB/SmolVLM-500M-Instruct - - - -# CAUSAL_ATTENTION_ON_HISTORY=true -# SELF_ATTN_ONLY_ACTIONS=false -# EXPERT_WIDTH_MULTIPLIER=0.75 -# PAST_OBS_KEYS="image,observation.state" -# N_OBS_STEPS=1 -# NUM_EXPERT_LAYERS=0 -# CHUNK_SIZE=50 -# NUM_VLM_LAYERS=16 -# PAD_LANG_TO=longest -# EVAL_CKPT=$WORK/logs/lerobot/lerobot_physical_intelligence_libero_smolpi0_lr1e-4bs8steps100000gpus2freeze32_imgtoktrue_cross_attn_vlml16_causalacttrue_sa2_smolvlm2500_chunk50_nobs1statestrue_expw0.75_lrvlm1e-4_longest_pt_trans0false_decaylr1e-5100000/checkpoints/best/pretrained_model/ -# ADD_IMAGE_TOKENS=true -# ATTN_MODE=cross_attn -# STATE_TO_PREFIX=true -# CAUSAL_ACTION_ATTENTION_MASK=true -# SELF_ATTN_EVERY_N_LAYERS=2 -# VLM_NAME=HuggingFaceTB/SmolVLM-500M-Instruct - - - -# CAUSAL_ATTENTION_ON_HISTORY=true -# SELF_ATTN_ONLY_ACTIONS=false -# EXPERT_WIDTH_MULTIPLIER=0.75 -# PAST_OBS_KEYS="image,observation.state" -# N_OBS_STEPS=1 -# NUM_EXPERT_LAYERS=0 -# CHUNK_SIZE=50 -# NUM_VLM_LAYERS=16 -# PAD_LANG_TO=longest -# EVAL_CKPT=$WORK/logs/lerobot/lerobot_physical_intelligence_libero_smolpi0_lr1e-4bs8steps100000gpus2freeze32_imgtoktrue_cross_attn_vlml16_causalacttrue_sa2_smolvlm2500_chunk50_nobs1statestrue_expw0.75_lrvlm1e-4_longest_pt_trans0false_decaylr1e-530000/checkpoints/best/pretrained_model/ -# ADD_IMAGE_TOKENS=true -# ATTN_MODE=cross_attn -# STATE_TO_PREFIX=true -# CAUSAL_ACTION_ATTENTION_MASK=true -# SELF_ATTN_EVERY_N_LAYERS=2 -# VLM_NAME=HuggingFaceTB/SmolVLM-500M-Instruct - - -# CAUSAL_ATTENTION_ON_HISTORY=true -# SELF_ATTN_ONLY_ACTIONS=false -# EXPERT_WIDTH_MULTIPLIER=0.75 -# PAST_OBS_KEYS="image,observation.state" -# N_OBS_STEPS=1 -# NUM_EXPERT_LAYERS=0 -# CHUNK_SIZE=50 -# NUM_VLM_LAYERS=16 -# PAD_LANG_TO=longest -# EVAL_CKPT=$WORK/logs/lerobot/lerobot_physical_intelligence_libero_smolpi0_lr1e-4bs8steps100000gpus2freeze32_imgtoktrue_cross_attn_vlml16_causalacttrue_sa2_smolvlm2500_chunk50_nobs1statestrue_expw0.75_lrvlm1e-4_longest_pt_trans0false_decaylr5e-6100000/checkpoints/best/pretrained_model/ -# ADD_IMAGE_TOKENS=true -# ATTN_MODE=cross_attn -# STATE_TO_PREFIX=true -# CAUSAL_ACTION_ATTENTION_MASK=true -# SELF_ATTN_EVERY_N_LAYERS=2 -# VLM_NAME=HuggingFaceTB/SmolVLM-500M-Instruct - - -# CAUSAL_ATTENTION_ON_HISTORY=true -# SELF_ATTN_ONLY_ACTIONS=false -# EXPERT_WIDTH_MULTIPLIER=0.75 -# PAST_OBS_KEYS="image" -# N_OBS_STEPS=1 -# NUM_EXPERT_LAYERS=0 -# CHUNK_SIZE=50 -# NUM_VLM_LAYERS=16 -# PAD_LANG_TO=longest -# EVAL_CKPT=$WORK/logs/lerobot/lerobot_physical_intelligence_libero_smolpi0_lr1e-4bs8steps100000gpus2freeze32_imgtoktrue_cross_attn_vlml16_causalacttrue_sa2_smolvlm2500_chunk50_nobs1statestrue_expw0.75_lrvlm1e-4_longest_pt_trans0false_decaylr2.5e-6100000/checkpoints/best/pretrained_model/ -# ADD_IMAGE_TOKENS=true -# ATTN_MODE=cross_attn -# STATE_TO_PREFIX=true -# CAUSAL_ACTION_ATTENTION_MASK=true -# SELF_ATTN_EVERY_N_LAYERS=2 -# VLM_NAME=HuggingFaceTB/SmolVLM-500M-Instruct - -# CAUSAL_ATTENTION_ON_HISTORY=true -# SELF_ATTN_ONLY_ACTIONS=false -# EXPERT_WIDTH_MULTIPLIER=0.75 -# PAST_OBS_KEYS="image" -# N_OBS_STEPS=1 -# NUM_EXPERT_LAYERS=0 -# CHUNK_SIZE=50 -# NUM_VLM_LAYERS=16 -# PAD_LANG_TO=longest -# EVAL_CKPT=$WORK/logs/lerobot/lerobot_physical_intelligence_libero_smolpi0_lr1e-4bs8steps100000gpus2freeze32_imgtoktrue_cross_attn_vlml16_causalacttrue_sa2_smolvlm2500_chunk50_nobs1statestrue_expw0.75_lrvlm1e-4_longest_pt_trans0false_decaylr2.5e-630000/checkpoints/best/pretrained_model/ -# ADD_IMAGE_TOKENS=true -# ATTN_MODE=cross_attn -# STATE_TO_PREFIX=true -# CAUSAL_ACTION_ATTENTION_MASK=true -# SELF_ATTN_EVERY_N_LAYERS=2 -# VLM_NAME=HuggingFaceTB/SmolVLM-500M-Instruct - -# CAUSAL_ATTENTION_ON_HISTORY=true -# SELF_ATTN_ONLY_ACTIONS=false -# EXPERT_WIDTH_MULTIPLIER=0.75 -# PAST_OBS_KEYS="image" -# N_OBS_STEPS=1 -# NUM_EXPERT_LAYERS=0 -# CHUNK_SIZE=50 -# NUM_VLM_LAYERS=16 -# PAD_LANG_TO=longest -# EVAL_CKPT=$WORK/logs/lerobot/lerobot_physical_intelligence_libero_smolpi0_lr1e-4bs8steps100000gpus2freeze32_imgtoktrue_cross_attn_vlml16_causalacttrue_sa2_smolvlm2500_chunk50_nobs1statestrue_expw0.75_lrvlm1e-4_longest_pt_trans0true_decaylr2.5e-630000/checkpoints/best/pretrained_model/ -# ADD_IMAGE_TOKENS=true -# ATTN_MODE=cross_attn -# STATE_TO_PREFIX=true -# CAUSAL_ACTION_ATTENTION_MASK=true -# SELF_ATTN_EVERY_N_LAYERS=2 -# VLM_NAME=HuggingFaceTB/SmolVLM-500M-Instruct - - -# CAUSAL_ATTENTION_ON_HISTORY=true -# SELF_ATTN_ONLY_ACTIONS=false -# EXPERT_WIDTH_MULTIPLIER=0.75 -# PAST_OBS_KEYS="image" -# N_OBS_STEPS=1 -# NUM_EXPERT_LAYERS=0 -# CHUNK_SIZE=50 -# NUM_VLM_LAYERS=16 -# PAD_LANG_TO=longest -# EVAL_CKPT=$WORK/logs/lerobot/lerobot_physical_intelligence_libero_smolpi0_lr1e-4bs32steps100000gpus2freeze32_imgtoktrue_cross_attn_vlml16_causalacttrue_sa2_smolvlm2500_chunk50_nobs1statestrue_expw0.75_lrvlm1e-4_longest_pt_trans0false_decaylr1e-5200000/checkpoints/best/pretrained_model/ -# ADD_IMAGE_TOKENS=true -# ATTN_MODE=cross_attn -# STATE_TO_PREFIX=true -# CAUSAL_ACTION_ATTENTION_MASK=true -# SELF_ATTN_EVERY_N_LAYERS=2 -# VLM_NAME=HuggingFaceTB/SmolVLM-500M-Instruct - - -# CAUSAL_ATTENTION_ON_HISTORY=true -# SELF_ATTN_ONLY_ACTIONS=false -# EXPERT_WIDTH_MULTIPLIER=0.75 -# PAST_OBS_KEYS="image" -# N_OBS_STEPS=1 -# NUM_EXPERT_LAYERS=0 -# CHUNK_SIZE=50 -# NUM_VLM_LAYERS=16 -# PAD_LANG_TO=longest -# EVAL_CKPT=$WORK/logs/lerobot/lerobot_physical_intelligence_libero_smolpi0_lr1e-4bs32steps100000gpus2freeze32_imgtoktrue_cross_attn_vlml16_causalacttrue_sa2_smolvlm2500_chunk50_nobs1statestrue_expw0.75_lrvlm1e-4_longest_pt_trans0false_decaylr5e-6200000/checkpoints/best/pretrained_model/ -# ADD_IMAGE_TOKENS=true -# ATTN_MODE=cross_attn -# STATE_TO_PREFIX=true -# CAUSAL_ACTION_ATTENTION_MASK=true -# SELF_ATTN_EVERY_N_LAYERS=2 -# VLM_NAME=HuggingFaceTB/SmolVLM-500M-Instruct - -# CAUSAL_ATTENTION_ON_HISTORY=true -# SELF_ATTN_ONLY_ACTIONS=false -# EXPERT_WIDTH_MULTIPLIER=0.75 -# PAST_OBS_KEYS="image" -# N_OBS_STEPS=1 -# NUM_EXPERT_LAYERS=0 -# CHUNK_SIZE=50 -# NUM_VLM_LAYERS=16 -# PAD_LANG_TO=longest -# EVAL_CKPT=$WORK/logs/lerobot/lerobot_physical_intelligence_libero_smolpi0_lr1e-4bs32steps100000gpus2freeze32_imgtoktrue_cross_attn_vlml16_causalacttrue_sa2_smolvlm2500_chunk50_nobs1statestrue_expw0.75_lrvlm1e-4_longest_pt_trans0false_decaylr1e-530000/checkpoints/best/pretrained_model/ -# ADD_IMAGE_TOKENS=true -# ATTN_MODE=cross_attn -# STATE_TO_PREFIX=true -# CAUSAL_ACTION_ATTENTION_MASK=true -# SELF_ATTN_EVERY_N_LAYERS=2 -# VLM_NAME=HuggingFaceTB/SmolVLM-500M-Instruct - - -# CAUSAL_ATTENTION_ON_HISTORY=true -# SELF_ATTN_ONLY_ACTIONS=false -# EXPERT_WIDTH_MULTIPLIER=0.75 -# PAST_OBS_KEYS="image" -# N_OBS_STEPS=1 -# NUM_EXPERT_LAYERS=0 -# CHUNK_SIZE=50 -# NUM_VLM_LAYERS=16 -# PAD_LANG_TO=longest -# EVAL_CKPT=$WORK/logs/lerobot/lerobot_physical_intelligence_libero_smolpi0_lr1e-4bs32steps100000gpus2freeze32_imgtoktrue_cross_attn_vlml16_causalacttrue_sa2_smolvlm2500_chunk50_nobs1statestrue_expw0.75_lrvlm1e-4_longest_pt_trans0false_decaylr2.5e-6200000/checkpoints/best/pretrained_model/ -# ADD_IMAGE_TOKENS=true -# ATTN_MODE=cross_attn -# STATE_TO_PREFIX=true -# CAUSAL_ACTION_ATTENTION_MASK=true -# SELF_ATTN_EVERY_N_LAYERS=2 -# VLM_NAME=HuggingFaceTB/SmolVLM-500M-Instruct - - -# CAUSAL_ATTENTION_ON_HISTORY=true -# SELF_ATTN_ONLY_ACTIONS=false -# EXPERT_WIDTH_MULTIPLIER=0.75 -# PAST_OBS_KEYS="image" -# N_OBS_STEPS=1 -# NUM_EXPERT_LAYERS=0 -# CHUNK_SIZE=50 -# NUM_VLM_LAYERS=16 -# PAD_LANG_TO=longest -# EVAL_CKPT=$WORK/logs/lerobot/lerobot_physical_intelligence_libero_smolpi0_lr1e-4bs32steps100000gpus2freeze32_imgtoktrue_cross_attn_vlml16_causalacttrue_sa2_smolvlm2500_chunk50_nobs1statestrue_expw0.75_lrvlm1e-4_longest_pt_trans0false_decaylr2.5e-6100000/checkpoints/best/pretrained_model/ -# ADD_IMAGE_TOKENS=true -# ATTN_MODE=cross_attn -# STATE_TO_PREFIX=true -# CAUSAL_ACTION_ATTENTION_MASK=true -# SELF_ATTN_EVERY_N_LAYERS=2 -# VLM_NAME=HuggingFaceTB/SmolVLM-500M-Instruct - - -# CAUSAL_ATTENTION_ON_HISTORY=true -# SELF_ATTN_ONLY_ACTIONS=false -# EXPERT_WIDTH_MULTIPLIER=0.75 -# PAST_OBS_KEYS="image" -# N_OBS_STEPS=1 -# NUM_EXPERT_LAYERS=0 -# CHUNK_SIZE=50 -# NUM_VLM_LAYERS=16 -# PAD_LANG_TO=longest -# EVAL_CKPT=$WORK/logs/lerobot/lerobot_physical_intelligence_libero_smolpi0_lr1e-4bs32steps100000gpus2freeze32_imgtoktrue_cross_attn_vlml16_causalacttrue_sa2_smolvlm2500_chunk50_nobs1statestrue_expw0.75_lrvlm1e-4_longest_pt_trans0false_decaylr2.5e-630000/checkpoints/best/pretrained_model/ -# ADD_IMAGE_TOKENS=true -# ATTN_MODE=cross_attn -# STATE_TO_PREFIX=true -# CAUSAL_ACTION_ATTENTION_MASK=true -# SELF_ATTN_EVERY_N_LAYERS=2 -# VLM_NAME=HuggingFaceTB/SmolVLM-500M-Instruct - - -# CAUSAL_ATTENTION_ON_HISTORY=true -# SELF_ATTN_ONLY_ACTIONS=false -# EXPERT_WIDTH_MULTIPLIER=0.75 -# PAST_OBS_KEYS="image" -# N_OBS_STEPS=1 -# NUM_EXPERT_LAYERS=0 -# CHUNK_SIZE=50 -# NUM_VLM_LAYERS=16 -# PAD_LANG_TO=longest -# EVAL_CKPT=$WORK/logs/lerobot/lerobot_physical_intelligence_libero_smolpi0_lr1e-4bs32steps100000gpus2freeze32_imgtoktrue_cross_attn_vlml16_causalacttrue_sa2_smolvlm2500_chunk50_nobs1statestrue_expw0.75_lrvlm1e-4_longest_pt_trans0false_decaylr5e-6100000/checkpoints/best/pretrained_model/ -# ADD_IMAGE_TOKENS=true -# ATTN_MODE=cross_attn -# STATE_TO_PREFIX=true -# CAUSAL_ACTION_ATTENTION_MASK=true -# SELF_ATTN_EVERY_N_LAYERS=2 -# VLM_NAME=HuggingFaceTB/SmolVLM-500M-Instruct - - -# CAUSAL_ATTENTION_ON_HISTORY=true -# SELF_ATTN_ONLY_ACTIONS=false -# EXPERT_WIDTH_MULTIPLIER=0.75 -# PAST_OBS_KEYS="image" -# N_OBS_STEPS=1 -# NUM_EXPERT_LAYERS=0 -# CHUNK_SIZE=50 -# NUM_VLM_LAYERS=16 -# PAD_LANG_TO=longest -# EVAL_CKPT=$WORK/logs/lerobot/lerobot_physical_intelligence_libero_smolpi0_lr1e-4bs8steps100000gpus2freeze32_imgtoktrue_cross_attn_vlml16_causalacttrue_sa2_smolvla500base_chunk50_nobs1statestrue_expw0.75_lrvlm1e-4_longest_pt_trans0false_decaylr2.5e-630000_relactfalsefirst_camfalse/checkpoints/best/pretrained_model/ -# ADD_IMAGE_TOKENS=true -# ATTN_MODE=cross_attn -# STATE_TO_PREFIX=true -# CAUSAL_ACTION_ATTENTION_MASK=true -# SELF_ATTN_EVERY_N_LAYERS=2 -# VLM_NAME=HuggingFaceTB/SmolVLA-500M-Base - - - -# PREDICT_RELATIVE_ACTIONS=true -# RELATIVE_ACTIONS_MODE=relative -# CAUSAL_ATTENTION_ON_HISTORY=true -# SELF_ATTN_ONLY_ACTIONS=false -# EXPERT_WIDTH_MULTIPLIER=0.75 -# PAST_OBS_KEYS="image" -# N_OBS_STEPS=1 -# NUM_EXPERT_LAYERS=0 -# CHUNK_SIZE=50 -# NUM_VLM_LAYERS=16 -# PAD_LANG_TO=longest -# EVAL_CKPT=$WORK/logs/lerobot/lerobot_physical_intelligence_libero_smolpi0_lr1e-4bs32steps100000gpus2freeze32_imgtoktrue_cross_attn_vlml16_causalacttrue_sa2_smolvlm2500_chunk50_nobs1statestrue_expw0.75_lrvlm1e-4_longest_pt_trans0false_decaylr2.5e-630000_relacttruerelative/checkpoints/best/pretrained_model/ -# ADD_IMAGE_TOKENS=true -# ATTN_MODE=cross_attn -# STATE_TO_PREFIX=true -# CAUSAL_ACTION_ATTENTION_MASK=true -# SELF_ATTN_EVERY_N_LAYERS=2 -# VLM_NAME=HuggingFaceTB/SmolVLM-500M-Instruct - - - -# PREDICT_RELATIVE_ACTIONS=true -# RELATIVE_ACTIONS_MODE=first -# CAUSAL_ATTENTION_ON_HISTORY=true -# SELF_ATTN_ONLY_ACTIONS=false -# EXPERT_WIDTH_MULTIPLIER=0.75 -# PAST_OBS_KEYS="image" -# N_OBS_STEPS=1 -# NUM_EXPERT_LAYERS=0 -# CHUNK_SIZE=50 -# NUM_VLM_LAYERS=16 -# PAD_LANG_TO=longest -# EVAL_CKPT=$WORK/logs/lerobot/lerobot_physical_intelligence_libero_smolpi0_lr1e-4bs32steps100000gpus2freeze32_imgtoktrue_cross_attn_vlml16_causalacttrue_sa2_smolvlm2500_chunk50_nobs1statestrue_expw0.75_lrvlm1e-4_longest_pt_trans0false_decaylr2.5e-630000_relacttruefirst/checkpoints/best/pretrained_model/ -# ADD_IMAGE_TOKENS=true -# ATTN_MODE=cross_attn -# STATE_TO_PREFIX=true -# CAUSAL_ACTION_ATTENTION_MASK=true -# SELF_ATTN_EVERY_N_LAYERS=2 -# VLM_NAME=HuggingFaceTB/SmolVLM-500M-Instruct - - -# CAUSAL_ATTENTION_ON_HISTORY=true -# SELF_ATTN_ONLY_ACTIONS=false -# EXPERT_WIDTH_MULTIPLIER=0.75 -# PAST_OBS_KEYS="image" -# N_OBS_STEPS=1 -# NUM_EXPERT_LAYERS=0 -# CHUNK_SIZE=50 -# NUM_VLM_LAYERS=16 -# PAD_LANG_TO=longest -# EVAL_CKPT=$WORK/logs/lerobot/lerobot_physical_intelligence_libero_smolpi0_lr1e-4bs32steps100000gpus2freeze32_imgtoktrue_cross_attn_vlml16_causalacttrue_sa2_smolvla500base_chunk50_nobs1statestrue_expw0.75_lrvlm1e-4_longest_pt_trans0false_decaylr2.5e-630000_relactfalsefirst_camfalse/checkpoints/best/pretrained_model/ -# ADD_IMAGE_TOKENS=true -# ATTN_MODE=cross_attn -# STATE_TO_PREFIX=true -# CAUSAL_ACTION_ATTENTION_MASK=true -# SELF_ATTN_EVERY_N_LAYERS=2 -# VLM_NAME=HuggingFaceTB/SmolVLA-500M-Base - - -# CAUSAL_ATTENTION_ON_HISTORY=true -# SELF_ATTN_ONLY_ACTIONS=false -# EXPERT_WIDTH_MULTIPLIER=0.75 -# PAST_OBS_KEYS="image" -# N_OBS_STEPS=1 -# NUM_EXPERT_LAYERS=0 -# CHUNK_SIZE=50 -# NUM_VLM_LAYERS=16 -# PAD_LANG_TO=longest -# EVAL_CKPT=$WORK/logs/lerobot/lerobot_physical_intelligence_libero_smolpi0_lr8e-5bs8steps100000gpus2freeze32_imgtoktrue_cross_attn_vlml16_causalacttrue_sa2_smolvlm2500_chunk50_nobs1statestrue_expw0.75_lrvlm1e-4_longest_pt_trans0false_decaylr2.5e-6100000_relactfalsefirst_camfalse/checkpoints/best/pretrained_model/ -# ADD_IMAGE_TOKENS=true -# ATTN_MODE=cross_attn -# STATE_TO_PREFIX=true -# CAUSAL_ACTION_ATTENTION_MASK=true -# SELF_ATTN_EVERY_N_LAYERS=2 -# VLM_NAME=HuggingFaceTB/SmolVLM-500M-Instruct - - -# CAUSAL_ATTENTION_ON_HISTORY=true -# SELF_ATTN_ONLY_ACTIONS=false -# EXPERT_WIDTH_MULTIPLIER=0.75 -# PAST_OBS_KEYS="image" -# N_OBS_STEPS=1 -# NUM_EXPERT_LAYERS=0 -# CHUNK_SIZE=50 -# NUM_VLM_LAYERS=16 -# PAD_LANG_TO=longest -# EVAL_CKPT=$WORK/logs/lerobot/lerobot_physical_intelligence_libero_smolpi0_lr6e-5bs8steps100000gpus2freeze32_imgtoktrue_cross_attn_vlml16_causalacttrue_sa2_smolvlm2500_chunk50_nobs1statestrue_expw0.75_lrvlm1e-4_longest_pt_trans0false_decaylr2.5e-6100000_relactfalsefirst_camfalse/checkpoints/best/pretrained_model/ -# ADD_IMAGE_TOKENS=true -# ATTN_MODE=cross_attn -# STATE_TO_PREFIX=true -# CAUSAL_ACTION_ATTENTION_MASK=true -# SELF_ATTN_EVERY_N_LAYERS=2 -# VLM_NAME=HuggingFaceTB/SmolVLM-500M-Instruct - -# CAUSAL_ATTENTION_ON_HISTORY=true -# SELF_ATTN_ONLY_ACTIONS=false -# EXPERT_WIDTH_MULTIPLIER=0.75 -# PAST_OBS_KEYS="image" -# N_OBS_STEPS=1 -# NUM_EXPERT_LAYERS=0 -# CHUNK_SIZE=50 -# NUM_VLM_LAYERS=16 -# PAD_LANG_TO=longest -# EVAL_CKPT=$WORK/logs/lerobot/lerobot_physical_intelligence_libero_smolpi0_lr4e-5bs8steps100000gpus2freeze32_imgtoktrue_cross_attn_vlml16_causalacttrue_sa2_smolvlm2500_chunk50_nobs1statestrue_expw0.75_lrvlm1e-4_longest_pt_trans0false_decaylr2.5e-6100000_relactfalsefirst_camfalse/checkpoints/best/pretrained_model/ -# ADD_IMAGE_TOKENS=true -# ATTN_MODE=cross_attn -# STATE_TO_PREFIX=true -# CAUSAL_ACTION_ATTENTION_MASK=true -# SELF_ATTN_EVERY_N_LAYERS=2 -# VLM_NAME=HuggingFaceTB/SmolVLM-500M-Instruct - -# CAUSAL_ATTENTION_ON_HISTORY=true -# SELF_ATTN_ONLY_ACTIONS=false -# EXPERT_WIDTH_MULTIPLIER=0.75 -# PAST_OBS_KEYS="image" -# N_OBS_STEPS=1 -# NUM_EXPERT_LAYERS=0 -# CHUNK_SIZE=50 -# NUM_VLM_LAYERS=16 -# PAD_LANG_TO=longest -# EVAL_CKPT=$WORK/logs/lerobot/lerobot_physical_intelligence_libero_smolpi0_lr1e-4bs32steps100000gpus2freeze32_cross_attn_vlml16_sa2_smolvlm2500_chunk50_nobs1_expw0.75_lrvlm1e-4_longest_ptcomv1v2freezebs64transv0_trans0false_decaylr2.5e-630000_relactfalsefirst_camfalse_vim-1/checkpoints/best/pretrained_model/ -# ADD_IMAGE_TOKENS=true -# ATTN_MODE=cross_attn -# STATE_TO_PREFIX=true -# CAUSAL_ACTION_ATTENTION_MASK=true -# SELF_ATTN_EVERY_N_LAYERS=2 -# VLM_NAME=HuggingFaceTB/SmolVLM-500M-Instruct - -# CAUSAL_ATTENTION_ON_HISTORY=true -# SELF_ATTN_ONLY_ACTIONS=false -# EXPERT_WIDTH_MULTIPLIER=0.75 -# PAST_OBS_KEYS="image" -# N_OBS_STEPS=1 -# NUM_EXPERT_LAYERS=0 -# CHUNK_SIZE=50 -# NUM_VLM_LAYERS=16 -# PAD_LANG_TO=longest -# EVAL_CKPT=$WORK/logs/lerobot/lerobot_physical_intelligence_libero_smolpi0_lr1e-4bs32steps100000gpus2freeze32_cross_attn_vlml16_sa2_smolvlm2500_chunk50_nobs1_expw0.75_lrvlm1e-4_longest_pt_trans1true_decaylr2.5e-630000_relactfalsefirst_camfalse_vim-1/checkpoints/best/pretrained_model/ -# ADD_IMAGE_TOKENS=true -# ATTN_MODE=cross_attn -# STATE_TO_PREFIX=true -# CAUSAL_ACTION_ATTENTION_MASK=true -# SELF_ATTN_EVERY_N_LAYERS=2 -# VLM_NAME=HuggingFaceTB/SmolVLM-500M-Instruct - -# REGRESSION_LOSS=true -# CAUSAL_ATTENTION_ON_HISTORY=true -# SELF_ATTN_ONLY_ACTIONS=false -# EXPERT_WIDTH_MULTIPLIER=0.5 -# PAST_OBS_KEYS="image" -# N_OBS_STEPS=1 -# NUM_EXPERT_LAYERS=0 -# CHUNK_SIZE=50 -# NUM_VLM_LAYERS=0 -# PAD_LANG_TO=longest -# EVAL_CKPT=$WORK/logs/lerobot/lerobot_physical_intelligence_libero_smolpi0_lr1e-4bs8steps100000gpus2freeze32_cross_attn_vlml0_sa0_smolvlm2500_chunk50_nobs1_expw0.5_lrvlm1e-4_longest_pt_trans0false_decaylr2.5e-630000_relactfalsefirst_camfalse_vim-1_regtrue/checkpoints/best/pretrained_model/ -# ADD_IMAGE_TOKENS=true -# ATTN_MODE=cross_attn -# STATE_TO_PREFIX=true -# CAUSAL_ACTION_ATTENTION_MASK=true -# SELF_ATTN_EVERY_N_LAYERS=0 -# VLM_NAME=HuggingFaceTB/SmolVLM-500M-Instruct - -# REGRESSION_LOSS=true -# CAUSAL_ATTENTION_ON_HISTORY=true -# SELF_ATTN_ONLY_ACTIONS=false -# EXPERT_WIDTH_MULTIPLIER=0.75 -# PAST_OBS_KEYS="image" -# N_OBS_STEPS=1 -# NUM_EXPERT_LAYERS=0 -# CHUNK_SIZE=50 -# NUM_VLM_LAYERS=16 -# PAD_LANG_TO=longest -# EVAL_CKPT=$WORK/logs/lerobot/lerobot_physical_intelligence_libero_smolpi0_lr1e-4bs32steps100000gpus2freeze32_cross_attn_vlml16_sa2_smolvlm2500_chunk50_nobs1_expw0.75_lrvlm1e-4_longest_pt_trans0false_decaylr2.5e-630000_relactfalsefirst_camfalse_vim-1_regtrue/checkpoints/best/pretrained_model/ -# ADD_IMAGE_TOKENS=true -# ATTN_MODE=cross_attn -# STATE_TO_PREFIX=true -# CAUSAL_ACTION_ATTENTION_MASK=true -# SELF_ATTN_EVERY_N_LAYERS=2 -# VLM_NAME=HuggingFaceTB/SmolVLM-500M-Instruct - - -# CAUSAL_ATTENTION_ON_HISTORY=true -# SELF_ATTN_ONLY_ACTIONS=false -# EXPERT_WIDTH_MULTIPLIER=0.75 -# PAST_OBS_KEYS="image" -# N_OBS_STEPS=1 -# NUM_EXPERT_LAYERS=0 -# CHUNK_SIZE=50 -# NUM_VLM_LAYERS=16 -# PAD_LANG_TO=longest -# EVAL_CKPT=$WORK/logs/lerobot/lerobot_physical_intelligence_libero_smolpi0_lr1e-4bs32steps100000gpus2freeze32_cross_attn_vlml16_sa2_smolvlm2500_chunk50_nobs1_expw0.75_lrvlm1e-4_longest_pt_trans0false_decaylr2.5e-6100000_relactfalsefirst_camfalse_vim-1/checkpoints/best/pretrained_model/ -# ADD_IMAGE_TOKENS=true -# ATTN_MODE=cross_attn -# STATE_TO_PREFIX=true -# CAUSAL_ACTION_ATTENTION_MASK=true -# SELF_ATTN_EVERY_N_LAYERS=2 -# VLM_NAME=HuggingFaceTB/SmolVLM-500M-Instruct - -# CAUSAL_ATTENTION_ON_HISTORY=true -# SELF_ATTN_ONLY_ACTIONS=false -# EXPERT_WIDTH_MULTIPLIER=0.75 -# PAST_OBS_KEYS="image" -# N_OBS_STEPS=1 -# NUM_EXPERT_LAYERS=0 -# CHUNK_SIZE=50 -# NUM_VLM_LAYERS=16 -# PAD_LANG_TO=longest -# EVAL_CKPT=$WORK/logs/lerobot/lerobot_physical_intelligence_libero_smolpi0_lr2e-4bs32steps100000gpus2freeze32_cross_attn_vlml16_sa2_smolvlm2500_chunk50_nobs1_expw0.75_lrvlm1e-4_longest_pt_trans0false_decaylr2.5e-6100000_relactfalsefirst_camfalse_vim-1/checkpoints/best/pretrained_model/ -# ADD_IMAGE_TOKENS=true -# ATTN_MODE=cross_attn -# STATE_TO_PREFIX=true -# CAUSAL_ACTION_ATTENTION_MASK=true -# SELF_ATTN_EVERY_N_LAYERS=2 -# VLM_NAME=HuggingFaceTB/SmolVLM-500M-Instruct - - - -# CAUSAL_ATTENTION_ON_HISTORY=true -# SELF_ATTN_ONLY_ACTIONS=false -# EXPERT_WIDTH_MULTIPLIER=0.75 -# PAST_OBS_KEYS="image" -# N_OBS_STEPS=1 -# NUM_EXPERT_LAYERS=0 -# CHUNK_SIZE=50 -# NUM_VLM_LAYERS=16 -# PAD_LANG_TO=longest -# EVAL_CKPT=$WORK/logs/lerobot/lerobot_physical_intelligence_libero_smolpi0_lr3e-4bs32steps100000gpus2freeze32_cross_attn_vlml16_sa2_smolvlm2500_chunk50_nobs1_expw0.75_lrvlm1e-4_longest_pt_trans0false_decaylr2.5e-6100000_relactfalsefirst_camfalse_vim-1/checkpoints/best/pretrained_model/ -# ADD_IMAGE_TOKENS=true -# ATTN_MODE=cross_attn -# STATE_TO_PREFIX=true -# CAUSAL_ACTION_ATTENTION_MASK=true -# SELF_ATTN_EVERY_N_LAYERS=2 -# VLM_NAME=HuggingFaceTB/SmolVLM-500M-Instruct - - -# REGRESSION_LOSS=true -# CAUSAL_ATTENTION_ON_HISTORY=true -# SELF_ATTN_ONLY_ACTIONS=false -# EXPERT_WIDTH_MULTIPLIER=0.5 -# PAST_OBS_KEYS="image" -# N_OBS_STEPS=1 -# NUM_EXPERT_LAYERS=0 -# CHUNK_SIZE=50 -# NUM_VLM_LAYERS=0 -# PAD_LANG_TO=longest -# EVAL_CKPT=$WORK/logs/lerobot/lerobot_physical_intelligence_libero_smolpi0_lr9e-5bs8steps100000gpus2freeze32_cross_attn_vlml0_sa0_smolvlm2500_chunk50_nobs1_expw0.5_lrvlm1e-4_longest_pt_trans0false_decaylr2.5e-630000_relactfalsefirst_camfalse_vim-1_regtrue/checkpoints/best/pretrained_model/ -# ADD_IMAGE_TOKENS=true -# ATTN_MODE=cross_attn -# STATE_TO_PREFIX=true -# CAUSAL_ACTION_ATTENTION_MASK=true -# SELF_ATTN_EVERY_N_LAYERS=0 -# VLM_NAME=HuggingFaceTB/SmolVLM-500M-Instruct - - - -# REGRESSION_LOSS=true -# CAUSAL_ATTENTION_ON_HISTORY=true -# SELF_ATTN_ONLY_ACTIONS=false -# EXPERT_WIDTH_MULTIPLIER=0.5 -# PAST_OBS_KEYS="image" -# N_OBS_STEPS=1 -# NUM_EXPERT_LAYERS=0 -# CHUNK_SIZE=50 -# NUM_VLM_LAYERS=0 -# PAD_LANG_TO=longest -# EVAL_CKPT=$WORK/logs/lerobot/lerobot_physical_intelligence_libero_smolpi0_lr2e-4bs8steps100000gpus2freeze32_cross_attn_vlml0_sa0_smolvlm2500_chunk50_nobs1_expw0.5_lrvlm1e-4_longest_pt_trans0false_decaylr2.5e-630000_relactfalsefirst_camfalse_vim-1_regtrue/checkpoints/best/pretrained_model/ -# ADD_IMAGE_TOKENS=true -# ATTN_MODE=cross_attn -# STATE_TO_PREFIX=true -# CAUSAL_ACTION_ATTENTION_MASK=true -# SELF_ATTN_EVERY_N_LAYERS=0 -# VLM_NAME=HuggingFaceTB/SmolVLM-500M-Instruct - - -# CAUSAL_ATTENTION_ON_HISTORY=true -# SELF_ATTN_ONLY_ACTIONS=false -# EXPERT_WIDTH_MULTIPLIER=0.75 -# PAST_OBS_KEYS="image" -# N_OBS_STEPS=1 -# NUM_EXPERT_LAYERS=0 -# CHUNK_SIZE=50 -# NUM_VLM_LAYERS=16 -# PAD_LANG_TO=longest -# EVAL_CKPT=$WORK/logs/lerobot/lerobot_physical_intelligence_libero_smolpi0_lr9e-5bs32steps100000gpus2freeze32_cross_attn_vlml16_sa2_smolvlm2500_chunk50_nobs1_expw0.75_lrvlm1e-4_longest_pt_trans0false_decaylr2.5e-6100000_relactfalsefirst_camfalse_vim-1_regfalse/checkpoints/best/pretrained_model/ -# ADD_IMAGE_TOKENS=true -# ATTN_MODE=cross_attn -# STATE_TO_PREFIX=true -# CAUSAL_ACTION_ATTENTION_MASK=true -# SELF_ATTN_EVERY_N_LAYERS=2 -# VLM_NAME=HuggingFaceTB/SmolVLM-500M-Instruct - - -# CAUSAL_ATTENTION_ON_HISTORY=true -# SELF_ATTN_ONLY_ACTIONS=false -# EXPERT_WIDTH_MULTIPLIER=0.75 -# PAST_OBS_KEYS="image" -# N_OBS_STEPS=1 -# NUM_EXPERT_LAYERS=0 -# CHUNK_SIZE=50 -# NUM_VLM_LAYERS=16 -# PAD_LANG_TO=longest -# EVAL_CKPT=$WORK/logs/lerobot/lerobot_physical_intelligence_libero_smolpi0_lr5e-4bs8steps100000gpus2freeze32_cross_attn_vlml16_sa2_smolvlm2250_chunk50_nobs1_expw0.75_lrvlm1e-4_longest_pt_trans0false_decaylr2.5e-630000_relactfalsefirst_camfalse_vim-1_regfalse/checkpoints/best/pretrained_model/ -# ADD_IMAGE_TOKENS=true -# ATTN_MODE=cross_attn -# STATE_TO_PREFIX=true -# CAUSAL_ACTION_ATTENTION_MASK=true -# SELF_ATTN_EVERY_N_LAYERS=2 -# VLM_NAME=HuggingFaceTB/SmolVLM2-256M-Video-Instruct - -# CAUSAL_ATTENTION_ON_HISTORY=true -# SELF_ATTN_ONLY_ACTIONS=false -# EXPERT_WIDTH_MULTIPLIER=0.75 -# PAST_OBS_KEYS="image" -# N_OBS_STEPS=1 -# NUM_EXPERT_LAYERS=0 -# CHUNK_SIZE=50 -# NUM_VLM_LAYERS=16 -# PAD_LANG_TO=longest -# EVAL_CKPT=$WORK/logs/lerobot/lerobot_physical_intelligence_libero_smolpi0_lr8e-5bs8steps100000gpus2freeze32_cross_attn_vlml16_sa2_smolvlm22b_chunk50_nobs1_expw0.75_lrvlm1e-4_longest_pt_trans0false_decaylr2.5e-630000_relactfalsefirst_camfalse_vim-1_regfalse/checkpoints/best/pretrained_model/ -# ADD_IMAGE_TOKENS=true -# ATTN_MODE=cross_attn -# STATE_TO_PREFIX=true -# CAUSAL_ACTION_ATTENTION_MASK=true -# SELF_ATTN_EVERY_N_LAYERS=2 -# VLM_NAME=HuggingFaceTB/SmolVLM2-2.2B-Instruct - -# CAUSAL_ATTENTION_ON_HISTORY=true -# SELF_ATTN_ONLY_ACTIONS=false -# EXPERT_WIDTH_MULTIPLIER=0.75 -# PAST_OBS_KEYS="image" -# N_OBS_STEPS=1 -# NUM_EXPERT_LAYERS=0 -# CHUNK_SIZE=50 -# NUM_VLM_LAYERS=16 -# PAD_LANG_TO=longest -# EVAL_CKPT=$WORK/logs/lerobot/lerobot_physical_intelligence_libero_smolpi0_lr5e-4bs32steps100000gpus2freeze32_cross_attn_vlml16_sa2_smolvlm1250_chunk50_nobs1_expw0.75_lrvlm1e-4_longest_pt_trans0false_decaylr2.5e-630000_relactfalsefirst_camfalse_vim-1_regfalse/checkpoints/best/pretrained_model/ -# ADD_IMAGE_TOKENS=true -# ATTN_MODE=cross_attn -# STATE_TO_PREFIX=true -# CAUSAL_ACTION_ATTENTION_MASK=true -# SELF_ATTN_EVERY_N_LAYERS=2 -# VLM_NAME=HuggingFaceTB/SmolVLM2-256M-Video-Instruct - - -# CAUSAL_ATTENTION_ON_HISTORY=true -# SELF_ATTN_ONLY_ACTIONS=false -# EXPERT_WIDTH_MULTIPLIER=0.75 -# PAST_OBS_KEYS="image" -# N_OBS_STEPS=1 -# NUM_EXPERT_LAYERS=0 -# CHUNK_SIZE=50 -# NUM_VLM_LAYERS=16 -# PAD_LANG_TO=longest -# EVAL_CKPT=$WORK/logs/lerobot/lerobot_physical_intelligence_libero_smolpi0_lr4e-4bs32steps100000gpus2freeze32_cross_attn_vlml16_sa2_smolvlm1250_chunk50_nobs1_expw0.75_lrvlm1e-4_longest_pt_trans0false_decaylr2.5e-630000_relactfalsefirst_camfalse_vim-1_regfalse/checkpoints/best/pretrained_model/ -# ADD_IMAGE_TOKENS=true -# ATTN_MODE=cross_attn -# STATE_TO_PREFIX=true -# CAUSAL_ACTION_ATTENTION_MASK=true -# SELF_ATTN_EVERY_N_LAYERS=2 -# VLM_NAME=HuggingFaceTB/SmolVLM2-256M-Video-Instruct - - -# CAUSAL_ATTENTION_ON_HISTORY=true -# SELF_ATTN_ONLY_ACTIONS=false -# EXPERT_WIDTH_MULTIPLIER=0.75 -# PAST_OBS_KEYS="image" -# N_OBS_STEPS=1 -# NUM_EXPERT_LAYERS=0 -# CHUNK_SIZE=50 -# NUM_VLM_LAYERS=16 -# PAD_LANG_TO=longest -# EVAL_CKPT=$WORK/logs/lerobot/lerobot_physical_intelligence_libero_smolpi0_lr3e-4bs32steps100000gpus2freeze32_cross_attn_vlml16_sa2_smolvlm1250_chunk50_nobs1_expw0.75_lrvlm1e-4_longest_pt_trans0false_decaylr2.5e-630000_relactfalsefirst_camfalse_vim-1_regfalse/checkpoints/best/pretrained_model/ -# ADD_IMAGE_TOKENS=true -# ATTN_MODE=cross_attn -# STATE_TO_PREFIX=true -# CAUSAL_ACTION_ATTENTION_MASK=true -# SELF_ATTN_EVERY_N_LAYERS=2 -# VLM_NAME=HuggingFaceTB/SmolVLM2-256M-Video-Instruct - - -# CAUSAL_ATTENTION_ON_HISTORY=true -# SELF_ATTN_ONLY_ACTIONS=false -# EXPERT_WIDTH_MULTIPLIER=0.75 -# PAST_OBS_KEYS="image" -# N_OBS_STEPS=1 -# NUM_EXPERT_LAYERS=0 -# CHUNK_SIZE=50 -# NUM_VLM_LAYERS=16 -# PAD_LANG_TO=longest -# EVAL_CKPT=$WORK/logs/lerobot/lerobot_physical_intelligence_libero_smolpi0_lr6e-5bs32steps100000gpus2freeze32_cross_attn_vlml16_sa2_smolvlm12b_chunk50_nobs1_expw0.75_lrvlm1e-4_longest_pt_trans0false_decaylr2.5e-630000_relactfalsefirst_camfalse_vim-1_regfalse/checkpoints/best/pretrained_model/ -# ADD_IMAGE_TOKENS=true -# ATTN_MODE=cross_attn -# STATE_TO_PREFIX=true -# CAUSAL_ACTION_ATTENTION_MASK=true -# SELF_ATTN_EVERY_N_LAYERS=2 -# VLM_NAME=HuggingFaceTB/SmolVLM2-2.2B-Instruct - - -# CAUSAL_ATTENTION_ON_HISTORY=true -# SELF_ATTN_ONLY_ACTIONS=false -# EXPERT_WIDTH_MULTIPLIER=0.75 -# PAST_OBS_KEYS="image" -# N_OBS_STEPS=1 -# NUM_EXPERT_LAYERS=0 -# CHUNK_SIZE=50 -# NUM_VLM_LAYERS=16 -# PAD_LANG_TO=longest -# EVAL_CKPT=$WORK/logs/lerobot/lerobot_physical_intelligence_libero_smolpi0_lr1e-4bs32steps100000gpus2freeze32_cross_attn_vlml16_sa2_smolvlm12b_chunk50_nobs1_expw0.75_lrvlm1e-4_longest_pt_trans0false_decaylr2.5e-630000_relactfalsefirst_camfalse_vim-1_regfalse/checkpoints/best/pretrained_model/ -# ADD_IMAGE_TOKENS=true -# ATTN_MODE=cross_attn -# STATE_TO_PREFIX=true -# CAUSAL_ACTION_ATTENTION_MASK=true -# SELF_ATTN_EVERY_N_LAYERS=2 -# VLM_NAME=HuggingFaceTB/SmolVLM2-2.2B-Instruct - - - -# CAUSAL_ATTENTION_ON_HISTORY=true -# SELF_ATTN_ONLY_ACTIONS=false -# EXPERT_WIDTH_MULTIPLIER=0.75 -# PAST_OBS_KEYS="image" -# N_OBS_STEPS=1 -# NUM_EXPERT_LAYERS=0 -# CHUNK_SIZE=50 -# NUM_VLM_LAYERS=16 -# PAD_LANG_TO=longest -# EVAL_CKPT=$WORK/logs/lerobot/lerobot_physical_intelligence_libero_smolpi0_lr9e-5bs32steps100000gpus2freeze32_cross_attn_vlml16_sa2_smolvlm12b_chunk50_nobs1_expw0.75_lrvlm1e-4_longest_pt_trans0false_decaylr2.5e-630000_relactfalsefirst_camfalse_vim-1_regfalse/checkpoints/best/pretrained_model/ -# ADD_IMAGE_TOKENS=true -# ATTN_MODE=cross_attn -# STATE_TO_PREFIX=true -# CAUSAL_ACTION_ATTENTION_MASK=true -# SELF_ATTN_EVERY_N_LAYERS=2 -# VLM_NAME=HuggingFaceTB/SmolVLM2-2.2B-Instruct - - -# CAUSAL_ATTENTION_ON_HISTORY=true -# SELF_ATTN_ONLY_ACTIONS=false -# EXPERT_WIDTH_MULTIPLIER=0.75 -# PAST_OBS_KEYS="image" -# N_OBS_STEPS=1 -# NUM_EXPERT_LAYERS=0 -# CHUNK_SIZE=50 -# NUM_VLM_LAYERS=16 -# PAD_LANG_TO=longest -# EVAL_CKPT=$WORK/logs/lerobot/lerobot_physical_intelligence_libero_smolpi0_lr7e-5bs32steps100000gpus2freeze32_cross_attn_vlml16_sa2_smolvlm12b_chunk50_nobs1_expw0.75_lrvlm1e-4_longest_pt_trans0false_decaylr2.5e-630000_relactfalsefirst_camfalse_vim-1_regfalse/checkpoints/best/pretrained_model/ -# ADD_IMAGE_TOKENS=true -# ATTN_MODE=cross_attn -# STATE_TO_PREFIX=true -# CAUSAL_ACTION_ATTENTION_MASK=true -# SELF_ATTN_EVERY_N_LAYERS=2 -# VLM_NAME=HuggingFaceTB/SmolVLM2-2.2B-Instruct - - -# CAUSAL_ATTENTION_ON_HISTORY=true -# SELF_ATTN_ONLY_ACTIONS=false -# EXPERT_WIDTH_MULTIPLIER=0.75 -# PAST_OBS_KEYS="image" -# N_OBS_STEPS=1 -# NUM_EXPERT_LAYERS=0 -# CHUNK_SIZE=50 -# NUM_VLM_LAYERS=16 -# PAD_LANG_TO=longest -# EVAL_CKPT=$WORK/logs/lerobot/lerobot_physical_intelligence_libero_smolpi0_lr8e-5bs32steps100000gpus2freeze32_cross_attn_vlml16_sa2_smolvlm12b_chunk50_nobs1_expw0.75_lrvlm1e-4_longest_pt_trans0false_decaylr2.5e-630000_relactfalsefirst_camfalse_vim-1_regfalse/checkpoints/best/pretrained_model/ -# ADD_IMAGE_TOKENS=true -# ATTN_MODE=cross_attn -# STATE_TO_PREFIX=true -# CAUSAL_ACTION_ATTENTION_MASK=true -# SELF_ATTN_EVERY_N_LAYERS=2 -# VLM_NAME=HuggingFaceTB/SmolVLM2-2.2B-Instruct - - -CAUSAL_ATTENTION_ON_HISTORY=true -SELF_ATTN_ONLY_ACTIONS=false -EXPERT_WIDTH_MULTIPLIER=0.75 -PAST_OBS_KEYS="image" -N_OBS_STEPS=1 -NUM_EXPERT_LAYERS=0 -CHUNK_SIZE=50 -NUM_VLM_LAYERS=16 -PAD_LANG_TO=longest -EVAL_CKPT=/raid/jade/models/smolvlamust -ADD_IMAGE_TOKENS=true -ATTN_MODE=cross_attn -STATE_TO_PREFIX=true -CAUSAL_ACTION_ATTENTION_MASK=true -SELF_ATTN_EVERY_N_LAYERS=2 -VLM_NAME=HuggingFaceTB/SmolVLM-500M-Instruct - - -# CAUSAL_ATTENTION_ON_HISTORY=true -# SELF_ATTN_ONLY_ACTIONS=false -# EXPERT_WIDTH_MULTIPLIER=0.75 -# PAST_OBS_KEYS="image" -# N_OBS_STEPS=1 -# NUM_EXPERT_LAYERS=0 -# CHUNK_SIZE=50 -# NUM_VLM_LAYERS=16 -# PAD_LANG_TO=longest -# EVAL_CKPT=$WORK/logs/lerobot/lerobot_physical_intelligence_libero_smolpi0_lr6e-5bs32steps100000gpus2freeze32_cross_attn_vlml16_sa2_smolvlm22b_chunk50_nobs1_expw0.75_lrvlm1e-4_longest_pt_trans0false_decaylr2.5e-630000_relactfalsefirst_camfalse_vim-1_regfalse_compilefalse/checkpoints/best/pretrained_model/ -# ADD_IMAGE_TOKENS=true -# ATTN_MODE=cross_attn -# STATE_TO_PREFIX=true -# CAUSAL_ACTION_ATTENTION_MASK=true -# SELF_ATTN_EVERY_N_LAYERS=2 -# VLM_NAME=HuggingFaceTB/SmolVLM2-2.2B-Instruct - -# CAUSAL_ATTENTION_ON_HISTORY=true -# SELF_ATTN_ONLY_ACTIONS=false -# EXPERT_WIDTH_MULTIPLIER=0.75 -# PAST_OBS_KEYS="image" -# N_OBS_STEPS=1 -# NUM_EXPERT_LAYERS=0 -# CHUNK_SIZE=50 -# NUM_VLM_LAYERS=16 -# PAD_LANG_TO=longest -# EVAL_CKPT=$WORK/logs/lerobot/lerobot_physical_intelligence_libero_smolpi0_lr9e-5bs32steps100000gpus2freeze32_cross_attn_vlml16_sa2_smolvlm22b_chunk50_nobs1_expw0.75_lrvlm1e-4_longest_pt_trans0false_decaylr2.5e-630000_relactfalsefirst_camfalse_vim-1_regfalse_compilefalse/checkpoints/best/pretrained_model/ -# ADD_IMAGE_TOKENS=true -# ATTN_MODE=cross_attn -# STATE_TO_PREFIX=true -# CAUSAL_ACTION_ATTENTION_MASK=true -# SELF_ATTN_EVERY_N_LAYERS=2 -# VLM_NAME=HuggingFaceTB/SmolVLM2-2.2B-Instruct - - -# CAUSAL_ATTENTION_ON_HISTORY=true -# SELF_ATTN_ONLY_ACTIONS=false -# EXPERT_WIDTH_MULTIPLIER=0.75 -# PAST_OBS_KEYS="image" -# N_OBS_STEPS=1 -# NUM_EXPERT_LAYERS=0 -# CHUNK_SIZE=50 -# NUM_VLM_LAYERS=16 -# PAD_LANG_TO=longest -# EVAL_CKPT=$WORK/logs/lerobot/lerobot_physical_intelligence_libero_smolpi0_lr8e-5bs32steps100000gpus2freeze32_cross_attn_vlml16_sa2_smolvlm22b_chunk50_nobs1_expw0.75_lrvlm1e-4_longest_pt_trans0false_decaylr2.5e-630000_relactfalsefirst_camfalse_vim-1_regfalse_compilefalse/checkpoints/best/pretrained_model/ -# ADD_IMAGE_TOKENS=true -# ATTN_MODE=cross_attn -# STATE_TO_PREFIX=true -# CAUSAL_ACTION_ATTENTION_MASK=true -# SELF_ATTN_EVERY_N_LAYERS=2 -# VLM_NAME=HuggingFaceTB/SmolVLM2-2.2B-Instruct - -# CAUSAL_ATTENTION_ON_HISTORY=true -# SELF_ATTN_ONLY_ACTIONS=false -# EXPERT_WIDTH_MULTIPLIER=0.75 -# PAST_OBS_KEYS="image" -# N_OBS_STEPS=1 -# NUM_EXPERT_LAYERS=0 -# CHUNK_SIZE=50 -# NUM_VLM_LAYERS=16 -# PAD_LANG_TO=longest -# EVAL_CKPT=$WORK/logs/lerobot/lerobot_physical_intelligence_libero_smolpi0_lr1e-4bs32steps100000gpus2freeze32_cross_attn_vlml16_sa2_smolvlm22b_chunk50_nobs1_expw0.75_lrvlm1e-4_longest_pt_trans0false_decaylr2.5e-630000_relactfalsefirst_camfalse_vim-1_regfalse_compilefalse/checkpoints/best/pretrained_model/ -# ADD_IMAGE_TOKENS=true -# ATTN_MODE=cross_attn -# STATE_TO_PREFIX=true -# CAUSAL_ACTION_ATTENTION_MASK=true -# SELF_ATTN_EVERY_N_LAYERS=2 -# VLM_NAME=HuggingFaceTB/SmolVLM2-2.2B-Instruct - - - -# CAUSAL_ATTENTION_ON_HISTORY=true -# SELF_ATTN_ONLY_ACTIONS=false -# EXPERT_WIDTH_MULTIPLIER=0.75 -# PAST_OBS_KEYS="image" -# N_OBS_STEPS=1 -# NUM_EXPERT_LAYERS=0 -# CHUNK_SIZE=50 -# NUM_VLM_LAYERS=16 -# PAD_LANG_TO=longest -# EVAL_CKPT=$WORK/logs/lerobot/lerobot_physical_intelligence_libero_smolpi0_lr7e-4bs32steps100000gpus2freeze32_cross_attn_vlml16_sa2_smolvlm1250_chunk50_nobs1_expw0.75_lrvlm1e-4_longest_pt_trans0false_decaylr2.5e-630000_relactfalsefirst_camfalse_vim-1_regfalse/checkpoints/best/pretrained_model/ -# ADD_IMAGE_TOKENS=true -# ATTN_MODE=cross_attn -# STATE_TO_PREFIX=true -# CAUSAL_ACTION_ATTENTION_MASK=true -# SELF_ATTN_EVERY_N_LAYERS=2 -# VLM_NAME=HuggingFaceTB/SmolVLM2-256M-Video-Instruct - - -# CAUSAL_ATTENTION_ON_HISTORY=true -# SELF_ATTN_ONLY_ACTIONS=false -# EXPERT_WIDTH_MULTIPLIER=0.75 -# PAST_OBS_KEYS="image" -# N_OBS_STEPS=1 -# NUM_EXPERT_LAYERS=0 -# CHUNK_SIZE=50 -# NUM_VLM_LAYERS=16 -# PAD_LANG_TO=longest -# EVAL_CKPT=$WORK/logs/lerobot/lerobot_physical_intelligence_libero_smolpi0_lr1e-4bs32steps100000gpus2freeze32_cross_attn_vlml16_sa2_smolvlm2250_chunk50_nobs1_expw0.75_lrvlm1e-4_longest_pt_trans0false_decaylr2.5e-630000_relactfalsefirst_camfalse_vim-1_regfalse_compilefalse/checkpoints/best/pretrained_model/ -# ADD_IMAGE_TOKENS=true -# ATTN_MODE=cross_attn -# STATE_TO_PREFIX=true -# CAUSAL_ACTION_ATTENTION_MASK=true -# SELF_ATTN_EVERY_N_LAYERS=2 -# VLM_NAME=HuggingFaceTB/SmolVLM2-256M-Video-Instruct - - -# CAUSAL_ATTENTION_ON_HISTORY=true -# SELF_ATTN_ONLY_ACTIONS=false -# EXPERT_WIDTH_MULTIPLIER=0.75 -# PAST_OBS_KEYS="image" -# N_OBS_STEPS=1 -# NUM_EXPERT_LAYERS=0 -# CHUNK_SIZE=50 -# NUM_VLM_LAYERS=16 -# PAD_LANG_TO=longest -# EVAL_CKPT=$WORK/logs/lerobot/lerobot_physical_intelligence_libero_smolpi0_lr3e-4bs32steps100000gpus2freeze32_cross_attn_vlml16_sa2_smolvlm2250_chunk50_nobs1_expw0.75_lrvlm1e-4_longest_pt_trans0false_decaylr2.5e-630000_relactfalsefirst_camfalse_vim-1_regfalse_compilefalse/checkpoints/best/pretrained_model/ -# ADD_IMAGE_TOKENS=true -# ATTN_MODE=cross_attn -# STATE_TO_PREFIX=true -# CAUSAL_ACTION_ATTENTION_MASK=true -# SELF_ATTN_EVERY_N_LAYERS=2 -# VLM_NAME=HuggingFaceTB/SmolVLM2-256M-Video-Instruct - -# CAUSAL_ATTENTION_ON_HISTORY=true -# SELF_ATTN_ONLY_ACTIONS=false -# EXPERT_WIDTH_MULTIPLIER=0.75 -# PAST_OBS_KEYS="image" -# N_OBS_STEPS=1 -# NUM_EXPERT_LAYERS=0 -# CHUNK_SIZE=50 -# NUM_VLM_LAYERS=16 -# PAD_LANG_TO=longest -# EVAL_CKPT=$WORK/logs/lerobot/lerobot_physical_intelligence_libero_smolpi0_lr5e-4bs32steps100000gpus2freeze32_cross_attn_vlml16_sa2_smolvlm2250_chunk50_nobs1_expw0.75_lrvlm1e-4_longest_pt_trans0false_decaylr2.5e-630000_relactfalsefirst_camfalse_vim-1_regfalse_compilefalse/checkpoints/best/pretrained_model/ -# ADD_IMAGE_TOKENS=true -# ATTN_MODE=cross_attn -# STATE_TO_PREFIX=true -# CAUSAL_ACTION_ATTENTION_MASK=true -# SELF_ATTN_EVERY_N_LAYERS=2 -# VLM_NAME=HuggingFaceTB/SmolVLM2-256M-Video-Instruct - - -# CAUSAL_ATTENTION_ON_HISTORY=true -# SELF_ATTN_ONLY_ACTIONS=false -# EXPERT_WIDTH_MULTIPLIER=0.75 -# PAST_OBS_KEYS="image" -# N_OBS_STEPS=1 -# NUM_EXPERT_LAYERS=0 -# CHUNK_SIZE=50 -# NUM_VLM_LAYERS=16 -# PAD_LANG_TO=longest -# EVAL_CKPT=$WORK/logs/lerobot/lerobot_physical_intelligence_libero_smolpi0_lr7e-4bs32steps100000gpus2freeze32_cross_attn_vlml16_sa2_smolvlm2250_chunk50_nobs1_expw0.75_lrvlm1e-4_longest_pt_trans0false_decaylr2.5e-630000_relactfalsefirst_camfalse_vim-1_regfalse_compilefalse/checkpoints/best/pretrained_model/ -# ADD_IMAGE_TOKENS=true -# ATTN_MODE=cross_attn -# STATE_TO_PREFIX=true -# CAUSAL_ACTION_ATTENTION_MASK=true -# SELF_ATTN_EVERY_N_LAYERS=2 -# VLM_NAME=HuggingFaceTB/SmolVLM2-256M-Video-Instruct - -# CAUSAL_ATTENTION_ON_HISTORY=true -# SELF_ATTN_ONLY_ACTIONS=false -# EXPERT_WIDTH_MULTIPLIER=0.75 -# PAST_OBS_KEYS="image" -# N_OBS_STEPS=1 -# NUM_EXPERT_LAYERS=0 -# CHUNK_SIZE=50 -# NUM_VLM_LAYERS=16 -# PAD_LANG_TO=longest -# EVAL_CKPT=$WORK/logs/lerobot/lerobot_physical_intelligence_libero_smolpi0_lr6e-4bs32steps100000gpus2freeze32_cross_attn_vlml16_sa2_smolvlm2250_chunk50_nobs1_expw0.75_lrvlm1e-4_longest_pt_trans0false_decaylr2.5e-630000_relactfalsefirst_camfalse_vim-1_regfalse_compilefalse/checkpoints/best/pretrained_model/ -# ADD_IMAGE_TOKENS=true -# ATTN_MODE=cross_attn -# STATE_TO_PREFIX=true -# CAUSAL_ACTION_ATTENTION_MASK=true -# SELF_ATTN_EVERY_N_LAYERS=2 -# VLM_NAME=HuggingFaceTB/SmolVLM2-256M-Video-Instruct - -# CAUSAL_ATTENTION_ON_HISTORY=true -# SELF_ATTN_ONLY_ACTIONS=false -# EXPERT_WIDTH_MULTIPLIER=0.75 -# PAST_OBS_KEYS="image" -# N_OBS_STEPS=1 -# NUM_EXPERT_LAYERS=0 -# CHUNK_SIZE=50 -# NUM_VLM_LAYERS=16 -# PAD_LANG_TO=longest -# EVAL_CKPT=$WORK/logs/lerobot/lerobot_physical_intelligence_libero_smolpi0_lr4e-4bs32steps100000gpus2freeze32_cross_attn_vlml16_sa2_smolvlm2250_chunk50_nobs1_expw0.75_lrvlm1e-4_longest_pt_trans0false_decaylr2.5e-630000_relactfalsefirst_camfalse_vim-1_regfalse_compilefalse/checkpoints/best/pretrained_model/ -# ADD_IMAGE_TOKENS=true -# ATTN_MODE=cross_attn -# STATE_TO_PREFIX=true -# CAUSAL_ACTION_ATTENTION_MASK=true -# SELF_ATTN_EVERY_N_LAYERS=2 -# VLM_NAME=HuggingFaceTB/SmolVLM2-256M-Video-Instruct - - -# # TASK=libero_spatial -# MULTITASK_EVAL=false -# N_EPISODES=50 - -# TASK=libero_spatial,libero_object,libero_goal,libero_10 -MULTITASK_EVAL=true -# N_EPISODES=5 -N_EPISODES=1 - -# MAX_PARRALLEL_TASKS=5 -# MAX_PARRALLEL_TASKS=2 -MAX_PARRALLEL_TASKS=1 - -# NUM_EVALS=2 -# SEEDS=(1000 5000) -SEEDS=(5000) -# ACTION_STEPS_LIST=(1 10 30 50) -ACTION_STEPS_LIST=(1) -# ACTION_STEPS_LIST=(50) -TASK_LIST=(libero_spatial libero_object libero_goal libero_10) -TASK_LIST=(libero_spatial) -for SEED in "${SEEDS[@]}"; do - for N_ACTION_STEPS in "${ACTION_STEPS_LIST[@]}"; do - for TASK in "${TASK_LIST[@]}"; do - echo "$TASK Evaluating: $EVAL_CKPT | N_ACTION_STEPS=$N_ACTION_STEPS | EVAL SEED=$SEED" - python src/lerobot/scripts/eval.py \ - --output_dir=/raid/jade/logs/lerobot/tmp \ - --env.type=$ENV \ - --env.task=$TASK \ - --eval.batch_size=$N_EPISODES \ - --eval.n_episodes=$N_EPISODES \ - --seed=$SEED \ - --policy.use_amp=false \ - --policy.path=$EVAL_CKPT \ - --policy.n_action_steps=$N_ACTION_STEPS \ - --policy.checkpoint_path=$EVAL_CKPT \ - --env.multitask_eval=$MULTITASK_EVAL --env.max_parallel_tasks=$MAX_PARRALLEL_TASKS \ - --policy.add_image_special_tokens=$ADD_IMAGE_TOKENS \ - --policy.attention_mode=$ATTN_MODE \ - --policy.causal_action_attention_mask=$CAUSAL_ACTION_ATTENTION_MASK \ - --policy.state_to_prefix=$STATE_TO_PREFIX \ - --policy.self_attn_every_n_layers=$SELF_ATTN_EVERY_N_LAYERS \ - --policy.pad_language_to=$PAD_LANG_TO \ - --policy.load_vlm_weights=$LOAD_VLM_WEIGHTS \ - --policy.vlm_model_name=$VLM_NAME \ - --policy.num_vlm_layers=$NUM_VLM_LAYERS \ - --policy.chunk_size=$CHUNK_SIZE \ - --policy.n_obs_steps=$N_OBS_STEPS \ - --policy.past_obs_keys=$PAST_OBS_KEYS \ - --policy.num_expert_layers=$NUM_EXPERT_LAYERS \ - --policy.expert_width_multiplier=$EXPERT_WIDTH_MULTIPLIER \ - --policy.peft_method=$PEFT_METHOD \ - --policy.self_attn_only_actions=$SELF_ATTN_ONLY_ACTIONS \ - --policy.causal_attention_on_history=$CAUSAL_ATTENTION_ON_HISTORY \ - --policy.predict_relative_actions=$PREDICT_RELATIVE_ACTIONS --policy.relative_actions_mode=$RELATIVE_ACTIONS_MODE --policy.shuffle_camera_positions=$SHUFFLE_CAMERA_POSITIONS \ - --policy.vlm_img_size=$VLM_IMG_SIZE \ - --policy.regression_loss=$REGRESSION_LOSS - # --policy.peft_config.r=$LORA_R --policy.peft_config.target_modules=$LORA_TARGET_MODULES --policy.peft_method=$PEFT_METHOD --policy.peft_target_model=$PEFT_TARGET_MODEL - - echo "Done with: $EVAL_CKPT | Steps=$N_ACTION_STEPS | EVAL SEED=$SEED" - echo "------------------------------------------------------" - done - done -done - - -# ############################################################################################################################################ -# ############################################################################################################################################ -# ############################################################################################################################################ -# ########### Offline eval - - -# # ############################ -# # # Community datasets V1 -# # # REPO_ID=pranavsaroha/so100_legos4,pranavsaroha/so100_onelego2,jpata/so100_pick_place_tangerine,pranavsaroha/so100_onelego3,pranavsaroha/so100_carrot_2,pranavsaroha/so100_carrot_5,pandaRQ/pick_med_1,HITHY/so100_strawberry,vladfatu/so100_above,koenvanwijk/orange50-1,koenvanwijk/orange50-variation-2,FeiYjf/new_GtoR,CSCSXX/pick_place_cube_1.18,vladfatu/so100_office,dragon-95/so100_sorting,dragon-95/so100_sorting_1,nbaron99/so100_pick_and_place4,Beegbrain/pick_place_green_block,Ityl/so100_recording2,dragon-95/so100_sorting_2,dragon-95/so100_sorting_3,aractingi/push_cube_offline_data,HITHY/so100_peach3,HITHY/so100_peach4,shreyasgite/so100_legocube_50,shreyasgite/so100_base_env,triton7777/so100_dataset_mix,Deason11/Open_the_drawer_to_place_items,Deason11/PLACE_TAPE_PUSH_DRAWER,NONHUMAN-RESEARCH/SOARM100_TASK_VENDA,mikechambers/block_cup_14,samsam0510/tooth_extraction_3,samsam0510/tooth_extraction_4,samsam0510/cube_reorientation_2,samsam0510/cube_reorientation_4,samsam0510/glove_reorientation_1,DorayakiLin/so100_pick_charger_on_tissue,zijian2022/noticehuman3,liuhuanjim013/so100_th -# # # Inconsistent actions dim: Deason11/Open_the_drawer_to_place_items, Deason11/PLACE_TAPE_PUSH_DRAWER -# # # Filtered datasets -# # REPO_ID=pranavsaroha/so100_onelego2,pranavsaroha/so100_onelego3,pranavsaroha/so100_carrot_2,vladfatu/so100_above,koenvanwijk/orange50-1,CSCSXX/pick_place_cube_1.18,dragon-95/so100_sorting,dragon-95/so100_sorting_1,nbaron99/so100_pick_and_place4,Beegbrain/pick_place_green_block,dragon-95/so100_sorting_3,HITHY/so100_peach3,shreyasgite/so100_legocube_50,triton7777/so100_dataset_mix,NONHUMAN-RESEARCH/SOARM100_TASK_VENDA,mikechambers/block_cup_14,samsam0510/tooth_extraction_3,samsam0510/tooth_extraction_4,samsam0510/cube_reorientation_2,samsam0510/cube_reorientation_4,samsam0510/glove_reorientation_1,vladfatu/so100_office,pranavsaroha/so100_legos4,Ityl/so100_recording2,FeiYjf/new_GtoR,dragon-95/so100_sorting_2,HITHY/so100_peach4,jpata/so100_pick_place_tangerine,HITHY/so100_strawberry,shreyasgite/so100_base_env,koenvanwijk/orange50-variation-2,pranavsaroha/so100_carrot_5,pandaRQ/pick_med_1,aractingi/push_cube_offline_data,DorayakiLin/so100_pick_charger_on_tissue,zijian2022/noticehuman3,liuhuanjim013/so100_th -# # SAMPLING_WEIGHTS= -# # DATASET_NAME=so100_community_v1 - - -# # # Community datasets V2 -# # # Inconsistent actions: 1g0rrr/sam_openpi_solder1, 1g0rrr/sam_openpi03, 1g0rrr/sam_openpi_solder2 -# # # Other issues: pierfabre/rabbit bensprenger/right_arm_p_brick_in_box_with_y_noise_v0 pierfabre/horse pierfabre/pig2 pierfabre/pig3 pierfabre/cow2,pierfabre/sheep -# # # REPO_ID=Chojins/chess_game_009_white,sihyun77/suho_3_17_1,sihyun77/sihyun_3_17_2,sihyun77/suho_3_17_3,sihyun77/sihyun_3_17_5,Odog16/so100_cube_drop_pick_v1,sihyun77/sihyun_main_2,sihyun77/suho_main_2,Bartm3/dice2,sihyun77/sihyun_main_3,Loki0929/so100_duck,pietroom/holdthis,pietroom/actualeasytask,Beegbrain/pick_lemon_and_drop_in_bowl,Beegbrain/sweep_tissue_cube,zijian2022/321,gxy1111/so100_pick_place,Odog16/so100_cube_stacking_v1,sihyun77/mond_1,andlyu/so100_indoor_1,andlyu/so100_indoor_3,frk2/so100large,lirislab/sweep_tissue_cube,lirislab/lemon_into_bowl,lirislab/red_cube_into_green_lego_block,lirislab/red_cube_into_blue_cube,00ri/so100_battery,frk2/so100largediffcam,FsqZ/so100_1,ZGGZZG/so100_drop0,Chojins/chess_game_000_white_red,smanni/train_so100_fluffy_box,ganker5/so100_push_20250328,ganker5/so100_dataline_0328,ganker5/so100_color_0328,CrazyYhang/A1234-B-C_mvA2B,RasmusP/so100_Orange2Green,sixpigs1/so100_pick_cube_in_box,ganker5/so100_push_20250331,ganker5/so100_dataline_20250331,lirislab/put_caps_into_teabox,lirislab/close_top_drawer_teabox,lirislab/open_top_drawer_teabox,lirislab/unfold_bottom_right,lirislab/push_cup_target,lirislab/put_banana_bowl,Chojins/chess_game_001_blue_stereo,Chojins/chess_game_001_red_stereo,ganker5/so100_toy_20250402,Gano007/so100_medic,00ri/so100_battery_bin_center,paszea/so100_whale_2,lirislab/fold_bottom_right,lirislab/put_coffee_cap_teabox,therarelab/so100_pick_place_2,paszea/so100_whale_3,paszea/so100_whale_4,paszea/so100_lego,LemonadeDai/so100_coca,zijian2022/backgrounda,zijian2022/backgroundb,356c/so100_nut_sort_1,Mwuqiu/so100_0408_muti,aimihat/so100_tape,lirislab/so100_demo,356c/so100_duck_reposition_1,zijian2022/sort1,weiye11/so100_410_zwy,VoicAndrei/so100_banana_to_plate_only,sixpigs1/so100_stack_cube_error,isadev/bougies3,zijian2022/close3,bensprenger/left_arm_yellow_brick_in_box_v0,lirislab/guess_who_so100,bensprenger/left_arm_yellow_brick_in_box_with_purple_noise_v0,roboticshack/team16-can-stacking,zijian2022/insert2,roboticshack/team-7-right-arm-grasp-tape,Jiangeng/so100_413,roboticshack/team9-pick_cube_place_static_plate,AndrejOrsula/lerobot_double_ball_stacking_random,roboticshack/left-arm-grasp-lego-brick,roboticshack/team-7-left-arm-grasp-motor,roboticshack/team9-pick_chicken_place_plate,roboticshack/team13-two-balls-stacking,tkc79/so100_lego_box_1,roboticshack/team13-three-balls-stacking,pierfabre/chicken,roboticshack/team16-water-pouring,ad330/cubePlace,Jiafei1224/so100_pa222per,paszea/so100_lego_2cam,bensprenger/chess_game_001_blue_stereo,Mohamedal/put_banana,tkc79/so100_lego_box_2,samanthalhy/so100_herding_1,jlesein/TestBoulon7 -# # REPO_ID=pierfabre/rabbit,bensprenger/right_arm_p_brick_in_box_with_y_noise_v0,pierfabre/horse,pierfabre/pig2,pierfabre/pig3,pierfabre/cow2,pierfabre/sheep,Chojins/chess_game_009_white,sihyun77/suho_3_17_1,sihyun77/sihyun_3_17_2,sihyun77/suho_3_17_3,sihyun77/sihyun_3_17_5,Odog16/so100_cube_drop_pick_v1,sihyun77/sihyun_main_2,sihyun77/suho_main_2,Bartm3/dice2,sihyun77/sihyun_main_3,Loki0929/so100_duck,pietroom/holdthis,pietroom/actualeasytask,Beegbrain/pick_lemon_and_drop_in_bowl,Beegbrain/sweep_tissue_cube,zijian2022/321,gxy1111/so100_pick_place,Odog16/so100_cube_stacking_v1,sihyun77/mond_1,andlyu/so100_indoor_1,andlyu/so100_indoor_3,frk2/so100large,lirislab/sweep_tissue_cube,lirislab/lemon_into_bowl,lirislab/red_cube_into_green_lego_block,lirislab/red_cube_into_blue_cube,00ri/so100_battery,frk2/so100largediffcam,FsqZ/so100_1,ZGGZZG/so100_drop0,Chojins/chess_game_000_white_red,smanni/train_so100_fluffy_box,ganker5/so100_push_20250328,ganker5/so100_dataline_0328,ganker5/so100_color_0328,CrazyYhang/A1234-B-C_mvA2B,RasmusP/so100_Orange2Green,sixpigs1/so100_pick_cube_in_box,ganker5/so100_push_20250331,ganker5/so100_dataline_20250331,lirislab/put_caps_into_teabox,lirislab/close_top_drawer_teabox,lirislab/open_top_drawer_teabox,lirislab/unfold_bottom_right,lirislab/push_cup_target,lirislab/put_banana_bowl,Chojins/chess_game_001_blue_stereo,Chojins/chess_game_001_red_stereo,ganker5/so100_toy_20250402,Gano007/so100_medic,00ri/so100_battery_bin_center,paszea/so100_whale_2,lirislab/fold_bottom_right,lirislab/put_coffee_cap_teabox,therarelab/so100_pick_place_2,paszea/so100_whale_3,paszea/so100_whale_4,paszea/so100_lego,LemonadeDai/so100_coca,zijian2022/backgrounda,zijian2022/backgroundb,356c/so100_nut_sort_1,Mwuqiu/so100_0408_muti,aimihat/so100_tape,lirislab/so100_demo,356c/so100_duck_reposition_1,zijian2022/sort1,weiye11/so100_410_zwy,VoicAndrei/so100_banana_to_plate_only,sixpigs1/so100_stack_cube_error,isadev/bougies3,zijian2022/close3,bensprenger/left_arm_yellow_brick_in_box_v0,lirislab/guess_who_so100,bensprenger/left_arm_yellow_brick_in_box_with_purple_noise_v0,roboticshack/team16-can-stacking,zijian2022/insert2,roboticshack/team-7-right-arm-grasp-tape,Jiangeng/so100_413,roboticshack/team9-pick_cube_place_static_plate,AndrejOrsula/lerobot_double_ball_stacking_random,roboticshack/left-arm-grasp-lego-brick,roboticshack/team-7-left-arm-grasp-motor,roboticshack/team9-pick_chicken_place_plate,roboticshack/team13-two-balls-stacking,tkc79/so100_lego_box_1,roboticshack/team13-three-balls-stacking,pierfabre/chicken,roboticshack/team16-water-pouring,ad330/cubePlace,Jiafei1224/so100_pa222per,paszea/so100_lego_2cam,bensprenger/chess_game_001_blue_stereo,Mohamedal/put_banana,tkc79/so100_lego_box_2,samanthalhy/so100_herding_1,jlesein/TestBoulon7 -# # SAMPLING_WEIGHTS= -# # DATASET_NAME=so100_community_v2 - -# # Community datasets V1+V2 -# # REPO_ID=pierfabre/rabbit,bensprenger/right_arm_p_brick_in_box_with_y_noise_v0,pierfabre/horse,pierfabre/pig2,pierfabre/pig3,pierfabre/cow2,pierfabre/sheep,Chojins/chess_game_009_white,sihyun77/suho_3_17_1,sihyun77/sihyun_3_17_2,sihyun77/suho_3_17_3,sihyun77/sihyun_3_17_5,Odog16/so100_cube_drop_pick_v1,sihyun77/sihyun_main_2,sihyun77/suho_main_2,Bartm3/dice2,sihyun77/sihyun_main_3,Loki0929/so100_duck,pietroom/holdthis,pietroom/actualeasytask,Beegbrain/pick_lemon_and_drop_in_bowl,Beegbrain/sweep_tissue_cube,zijian2022/321,gxy1111/so100_pick_place,Odog16/so100_cube_stacking_v1,sihyun77/mond_1,andlyu/so100_indoor_1,andlyu/so100_indoor_3,frk2/so100large,lirislab/sweep_tissue_cube,lirislab/lemon_into_bowl,lirislab/red_cube_into_green_lego_block,lirislab/red_cube_into_blue_cube,00ri/so100_battery,frk2/so100largediffcam,FsqZ/so100_1,ZGGZZG/so100_drop0,Chojins/chess_game_000_white_red,smanni/train_so100_fluffy_box,ganker5/so100_push_20250328,ganker5/so100_dataline_0328,ganker5/so100_color_0328,CrazyYhang/A1234-B-C_mvA2B,RasmusP/so100_Orange2Green,sixpigs1/so100_pick_cube_in_box,ganker5/so100_push_20250331,ganker5/so100_dataline_20250331,lirislab/put_caps_into_teabox,lirislab/close_top_drawer_teabox,lirislab/open_top_drawer_teabox,lirislab/unfold_bottom_right,lirislab/push_cup_target,lirislab/put_banana_bowl,Chojins/chess_game_001_blue_stereo,Chojins/chess_game_001_red_stereo,ganker5/so100_toy_20250402,Gano007/so100_medic,00ri/so100_battery_bin_center,paszea/so100_whale_2,lirislab/fold_bottom_right,lirislab/put_coffee_cap_teabox,therarelab/so100_pick_place_2,paszea/so100_whale_3,paszea/so100_whale_4,paszea/so100_lego,LemonadeDai/so100_coca,zijian2022/backgrounda,zijian2022/backgroundb,356c/so100_nut_sort_1,Mwuqiu/so100_0408_muti,aimihat/so100_tape,lirislab/so100_demo,356c/so100_duck_reposition_1,zijian2022/sort1,weiye11/so100_410_zwy,VoicAndrei/so100_banana_to_plate_only,sixpigs1/so100_stack_cube_error,isadev/bougies3,zijian2022/close3,bensprenger/left_arm_yellow_brick_in_box_v0,lirislab/guess_who_so100,bensprenger/left_arm_yellow_brick_in_box_with_purple_noise_v0,roboticshack/team16-can-stacking,zijian2022/insert2,roboticshack/team-7-right-arm-grasp-tape,Jiangeng/so100_413,roboticshack/team9-pick_cube_place_static_plate,AndrejOrsula/lerobot_double_ball_stacking_random,roboticshack/left-arm-grasp-lego-brick,roboticshack/team-7-left-arm-grasp-motor,roboticshack/team9-pick_chicken_place_plate,roboticshack/team13-two-balls-stacking,tkc79/so100_lego_box_1,roboticshack/team13-three-balls-stacking,pierfabre/chicken,roboticshack/team16-water-pouring,ad330/cubePlace,Jiafei1224/so100_pa222per,paszea/so100_lego_2cam,bensprenger/chess_game_001_blue_stereo,Mohamedal/put_banana,tkc79/so100_lego_box_2,samanthalhy/so100_herding_1,jlesein/TestBoulon7,pranavsaroha/so100_onelego2,pranavsaroha/so100_onelego3,pranavsaroha/so100_carrot_2,vladfatu/so100_above,koenvanwijk/orange50-1,CSCSXX/pick_place_cube_1.18,dragon-95/so100_sorting,dragon-95/so100_sorting_1,nbaron99/so100_pick_and_place4,Beegbrain/pick_place_green_block,dragon-95/so100_sorting_3,HITHY/so100_peach3,shreyasgite/so100_legocube_50,triton7777/so100_dataset_mix,NONHUMAN-RESEARCH/SOARM100_TASK_VENDA,mikechambers/block_cup_14,samsam0510/tooth_extraction_3,samsam0510/tooth_extraction_4,samsam0510/cube_reorientation_2,samsam0510/cube_reorientation_4,samsam0510/glove_reorientation_1,vladfatu/so100_office,pranavsaroha/so100_legos4,Ityl/so100_recording2,FeiYjf/new_GtoR,dragon-95/so100_sorting_2,HITHY/so100_peach4,jpata/so100_pick_place_tangerine,HITHY/so100_strawberry,shreyasgite/so100_base_env,koenvanwijk/orange50-variation-2,pranavsaroha/so100_carrot_5,pandaRQ/pick_med_1,aractingi/push_cube_offline_data,DorayakiLin/so100_pick_charger_on_tissue,zijian2022/noticehuman3,liuhuanjim013/so100_th -# REPO_ID=pierfabre/rabbit,bensprenger/right_arm_p_brick_in_box_with_y_noise_v0,pierfabre/horse,pierfabre/pig2 -# SAMPLING_WEIGHTS= - -# # # Community V3 -# # # issues, yskim2025/unitylerobot (version), cranberrysoft/so100 (don't exist),29 datasets different actions: nguyen-v/so100_rotate_red_button satvikahuja/mixer_on_off_new_1 ... -# # REPO_ID=satvikahuja/mixer_on_off_new_1,aergogo/so100_pick_place,andy309/so100_0314_fold_cloths,jchun/so100_pickplace_small_20250323_120056,astroyat/cube,Ofiroz91/so_100_cube2bowl,HappyPablo/dec3_data2,ZCM5115/so100_1210,francescocrivelli/orange_feeding,francescocrivelli/carrot_eating,0x00raghu/toffee_red,0x00raghu/toffee_red_2,0x00raghu/toffee_red_3__,0x00raghu/toffee_blue,0x00raghu/toffee_blue_2,0x00raghu/toffee_to_hand_1,0x00raghu/toffee_to_hand_2,liyitenga/so100_bi_hello,liyitenga/so100_bi_giveme5,ZCM5115/so100_2Arm3cameras_movebox,pranavsaroha/so100_carrot_1,pranavsaroha/so100_carrot_3,pranavsaroha/so100_carrot_4,maximilienroberti/so100_lego_red_box,pranavsaroha/so100_squishy,rabhishek100/so100_train_dataset,pranavsaroha/so100_squishy100,swarajgosavi/kikobot_pusht_real_v2,pandaRQ/pickmed,swarajgosavi/act_kikobot_pusht_real,pranavsaroha/so100_squishy2colors,pranavsaroha/so100_squishy2colors_1,Chojins/chess_game_001_white,jmrog/so100_sweet_pick,Chojins/chess_game_002_white,pranavsaroha/so100_squishy2colors_2_new,Chojins/chess_game_003_white,aractingi/pick_place_lego_cube,Chojins/chess_game_004_white,Chojins/chess_game_005_white,Chojins/chess_game_006_white,Chojins/chess_game_007_white,koenvanwijk/blue2,jlitch/so100multicam3,koenvanwijk/blue52,jlitch/so100multicam6,aractingi/pick_place_lego_cube_1,jlitch/so100multicam7,vladfatu/so100_ds,Chojins/chess_game_000_white,HITHY/so100-kiwi,HITHY/so100_peach1,HITHY/so100_redstrawberry,satvikahuja/orange_mixer_1,satvikahuja/mixer_on_off,satvikahuja/orange_pick_place_new1,satvikahuja/mixer_on_off_new,danmac1/real_real332,FeiYjf/Makalu_push,liyitenga/so100_pick_taffy1,chmadran/so100_dataset04,FeiYjf/Maklu_dataset,FeiYjf/new_Dataset,liyitenga/so100_pick_taffy2,satvikahuja/mixer_on_off_new_4,CSCSXX/pick_place_cube_1.17,liyitenga/so100_pick_taffy3,liyitenga/so100_pick_taffy4,yuz1wan/so100_pick_pink,yuz1wan/so100_pick_wahaha,yuz1wan/so100_pp_pink,yuz1wan/so100_pour_cup,liyitenga/so100_pick_taffy5,liyitenga/so100_pick_taffy6,yuz1wan/so100_button,yuz1wan/so100_pickplace,liyitenga/so100_pick_taffy7,FeiYjf/push_gg,FeiYjf/push_0094,swarajgosavi/act_kikobot_block_real,liyitenga/so100_pick_taffy8,phospho-ai/OrangeBrick3Cameras,vaishanthr/toy_pick_place,SeanLMH/so100_picknplace_v2,pepijn223/yellow_lego_in_box1,DimiSch/so100_50ep_2,DimiSch/so100_50ep_3,SeanLMH/so100_picknplace,nbaron99/so100_pick_and_place2,chmadran/so100_dataset08,vaishanthr/toy_pickplace_50ep,Beegbrain/pick_place_green_block_lr,Ityl/so100_recording1,vaishanthr/toy_pickplace,ad330/so100_box_pickPlace,Beegbrain/so100_put_cube_cup,aractingi/push_green_cube_hf,aractingi/push_green_cube_hf_cropped_resized,carpit680/giraffe_task,carpit680/giraffe_sock_demo_1,DimiSch/so100_terra_50_2,carpit680/giraffe_sock_demo_2,aractingi/push_cube_to_face_reward,aractingi/push_cube_to_face_reward_cropped_resized,aractingi/push_cube_reward_data,aractingi/push_cube_reward_data_cropped_resized,aractingi/push_cube_offline_data_cropped_resized,aractingi/push_cube_front_side_reward,aractingi/push_cube_front_side_reward_cropped_resized,aractingi/push_cube_front_side_reward_long,aractingi/push_cube_front_side_reward_long_cropped_resized,aractingi/push_cube_reward,aractingi/push_cube_reward_cropped_resized,aractingi/push_cube_square_reward_cropped_resized,aractingi/push_cube_square_reward_1,aractingi/push_cube_square_reward_1_cropped_resized,aractingi/push_cube_square_light_reward,aractingi/push_cube_square_light_offline_demo,aractingi/push_cube_square_light_offline_demo_cropped_resized,denghj/dataset_red_tape01,aractingi/push_cube_square_offline_demo,aractingi/push_cube_square_offline_demo_cropped_resized,Beegbrain/stack_two_cubes,FeiYjf/Test_NNNN,LegrandFrederic/Orange-brick-lower-resolution,aractingi/pick_place_lego_cube_cropped_resized,aractingi/push_cube_overfit,aractingi/push_cube_overfit_cropped_resized,HITHY/so100_peach,zaringleb/so100_cube_2,andreasBihlmaier/dual_arm_transfer_2025_02_16,zaringleb/so100_cube_4_binary,1g0rrr/reward_pickplace1,1g0rrr/reward_pickplace1_cropped_resized,FeiYjf/Hold_Pieces,FeiYjf/Grab_Pieces,hegdearyandev/so100_eraser_cup_v1,jbraumann/so100_1902,liyitenga/so100_pick_taffy10,mikechambers/block_cup_5,zaringleb/so100_cube_5_linear,yuz1wan/so100_pickplace_0223_2,yuz1wan/so100_pickplace_0223_3,samsam0510/mj_data_temp,samsam0510/tape_insert_1,samsam0510/tape_insert_2,pengjunkun/so100_push_to_hole,Deason11/Random_Kitchen,1g0rrr/reward_dataset_name2,1g0rrr/reward_dataset_name2_cropped_resized,1g0rrr/offline_dataset_name2,1g0rrr/offline_dataset_name2_cropped_resized,aractingi/push_cube_simp_cropped_resized,danielkr452/so100_work6,Loki0929/so100_100,yuz1wan/so100_fold_0227_1,yuz1wan/so100_fold_0227_2,speedyyoshi/so100_grasp_pink_block,lirislab/stack_two_red_cubes,lirislab/red_cube_into_mug,lirislab/green_lego_block_into_mug,lirislab/green_lego_block_into_mug_easy,kevin510/lerobot-cat-toy-placement,NONHUMAN-RESEARCH/SOARM100_TASK_VENDA_BOX,wangjl1512/pour_water,airthebear/so100_GL,zijian2022/noticehuman1,zijian2022/noticehuman2,kantine/so100_kapla_tower6,zijian2022/noticehuman5,zijian2022/llm40,Ashton3/lerobot-aloha,zijian2022/noticehuman50,AaronNewman/screwdriver_task_batch1,AaronNewman/screwdriver_task_batch2,AaronNewman/screwdriver_task_batch3,zijian2022/noticehuman60,zijian2022/noticehuman70,Bartm3/tape_to_bin,liuhuanjim013/so100_th_1,Pi-robot/barbecue_flip,Pi-robot/barbecue_put,wangjl1512/doll,sshh11/so100_orange_50ep_1,sshh11/so100_orange_50ep_2,DorayakiLin/so100_pick_cube_in_box,Bartm3/tape_to_bin2,luke250305/play_dice_250311.1,andy309/so100_0311_1152,sihyun77/suho_so100,sihyun77/si_so100,shreyasgite/so100_base_left,sihyun77/suho_red,liuhuanjim013/so100_block,andy309/so100_0313_no_wrist_camera,zijian2022/l9,zijian2022/n1_2,DorayakiLin/so100_stack_cube,andy309/so100_0313_no_wrist_camera_with_two_arms_cloths,joaoocruz00/so100_makeitD1,zijian2022/l10_1,zijian2022/l10_5,sihyun77/suho_red2,sihyun77/suho_angel,sihyun77/sihyun_king,acrampette/third_arm_01,Winster/so100_cube,1g0rrr/sam_openpi03,thedevansh/mar16_1336,hkphoooey/throw_stuffie,doujiangwang/task1_10epi_100000step,sihyun77/sihyun_3_17_1,acrampette/third_arm_02,imsyed00/so100_yellowbowl_pickplace_1,kumarhans/so100_tape_task,sihyun77/sihyun_main,doujiangwang/task2_10epi_100000step,kantine/industrial_robothon_buttons_expert,kantine/industrial_robothon_buttons_anomaly,kantine/industrial_robothon_hatchAndProbe_expert,kantine/industrial_robothon_hatchAndProbe_anomaly,Odog16/so100_tea_towel_folding_v1,zijian2022/so100_318,zijian2022/so100_318_1,Congying1112/so100_place_blue_bottle_with_two_cameras,Congying1112/so100_place_blue_bottle_with_two_cameras2,Congying1112/so100_place_blue_bottle_with_single_camera,pietroom/first_task_short,kantine/industrial_screws_sorting_expert,kantine/industrial_screws_sorting_anomaly,pietroom/second_task,zijian2022/c0,doujiangwang/task4_10epi_100000step,Congying1112/so100_switch_with_onhand_camera,HYAIYN/so100_get_orange_10epi,doujiangwang/task5_10epi_100000step,1g0rrr/sam_openpi_cube_low10,1g0rrr/sam_openpi_cube_top10,1g0rrr/sam_openpi_wire10,1g0rrr/sam_openpi_solder1,1g0rrr/sam_openpi_solder2,wcode/so100_put_pen_50,jchun/so100_pickplace_small_20250322_193929,bnarin/so100_tic_tac_toe_we_do_it_live,dc2ac/so100-t5,chmadran/so100_home_dataset,baladhurgesh97/so100_final_picking_3,bnarin/so100_tic_tac_toe_move_0_0,bnarin/so100_tic_tac_toe_move_1_0,bnarin/so100_tic_tac_toe_move_2_1,bnarin/so100_tic_tac_toe_move_4_0,zaringleb/so100_cube_6_2d,andlyu/so100_indoor_0,andlyu/so100_indoor_2,Winster/so100_sim,badwolf256/so100_twin_cam_duck,Congying1112/so100_simplepick_with_2_cameras_from_top,andlyu/so100_indoor_4,Zak-Y/so100_grap_dataset,kantine/domotic_pouringCoffee_expert,kantine/domotic_pouringCoffee_anomaly,lucasngoo/so100_strawberry_grape,kantine/domotic_makingCoffee_expert,kantine/domotic_makingCoffee_anomaly,ZGGZZG/so100_drop1,kantine/industrial_soldering_expert,kantine/industrial_soldering_anomaly,Yotofu/so100_sweeper_shoes,kantine/domotic_dishTidyUp_expert,kantine/domotic_dishTidyUp_anomaly,kantine/domotic_groceriesSorting_expert,kantine/domotic_groceriesSorting_anomaly,badwolf256/so100_twin_cam_duck_v2,kantine/domotic_vegetagblesAndFruitsSorting_expert,kantine/domotic_vegetagblesAndFruitsSorting_anomaly,kantine/domotic_setTheTable_expert,kantine/domotic_setTheTable_anomaly,therarelab/so100_pick_place,abhisb/so100_51_ep,andlyu/so100_indoor_val_0,allenchienxxx/so100Test,lizi178119985/so100_jia,badwolf256/so100_twin_cam_duck_v3,andrewcole712/so100_tape_bin_place,Gano007/so100_lolo,Zak-Y/so100_three_cameras_dataset,Gano007/so100_doliprane,XXRRSSRR/so100_v3_num_episodes_50,zijian2022/assemblyarm2,ganker5/so100_action_20250403,andlyu/so100_indoor_val2,Gano007/so100_gano,paszea/so100_whale_grab,paszea/so100_whale,Clementppr/lerobot_pick_and_place_dataset_world_model,andlyu/so100_indoor_10,RasmusP/so100_dataset50ep_a,RasmusP/so100_dataset50ep,Gano007/so100_second,zaringleb/so100_cude_linear_and_2d_comb,dsfsg/grasp_pens,zijian2022/digitalfix,zijian2022/digitalfix2,zijian2022/digitalfix3,T1g3rGE/so100_pickplace_small_20250407_171912,sihyun77/mond_13,abokinala/sputnik_100_11_pick_place_container,dsfsg/bring_bottle,duthvik/sputnik_100_13_pick_place_container,abokinala/sputnik_100_12_pick_place_container,Mwuqiu/so100_0408,AK51/4090_01,356c/so100_rope_reposition_1,paszea/so100_lego_mix,abokinala/sputnik_100_14_pick_place_container,abokinala/sputnik_100_23_pick_place_surface,jiajun001/eraser00_2,jlesein/TestBoulon2,duthvik/sputnik_100_31_pour_liquid,duthvik/sputnik_100_24_pick_place_surface,duthvik/sputnik_100_25_pick_place_surface,duthvik/sputnik_100_17_pick_place_container,duthvik/sputnik_100_26_pick_place_surface,VoicAndrei/so100_banana_to_plate_rebel_full,isadev/bougies1,danaaubakirova/so100_task_1,danaaubakirova/so100_task_2,danaaubakirova/so100_task_3,danaaubakirova/so100_task_4,sixpigs1/so100_pick_cube_in_box_error,sixpigs1/so100_push_cube_error,sixpigs1/so100_pull_cube_error,isadev/bougies2,therarelab/med_dis_rare_6,duthvik/sputnik_100_27_pick_place_surface,zijian2022/closer3,duthvik/sputnik_100_41_custom_tasks,duthvik/sputnik_100_42_custom_tasks,duthvik/sputnik_100_43_custom_tasks,duthvik/sputnik_100_44_custom_tasks,duthvik/sputnik_100_51_kitchen_tasks,duthvik/sputnik_100_52_kitchen_tasks,duthvik/sputnik_100_53_kitchen_tasks,duthvik/sputnik_100_45_custom_tasks,duthvik/sputnik_100_32_pour_liquid,duthvik/sputnik_100_29_pick_place_surface,duthvik/sputnik_100_18_pick_place_container,sixpigs1/so100_pull_cube_by_tool_error,sixpigs1/so100_insert_cylinder_error,abokinala/sputnik_100_54_kitchen_tasks,abokinala/sputnik_100_55_kitchen_tasks,m1b/so100_bluelego,abokinala/sputnik_100_46_custom_tasks,m1b/so100_bluelego_updt,kantine/flip_A0,kantine/flip_A1,kantine/flip_A2,kantine/flip_A3,lirislab/guess_who_no_cond,kantine/flip_A4,kantine/flip_A5,lirislab/guess_who_lighting,nguyen-v/so100_press_red_button,nguyen-v/so100_bimanual_grab_lemon_put_in_box2,pierfabre/cow,nguyen-v/press_red_button_new,nguyen-v/so100_rotate_red_button,raghav-katta-1/lerobot2,Cidoyi/so100_all_notes,roboticshack/team10-red-block,Cidoyi/so100_all_notes_1,roboticshack/team_5-QuiEstCe_everyBox,roboticshack/team11_pianobot,roboticshack/team2-guess_who_so100,roboticshack/team2-guess_who_so100_light,roboticshack/team2-guess_who_so100_edge_case,roboticshack/team2-guess_who_less_ligth,Cidoyi/so100_all_notes_3,dsfsg/grasp_pen_and_bottle,abokinala/sputnik_100_60_kitchen_tasks,abokinala/sputnik_100_58_kitchen_tasks,danaaubakirova/so100_v2_task_1,danaaubakirova/so100_v2_task_2,danaaubakirova/so100_v2_task_3,danaaubakirova/so100_v2_task_4,zijian2022/force1,zijian2022/force2,zijian2022/force3,jiajun001/eraser00_3,zijian2022/bi2,zijian2022/bi1,zijian2022/hand1,Setchii/so100_grab_ball,MossProphet/so100_square-1-2-3.2 -# # SAMPLING_WEIGHTS= -# # DATASET_NAME=so100_community_v3 - -# ########################## - -# ROBOT=so100 -# export TOKENIZERS_PARALLELISM=false -# export MUJOCO_GL=egl - - - -# SAMPLING_WEIGHTS= -# FEATURES_VERSION=2 -# NUM_IMAGE_TRANSFORMS=10 -# TRAIN_ON_ALL_FEATURES=true -# NORM_PER_ROBOT=true -# USE_IMAGENET_STATS=false - -# MAX_STATE_DIM=6 -# MAX_ACTION_DIM=6 -# MAX_NUM_IMAGES=3 -# MAX_IMAGE_DIM=256 - - -# SEED=5000 -# BATCH_SIZE=32 -# # EVAL_STEPS=1000 -# EVAL_STEPS=100 - - - - - -# SELF_ATTN_ONLY_ACTIONS=false -# EXPERT_WIDTH_MULTIPLIER=0.75 -# PAST_OBS_KEYS="image" -# N_OBS_STEPS=1 -# NUM_EXPERT_LAYERS=0 -# CHUNK_SIZE=50 -# NUM_VLM_LAYERS=16 -# PAD_LANG_TO=longest -# EVAL_CKPT=/lustre/fswork/projects/rech/dyf/ugz83ue/logs/lerobot/lerobot_so100_community_v1_v2_smolpi0_lr1e-4bs64steps200000gpus4freeze32_imgtoktrue_cross_attn_gap1_localimgfalse_statetopreftrue_explay0_vlml16_causalacttrue_sa2_smolvlm2500_chunk50_nobs1_expw0.75_feat2_lrvlm1e-4_droptrue_max_length/checkpoints/080000/pretrained_model/ -# ADD_IMAGE_TOKENS=true -# ATTN_MODE=cross_attn -# STATE_TO_PREFIX=true -# CAUSAL_ACTION_ATTENTION_MASK=true -# SELF_ATTN_EVERY_N_LAYERS=2 -# VLM_NAME=HuggingFaceTB/SmolVLM-500M-Instruct - - -# python lerobot/scripts/offline_inference.py \ -# --output_dir=$WORK/logs/lerobot/tmp \ -# --batch_size=$BATCH_SIZE \ -# --seed=$SEED \ -# --eval_steps=$EVAL_STEPS \ -# --use_amp=false \ -# --device=cuda \ -# --dataset.repo_id=$REPO_ID --dataset.local_files_only=true --dataset.sampling_weights=$SAMPLING_WEIGHTS --dataset.use_imagenet_stats=$USE_IMAGENET_STATS --policy.normalize_per_robot_type=$NORM_PER_ROBOT \ -# --dataset.image_transforms.max_num_transforms=$NUM_IMAGE_TRANSFORMS --dataset.image_transforms.enable=true --dataset.train_on_all_features=$TRAIN_ON_ALL_FEATURES \ -# --dataset.max_action_dim=$MAX_ACTION_DIM --dataset.max_state_dim=$MAX_STATE_DIM --dataset.max_num_images=$MAX_NUM_IMAGES --dataset.max_image_dim=$MAX_IMAGE_DIM --dataset.features_version=$FEATURES_VERSION \ -# --policy.type=$POLICY \ -# --policy.checkpoint_path=$EVAL_CKPT \ -# --policy.checkpoint_keys_mapping=$CKPT_KEYS_MAPPING \ -# --policy.add_image_special_tokens=$ADD_IMAGE_TOKENS \ -# --policy.attention_mode=$ATTN_MODE \ -# --policy.causal_action_attention_mask=$CAUSAL_ACTION_ATTENTION_MASK \ -# --policy.state_to_prefix=$STATE_TO_PREFIX \ -# --policy.self_attn_every_n_layers=$SELF_ATTN_EVERY_N_LAYERS \ -# --policy.vlm_model_name=$VLM_NAME \ -# --policy.pad_language_to=$PAD_LANG_TO \ -# --policy.load_vlm_weights=$LOAD_VLM_WEIGHTS \ -# --policy.num_vlm_layers=$NUM_VLM_LAYERS \ -# --policy.chunk_size=$CHUNK_SIZE \ -# --policy.n_obs_steps=$N_OBS_STEPS \ -# --policy.past_obs_keys=$PAST_OBS_KEYS \ -# --policy.num_expert_layers=$NUM_EXPERT_LAYERS \ -# --policy.expert_width_multiplier=$EXPERT_WIDTH_MULTIPLIER \ -# --policy.peft_method=$PEFT_METHOD \ -# --policy.self_attn_only_actions=$SELF_ATTN_ONLY_ACTIONS \ -# --policy.robot_type=$ROBOT - - - - -# MULTITASK_EVAL=true -# N_EPISODES=5 -# MAX_PARRALLEL_TASKS=1 -# ACTION_STEPS_LIST=(1) -# TASK_LIST=(libero_10) -# for N_ACTION_STEPS in "${ACTION_STEPS_LIST[@]}"; do -# for TASK in "${TASK_LIST[@]}"; do -# echo "$TASK Evaluating: $EVAL_CKPT | N_ACTION_STEPS=$N_ACTION_STEPS" -# python lerobot/scripts/eval.py \ -# --output_dir=$WORK/logs/lerobot/tmp \ -# --env.type=$ENV \ -# --env.task=$TASK \ -# --eval.batch_size=$N_EPISODES \ -# --eval.n_episodes=$N_EPISODES \ -# --use_amp=false \ -# --device=cuda \ -# --policy.n_action_steps=$N_ACTION_STEPS \ -# --policy.type=$POLICY \ -# --policy.checkpoint_path=$EVAL_CKPT \ -# --policy.checkpoint_keys_mapping=$CKPT_KEYS_MAPPING \ -# --env.multitask_eval=$MULTITASK_EVAL --env.max_parallel_tasks=$MAX_PARRALLEL_TASKS \ -# --policy.add_image_special_tokens=$ADD_IMAGE_TOKENS \ -# --policy.attention_mode=$ATTN_MODE \ -# --policy.causal_action_attention_mask=$CAUSAL_ACTION_ATTENTION_MASK \ -# --policy.state_to_prefix=$STATE_TO_PREFIX \ -# --policy.self_attn_every_n_layers=$SELF_ATTN_EVERY_N_LAYERS \ -# --policy.vlm_model_name=$VLM_NAME \ -# --policy.load_vlm_weights=$LOAD_VLM_WEIGHTS \ -# --policy.num_vlm_layers=$NUM_VLM_LAYERS \ -# --policy.chunk_size=$CHUNK_SIZE - -# echo "Done with: $EVAL_CKPT | Steps=$N_ACTION_STEPS" -# echo "------------------------------------------------------" -# done -# done - diff --git a/examples/checker.py b/examples/checker.py deleted file mode 100644 index 12377e9b9..000000000 --- a/examples/checker.py +++ /dev/null @@ -1,27 +0,0 @@ -from huggingface_hub import HfApi -api = HfApi() -# api.upload_large_folder( -# repo_id="HuggingFaceVLA/libero", -# repo_type="dataset", -# folder_path="/raid/jade/.cache/huggingface/lerobot/HuggingFaceVLA/libero", -# ) -api.upload_large_folder( - repo_id="HuggingFaceVLA/metaworld_mt50", - repo_type="dataset", - folder_path="/raid/jade/.cache/huggingface/lerobot/metaworld_mt50", -) -# repo_id="HuggingFaceVLA/libero" -# # Upload extra files -# api.upload_file( -# repo_id=repo_id, -# repo_type="dataset", -# path_or_fileobj="/raid/jade/libero_converted/README.md", -# path_in_repo="README.md" -# ) - -# api.upload_folder( -# repo_id=repo_id, -# repo_type="dataset", -# folder_path="/raid/jade/libero_converted/meta", -# path_in_repo="meta" -# ) diff --git a/examples/checker2.py b/examples/checker2.py deleted file mode 100644 index a5825d87f..000000000 --- a/examples/checker2.py +++ /dev/null @@ -1,35 +0,0 @@ -import pyarrow.parquet as pq - -# # First parquet (cached HF version) -meta1 = pq.read_metadata("/raid/jade/.cache/huggingface/datasets/data/chunk-000/episode_000000.parquet") -meta1 = pq.read_metadata("//raid/jade/.cache/huggingface/lerobot/HuggingFaceVLA/libero/data/chunk-000/episode_000019.parquet") -print("First parquet key_value_metadata:") -print(meta1.metadata) # low-level file metadata -# print() -print("Second") -# Second parquet (your converted version) -meta2 = pq.read_metadata("//raid/jade/.cache/huggingface/lerobot/HuggingFaceVLA/libero/data/chunk-000/episode_000019.parquet") -print("\nSecond parquet key_value_metadata:") -# print(meta2.metadata) - -# from datasets import load_dataset -# root_dir = "/raid/jade/libero_converted" - -# # Load all parquet files under the root_dir recursively -# ds = load_dataset("parquet", data_files=f"{root_dir}/**/*.parquet") - -# print(ds) # prints split info -# print(ds["train"].features) # check schema/features - -# # Peek at one row -# example = ds["train"][0] -# print(example.keys()) -# print(type(example["observation.images.image"])) -# print(type(example["observation.images.image2"])) - -import pyarrow.parquet as pq - -for ep in ["episode_000019.parquet", "episode_000021.parquet", "episode_000026.parquet"]: - path = f"/raid/jade/.cache/huggingface/lerobot/HuggingFaceVLA/libero/data/chunk-000/{ep}" - schema = pq.read_schema(path) - print(ep, schema.names) diff --git a/examples/convert_data.py b/examples/convert_data.py deleted file mode 100644 index 96ce58cb1..000000000 --- a/examples/convert_data.py +++ /dev/null @@ -1,253 +0,0 @@ -#!/usr/bin/env python3 -""" -Convert local LeRobot datasets from v2.0 to v2.1 format. -This script adapts the official converter to work with local datasets. -""" - -import sys -import argparse -import logging -from pathlib import Path - -# Add lerobot to path -sys.path.insert(0, '/home/jade_choghari/lerobot/src') - -from lerobot.datasets.lerobot_dataset import CODEBASE_VERSION, LeRobotDataset -from lerobot.datasets.utils import EPISODES_STATS_PATH, STATS_PATH, load_stats, write_info -from lerobot.datasets.v21.convert_stats import check_aggregate_stats, convert_stats - -logging.basicConfig(level=logging.INFO) -logger = logging.getLogger(__name__) - -def convert_local_dataset( - dataset_path: str, - num_workers: int = 4, - skip_if_converted: bool = True -): - """ - Convert a local dataset from v2.0 to v2.1 format. - - Args: - dataset_path: Path to the local dataset directory - num_workers: Number of workers for parallel processing - skip_if_converted: Skip if already has episodes_stats.jsonl - """ - dataset_path = Path(dataset_path) - - print(f"πŸ”„ Converting local dataset: {dataset_path}") - - # Check if already converted - episodes_stats_path = dataset_path / "meta" / "episodes_stats.jsonl" - if episodes_stats_path.exists() and skip_if_converted: - # Check if file is empty - file_size = episodes_stats_path.stat().st_size - if file_size == 0: - print(f" ⚠️ episodes_stats.jsonl is empty, will regenerate") - else: - # Check if file has content - with open(episodes_stats_path, 'r') as f: - content = f.read().strip() - if not content: - print(f" ⚠️ episodes_stats.jsonl has no content, will regenerate") - else: - print(f" ⏭️ Already has episodes_stats.jsonl, skipping") - return True - - try: - # Check if this is a v2.0 dataset that needs conversion - episodes_stats_path = dataset_path / "meta" / "episodes_stats.jsonl" - stats_path = dataset_path / "meta" / "stats.json" - - if not episodes_stats_path.exists() and stats_path.exists(): - print(f" πŸ”„ Detected v2.0 dataset, creating temporary episodes_stats.jsonl...") - # Create empty episodes_stats.jsonl to allow loading - episodes_stats_path.touch() - created_temp_file = True - else: - created_temp_file = False - - # Load dataset from local path with pyav video backend - print(f" πŸ“‚ Loading dataset from local path...") - # Use a dummy repo_id since we're loading locally - dummy_repo_id = f"{dataset_path.parent.name}/{dataset_path.name}" - dataset = LeRobotDataset( - dummy_repo_id, - root=str(dataset_path), - # video_backend="pyav", - # local_files_only=True - ) - - # Remove temporary file if we created it - if created_temp_file and episodes_stats_path.exists() and episodes_stats_path.stat().st_size == 0: - episodes_stats_path.unlink() - print(f" πŸ—‘οΈ Removed temporary episodes_stats.jsonl") - - # Remove existing episodes_stats if present (ensure clean conversion) - episodes_stats_path = dataset_path / "meta" / "episodes_stats.jsonl" - if episodes_stats_path.exists(): - episodes_stats_path.unlink() - print(f" πŸ—‘οΈ Removed existing episodes_stats.jsonl") - - # Check if video directory exists before conversion - videos_dir = dataset_path / "videos" - if not videos_dir.exists(): - print(f" ⚠️ No videos directory found - will skip video statistics") - - # Convert stats - print(f" πŸ“Š Computing episode statistics...") - convert_stats(dataset, num_workers=num_workers) - - # Load reference stats for validation if they exist - stats_path = dataset.root / STATS_PATH - if stats_path.exists(): - print(f" βœ… Validating against reference stats...") - try: - ref_stats = load_stats(dataset.root) - check_aggregate_stats(dataset, ref_stats) - print(f" βœ… Stats validation passed!") - except AssertionError as e: - print(f" ⚠️ Stats validation failed with minor differences: {e}") - print(f" ⚠️ This is likely due to floating-point precision, continuing anyway...") - # Check if the error is just a small numerical difference - if "Max absolute difference:" in str(e) and "Max relative difference:" in str(e): - print(f" βœ… Treating as acceptable numerical precision difference") - else: - raise e - - # Remove old stats.json file - print(f" πŸ—‘οΈ Removing old stats.json") - stats_path.unlink() - else: - print(f" ⚠️ No reference stats found, skipping validation") - - # Update codebase version - dataset.meta.info["codebase_version"] = CODEBASE_VERSION - write_info(dataset.meta.info, dataset.root) - - print(f" βœ… Successfully converted to v2.1") - return True - - except Exception as e: - print(f" ❌ Failed to convert: {e}") - logger.exception("Conversion failed") - return False - -def convert_multiple_datasets( - base_dirs: list[str], - max_datasets: int = None, - num_workers: int = 4 -): - """Convert multiple datasets from base directories.""" - - datasets_to_convert = [] - - # Scan for datasets needing conversion - for base_dir in base_dirs: - base_path = Path(base_dir) - if not base_path.exists(): - print(f"⚠️ Directory not found: {base_dir}") - continue - - print(f"πŸ” Scanning: {base_dir}") - - # Walk through author/dataset structure - for author_dir in sorted(base_path.iterdir()): - if not author_dir.is_dir(): - continue - - for dataset_dir in sorted(author_dir.iterdir()): - if not dataset_dir.is_dir(): - continue - - # Check if needs conversion - episodes_stats_path = dataset_dir / "meta" / "episodes_stats.jsonl" - info_path = dataset_dir / "meta" / "info.json" - - needs_conversion = False - if info_path.exists(): - if not episodes_stats_path.exists(): - needs_conversion = True - print(f" πŸ“ Found (missing): {author_dir.name}/{dataset_dir.name}") - else: - # Check if episodes_stats file is empty - try: - file_size = episodes_stats_path.stat().st_size - if file_size == 0: - needs_conversion = True - print(f" πŸ“ Found (empty): {author_dir.name}/{dataset_dir.name}") - else: - # Check if file has content - with open(episodes_stats_path, 'r') as f: - content = f.read().strip() - if not content: - needs_conversion = True - print(f" πŸ“ Found (no content): {author_dir.name}/{dataset_dir.name}") - except Exception as e: - # If we can't read the file, consider it needs conversion - needs_conversion = True - print(f" πŸ“ Found (read error): {author_dir.name}/{dataset_dir.name}") - - if needs_conversion: - datasets_to_convert.append(dataset_dir) - - if not datasets_to_convert: - print("πŸŽ‰ No datasets need conversion!") - return - - if max_datasets: - datasets_to_convert = datasets_to_convert[:max_datasets] - - print(f"\nπŸš€ Converting {len(datasets_to_convert)} datasets...") - - successful = 0 - failed = 0 - - for i, dataset_path in enumerate(datasets_to_convert, 1): - print(f"\n[{i}/{len(datasets_to_convert)}] {dataset_path.parent.name}/{dataset_path.name}") - - success = convert_local_dataset(dataset_path, num_workers=num_workers) - if success: - successful += 1 - else: - failed += 1 - - print(f"\nπŸ“Š Conversion Summary:") - print(f" βœ… Successful: {successful}") - print(f" ❌ Failed: {failed}") - print(f" πŸ“ˆ Success rate: {successful}/{len(datasets_to_convert)} ({100*successful/len(datasets_to_convert):.1f}%)") - - -def main(): - parser = argparse.ArgumentParser(description="Convert local LeRobot datasets to v2.1 format") - parser.add_argument("--dataset", type=str, help="Single dataset path to convert") - parser.add_argument("--base-dirs", nargs="+", - default=["/fsx/dana_aubakirova/vla/community_dataset_v1"], - help="Base directories to scan for datasets") - parser.add_argument("--max-datasets", type=int, help="Maximum number of datasets to convert") - parser.add_argument("--num-workers", type=int, default=4, help="Number of workers for stats computation") - parser.add_argument("--all", action="store_true", help="Convert all datasets in base directories") - - args = parser.parse_args() - - if args.dataset: - # Convert single dataset - success = convert_local_dataset(args.dataset, num_workers=args.num_workers) - if success: - print(f"\nπŸŽ‰ Successfully converted: {args.dataset}") - else: - print(f"\nπŸ’₯ Failed to convert: {args.dataset}") - sys.exit(1) - - elif args.all: - # Convert all datasets - convert_multiple_datasets( - args.base_dirs, - max_datasets=args.max_datasets, - num_workers=args.num_workers - ) - - else: - parser.print_help() - -if __name__ == "__main__": - main() \ No newline at end of file diff --git a/examples/convert_libero.py b/examples/convert_libero.py deleted file mode 100644 index 7bfc50eae..000000000 --- a/examples/convert_libero.py +++ /dev/null @@ -1,126 +0,0 @@ -import os -import pyarrow.parquet as pq -import tempfile -import shutil - -# Root directory of converted data -root_dir = "/raid/jade/libero_converted" - -# No renaming -rename_map = { - -} - -# Hugging Face features metadata (constant across all files) -HF_METADATA = { - b"huggingface": b'{"info": {"features": {"observation.images.image": {"_type": "Image"}, "observation.images.image2": {"_type": "Image"}, "state": {"feature": {"dtype": "float32", "_type": "Value"}, "length": 8, "_type": "Sequence"}, "actions": {"feature": {"dtype": "float32", "_type": "Value"}, "length": 7, "_type": "Sequence"}, "timestamp": {"dtype": "float32", "_type": "Value"}, "frame_index": {"dtype": "int64", "_type": "Value"}, "episode_index": {"dtype": "int64", "_type": "Value"}, "index": {"dtype": "int64", "_type": "Value"}, "task_index": {"dtype": "int64", "_type": "Value"}}}}' -} - -def patch_parquet(parquet_path, hf_metadata): - try: - table = pq.read_table(parquet_path) - - # Merge metadata - new_meta = dict(table.schema.metadata or {}) - new_meta.update(hf_metadata) - - # Apply metadata to table - table = table.replace_schema_metadata(new_meta) - - # Write safely via temp file - tmp_fd, tmp_path = tempfile.mkstemp(suffix=".parquet") - os.close(tmp_fd) - pq.write_table(table, tmp_path) - shutil.move(tmp_path, parquet_path) - - print(f"βœ… Patched: {parquet_path}") - return True - except Exception as e: - print(f"❌ Failed on {parquet_path}: {e}") - return False - -# Walk through all chunk dirs and patch parquet files -for dirpath, _, filenames in os.walk(root_dir): - for fname in filenames: - if fname.endswith(".parquet"): - fpath = os.path.join(dirpath, fname) - patch_parquet(fpath, HF_METADATA)#!/usr/bin/env python3 - -#!/usr/bin/env python3 -import os -import pyarrow.parquet as pq -import tempfile -import shutil - -# Explicit list of files to patch -FILES_TO_PATCH = [ - "/raid/jade/.cache/huggingface/lerobot/HuggingFaceVLA/libero/data/chunk-000/episode_000021.parquet", - "/raid/jade/.cache/huggingface/lerobot/HuggingFaceVLA/libero/data/chunk-000/episode_000022.parquet", - "/raid/jade/.cache/huggingface/lerobot/HuggingFaceVLA/libero/data/chunk-000/episode_000023.parquet", - "/raid/jade/.cache/huggingface/lerobot/HuggingFaceVLA/libero/data/chunk-000/episode_000024.parquet", - "/raid/jade/.cache/huggingface/lerobot/HuggingFaceVLA/libero/data/chunk-000/episode_000025.parquet", -] - -# Optional renaming map (fill in as needed) -rename_map = { - # "old_column_name": "new_column_name", - "image": "observation.images.image", - "image2": "observation.images.image2", - "actions": "action", -} - -# Hugging Face features metadata (constant across all files) -HF_METADATA = { - b"huggingface": b'{"info": {"features": {' - b'"observation.images.image": {"_type": "Image"}, ' - b'"observation.images.image2": {"_type": "Image"}, ' - b'"state": {"feature": {"dtype": "float32", "_type": "Value"}, "length": 8, "_type": "Sequence"}, ' - b'"actions": {"feature": {"dtype": "float32", "_type": "Value"}, "length": 7, "_type": "Sequence"}, ' - b'"timestamp": {"dtype": "float32", "_type": "Value"}, ' - b'"frame_index": {"dtype": "int64", "_type": "Value"}, ' - b'"episode_index": {"dtype": "int64", "_type": "Value"}, ' - b'"index": {"dtype": "int64", "_type": "Value"}, ' - b'"task_index": {"dtype": "int64", "_type": "Value"}}}}' -} - -def patch_parquet(parquet_path, hf_metadata, rename_map): - try: - # Load parquet table - table = pq.read_table(parquet_path) - - # If renaming is needed - if rename_map: - schema = table.schema - new_names = [ - rename_map.get(name, name) for name in schema.names - ] - table = table.rename_columns(new_names) - - # Merge schema metadata - new_meta = dict(table.schema.metadata or {}) - new_meta.update(hf_metadata) - - # Replace metadata in table - table = table.replace_schema_metadata(new_meta) - - # Write safely via temp file - tmp_fd, tmp_path = tempfile.mkstemp(suffix=".parquet") - os.close(tmp_fd) - pq.write_table(table, tmp_path) - - # Replace original file - shutil.move(tmp_path, parquet_path) - - print(f"βœ… Patched: {parquet_path}") - return True - except Exception as e: - print(f"❌ Failed on {parquet_path}: {e}") - return False - - -if __name__ == "__main__": - for fpath in FILES_TO_PATCH: - if os.path.exists(fpath): - patch_parquet(fpath, HF_METADATA, rename_map) - else: - print(f"⚠️ File not found: {fpath}") diff --git a/examples/evaluate_libero.py b/examples/evaluate_libero.py deleted file mode 100644 index bf99994b6..000000000 --- a/examples/evaluate_libero.py +++ /dev/null @@ -1,255 +0,0 @@ -""" -This script demonstrates how to evaluate a pretrained smolVLA policy on the LIBERO benchmark. -""" - -import collections -import dataclasses -import logging -import math -import pathlib - -import cv2 -import draccus -import imageio -import numpy as np -import torch -from libero.libero import benchmark, get_libero_path -from libero.libero.envs import OffScreenRenderEnv -from tqdm import tqdm - -from lerobot.policies.smolvla.modeling_smolvla import SmolVLAPolicy -from lerobot.policies.pi0.modeling_pi0 import PI0Policy - -LIBERO_DUMMY_ACTION = [0.0] * 6 + [-1.0] -LIBERO_ENV_RESOLUTION = 256 # resolution used to render training data - -@dataclasses.dataclass -class Args: - """ - Evaluation arguments for smolVLA on LIBERO. - """ - - # --- Hugging Face arguments --- - policy_path: str = "lerobot/smolvla_base" - """Path to the pretrained policy on the Hugging Face Hub or local directory.""" - - # --- LIBERO environment-specific parameters --- - task_suite_name: str = "libero_spatial" - """Task suite. Options: libero_spatial, libero_object, libero_goal, libero_10, libero_90""" - num_steps_wait: int = 10 - """Number of steps to wait for objects to stabilize in sim.""" - num_trials_per_task: int = 50 - """Number of rollouts per task.""" - - # --- Evaluation arguments --- - video_out_path: str = "data/libero/videos" - """Path to save videos.""" - device: str = "cuda" - """Device to use for evaluation.""" - - seed: int = 7 - """Random Seed (for reproducibility)""" - - -@draccus.wrap() -def eval_libero(args: Args) -> None: - # Set random seed - torch.manual_seed(args.seed) - np.random.seed(args.seed) - - # --- Load Policy --- - policy = SmolVLAPolicy.from_pretrained(args.policy_path) - policy.to(args.device) - policy.eval() - - # --- Initialize LIBERO task suite --- - benchmark_dict = benchmark.get_benchmark_dict() - try: - task_suite = benchmark_dict[args.task_suite_name]() - except KeyError: - raise ValueError( - f"Unknown task suite: {args.task_suite_name}. " - f"Available options are: {list(benchmark_dict.keys())}" - ) - num_tasks_in_suite = task_suite.n_tasks - logging.info(f"Task suite: {args.task_suite_name}") - - pathlib.Path(args.video_out_path).mkdir(parents=True, exist_ok=True) - - if args.task_suite_name == "libero_spatial": - max_steps = 220 # longest training demo has 193 steps - elif args.task_suite_name == "libero_object": - max_steps = 280 # longest training demo has 254 steps - elif args.task_suite_name == "libero_goal": - max_steps = 300 # longest training demo has 270 steps - elif args.task_suite_name == "libero_10": - max_steps = 520 # longest training demo has 505 steps - elif args.task_suite_name == "libero_90": - max_steps = 400 # longest training demo has 373 steps - else: - # Fallback for custom task suites - max_steps = 520 - - # --- Evaluation Loop --- - total_episodes, total_successes = 0, 0 - for task_id in tqdm(range(num_tasks_in_suite), desc="Tasks"): - # Get task - task = task_suite.get_task(task_id) - - # Get default LIBERO initial states - initial_states = task_suite.get_task_init_states(task_id) - - # Initialize LIBERO environment and task description - env, task_description = _get_libero_env(task, LIBERO_ENV_RESOLUTION, args.seed) - - # Start episodes - task_episodes, task_successes = 0, 0 - for episode_idx in tqdm( - range(min(args.num_trials_per_task, len(initial_states))), - desc=f"Task {task_id}: {task.language}", - leave=False, - ): - logging.info(f"\nTask: {task_description}") - - # Reset environment and policy - env.reset() - policy.reset() - - # Set initial states - obs = env.set_init_state(initial_states[episode_idx]) - - # IMPORTANT: Do nothing for the first few timesteps because the simulator drops objects - # and we need to wait for them to fall - for _ in range(args.num_steps_wait): - obs, _, _, _ = env.step(LIBERO_DUMMY_ACTION) - - # Setup - t = 0 - frames = [] - done = False - - # Add initial frame - agentview_image = np.ascontiguousarray(obs["agentview_image"][::-1, ::-1]) - # frames.append(agentview_image) - # import ipdb; ipdb.set_trace() - logging.info(f"Starting episode {task_episodes+1}...") - while t < max_steps: - try: - # Get preprocessed image - # IMPORTANT: rotate 180 degrees to match train preprocessing - wrist_img = np.ascontiguousarray(obs["robot0_eye_in_hand_image"][::-1, ::-1]) - agentview_image = np.ascontiguousarray(obs["agentview_image"][::-1, ::-1]) - frames.append(agentview_image) - - # Prepare observations dict - state = np.concatenate( - ( - obs["robot0_eef_pos"], - _quat2axisangle(obs["robot0_eef_quat"]), - obs["robot0_gripper_qpos"], - ) - ) - observation = { - "observation.images.image": torch.from_numpy(agentview_image / 255.0) - .permute(2, 0, 1) - .to(torch.float32) - .to(args.device).unsqueeze(0), - "observation.images.image2": torch.from_numpy(wrist_img / 255.0) - .permute(2, 0, 1) - .to(torch.float32) - .to(args.device).unsqueeze(0), - "observation.state": torch.from_numpy(state).to(torch.float32).to(args.device).unsqueeze(0), - "task": task_description, - } - - # Query model to get action - with torch.inference_mode(): - action_tensor = policy.select_action(observation) - action = action_tensor.cpu().numpy()[0] - action[-1] = 1 - action[-1] - - # Execute action in environment - obs, _, done, _ = env.step(action) - if done: - task_successes += 1 - total_successes += 1 - break - t += 1 - - except Exception as e: - logging.error(f"Caught exception: {e}") - break - - task_episodes += 1 - total_episodes += 1 - - # Save a replay video of the episode - suffix = "success" if done else "failure" - task_segment = task_description.replace(" ", "_").replace("/", "_") - video_path = ( - pathlib.Path(args.video_out_path) / f"rollout_task_{task_id}_episode_{episode_idx}_{task_segment}_{suffix}.mp4" - ) - fps = 30 - writer = imageio.get_writer(video_path, fps=fps) - - for image in frames: - writer.append_data(image) - writer.close() - logging.info(f"Saved video to {video_path}") - - # Log current results - logging.info(f"Success: {done}") - if total_episodes > 0: - logging.info(f"# episodes completed so far: {total_episodes}") - logging.info(f"# successes: {total_successes} ({total_successes / total_episodes * 100:.1f}%)") - - # Log final results for the task - if task_episodes > 0: - logging.info(f"Task {task_id} success rate: {float(task_successes) / float(task_episodes):.2f}") - if total_episodes > 0: - logging.info(f"Cumulative success rate: {float(total_successes) / float(total_episodes):.2f}") - - logging.info("--- Evaluation finished ---") - if total_episodes > 0: - logging.info(f"Total success rate: {float(total_successes) / float(total_episodes):.2f}") - logging.info(f"Total episodes: {total_episodes}") - logging.info(f"Total successes: {total_successes}") - cv2.destroyAllWindows() - - -def _get_libero_env(task, resolution, seed): - """Initializes and returns the LIBERO environment, along with the task description.""" - task_description = task.language - task_bddl_file = pathlib.Path(get_libero_path("bddl_files")) / task.problem_folder / task.bddl_file - env_args = { - "bddl_file_name": str(task_bddl_file), - "camera_heights": resolution, - "camera_widths": resolution, - } - env = OffScreenRenderEnv(**env_args) - env.seed(seed) # IMPORTANT: seed seems to affect object positions even when using fixed initial state - return env, task_description - - -def _quat2axisangle(quat): - """ - Copied from robosuite: - https://github.com/ARISE-Initiative/robosuite/blob/eafb81f54ffc104f905ee48a16bb15f059176ad3/robosuite/utils/transform_utils.py#L490C1-L512C55 - """ - # clip quaternion - if quat[3] > 1.0: - quat[3] = 1.0 - elif quat[3] < -1.0: - quat[3] = -1.0 - - den = np.sqrt(1.0 - quat[3] * quat[3]) - if math.isclose(den, 0.0): - # This is (close to) a zero degree rotation, immediately return - return np.zeros(3) - - return (quat[:3] * 2.0 * math.acos(quat[3])) / den - - -if __name__ == "__main__": - logging.basicConfig(level=logging.INFO) - eval_libero() \ No newline at end of file diff --git a/examples/requirements.in b/examples/requirements.in deleted file mode 100644 index 25664608a..000000000 --- a/examples/requirements.in +++ /dev/null @@ -1,8 +0,0 @@ -imageio[ffmpeg] -numpy==1.22.4 -tqdm -tyro -PyYaml -opencv-python==4.6.0.66 -robosuite==1.4.1 -matplotlib==3.5.3 \ No newline at end of file diff --git a/examples/script2.py b/examples/script2.py deleted file mode 100644 index cbd4da913..000000000 --- a/examples/script2.py +++ /dev/null @@ -1,70 +0,0 @@ -#!/usr/bin/env python3 -import os -import pyarrow.parquet as pq -import tempfile -import shutil - -FILES_TO_PATCH = [ - "/raid/jade/.cache/huggingface/lerobot/HuggingFaceVLA/libero/data/chunk-000/episode_000021.parquet", - "/raid/jade/.cache/huggingface/lerobot/HuggingFaceVLA/libero/data/chunk-000/episode_000022.parquet", - "/raid/jade/.cache/huggingface/lerobot/HuggingFaceVLA/libero/data/chunk-000/episode_000023.parquet", - "/raid/jade/.cache/huggingface/lerobot/HuggingFaceVLA/libero/data/chunk-000/episode_000024.parquet", - "/raid/jade/.cache/huggingface/lerobot/HuggingFaceVLA/libero/data/chunk-000/episode_000025.parquet", -] - -# Column renaming map -rename_map = { - "wrist_image": "observation.images.image2", - "actions": "action", -} - -# Hugging Face metadata -HF_METADATA = { - b"huggingface": b'{"info": {"features": {' - b'"observation.images.image": {"_type": "Image"}, ' - b'"observation.images.image2": {"_type": "Image"}, ' - b'"state": {"feature": {"dtype": "float32", "_type": "Value"}, "length": 8, "_type": "Sequence"}, ' - b'"action": {"feature": {"dtype": "float32", "_type": "Value"}, "length": 7, "_type": "Sequence"}, ' - b'"timestamp": {"dtype": "float32", "_type": "Value"}, ' - b'"frame_index": {"dtype": "int64", "_type": "Value"}, ' - b'"episode_index": {"dtype": "int64", "_type": "Value"}, ' - b'"index": {"dtype": "int64", "_type": "Value"}, ' - b'"task_index": {"dtype": "int64", "_type": "Value"}}}}' -} - -def patch_parquet(parquet_path, hf_metadata, rename_map): - try: - table = pq.read_table(parquet_path) - - # Apply column renames if needed - if rename_map: - schema = table.schema - new_names = [rename_map.get(name, name) for name in schema.names] - table = table.rename_columns(new_names) - - # Merge schema metadata - new_meta = dict(table.schema.metadata or {}) - new_meta.update(hf_metadata) - - # Replace metadata - table = table.replace_schema_metadata(new_meta) - - # Write via temp file - tmp_fd, tmp_path = tempfile.mkstemp(suffix=".parquet") - os.close(tmp_fd) - pq.write_table(table, tmp_path) - - shutil.move(tmp_path, parquet_path) - print(f"βœ… Patched: {parquet_path}") - return True - except Exception as e: - print(f"❌ Failed on {parquet_path}: {e}") - return False - - -if __name__ == "__main__": - for fpath in FILES_TO_PATCH: - if os.path.exists(fpath): - patch_parquet(fpath, HF_METADATA, rename_map) - else: - print(f"⚠️ File not found: {fpath}") diff --git a/examples/script3.py b/examples/script3.py deleted file mode 100644 index 7b4d7957a..000000000 --- a/examples/script3.py +++ /dev/null @@ -1,64 +0,0 @@ -#!/usr/bin/env python3 -import os -import pyarrow.parquet as pq -import tempfile -import shutil - -# Root directory containing all parquet files -ROOT_DIR = "/raid/jade/.cache/huggingface/lerobot/HuggingFaceVLA/libero/data" - -# Column renaming map (normalize schema to what training expects) -rename_map = { - "state": "observation.state", -} - -# Hugging Face metadata (aligned with expected feature names) -HF_METADATA = { - b"huggingface": b'{"info": {"features": {' - b'"observation.images.image": {"_type": "Image"}, ' - b'"observation.images.image2": {"_type": "Image"}, ' - b'"observation.state": {"feature": {"dtype": "float32", "_type": "Value"}, "length": 8, "_type": "Sequence"}, ' - b'"action": {"feature": {"dtype": "float32", "_type": "Value"}, "length": 7, "_type": "Sequence"}, ' - b'"timestamp": {"dtype": "float32", "_type": "Value"}, ' - b'"frame_index": {"dtype": "int64", "_type": "Value"}, ' - b'"episode_index": {"dtype": "int64", "_type": "Value"}, ' - b'"index": {"dtype": "int64", "_type": "Value"}, ' - b'"task_index": {"dtype": "int64", "_type": "Value"}}}}' -} - -def patch_parquet(parquet_path, hf_metadata, rename_map): - try: - # Read the parquet table - table = pq.read_table(parquet_path) - - # Apply renames if necessary - if rename_map: - new_names = [rename_map.get(name, name) for name in table.schema.names] - if new_names != table.schema.names: - table = table.rename_columns(new_names) - - # Update metadata - new_meta = dict(table.schema.metadata or {}) - new_meta.update(hf_metadata) - table = table.replace_schema_metadata(new_meta) - - # Write to temp file then atomically move back - tmp_fd, tmp_path = tempfile.mkstemp(suffix=".parquet") - os.close(tmp_fd) - pq.write_table(table, tmp_path) - shutil.move(tmp_path, parquet_path) - - # Debug print - print(f"βœ… Patched: {parquet_path}") - print(" Columns:", table.schema.names) - return True - except Exception as e: - print(f"❌ Failed on {parquet_path}: {e}") - return False - -if __name__ == "__main__": - for dirpath, _, filenames in os.walk(ROOT_DIR): - for fname in filenames: - if fname.endswith(".parquet"): - fpath = os.path.join(dirpath, fname) - patch_parquet(fpath, HF_METADATA, rename_map) diff --git a/examples/script4.py b/examples/script4.py deleted file mode 100644 index 2eed60a94..000000000 --- a/examples/script4.py +++ /dev/null @@ -1,3 +0,0 @@ -from huggingface_hub import HfApi -hub_api = HfApi() -hub_api.create_tag("HuggingFaceVLA/libero", tag="v2.1", repo_type="dataset") diff --git a/log_text.txt b/log_text.txt deleted file mode 100644 index 6676df0eb..000000000 --- a/log_text.txt +++ /dev/null @@ -1,1765 +0,0 @@ - self.vlm_with_expert = SmolVLMWithExpertModel( - File "/home/jade_choghari/lerobot/src/lerobot/policies/smolvla/smolvlm_with_expert.py", line 88, in __init__ - self.processor = AutoProcessor.from_pretrained(model_id) - File "/home/jade_choghari/miniconda3/envs/lerobot/lib/python3.10/site-packages/transformers/models/auto/processing -_auto.py", line 300, in from_pretrained - config_dict, _ = ProcessorMixin.get_processor_dict(pretrained_model_name_or_path, **kwargs) - File "/home/jade_choghari/miniconda3/envs/lerobot/lib/python3.10/site-packages/transformers/processing_utils.py", -line 944, in get_processor_dict - resolved_raw_chat_template_file = cached_file( - File "/home/jade_choghari/miniconda3/envs/lerobot/lib/python3.10/site-packages/transformers/utils/hub.py", line 32 -1, in cached_file - file = cached_files(path_or_repo_id=path_or_repo_id, filenames=[filename], **kwargs) - File "/home/jade_choghari/miniconda3/envs/lerobot/lib/python3.10/site-packages/transformers/utils/hub.py", line 47 -8, in cached_files - hf_hub_download( - File "/home/jade_choghari/miniconda3/envs/lerobot/lib/python3.10/site-packages/huggingface_hub/utils/_validators.p -y", line 114, in _inner_fn - return fn(*args, **kwargs) - File "/home/jade_choghari/miniconda3/envs/lerobot/lib/python3.10/site-packages/huggingface_hub/file_download.py", -line 1010, in hf_hub_download - return _hf_hub_download_to_cache_dir( - File "/home/jade_choghari/miniconda3/envs/lerobot/lib/python3.10/site-packages/huggingface_hub/file_download.py", -line 1073, in _hf_hub_download_to_cache_dir - (url_to_download, etag, commit_hash, expected_size, xet_file_data, head_call_error) = _get_metadata_or_catch_err -or( - File "/home/jade_choghari/miniconda3/envs/lerobot/lib/python3.10/site-packages/huggingface_hub/file_download.py", -line 1546, in _get_metadata_or_catch_error - metadata = get_hf_file_metadata( - File "/home/jade_choghari/miniconda3/envs/lerobot/lib/python3.10/site-packages/huggingface_hub/utils/_validators.p -y", line 114, in _inner_fn - return fn(*args, **kwargs) - File "/home/jade_choghari/miniconda3/envs/lerobot/lib/python3.10/site-packages/huggingface_hub/file_download.py", -line 1463, in get_hf_file_metadata - r = _request_wrapper( - File "/home/jade_choghari/miniconda3/envs/lerobot/lib/python3.10/site-packages/huggingface_hub/file_download.py", -line 286, in _request_wrapper - response = _request_wrapper( - File "/home/jade_choghari/miniconda3/envs/lerobot/lib/python3.10/site-packages/huggingface_hub/file_download.py", -line 309, in _request_wrapper - response = http_backoff(method=method, url=url, **params, retry_on_exceptions=(), retry_on_status_codes=(429,)) - File "/home/jade_choghari/miniconda3/envs/lerobot/lib/python3.10/site-packages/huggingface_hub/utils/_http.py", li -ne 310, in http_backoff - response = session.request(method=method, url=url, **kwargs) - File "/home/jade_choghari/miniconda3/envs/lerobot/lib/python3.10/site-packages/requests/sessions.py", line 589, in - request - resp = self.send(prep, **send_kwargs) - File "/home/jade_choghari/miniconda3/envs/lerobot/lib/python3.10/site-packages/requests/sessions.py", line 703, in - send - r = adapter.send(request, **kwargs) - File "/home/jade_choghari/miniconda3/envs/lerobot/lib/python3.10/site-packages/huggingface_hub/utils/_http.py", li -ne 96, in send - return super().send(request, *args, **kwargs) - File "/home/jade_choghari/miniconda3/envs/lerobot/lib/python3.10/site-packages/requests/adapters.py", line 644, in - send - resp = conn.urlopen( - File "/home/jade_choghari/miniconda3/envs/lerobot/lib/python3.10/site-packages/urllib3/connectionpool.py", line 78 -7, in urlopen - response = self._make_request( - File "/home/jade_choghari/miniconda3/envs/lerobot/lib/python3.10/site-packages/urllib3/connectionpool.py", line 53 -4, in _make_request - response = conn.getresponse() - File "/home/jade_choghari/miniconda3/envs/lerobot/lib/python3.10/site-packages/urllib3/connection.py", line 565, i -n getresponse - httplib_response = super().getresponse() - File "/home/jade_choghari/miniconda3/envs/lerobot/lib/python3.10/http/client.py", line 1375, in getresponse - response.begin() - File "/home/jade_choghari/miniconda3/envs/lerobot/lib/python3.10/http/client.py", line 318, in begin - version, status, reason = self._read_status() - File "/home/jade_choghari/miniconda3/envs/lerobot/lib/python3.10/http/client.py", line 279, in _read_status - line = str(self.fp.readline(_MAXLINE + 1), "iso-8859-1") - File "/home/jade_choghari/miniconda3/envs/lerobot/lib/python3.10/socket.py", line 717, in readinto - return self._sock.recv_into(b) - File "/home/jade_choghari/miniconda3/envs/lerobot/lib/python3.10/ssl.py", line 1307, in recv_into - return self.read(nbytes, buffer) - File "/home/jade_choghari/miniconda3/envs/lerobot/lib/python3.10/ssl.py", line 1163, in read - return self._sslobj.read(len, buffer) -KeyboardInterrupt -clea -(lerobot) jade_choghari@hf-dgx-01:~/lerobot$ clear -(lerobot) jade_choghari@hf-dgx-01:~/lerobot$ bash examples/8_train_smolvla_must.sh -Training dir: /raid/jade/logs/lerobot/lerobot_2_physical-intelligence_libero_smolvla_lr1e-4bs32steps100000 -/home/jade_choghari/miniconda3/envs/lerobot/lib/python3.10/site-packages/transformers/utils/hub.py:111: FutureWarnin -g: Using `TRANSFORMERS_CACHE` is deprecated and will be removed in v5 of Transformers. Use `HF_HOME` instead. - warnings.warn( -INFO 2025-09-09 15:50:52 ils/utils.py:48 Cuda backend detected, using cuda. -WARNING 2025-09-09 15:50:52 /policies.py:81 Device 'None' is not available. Switching to 'cuda'. -INFO 2025-09-09 15:50:52 ts/train.py:137 {'batch_size': 32, - 'dataset': {'episodes': None, - 'image_transforms': {'enable': False, - 'max_num_transforms': 3, - 'random_order': False, - 'tfs': {'brightness': {'kwargs': {'brightness': [0.8, - 1.2]}, - 'type': 'ColorJitter', - 'weight': 1.0}, - 'contrast': {'kwargs': {'contrast': [0.8, - 1.2]}, - 'type': 'ColorJitter', - 'weight': 1.0}, - 'hue': {'kwargs': {'hue': [-0.05, - 0.05]}, - 'type': 'ColorJitter', - 'weight': 1.0}, - 'saturation': {'kwargs': {'saturation': [0.5, - 1.5]}, - 'type': 'ColorJitter', - 'weight': 1.0}, - 'sharpness': {'kwargs': {'sharpness': [0.5, - 1.5]}, - 'type': 'SharpnessJitter', - 'weight': 1.0}}}, - 'repo_id': 'physical-intelligence/libero', - 'revision': None, - 'root': '/raid/jade/.cache/huggingface/datasets', - 'use_imagenet_stats': True, - 'video_backend': 'torchcodec'}, - 'env': {'camera_name': 'agentview_image,robot0_eye_in_hand_image', - 'episode_length': 520, - 'features': {'action': {'shape': [7], - 'type': }, - 'agent_pos': {'shape': [8], - 'type': }, - 'pixels/agentview_image': {'shape': [360, 360, 3], - 'type': }, - 'pixels/robot0_eye_in_hand_image': {'shape': [360, - 360, - 3], - 'type': }}, - 'features_map': {'action': 'action', - 'agent_pos': 'observation.state', - 'pixels/agentview_image': 'observation.images.image', - 'pixels/robot0_eye_in_hand_image': 'observation.images.image2'}, - 'fps': 30, - 'init_states': True, - 'max_parallel_tasks': 5, - 'multitask_eval': True, - 'obs_type': 'pixels_agent_pos', - 'render_mode': 'rgb_array', - 'task': 'libero_spatial', - 'type': 'libero'}, - 'eval': {'batch_size': 1, 'n_episodes': 1, 'use_async_envs': False}, - 'eval_freq': 0, - 'job_name': 'libero_smolvla', - 'log_freq': 200, - 'num_workers': 4, - 'optimizer': {'betas': [0.9, 0.95], - 'eps': 1e-08, - 'grad_clip_norm': 10, - 'lr': 0.0001, - 'type': 'adamw', - 'weight_decay': 1e-10}, - 'output_dir': '/raid/jade/logs/lerobot/lerobot_2_physical-intelligence_libero_smolvla_lr1e-4bs32steps100000', - 'policy': {'adapt_to_pi_aloha': False, - 'add_image_special_tokens': False, - 'attention_mode': 'cross_attn', - 'chunk_size': 50, - 'device': 'cuda', - 'empty_cameras': 0, - 'expert_width_multiplier': 0.5, - 'freeze_vision_encoder': True, - 'gradient_accumulation_steps': 1, - 'input_features': {}, - 'license': None, - 'load_vlm_weights': False, - 'max_action_dim': 32, - 'max_period': 4.0, - 'max_state_dim': 32, - 'min_period': 0.004, - 'n_action_steps': 1, - 'n_obs_steps': 1, - 'normalization_mapping': {'ACTION': , - 'STATE': , - 'VISUAL': }, - 'num_expert_layers': -1, - 'num_steps': 10, - 'num_vlm_layers': 16, - 'optimizer_betas': [0.9, 0.95], - 'optimizer_eps': 1e-08, - 'optimizer_grad_clip_norm': 10, - 'optimizer_lr': 0.0001, - 'optimizer_weight_decay': 1e-10, - 'output_features': {}, - 'pad_language_to': 'longest', - 'prefix_length': 0, - 'private': None, - 'push_to_hub': True, - 'repo_id': 'None', - 'resize_imgs_with_padding': [512, 512], - 'scheduler_decay_lr': 2.5e-06, - 'scheduler_decay_steps': 30000, - 'scheduler_warmup_steps': 1000, - 'self_attn_every_n_layers': 2, - 'tags': None, - 'tokenizer_max_length': 48, - 'train_expert_only': True, - 'train_state_proj': True, - 'type': 'smolvla', - 'use_amp': True, - 'use_cache': True, - 'use_delta_joint_actions_aloha': False, - 'vlm_model_name': 'HuggingFaceTB/SmolVLM2-500M-Instruct'}, - 'resume': False, - 'save_checkpoint': True, - 'save_freq': 20000, - 'scheduler': {'decay_lr': 2.5e-06, - 'num_decay_steps': 30000, - 'num_warmup_steps': 1000, - 'peak_lr': 0.0001, - 'type': 'cosine_decay_with_warmup'}, - 'seed': 1000, - 'steps': 100000, - 'use_policy_training_preset': True, - 'wandb': {'disable_artifact': False, - 'enable': False, - 'entity': None, - 'mode': None, - 'notes': None, - 'project': 'lerobot', - 'run_id': None}} -INFO 2025-09-09 15:50:52 ts/train.py:143 Logs will be saved locally. -INFO 2025-09-09 15:50:52 ts/train.py:153 Creating dataset -WARNING 2025-09-09 15:50:52 ts/utils.py:302 -The dataset you requested (physical-intelligence/libero) is in 2.0 format. -While current version of LeRobot is backward-compatible with it, the version of your dataset still uses global -stats instead of per-episode stats. Update your dataset stats to the new format using this command: -``` -python -m lerobot.datasets.v21.convert_dataset_v20_to_v21 --repo-id=physical-intelligence/libero -``` - -If you encounter a problem, contact LeRobot maintainers on [Discord](https://discord.com/invite/s3KuuzsPFb) -or open an [issue on GitHub](https://github.com/huggingface/lerobot/issues/new/choose). - -WARNING 2025-09-09 15:50:52 ts/utils.py:302 -The dataset you requested (physical-intelligence/libero) is in 2.0 format. -While current version of LeRobot is backward-compatible with it, the version of your dataset still uses global -stats instead of per-episode stats. Update your dataset stats to the new format using this command: -``` -python -m lerobot.datasets.v21.convert_dataset_v20_to_v21 --repo-id=physical-intelligence/libero -``` - -If you encounter a problem, contact LeRobot maintainers on [Discord](https://discord.com/invite/s3KuuzsPFb) -or open an [issue on GitHub](https://github.com/huggingface/lerobot/issues/new/choose). - -Resolving data files: 100%|β–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆ| 1693/1693 [00:00<00:00, 67057.8 -5it/s] -Loading dataset shards: 100%|β–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆ| 70/70 [00:00<00:00, 5343.9 -4it/s] -INFO 2025-09-09 15:50:57 ts/train.py:163 Creating policy -Fetching 2 files: 100%|β–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆ| 2/2 [00:00<00:00, 47393.2 -7it/s] -Fetching 2 files: 100%|β–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆ| 2/2 [00:00<00:00, 3797.4 -7it/s] -Fetching 2 files: 100%|β–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆ| 2/2 [00:00<00:00, 44384.1 -7it/s] -Fetching 2 files: 100%|β–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆ| 2/2 [00:00<00:00, 6533.1 -8it/s] -Reducing the number of VLM layers to 16 ... -INFO 2025-09-09 15:51:30 ts/train.py:168 Creating optimizer and scheduler -INFO 2025-09-09 15:51:30 ts/train.py:180 Output dir: /raid/jade/logs/lerobot/lerobot_2_physical-intelligence_libero_ -smolvla_lr1e-4bs32steps100000 -INFO 2025-09-09 15:51:30 ts/train.py:182 cfg.env.task='libero_spatial' -INFO 2025-09-09 15:51:30 ts/train.py:183 cfg.steps=100000 (100K) -INFO 2025-09-09 15:51:30 ts/train.py:184 dataset.num_frames=273465 (273K) -INFO 2025-09-09 15:51:30 ts/train.py:185 dataset.num_episodes=1693 -INFO 2025-09-09 15:51:30 ts/train.py:186 num_learnable_params=49103712 (49M) -INFO 2025-09-09 15:51:30 ts/train.py:187 num_total_params=399268924 (399M) -INFO 2025-09-09 15:51:30 ts/train.py:225 Start offline training on a fixed dataset -> /home/jade_choghari/lerobot/src/lerobot/scripts/train.py(230)train() --> train_tracker.dataloading_s = time.perf_counter() - start_time -(Pdb) batch.keys() -dict_keys(['image', 'wrist_image', 'state', 'actions', 'timestamp', 'frame_index', 'episode_index', 'index', 'task_i -ndex', 'task']) -(Pdb) policy.config.input_features -{'image': PolicyFeature(type=, shape=(3, 256, 256)), 'wrist_image': PolicyFeature(type -=, shape=(3, 256, 256))} -(Pdb) quit() -Traceback (most recent call last): - File "/home/jade_choghari/lerobot/src/lerobot/scripts/train.py", line 343, in - main() - File "/home/jade_choghari/lerobot/src/lerobot/scripts/train.py", line 339, in main - train() - File "/home/jade_choghari/lerobot/src/lerobot/configs/parser.py", line 225, in wrapper_inner - response = fn(cfg, *args, **kwargs) - File "/home/jade_choghari/lerobot/src/lerobot/scripts/train.py", line 230, in train - train_tracker.dataloading_s = time.perf_counter() - start_time - File "/home/jade_choghari/lerobot/src/lerobot/scripts/train.py", line 230, in train - train_tracker.dataloading_s = time.perf_counter() - start_time - File "/home/jade_choghari/miniconda3/envs/lerobot/lib/python3.10/bdb.py", line 90, in trace_dispatch - return self.dispatch_line(frame) - File "/home/jade_choghari/miniconda3/envs/lerobot/lib/python3.10/bdb.py", line 115, in dispatch_line - if self.quitting: raise BdbQuit -bdb.BdbQuit -clear -^[[A(lerobot) jade_choghari@hf-dgx-01:~/lerobot$ clear -(lerobot) jade_choghari@hf-dgx-01:~/lerobot$ bash examples/8_train_smolvla_must.sh -Training dir: /raid/jade/logs/lerobot/lerobot_2_physical-intelligence_libero_smolvla_lr1e-4bs32steps100000 -/home/jade_choghari/miniconda3/envs/lerobot/lib/python3.10/site-packages/transformers/utils/hub.py:111: FutureWarnin -g: Using `TRANSFORMERS_CACHE` is deprecated and will be removed in v5 of Transformers. Use `HF_HOME` instead. - warnings.warn( -INFO 2025-09-09 15:53:49 ils/utils.py:48 Cuda backend detected, using cuda. -WARNING 2025-09-09 15:53:49 /policies.py:81 Device 'None' is not available. Switching to 'cuda'. -INFO 2025-09-09 15:53:49 ts/train.py:137 {'batch_size': 32, - 'dataset': {'episodes': None, - 'image_transforms': {'enable': False, - 'max_num_transforms': 3, - 'random_order': False, - 'tfs': {'brightness': {'kwargs': {'brightness': [0.8, - 1.2]}, - 'type': 'ColorJitter', - 'weight': 1.0}, - 'contrast': {'kwargs': {'contrast': [0.8, - 1.2]}, - 'type': 'ColorJitter', - 'weight': 1.0}, - 'hue': {'kwargs': {'hue': [-0.05, - 0.05]}, - 'type': 'ColorJitter', - 'weight': 1.0}, - 'saturation': {'kwargs': {'saturation': [0.5, - 1.5]}, - 'type': 'ColorJitter', - 'weight': 1.0}, - 'sharpness': {'kwargs': {'sharpness': [0.5, - 1.5]}, - 'type': 'SharpnessJitter', - 'weight': 1.0}}}, - 'repo_id': 'physical-intelligence/libero', - 'revision': None, - 'root': '/raid/jade/.cache/huggingface/datasets', - 'use_imagenet_stats': True, - 'video_backend': 'torchcodec'}, - 'env': {'camera_name': 'agentview_image,robot0_eye_in_hand_image', - 'episode_length': 520, - 'features': {'action': {'shape': [7], - 'type': }, - 'agent_pos': {'shape': [8], - 'type': }, - 'pixels/agentview_image': {'shape': [360, 360, 3], - 'type': }, - 'pixels/robot0_eye_in_hand_image': {'shape': [360, - 360, - 3], - 'type': }}, - 'features_map': {'action': 'action', - 'agent_pos': 'observation.state', - 'pixels/agentview_image': 'observation.images.image', - 'pixels/robot0_eye_in_hand_image': 'observation.images.image2'}, - 'fps': 30, - 'init_states': True, - 'max_parallel_tasks': 5, - 'multitask_eval': True, - 'obs_type': 'pixels_agent_pos', - 'render_mode': 'rgb_array', - 'task': 'libero_spatial', - 'type': 'libero'}, - 'eval': {'batch_size': 1, 'n_episodes': 1, 'use_async_envs': False}, - 'eval_freq': 0, - 'job_name': 'libero_smolvla', - 'log_freq': 200, - 'num_workers': 4, - 'optimizer': {'betas': [0.9, 0.95], - 'eps': 1e-08, - 'grad_clip_norm': 10, - 'lr': 0.0001, - 'type': 'adamw', - 'weight_decay': 1e-10}, - 'output_dir': '/raid/jade/logs/lerobot/lerobot_2_physical-intelligence_libero_smolvla_lr1e-4bs32steps100000', - 'policy': {'adapt_to_pi_aloha': False, - 'add_image_special_tokens': False, - 'attention_mode': 'cross_attn', - 'chunk_size': 50, - 'device': 'cuda', - 'empty_cameras': 0, - 'expert_width_multiplier': 0.5, - 'freeze_vision_encoder': True, - 'gradient_accumulation_steps': 1, - 'input_features': {}, - 'license': None, - 'load_vlm_weights': False, - 'max_action_dim': 32, - 'max_period': 4.0, - 'max_state_dim': 32, - 'min_period': 0.004, - 'n_action_steps': 1, - 'n_obs_steps': 1, - 'normalization_mapping': {'ACTION': , - 'STATE': , - 'VISUAL': }, - 'num_expert_layers': -1, - 'num_steps': 10, - 'num_vlm_layers': 16, - 'optimizer_betas': [0.9, 0.95], - 'optimizer_eps': 1e-08, - 'optimizer_grad_clip_norm': 10, - 'optimizer_lr': 0.0001, - 'optimizer_weight_decay': 1e-10, - 'output_features': {}, - 'pad_language_to': 'longest', - 'prefix_length': 0, - 'private': None, - 'push_to_hub': True, - 'repo_id': 'None', - 'resize_imgs_with_padding': [512, 512], - 'scheduler_decay_lr': 2.5e-06, - 'scheduler_decay_steps': 30000, - 'scheduler_warmup_steps': 1000, - 'self_attn_every_n_layers': 2, - 'tags': None, - 'tokenizer_max_length': 48, - 'train_expert_only': True, - 'train_state_proj': True, - 'type': 'smolvla', - 'use_amp': True, - 'use_cache': True, - 'use_delta_joint_actions_aloha': False, - 'vlm_model_name': 'HuggingFaceTB/SmolVLM2-500M-Instruct'}, - 'resume': False, - 'save_checkpoint': True, - 'save_freq': 20000, - 'scheduler': {'decay_lr': 2.5e-06, - 'num_decay_steps': 30000, - 'num_warmup_steps': 1000, - 'peak_lr': 0.0001, - 'type': 'cosine_decay_with_warmup'}, - 'seed': 1000, - 'steps': 100000, - 'use_policy_training_preset': True, - 'wandb': {'disable_artifact': False, - 'enable': False, - 'entity': None, - 'mode': None, - 'notes': None, - 'project': 'lerobot', - 'run_id': None}} -INFO 2025-09-09 15:53:49 ts/train.py:143 Logs will be saved locally. -INFO 2025-09-09 15:53:49 ts/train.py:153 Creating dataset -WARNING 2025-09-09 15:53:49 ts/utils.py:302 -The dataset you requested (physical-intelligence/libero) is in 2.0 format. -While current version of LeRobot is backward-compatible with it, the version of your dataset still uses global -stats instead of per-episode stats. Update your dataset stats to the new format using this command: -``` -python -m lerobot.datasets.v21.convert_dataset_v20_to_v21 --repo-id=physical-intelligence/libero -``` - -If you encounter a problem, contact LeRobot maintainers on [Discord](https://discord.com/invite/s3KuuzsPFb) -or open an [issue on GitHub](https://github.com/huggingface/lerobot/issues/new/choose). - -WARNING 2025-09-09 15:53:49 ts/utils.py:302 -The dataset you requested (physical-intelligence/libero) is in 2.0 format. -While current version of LeRobot is backward-compatible with it, the version of your dataset still uses global -stats instead of per-episode stats. Update your dataset stats to the new format using this command: -``` -python -m lerobot.datasets.v21.convert_dataset_v20_to_v21 --repo-id=physical-intelligence/libero -``` - -If you encounter a problem, contact LeRobot maintainers on [Discord](https://discord.com/invite/s3KuuzsPFb) -or open an [issue on GitHub](https://github.com/huggingface/lerobot/issues/new/choose). - -Resolving data files: 100%|β–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆ| 1693/1693 [00:00<00:00, 34701.4 -4it/s] -Loading dataset shards: 100%|β–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆ| 70/70 [00:00<00:00, 5495.3 -7it/s] -INFO 2025-09-09 15:53:55 ts/train.py:163 Creating policy -Fetching 2 files: 100%|β–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆ| 2/2 [00:00<00:00, 41943.0 -4it/s] -Fetching 2 files: 100%|β–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆ| 2/2 [00:00<00:00, 5500.7 -3it/s] -Fetching 2 files: 100%|β–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆ| 2/2 [00:00<00:00, 2361.6 -6it/s] -Fetching 2 files: 100%|β–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆ| 2/2 [00:00<00:00, 5041.2 -3it/s] -Reducing the number of VLM layers to 16 ... -> /home/jade_choghari/lerobot/src/lerobot/policies/factory.py(173)make_policy() --> assert isinstance(policy, nn.Module) -(Pdb) features -{'image': PolicyFeature(type=, shape=(3, 256, 256)), 'wrist_image': PolicyFeature(type -=, shape=(3, 256, 256)), 'actions': PolicyFeature(type=, - shape=(7,))} -(Pdb) ds_meta.features -{'image': {'dtype': 'image', 'shape': (256, 256, 3), 'names': ['height', 'width', 'channel']}, 'wrist_image': {'dtyp -e': 'image', 'shape': (256, 256, 3), 'names': ['height', 'width', 'channel']}, 'state': {'dtype': 'float32', 'shape' -: (8,), 'names': ['state']}, 'actions': {'dtype': 'float32', 'shape': (7,), 'names': ['actions']}, 'timestamp': {'dt -ype': 'float32', 'shape': (1,), 'names': None}, 'frame_index': {'dtype': 'int64', 'shape': (1,), 'names': None}, 'ep -isode_index': {'dtype': 'int64', 'shape': (1,), 'names': None}, 'index': {'dtype': 'int64', 'shape': (1,), 'names': -None}, 'task_index': {'dtype': 'int64', 'shape': (1,), 'names': None}} -(Pdb) quit() - -Traceback (most recent call last): - File "/home/jade_choghari/lerobot/src/lerobot/scripts/train.py", line 343, in - main() - File "/home/jade_choghari/lerobot/src/lerobot/scripts/train.py", line 339, in main - train() - File "/home/jade_choghari/lerobot/src/lerobot/configs/parser.py", line 225, in wrapper_inner - response = fn(cfg, *args, **kwargs) - File "/home/jade_choghari/lerobot/src/lerobot/scripts/train.py", line 164, in train - policy = make_policy( - File "/home/jade_choghari/lerobot/src/lerobot/policies/factory.py", line 173, in make_policy - assert isinstance(policy, nn.Module) - File "/home/jade_choghari/lerobot/src/lerobot/policies/factory.py", line 173, in make_policy - assert isinstance(policy, nn.Module) - File "/home/jade_choghari/miniconda3/envs/lerobot/lib/python3.10/bdb.py", line 90, in trace_dispatch - return self.dispatch_line(frame) - File "/home/jade_choghari/miniconda3/envs/lerobot/lib/python3.10/bdb.py", line 115, in dispatch_line - if self.quitting: raise BdbQuit -bdb.BdbQuit -clear -(lerobot) jade_choghari@hf-dgx-01:~/lerobot$ clear -(lerobot) jade_choghari@hf-dgx-01:~/lerobot$ bash examples/8_train_smolvla_must.sh -Training dir: /raid/jade/logs/lerobot/lerobot_2_physical-intelligence_libero_smolvla_lr1e-4bs32steps100000 -/home/jade_choghari/miniconda3/envs/lerobot/lib/python3.10/site-packages/transformers/utils/hub.py:111: FutureWarnin -g: Using `TRANSFORMERS_CACHE` is deprecated and will be removed in v5 of Transformers. Use `HF_HOME` instead. - warnings.warn( -INFO 2025-09-09 15:56:35 ils/utils.py:48 Cuda backend detected, using cuda. -WARNING 2025-09-09 15:56:35 /policies.py:81 Device 'None' is not available. Switching to 'cuda'. -INFO 2025-09-09 15:56:35 ts/train.py:137 {'batch_size': 32, - 'dataset': {'episodes': None, - 'image_transforms': {'enable': False, - 'max_num_transforms': 3, - 'random_order': False, - 'tfs': {'brightness': {'kwargs': {'brightness': [0.8, - 1.2]}, - 'type': 'ColorJitter', - 'weight': 1.0}, - 'contrast': {'kwargs': {'contrast': [0.8, - 1.2]}, - 'type': 'ColorJitter', - 'weight': 1.0}, - 'hue': {'kwargs': {'hue': [-0.05, - 0.05]}, - 'type': 'ColorJitter', - 'weight': 1.0}, - 'saturation': {'kwargs': {'saturation': [0.5, - 1.5]}, - 'type': 'ColorJitter', - 'weight': 1.0}, - 'sharpness': {'kwargs': {'sharpness': [0.5, - 1.5]}, - 'type': 'SharpnessJitter', - 'weight': 1.0}}}, - 'repo_id': 'physical-intelligence/libero', - 'revision': None, - 'root': '/raid/jade/.cache/huggingface/datasets', - 'use_imagenet_stats': True, - 'video_backend': 'torchcodec'}, - 'env': {'camera_name': 'agentview_image,robot0_eye_in_hand_image', - 'episode_length': 520, - 'features': {'action': {'shape': [7], - 'type': }, - 'agent_pos': {'shape': [8], - 'type': }, - 'pixels/agentview_image': {'shape': [360, 360, 3], - 'type': }, - 'pixels/robot0_eye_in_hand_image': {'shape': [360, - 360, - 3], - 'type': }}, - 'features_map': {'action': 'action', - 'agent_pos': 'observation.state', - 'pixels/agentview_image': 'observation.images.image', - 'pixels/robot0_eye_in_hand_image': 'observation.images.image2'}, - 'fps': 30, - 'init_states': True, - 'max_parallel_tasks': 5, - 'multitask_eval': True, - 'obs_type': 'pixels_agent_pos', - 'render_mode': 'rgb_array', - 'task': 'libero_spatial', - 'type': 'libero'}, - 'eval': {'batch_size': 1, 'n_episodes': 1, 'use_async_envs': False}, - 'eval_freq': 0, - 'job_name': 'libero_smolvla', - 'log_freq': 200, - 'num_workers': 4, - 'optimizer': {'betas': [0.9, 0.95], - 'eps': 1e-08, - 'grad_clip_norm': 10, - 'lr': 0.0001, - 'type': 'adamw', - 'weight_decay': 1e-10}, - 'output_dir': '/raid/jade/logs/lerobot/lerobot_2_physical-intelligence_libero_smolvla_lr1e-4bs32steps100000', - 'policy': {'adapt_to_pi_aloha': False, - 'add_image_special_tokens': False, - 'attention_mode': 'cross_attn', - 'chunk_size': 50, - 'device': 'cuda', - 'empty_cameras': 0, - 'expert_width_multiplier': 0.5, - 'freeze_vision_encoder': True, - 'gradient_accumulation_steps': 1, - 'input_features': {}, - 'license': None, - 'load_vlm_weights': False, - 'max_action_dim': 32, - 'max_period': 4.0, - 'max_state_dim': 32, - 'min_period': 0.004, - 'n_action_steps': 1, - 'n_obs_steps': 1, - 'normalization_mapping': {'ACTION': , - 'STATE': , - 'VISUAL': }, - 'num_expert_layers': -1, - 'num_steps': 10, - 'num_vlm_layers': 16, - 'optimizer_betas': [0.9, 0.95], - 'optimizer_eps': 1e-08, - 'optimizer_grad_clip_norm': 10, - 'optimizer_lr': 0.0001, - 'optimizer_weight_decay': 1e-10, - 'output_features': {}, - 'pad_language_to': 'longest', - 'prefix_length': 0, - 'private': None, - 'push_to_hub': True, - 'repo_id': 'None', - 'resize_imgs_with_padding': [512, 512], - 'scheduler_decay_lr': 2.5e-06, - 'scheduler_decay_steps': 30000, - 'scheduler_warmup_steps': 1000, - 'self_attn_every_n_layers': 2, - 'tags': None, - 'tokenizer_max_length': 48, - 'train_expert_only': True, - 'train_state_proj': True, - 'type': 'smolvla', - 'use_amp': True, - 'use_cache': True, - 'use_delta_joint_actions_aloha': False, - 'vlm_model_name': 'HuggingFaceTB/SmolVLM2-500M-Instruct'}, - 'resume': False, - 'save_checkpoint': True, - 'save_freq': 20000, - 'scheduler': {'decay_lr': 2.5e-06, - 'num_decay_steps': 30000, - 'num_warmup_steps': 1000, - 'peak_lr': 0.0001, - 'type': 'cosine_decay_with_warmup'}, - 'seed': 1000, - 'steps': 100000, - 'use_policy_training_preset': True, - 'wandb': {'disable_artifact': False, - 'enable': False, - 'entity': None, - 'mode': None, - 'notes': None, - 'project': 'lerobot', - 'run_id': None}} -INFO 2025-09-09 15:56:35 ts/train.py:143 Logs will be saved locally. -INFO 2025-09-09 15:56:35 ts/train.py:153 Creating dataset -WARNING 2025-09-09 15:56:35 ts/utils.py:302 -The dataset you requested (physical-intelligence/libero) is in 2.0 format. -While current version of LeRobot is backward-compatible with it, the version of your dataset still uses global -stats instead of per-episode stats. Update your dataset stats to the new format using this command: -``` -python -m lerobot.datasets.v21.convert_dataset_v20_to_v21 --repo-id=physical-intelligence/libero -``` - -If you encounter a problem, contact LeRobot maintainers on [Discord](https://discord.com/invite/s3KuuzsPFb) -or open an [issue on GitHub](https://github.com/huggingface/lerobot/issues/new/choose). - -WARNING 2025-09-09 15:56:35 ts/utils.py:302 -The dataset you requested (physical-intelligence/libero) is in 2.0 format. -While current version of LeRobot is backward-compatible with it, the version of your dataset still uses global -stats instead of per-episode stats. Update your dataset stats to the new format using this command: -``` -python -m lerobot.datasets.v21.convert_dataset_v20_to_v21 --repo-id=physical-intelligence/libero -``` - -If you encounter a problem, contact LeRobot maintainers on [Discord](https://discord.com/invite/s3KuuzsPFb) -or open an [issue on GitHub](https://github.com/huggingface/lerobot/issues/new/choose). - -Resolving data files: 100%|β–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆ| 1693/1693 [00:00<00:00, 78132.9 -5it/s] -Loading dataset shards: 100%|β–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆ| 70/70 [00:00<00:00, 4716.0 -3it/s] -INFO 2025-09-09 15:56:40 ts/train.py:163 Creating policy -Fetching 2 files: 100%|β–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆ| 2/2 [00:00<00:00, 5259.3 -2it/s] -Fetching 2 files: 100%|β–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆ| 2/2 [00:00<00:00, 3477.8 -6it/s] -Fetching 2 files: 100%|β–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆ| 2/2 [00:00<00:00, 45343.8 -3it/s] -Fetching 2 files: 100%|β–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆ| 2/2 [00:00<00:00, 5551.6 -9it/s] -Reducing the number of VLM layers to 16 ... -> /home/jade_choghari/lerobot/src/lerobot/policies/factory.py(173)make_policy() --> assert isinstance(policy, nn.Module) -(Pdb) features -{'image': PolicyFeature(type=, shape=(3, 256, 256)), 'wrist_image': PolicyFeature(type -=, shape=(3, 256, 256)), 'state': PolicyFeature(type=, sha -pe=(8,)), 'actions': PolicyFeature(type=, shape=(7,))} -(Pdb) quit() -Traceback (most recent call last): - File "/home/jade_choghari/lerobot/src/lerobot/scripts/train.py", line 343, in - File "/home/jade_choghari/lerobot/src/lerobot/scripts/train.py", line 339, in main - - File "/home/jade_choghari/lerobot/src/lerobot/configs/parser.py", line 225, in wrapper_inner - response = fn(cfg, *args, **kwargs) - File "/home/jade_choghari/lerobot/src/lerobot/scripts/train.py", line 164, in train - policy = make_policy( - File "/home/jade_choghari/lerobot/src/lerobot/policies/factory.py", line 173, in make_policy - # policy = torch.compile(policy, mode="reduce-overhead") - File "/home/jade_choghari/lerobot/src/lerobot/policies/factory.py", line 173, in make_policy - # policy = torch.compile(policy, mode="reduce-overhead") - File "/home/jade_choghari/miniconda3/envs/lerobot/lib/python3.10/bdb.py", line 90, in trace_dispatch - return self.dispatch_line(frame) - File "/home/jade_choghari/miniconda3/envs/lerobot/lib/python3.10/bdb.py", line 115, in dispatch_line - if self.quitting: raise BdbQuit -bdb.BdbQuit -clear -(lerobot) jade_choghari@hf-dgx-01:~/lerobot$ clear -(lerobot) jade_choghari@hf-dgx-01:~/lerobot$ bash examples/8_train_smolvla_must.sh -Training dir: /raid/jade/logs/lerobot/lerobot_2_physical-intelligence_libero_smolvla_lr1e-4bs32steps100000 -/home/jade_choghari/miniconda3/envs/lerobot/lib/python3.10/site-packages/transformers/utils/hub.py:111: FutureWarnin -g: Using `TRANSFORMERS_CACHE` is deprecated and will be removed in v5 of Transformers. Use `HF_HOME` instead. - warnings.warn( -INFO 2025-09-09 15:58:35 ils/utils.py:48 Cuda backend detected, using cuda. -WARNING 2025-09-09 15:58:35 /policies.py:81 Device 'None' is not available. Switching to 'cuda'. -INFO 2025-09-09 15:58:35 ts/train.py:137 {'batch_size': 32, - 'dataset': {'episodes': None, - 'image_transforms': {'enable': False, - 'max_num_transforms': 3, - 'random_order': False, - 'tfs': {'brightness': {'kwargs': {'brightness': [0.8, - 1.2]}, - 'type': 'ColorJitter', - 'weight': 1.0}, - 'contrast': {'kwargs': {'contrast': [0.8, - 1.2]}, - 'type': 'ColorJitter', - 'weight': 1.0}, - 'hue': {'kwargs': {'hue': [-0.05, - 0.05]}, - 'type': 'ColorJitter', - 'weight': 1.0}, - 'saturation': {'kwargs': {'saturation': [0.5, - 1.5]}, - 'type': 'ColorJitter', - 'weight': 1.0}, - 'sharpness': {'kwargs': {'sharpness': [0.5, - 1.5]}, - 'type': 'SharpnessJitter', - 'weight': 1.0}}}, - 'repo_id': 'physical-intelligence/libero', - 'revision': None, - 'root': '/raid/jade/.cache/huggingface/datasets', - 'use_imagenet_stats': True, - 'video_backend': 'torchcodec'}, - 'env': {'camera_name': 'agentview_image,robot0_eye_in_hand_image', - 'episode_length': 520, - 'features': {'action': {'shape': [7], - 'type': }, - 'agent_pos': {'shape': [8], - 'type': }, - 'pixels/agentview_image': {'shape': [360, 360, 3], - 'type': }, - 'pixels/robot0_eye_in_hand_image': {'shape': [360, - 360, - 3], - 'type': }}, - 'features_map': {'action': 'action', - 'agent_pos': 'observation.state', - 'pixels/agentview_image': 'observation.images.image', - 'pixels/robot0_eye_in_hand_image': 'observation.images.image2'}, - 'fps': 30, - 'init_states': True, - 'max_parallel_tasks': 5, - 'multitask_eval': True, - 'obs_type': 'pixels_agent_pos', - 'render_mode': 'rgb_array', - 'task': 'libero_spatial', - 'type': 'libero'}, - 'eval': {'batch_size': 1, 'n_episodes': 1, 'use_async_envs': False}, - 'eval_freq': 0, - 'job_name': 'libero_smolvla', - 'log_freq': 200, - 'num_workers': 4, - 'optimizer': {'betas': [0.9, 0.95], - 'eps': 1e-08, - 'grad_clip_norm': 10, - 'lr': 0.0001, - 'type': 'adamw', - 'weight_decay': 1e-10}, - 'output_dir': '/raid/jade/logs/lerobot/lerobot_2_physical-intelligence_libero_smolvla_lr1e-4bs32steps100000', - 'policy': {'adapt_to_pi_aloha': False, - 'add_image_special_tokens': False, - 'attention_mode': 'cross_attn', - 'chunk_size': 50, - 'device': 'cuda', - 'empty_cameras': 0, - 'expert_width_multiplier': 0.5, - 'freeze_vision_encoder': True, - 'gradient_accumulation_steps': 1, - 'input_features': {}, - 'license': None, - 'load_vlm_weights': False, - 'max_action_dim': 32, - 'max_period': 4.0, - 'max_state_dim': 32, - 'min_period': 0.004, - 'n_action_steps': 1, - 'n_obs_steps': 1, - 'normalization_mapping': {'ACTION': , - 'STATE': , - 'VISUAL': }, - 'num_expert_layers': -1, - 'num_steps': 10, - 'num_vlm_layers': 16, - 'optimizer_betas': [0.9, 0.95], - 'optimizer_eps': 1e-08, - 'optimizer_grad_clip_norm': 10, - 'optimizer_lr': 0.0001, - 'optimizer_weight_decay': 1e-10, - 'output_features': {}, - 'pad_language_to': 'longest', - 'prefix_length': 0, - 'private': None, - 'push_to_hub': True, - 'repo_id': 'None', - 'resize_imgs_with_padding': [512, 512], - 'scheduler_decay_lr': 2.5e-06, - 'scheduler_decay_steps': 30000, - 'scheduler_warmup_steps': 1000, - 'self_attn_every_n_layers': 2, - 'tags': None, - 'tokenizer_max_length': 48, - 'train_expert_only': True, - 'train_state_proj': True, - 'type': 'smolvla', - 'use_amp': True, - 'use_cache': True, - 'use_delta_joint_actions_aloha': False, - 'vlm_model_name': 'HuggingFaceTB/SmolVLM2-500M-Instruct'}, - 'resume': False, - 'save_checkpoint': True, - 'save_freq': 20000, - 'scheduler': {'decay_lr': 2.5e-06, - 'num_decay_steps': 30000, - 'num_warmup_steps': 1000, - 'peak_lr': 0.0001, - 'type': 'cosine_decay_with_warmup'}, - 'seed': 1000, - 'steps': 100000, - 'use_policy_training_preset': True, - 'wandb': {'disable_artifact': False, - 'enable': False, - 'entity': None, - 'mode': None, - 'notes': None, - 'project': 'lerobot', - 'run_id': None}} -INFO 2025-09-09 15:58:35 ts/train.py:143 Logs will be saved locally. -INFO 2025-09-09 15:58:35 ts/train.py:153 Creating dataset -WARNING 2025-09-09 15:58:35 ts/utils.py:302 -The dataset you requested (physical-intelligence/libero) is in 2.0 format. -While current version of LeRobot is backward-compatible with it, the version of your dataset still uses global -stats instead of per-episode stats. Update your dataset stats to the new format using this command: -``` -python -m lerobot.datasets.v21.convert_dataset_v20_to_v21 --repo-id=physical-intelligence/libero -``` - -If you encounter a problem, contact LeRobot maintainers on [Discord](https://discord.com/invite/s3KuuzsPFb) -or open an [issue on GitHub](https://github.com/huggingface/lerobot/issues/new/choose). - -WARNING 2025-09-09 15:58:35 ts/utils.py:302 -The dataset you requested (physical-intelligence/libero) is in 2.0 format. -While current version of LeRobot is backward-compatible with it, the version of your dataset still uses global -stats instead of per-episode stats. Update your dataset stats to the new format using this command: -``` -python -m lerobot.datasets.v21.convert_dataset_v20_to_v21 --repo-id=physical-intelligence/libero -``` - -If you encounter a problem, contact LeRobot maintainers on [Discord](https://discord.com/invite/s3KuuzsPFb) -or open an [issue on GitHub](https://github.com/huggingface/lerobot/issues/new/choose). - -Resolving data files: 100%|β–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆ| 1693/1693 [00:00<00:00, 27666.4 -6it/s] -Loading dataset shards: 100%|β–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆ| 70/70 [00:00<00:00, 5305.7 -0it/s] -INFO 2025-09-09 15:58:41 ts/train.py:163 Creating policy -Fetching 2 files: 100%|β–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆ| 2/2 [00:00<00:00, 44384.1 -7it/s] -Fetching 2 files: 100%|β–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆ| 2/2 [00:00<00:00, 3192.0 -1it/s] -Fetching 2 files: 100%|β–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆ| 2/2 [00:00<00:00, 44620.2 -6it/s] -Fetching 2 files: 100%|β–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆ| 2/2 [00:00<00:00, 42799.0 -2it/s] -Reducing the number of VLM layers to 16 ... -INFO 2025-09-09 15:59:13 ts/train.py:168 Creating optimizer and scheduler -INFO 2025-09-09 15:59:13 ts/train.py:180 Output dir: /raid/jade/logs/lerobot/lerobot_2_physical-intelligence_libero_ -smolvla_lr1e-4bs32steps100000 -INFO 2025-09-09 15:59:13 ts/train.py:182 cfg.env.task='libero_spatial' -INFO 2025-09-09 15:59:13 ts/train.py:183 cfg.steps=100000 (100K) -INFO 2025-09-09 15:59:13 ts/train.py:184 dataset.num_frames=273465 (273K) -INFO 2025-09-09 15:59:13 ts/train.py:185 dataset.num_episodes=1693 -INFO 2025-09-09 15:59:13 ts/train.py:186 num_learnable_params=49103712 (49M) -INFO 2025-09-09 15:59:13 ts/train.py:187 num_total_params=399268940 (399M) -INFO 2025-09-09 15:59:13 ts/train.py:225 Start offline training on a fixed dataset -Traceback (most recent call last): - File "/home/jade_choghari/lerobot/src/lerobot/scripts/train.py", line 342, in - main() - File "/home/jade_choghari/lerobot/src/lerobot/scripts/train.py", line 338, in main - train() - File "/home/jade_choghari/lerobot/src/lerobot/configs/parser.py", line 225, in wrapper_inner - response = fn(cfg, *args, **kwargs) - File "/home/jade_choghari/lerobot/src/lerobot/scripts/train.py", line 235, in train - train_tracker, output_dict = update_policy( - File "/home/jade_choghari/lerobot/src/lerobot/scripts/train.py", line 71, in update_policy - loss, output_dict = policy.forward(batch) - File "/home/jade_choghari/lerobot/src/lerobot/policies/smolvla/modeling_smolvla.py", line 458, in forward - actions = self.prepare_action(batch) - File "/home/jade_choghari/lerobot/src/lerobot/policies/smolvla/modeling_smolvla.py", line 580, in prepare_action - actions = pad_vector(batch[ACTION], self.config.max_action_dim) -KeyError: 'action' -Exception in thread Thread-3 (_pin_memory_loop): -Traceback (most recent call last): - File "/home/jade_choghari/miniconda3/envs/lerobot/lib/python3.10/threading.py", line 1016, in _bootstrap_inner - self.run() - File "/home/jade_choghari/miniconda3/envs/lerobot/lib/python3.10/threading.py", line 953, in run - self._target(*self._args, **self._kwargs) - File "/home/jade_choghari/miniconda3/envs/lerobot/lib/python3.10/site-packages/torch/utils/data/_utils/pin_memory. -py", line 61, in _pin_memory_loop - do_one_step() - File "/home/jade_choghari/miniconda3/envs/lerobot/lib/python3.10/site-packages/torch/utils/data/_utils/pin_memory. -py", line 37, in do_one_step - r = in_queue.get(timeout=MP_STATUS_CHECK_INTERVAL) - File "/home/jade_choghari/miniconda3/envs/lerobot/lib/python3.10/multiprocessing/queues.py", line 122, in get - return _ForkingPickler.loads(res) - File "/home/jade_choghari/miniconda3/envs/lerobot/lib/python3.10/site-packages/torch/multiprocessing/reductions.py -", line 541, in rebuild_storage_fd - fd = df.detach() - File "/home/jade_choghari/miniconda3/envs/lerobot/lib/python3.10/multiprocessing/resource_sharer.py", line 57, in -detach - with _resource_sharer.get_connection(self._id) as conn: - File "/home/jade_choghari/miniconda3/envs/lerobot/lib/python3.10/multiprocessing/resource_sharer.py", line 86, in -get_connection - c = Client(address, authkey=process.current_process().authkey) - File "/home/jade_choghari/miniconda3/envs/lerobot/lib/python3.10/multiprocessing/connection.py", line 508, in Clie -nt - answer_challenge(c, authkey) - File "/home/jade_choghari/miniconda3/envs/lerobot/lib/python3.10/multiprocessing/connection.py", line 752, in answ -er_challenge - message = connection.recv_bytes(256) # reject large message - File "/home/jade_choghari/miniconda3/envs/lerobot/lib/python3.10/multiprocessing/connection.py", line 216, in recv -_bytes - buf = self._recv_bytes(maxlength) - File "/home/jade_choghari/miniconda3/envs/lerobot/lib/python3.10/multiprocessing/connection.py", line 414, in _rec -v_bytes - buf = self._recv(4) - File "/home/jade_choghari/miniconda3/envs/lerobot/lib/python3.10/multiprocessing/connection.py", line 379, in _rec -v - chunk = read(handle, remaining) -ConnectionResetError: [Errno 104] Connection reset by peer -(lerobot) jade_choghari@hf-dgx-01:~/lerobot$ clear -(lerobot) jade_choghari@hf-dgx-01:~/lerobot$ bash examples/8_train_smolvla_must.sh -Training dir: /raid/jade/logs/lerobot/lerobot_2_physical-intelligence_libero_smolvla_lr1e-4bs32steps100000 -/home/jade_choghari/miniconda3/envs/lerobot/lib/python3.10/site-packages/transformers/utils/hub.py:111: FutureWarnin -g: Using `TRANSFORMERS_CACHE` is deprecated and will be removed in v5 of Transformers. Use `HF_HOME` instead. - warnings.warn( -INFO 2025-09-09 15:59:53 ils/utils.py:48 Cuda backend detected, using cuda. -WARNING 2025-09-09 15:59:53 /policies.py:81 Device 'None' is not available. Switching to 'cuda'. -INFO 2025-09-09 15:59:53 ts/train.py:137 {'batch_size': 32, - 'dataset': {'episodes': None, - 'image_transforms': {'enable': False, - 'max_num_transforms': 3, - 'random_order': False, - 'tfs': {'brightness': {'kwargs': {'brightness': [0.8, - 1.2]}, - 'type': 'ColorJitter', - 'weight': 1.0}, - 'contrast': {'kwargs': {'contrast': [0.8, - 1.2]}, - 'type': 'ColorJitter', - 'weight': 1.0}, - 'hue': {'kwargs': {'hue': [-0.05, - 0.05]}, - 'type': 'ColorJitter', - 'weight': 1.0}, - 'saturation': {'kwargs': {'saturation': [0.5, - 1.5]}, - 'type': 'ColorJitter', - 'weight': 1.0}, - 'sharpness': {'kwargs': {'sharpness': [0.5, - 1.5]}, - 'type': 'SharpnessJitter', - 'weight': 1.0}}}, - 'repo_id': 'physical-intelligence/libero', - 'revision': None, - 'root': '/raid/jade/.cache/huggingface/datasets', - 'use_imagenet_stats': True, - 'video_backend': 'torchcodec'}, - 'env': {'camera_name': 'agentview_image,robot0_eye_in_hand_image', - 'episode_length': 520, - 'features': {'action': {'shape': [7], - 'type': }, - 'agent_pos': {'shape': [8], - 'type': }, - 'pixels/agentview_image': {'shape': [360, 360, 3], - 'type': }, - 'pixels/robot0_eye_in_hand_image': {'shape': [360, - 360, - 3], - 'type': }}, - 'features_map': {'action': 'action', - 'agent_pos': 'observation.state', - 'pixels/agentview_image': 'observation.images.image', - 'pixels/robot0_eye_in_hand_image': 'observation.images.image2'}, - 'fps': 30, - 'init_states': True, - 'max_parallel_tasks': 5, - 'multitask_eval': True, - 'obs_type': 'pixels_agent_pos', - 'render_mode': 'rgb_array', - 'task': 'libero_spatial', - 'type': 'libero'}, - 'eval': {'batch_size': 1, 'n_episodes': 1, 'use_async_envs': False}, - 'eval_freq': 0, - 'job_name': 'libero_smolvla', - 'log_freq': 200, - 'num_workers': 4, - 'optimizer': {'betas': [0.9, 0.95], - 'eps': 1e-08, - 'grad_clip_norm': 10, - 'lr': 0.0001, - 'type': 'adamw', - 'weight_decay': 1e-10}, - 'output_dir': '/raid/jade/logs/lerobot/lerobot_2_physical-intelligence_libero_smolvla_lr1e-4bs32steps100000', - 'policy': {'adapt_to_pi_aloha': False, - 'add_image_special_tokens': False, - 'attention_mode': 'cross_attn', - 'chunk_size': 50, - 'device': 'cuda', - 'empty_cameras': 0, - 'expert_width_multiplier': 0.5, - 'freeze_vision_encoder': True, - 'gradient_accumulation_steps': 1, - 'input_features': {}, - 'license': None, - 'load_vlm_weights': False, - 'max_action_dim': 32, - 'max_period': 4.0, - 'max_state_dim': 32, - 'min_period': 0.004, - 'n_action_steps': 1, - 'n_obs_steps': 1, - 'normalization_mapping': {'ACTION': , - 'STATE': , - 'VISUAL': }, - 'num_expert_layers': -1, - 'num_steps': 10, - 'num_vlm_layers': 16, - 'optimizer_betas': [0.9, 0.95], - 'optimizer_eps': 1e-08, - 'optimizer_grad_clip_norm': 10, - 'optimizer_lr': 0.0001, - 'optimizer_weight_decay': 1e-10, - 'output_features': {}, - 'pad_language_to': 'longest', - 'prefix_length': 0, - 'private': None, - 'push_to_hub': True, - 'repo_id': 'None', - 'resize_imgs_with_padding': [512, 512], - 'scheduler_decay_lr': 2.5e-06, - 'scheduler_decay_steps': 30000, - 'scheduler_warmup_steps': 1000, - 'self_attn_every_n_layers': 2, - 'tags': None, - 'tokenizer_max_length': 48, - 'train_expert_only': True, - 'train_state_proj': True, - 'type': 'smolvla', - 'use_amp': True, - 'use_cache': True, - 'use_delta_joint_actions_aloha': False, - 'vlm_model_name': 'HuggingFaceTB/SmolVLM2-500M-Instruct'}, - 'resume': False, - 'save_checkpoint': True, - 'save_freq': 20000, - 'scheduler': {'decay_lr': 2.5e-06, - 'num_decay_steps': 30000, - 'num_warmup_steps': 1000, - 'peak_lr': 0.0001, - 'type': 'cosine_decay_with_warmup'}, - 'seed': 1000, - 'steps': 100000, - 'use_policy_training_preset': True, - 'wandb': {'disable_artifact': False, - 'enable': False, - 'entity': None, - 'mode': None, - 'notes': None, - 'project': 'lerobot', - 'run_id': None}} -INFO 2025-09-09 15:59:53 ts/train.py:143 Logs will be saved locally. -INFO 2025-09-09 15:59:53 ts/train.py:153 Creating dataset -WARNING 2025-09-09 15:59:53 ts/utils.py:302 -The dataset you requested (physical-intelligence/libero) is in 2.0 format. -While current version of LeRobot is backward-compatible with it, the version of your dataset still uses global -stats instead of per-episode stats. Update your dataset stats to the new format using this command: -``` -python -m lerobot.datasets.v21.convert_dataset_v20_to_v21 --repo-id=physical-intelligence/libero -``` - -If you encounter a problem, contact LeRobot maintainers on [Discord](https://discord.com/invite/s3KuuzsPFb) -or open an [issue on GitHub](https://github.com/huggingface/lerobot/issues/new/choose). - -WARNING 2025-09-09 15:59:53 ts/utils.py:302 -The dataset you requested (physical-intelligence/libero) is in 2.0 format. -While current version of LeRobot is backward-compatible with it, the version of your dataset still uses global -stats instead of per-episode stats. Update your dataset stats to the new format using this command: -``` -python -m lerobot.datasets.v21.convert_dataset_v20_to_v21 --repo-id=physical-intelligence/libero -``` - -If you encounter a problem, contact LeRobot maintainers on [Discord](https://discord.com/invite/s3KuuzsPFb) -or open an [issue on GitHub](https://github.com/huggingface/lerobot/issues/new/choose). - -Resolving data files: 100%|β–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆ| 1693/1693 [00:00<00:00, 72147.3 -3it/s] -Loading dataset shards: 100%|β–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆ| 70/70 [00:00<00:00, 5076.7 -1it/s] -INFO 2025-09-09 15:59:58 ts/train.py:163 Creating policy -Fetching 2 files: 100%|β–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆ| 2/2 [00:00<00:00, 6096.3 -7it/s] -Fetching 2 files: 100%|β–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆ| 2/2 [00:00<00:00, 4348.6 -8it/s] -Fetching 2 files: 100%|β–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆ| 2/2 [00:00<00:00, 46091.2 -5it/s] -Fetching 2 files: 100%|β–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆ| 2/2 [00:00<00:00, 3225.1 -5it/s] -Reducing the number of VLM layers to 16 ... -INFO 2025-09-09 16:00:31 ts/train.py:168 Creating optimizer and scheduler -INFO 2025-09-09 16:00:31 ts/train.py:180 Output dir: /raid/jade/logs/lerobot/lerobot_2_physical-intelligence_libero_ -smolvla_lr1e-4bs32steps100000 -INFO 2025-09-09 16:00:31 ts/train.py:182 cfg.env.task='libero_spatial' -INFO 2025-09-09 16:00:31 ts/train.py:183 cfg.steps=100000 (100K) -INFO 2025-09-09 16:00:31 ts/train.py:184 dataset.num_frames=273465 (273K) -INFO 2025-09-09 16:00:31 ts/train.py:185 dataset.num_episodes=1693 -INFO 2025-09-09 16:00:31 ts/train.py:186 num_learnable_params=49103712 (49M) -INFO 2025-09-09 16:00:31 ts/train.py:187 num_total_params=399268940 (399M) -INFO 2025-09-09 16:00:31 ts/train.py:225 Start offline training on a fixed dataset -Traceback (most recent call last): - File "/home/jade_choghari/lerobot/src/lerobot/scripts/train.py", line 342, in - main() - File "/home/jade_choghari/lerobot/src/lerobot/scripts/train.py", line 338, in main - train() - File "/home/jade_choghari/lerobot/src/lerobot/configs/parser.py", line 225, in wrapper_inner - response = fn(cfg, *args, **kwargs) - File "/home/jade_choghari/lerobot/src/lerobot/scripts/train.py", line 235, in train - train_tracker, output_dict = update_policy( - File "/home/jade_choghari/lerobot/src/lerobot/scripts/train.py", line 71, in update_policy - loss, output_dict = policy.forward(batch) - File "/home/jade_choghari/lerobot/src/lerobot/policies/smolvla/modeling_smolvla.py", line 461, in forward - losses = self.model.forward(images, img_masks, lang_tokens, lang_masks, state, actions, noise, time) - File "/home/jade_choghari/lerobot/src/lerobot/policies/smolvla/modeling_smolvla.py", line 850, in forward - att_2d_masks = make_att_2d_masks(pad_masks, att_masks) - File "/home/jade_choghari/lerobot/src/lerobot/policies/smolvla/modeling_smolvla.py", line 226, in make_att_2d_mask -s - att_2d_masks = att_2d_masks & pad_2d_masks -RuntimeError: The size of tensor a (199) must match the size of tensor b (181) at non-singleton dimension 2 -(lerobot) jade_choghari@hf-dgx-01:~/lerobot$ clear -(lerobot) jade_choghari@hf-dgx-01:~/lerobot$ bash examples/8_train_smolvla_must.sh -Training dir: /raid/jade/logs/lerobot/lerobot_2_physical-intelligence_libero_smolvla_lr1e-4bs32steps100000 -/home/jade_choghari/miniconda3/envs/lerobot/lib/python3.10/site-packages/transformers/utils/hub.py:111: FutureWarnin -g: Using `TRANSFORMERS_CACHE` is deprecated and will be removed in v5 of Transformers. Use `HF_HOME` instead. - warnings.warn( -INFO 2025-09-09 16:10:03 ils/utils.py:48 Cuda backend detected, using cuda. -WARNING 2025-09-09 16:10:03 /policies.py:81 Device 'None' is not available. Switching to 'cuda'. -INFO 2025-09-09 16:10:03 ts/train.py:137 {'batch_size': 32, - 'dataset': {'episodes': None, - 'image_transforms': {'enable': False, - 'max_num_transforms': 3, - 'random_order': False, - 'tfs': {'brightness': {'kwargs': {'brightness': [0.8, - 1.2]}, - 'type': 'ColorJitter', - 'weight': 1.0}, - 'contrast': {'kwargs': {'contrast': [0.8, - 1.2]}, - 'type': 'ColorJitter', - 'weight': 1.0}, - 'hue': {'kwargs': {'hue': [-0.05, - 0.05]}, - 'type': 'ColorJitter', - 'weight': 1.0}, - 'saturation': {'kwargs': {'saturation': [0.5, - 1.5]}, - 'type': 'ColorJitter', - 'weight': 1.0}, - 'sharpness': {'kwargs': {'sharpness': [0.5, - 1.5]}, - 'type': 'SharpnessJitter', - 'weight': 1.0}}}, - 'repo_id': 'physical-intelligence/libero', - 'revision': None, - 'root': '/raid/jade/.cache/huggingface/datasets', - 'use_imagenet_stats': True, - 'video_backend': 'torchcodec'}, - 'env': {'camera_name': 'agentview_image,robot0_eye_in_hand_image', - 'episode_length': 520, - 'features': {'action': {'shape': [7], - 'type': }, - 'agent_pos': {'shape': [8], - 'type': }, - 'pixels/agentview_image': {'shape': [360, 360, 3], - 'type': }, - 'pixels/robot0_eye_in_hand_image': {'shape': [360, - 360, - 3], - 'type': }}, - 'features_map': {'action': 'action', - 'agent_pos': 'observation.state', - 'pixels/agentview_image': 'observation.images.image', - 'pixels/robot0_eye_in_hand_image': 'observation.images.image2'}, - 'fps': 30, - 'init_states': True, - 'max_parallel_tasks': 5, - 'multitask_eval': True, - 'obs_type': 'pixels_agent_pos', - 'render_mode': 'rgb_array', - 'task': 'libero_spatial', - 'type': 'libero'}, - 'eval': {'batch_size': 1, 'n_episodes': 1, 'use_async_envs': False}, - 'eval_freq': 0, - 'job_name': 'libero_smolvla', - 'log_freq': 200, - 'num_workers': 4, - 'optimizer': {'betas': [0.9, 0.95], - 'eps': 1e-08, - 'grad_clip_norm': 10, - 'lr': 0.0001, - 'type': 'adamw', - 'weight_decay': 1e-10}, - 'output_dir': '/raid/jade/logs/lerobot/lerobot_2_physical-intelligence_libero_smolvla_lr1e-4bs32steps100000', - 'policy': {'adapt_to_pi_aloha': False, - 'add_image_special_tokens': False, - 'attention_mode': 'cross_attn', - 'chunk_size': 50, - 'device': 'cuda', - 'empty_cameras': 0, - 'expert_width_multiplier': 0.5, - 'freeze_vision_encoder': True, - 'gradient_accumulation_steps': 1, - 'input_features': {}, - 'license': None, - 'load_vlm_weights': False, - 'max_action_dim': 32, - 'max_period': 4.0, - 'max_state_dim': 32, - 'min_period': 0.004, - 'n_action_steps': 1, - 'n_obs_steps': 1, - 'normalization_mapping': {'ACTION': , - 'STATE': , - 'VISUAL': }, - 'num_expert_layers': -1, - 'num_steps': 10, - 'num_vlm_layers': 16, - 'optimizer_betas': [0.9, 0.95], - 'optimizer_eps': 1e-08, - 'optimizer_grad_clip_norm': 10, - 'optimizer_lr': 0.0001, - 'optimizer_weight_decay': 1e-10, - 'output_features': {}, - 'pad_language_to': 'longest', - 'prefix_length': 0, - 'private': None, - 'push_to_hub': True, - 'repo_id': 'None', - 'resize_imgs_with_padding': [512, 512], - 'scheduler_decay_lr': 2.5e-06, - 'scheduler_decay_steps': 30000, - 'scheduler_warmup_steps': 1000, - 'self_attn_every_n_layers': 2, - 'tags': None, - 'tokenizer_max_length': 48, - 'train_expert_only': True, - 'train_state_proj': True, - 'type': 'smolvla', - 'use_amp': True, - 'use_cache': True, - 'use_delta_joint_actions_aloha': False, - 'vlm_model_name': 'HuggingFaceTB/SmolVLM2-500M-Instruct'}, - 'resume': False, - 'save_checkpoint': True, - 'save_freq': 20000, - 'scheduler': {'decay_lr': 2.5e-06, - 'num_decay_steps': 30000, - 'num_warmup_steps': 1000, - 'peak_lr': 0.0001, - 'type': 'cosine_decay_with_warmup'}, - 'seed': 1000, - 'steps': 100000, - 'use_policy_training_preset': True, - 'wandb': {'disable_artifact': False, - 'enable': False, - 'entity': None, - 'mode': None, - 'notes': None, - 'project': 'lerobot', - 'run_id': None}} -INFO 2025-09-09 16:10:03 ts/train.py:143 Logs will be saved locally. -INFO 2025-09-09 16:10:03 ts/train.py:153 Creating dataset -WARNING 2025-09-09 16:10:03 ts/utils.py:302 -The dataset you requested (physical-intelligence/libero) is in 2.0 format. -While current version of LeRobot is backward-compatible with it, the version of your dataset still uses global -stats instead of per-episode stats. Update your dataset stats to the new format using this command: -``` -python -m lerobot.datasets.v21.convert_dataset_v20_to_v21 --repo-id=physical-intelligence/libero -``` - -If you encounter a problem, contact LeRobot maintainers on [Discord](https://discord.com/invite/s3KuuzsPFb) -or open an [issue on GitHub](https://github.com/huggingface/lerobot/issues/new/choose). - -WARNING 2025-09-09 16:10:03 ts/utils.py:302 -The dataset you requested (physical-intelligence/libero) is in 2.0 format. -While current version of LeRobot is backward-compatible with it, the version of your dataset still uses global -stats instead of per-episode stats. Update your dataset stats to the new format using this command: -``` -python -m lerobot.datasets.v21.convert_dataset_v20_to_v21 --repo-id=physical-intelligence/libero -``` - -If you encounter a problem, contact LeRobot maintainers on [Discord](https://discord.com/invite/s3KuuzsPFb) -or open an [issue on GitHub](https://github.com/huggingface/lerobot/issues/new/choose). - -Resolving data files: 100%|β–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆ| 1693/1693 [00:00<00:00, 54574.89it/s] -Loading dataset shards: 100%|β–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆ| 70/70 [00:00<00:00, 7567.63it/s] -INFO 2025-09-09 16:10:09 ts/train.py:163 Creating policy -Fetching 2 files: 100%|β–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆ| 2/2 [00:00<00:00, 40721.40it/s] -Fetching 2 files: 100%|β–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆ| 2/2 [00:00<00:00, 7516.67it/s] -Fetching 2 files: 100%|β–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆ| 2/2 [00:00<00:00, 3158.36it/s] -Fetching 2 files: 100%|β–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆ| 2/2 [00:00<00:00, 6775.94it/s] -Reducing the number of VLM layers to 16 ... -INFO 2025-09-09 16:10:41 ts/train.py:168 Creating optimizer and scheduler -INFO 2025-09-09 16:10:41 ts/train.py:180 Output dir: /raid/jade/logs/lerobot/lerobot_2_physical-intelligence_libero_ -smolvla_lr1e-4bs32steps100000 -INFO 2025-09-09 16:10:41 ts/train.py:182 cfg.env.task='libero_spatial' -INFO 2025-09-09 16:10:41 ts/train.py:183 cfg.steps=100000 (100K) -INFO 2025-09-09 16:10:41 ts/train.py:184 dataset.num_frames=273465 (273K) -INFO 2025-09-09 16:10:41 ts/train.py:185 dataset.num_episodes=1693 -INFO 2025-09-09 16:10:41 ts/train.py:186 num_learnable_params=49103712 (49M) -INFO 2025-09-09 16:10:41 ts/train.py:187 num_total_params=399268940 (399M) -INFO 2025-09-09 16:10:41 ts/train.py:225 Start offline training on a fixed dataset -Traceback (most recent call last): - File "/home/jade_choghari/lerobot/src/lerobot/scripts/train.py", line 342, in - main() - File "/home/jade_choghari/lerobot/src/lerobot/scripts/train.py", line 338, in main - train() - File "/home/jade_choghari/lerobot/src/lerobot/configs/parser.py", line 225, in wrapper_inner - response = fn(cfg, *args, **kwargs) - File "/home/jade_choghari/lerobot/src/lerobot/scripts/train.py", line 235, in train - train_tracker, output_dict = update_policy( - File "/home/jade_choghari/lerobot/src/lerobot/scripts/train.py", line 76, in update_policy - grad_scaler.unscale_(optimizer) - File "/home/jade_choghari/miniconda3/envs/lerobot/lib/python3.10/site-packages/torch/amp/grad_scaler.py", line 342 -, in unscale_ - optimizer_state["found_inf_per_device"] = self._unscale_grads_( - File "/home/jade_choghari/miniconda3/envs/lerobot/lib/python3.10/site-packages/torch/amp/grad_scaler.py", line 283 -, in _unscale_grads_ - torch._amp_foreach_non_finite_check_and_unscale_( -RuntimeError: "_amp_foreach_non_finite_check_and_unscale_cuda" not implemented for 'BFloat16' -(lerobot) jade_choghari@hf-dgx-01:~/lerobot$ clear -(lerobot) jade_choghari@hf-dgx-01:~/lerobot$ bash examples/8_train_smolvla_must.sh -Training dir: /raid/jade/logs/lerobot/lerobot_2_physical-intelligence_libero_smolvla_lr1e-4bs32steps100000 -/home/jade_choghari/miniconda3/envs/lerobot/lib/python3.10/site-packages/transformers/utils/hub.py:111: FutureWarnin -g: Using `TRANSFORMERS_CACHE` is deprecated and will be removed in v5 of Transformers. Use `HF_HOME` instead. - warnings.warn( -INFO 2025-09-09 16:12:28 ils/utils.py:48 Cuda backend detected, using cuda. -WARNING 2025-09-09 16:12:28 /policies.py:81 Device 'None' is not available. Switching to 'cuda'. -INFO 2025-09-09 16:12:28 ts/train.py:137 {'batch_size': 32, - 'dataset': {'episodes': None, - 'image_transforms': {'enable': False, - 'max_num_transforms': 3, - 'random_order': False, - 'tfs': {'brightness': {'kwargs': {'brightness': [0.8, - 1.2]}, - 'type': 'ColorJitter', - 'weight': 1.0}, - 'contrast': {'kwargs': {'contrast': [0.8, - 1.2]}, - 'type': 'ColorJitter', - 'weight': 1.0}, - 'hue': {'kwargs': {'hue': [-0.05, - 0.05]}, - 'type': 'ColorJitter', - 'weight': 1.0}, - 'saturation': {'kwargs': {'saturation': [0.5, - 1.5]}, - 'type': 'ColorJitter', - 'weight': 1.0}, - 'sharpness': {'kwargs': {'sharpness': [0.5, - 1.5]}, - 'type': 'SharpnessJitter', - 'weight': 1.0}}}, - 'repo_id': 'physical-intelligence/libero', - 'revision': None, - 'root': '/raid/jade/.cache/huggingface/datasets', - 'use_imagenet_stats': True, - 'video_backend': 'torchcodec'}, - 'env': {'camera_name': 'agentview_image,robot0_eye_in_hand_image', - 'episode_length': 520, - 'features': {'action': {'shape': [7], - 'type': }, - 'agent_pos': {'shape': [8], - 'type': }, - 'pixels/agentview_image': {'shape': [360, 360, 3], - 'type': }, - 'pixels/robot0_eye_in_hand_image': {'shape': [360, - 360, - 3], - 'type': }}, - 'features_map': {'action': 'action', - 'agent_pos': 'observation.state', - 'pixels/agentview_image': 'observation.images.image', - 'pixels/robot0_eye_in_hand_image': 'observation.images.image2'}, - 'fps': 30, - 'init_states': True, - 'max_parallel_tasks': 5, - 'multitask_eval': True, - 'obs_type': 'pixels_agent_pos', - 'render_mode': 'rgb_array', - 'task': 'libero_spatial', - 'type': 'libero'}, - 'eval': {'batch_size': 1, 'n_episodes': 1, 'use_async_envs': False}, - 'eval_freq': 0, - 'job_name': 'libero_smolvla', - 'log_freq': 200, - 'num_workers': 4, - 'optimizer': {'betas': [0.9, 0.95], - 'eps': 1e-08, - 'grad_clip_norm': 10, - 'lr': 0.0001, - 'type': 'adamw', - 'weight_decay': 1e-10}, - 'output_dir': '/raid/jade/logs/lerobot/lerobot_2_physical-intelligence_libero_smolvla_lr1e-4bs32steps100000', - 'policy': {'adapt_to_pi_aloha': False, - 'add_image_special_tokens': False, - 'attention_mode': 'cross_attn', - 'chunk_size': 50, - 'device': 'cuda', - 'empty_cameras': 0, - 'expert_width_multiplier': 0.5, - 'freeze_vision_encoder': True, - 'gradient_accumulation_steps': 1, - 'input_features': {}, - 'license': None, - 'load_vlm_weights': False, - 'max_action_dim': 32, - 'max_period': 4.0, - 'max_state_dim': 32, - 'min_period': 0.004, - 'n_action_steps': 1, - 'n_obs_steps': 1, - 'normalization_mapping': {'ACTION': , - 'STATE': , - 'VISUAL': }, - 'num_expert_layers': -1, - 'num_steps': 10, - 'num_vlm_layers': 16, - 'optimizer_betas': [0.9, 0.95], - 'optimizer_eps': 1e-08, - 'optimizer_grad_clip_norm': 10, - 'optimizer_lr': 0.0001, - 'optimizer_weight_decay': 1e-10, - 'output_features': {}, - 'pad_language_to': 'longest', - 'prefix_length': 0, - 'private': None, - 'push_to_hub': True, - 'repo_id': 'None', - 'resize_imgs_with_padding': [512, 512], - 'scheduler_decay_lr': 2.5e-06, - 'scheduler_decay_steps': 30000, - 'scheduler_warmup_steps': 1000, - 'self_attn_every_n_layers': 2, - 'tags': None, - 'tokenizer_max_length': 48, - 'train_expert_only': True, - 'train_state_proj': True, - 'type': 'smolvla', - 'use_amp': True, - 'use_cache': True, - 'use_delta_joint_actions_aloha': False, - 'vlm_model_name': 'HuggingFaceTB/SmolVLM2-500M-Instruct'}, - 'resume': False, - 'save_checkpoint': True, - 'save_freq': 20000, - 'scheduler': {'decay_lr': 2.5e-06, - 'num_decay_steps': 30000, - 'num_warmup_steps': 1000, - 'peak_lr': 0.0001, - 'type': 'cosine_decay_with_warmup'}, - 'seed': 1000, - 'steps': 100000, - 'use_policy_training_preset': True, - 'wandb': {'disable_artifact': False, - 'enable': False, - 'entity': None, - 'mode': None, - 'notes': None, - 'project': 'lerobot', - 'run_id': None}} -INFO 2025-09-09 16:12:28 ts/train.py:143 Logs will be saved locally. -INFO 2025-09-09 16:12:28 ts/train.py:153 Creating dataset -WARNING 2025-09-09 16:12:28 ts/utils.py:302 -The dataset you requested (physical-intelligence/libero) is in 2.0 format. -While current version of LeRobot is backward-compatible with it, the version of your dataset still uses global -stats instead of per-episode stats. Update your dataset stats to the new format using this command: -``` -python -m lerobot.datasets.v21.convert_dataset_v20_to_v21 --repo-id=physical-intelligence/libero -``` - -If you encounter a problem, contact LeRobot maintainers on [Discord](https://discord.com/invite/s3KuuzsPFb) -or open an [issue on GitHub](https://github.com/huggingface/lerobot/issues/new/choose). - -WARNING 2025-09-09 16:12:28 ts/utils.py:302 -The dataset you requested (physical-intelligence/libero) is in 2.0 format. -While current version of LeRobot is backward-compatible with it, the version of your dataset still uses global -stats instead of per-episode stats. Update your dataset stats to the new format using this command: -``` -python -m lerobot.datasets.v21.convert_dataset_v20_to_v21 --repo-id=physical-intelligence/libero -``` - -If you encounter a problem, contact LeRobot maintainers on [Discord](https://discord.com/invite/s3KuuzsPFb) -or open an [issue on GitHub](https://github.com/huggingface/lerobot/issues/new/choose). - -Resolving data files: 100%|β–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆ| 1693/1693 [00:00<00:00, 87666.13it/s] -Loading dataset shards: 100%|β–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆ| 70/70 [00:00<00:00, 4223.20it/s] -INFO 2025-09-09 16:12:34 ts/train.py:163 Creating policy -Fetching 2 files: 100%|β–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆ| 2/2 [00:00<00:00, 43690.67it/s] -Fetching 2 files: 100%|β–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆ| 2/2 [00:00<00:00, 4871.43it/s] -Fetching 2 files: 100%|β–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆ| 2/2 [00:00<00:00, 6512.89it/s] -Fetching 2 files: 100%|β–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆ| 2/2 [00:00<00:00, 43018.50it/s] -Reducing the number of VLM layers to 16 ... -INFO 2025-09-09 16:13:06 ts/train.py:168 Creating optimizer and scheduler -INFO 2025-09-09 16:13:06 ts/train.py:180 Output dir: /raid/jade/logs/lerobot/lerobot_2_physical-intelligence_libero_ -smolvla_lr1e-4bs32steps100000 -INFO 2025-09-09 16:13:06 ts/train.py:182 cfg.env.task='libero_spatial' -INFO 2025-09-09 16:13:06 ts/train.py:183 cfg.steps=100000 (100K) -INFO 2025-09-09 16:13:06 ts/train.py:184 dataset.num_frames=273465 (273K) -INFO 2025-09-09 16:13:06 ts/train.py:185 dataset.num_episodes=1693 -INFO 2025-09-09 16:13:06 ts/train.py:186 num_learnable_params=49103712 (49M) -INFO 2025-09-09 16:13:06 ts/train.py:187 num_total_params=399268940 (399M) -INFO 2025-09-09 16:13:06 ts/train.py:225 Start offline training on a fixed dataset -Traceback (most recent call last): - File "/home/jade_choghari/lerobot/src/lerobot/scripts/train.py", line 342, in - main() - File "/home/jade_choghari/lerobot/src/lerobot/scripts/train.py", line 338, in main - train() - File "/home/jade_choghari/lerobot/src/lerobot/configs/parser.py", line 225, in wrapper_inner - response = fn(cfg, *args, **kwargs) - File "/home/jade_choghari/lerobot/src/lerobot/scripts/train.py", line 235, in train - train_tracker, output_dict = update_policy( - File "/home/jade_choghari/lerobot/src/lerobot/scripts/train.py", line 76, in update_policy - grad_scaler.unscale_(optimizer) - File "/home/jade_choghari/miniconda3/envs/lerobot/lib/python3.10/site-packages/torch/amp/grad_scaler.py", line 342 -, in unscale_ - optimizer_state["found_inf_per_device"] = self._unscale_grads_( - File "/home/jade_choghari/miniconda3/envs/lerobot/lib/python3.10/site-packages/torch/amp/grad_scaler.py", line 283 -, in _unscale_grads_ - torch._amp_foreach_non_finite_check_and_unscale_( -RuntimeError: "_amp_foreach_non_finite_check_and_unscale_cuda" not implemented for 'BFloat16' -(lerobot) jade_choghari@hf-dgx-01:~/lerobot$ bash examples/8_train_smolvla_must.sh -Training dir: /raid/jade/logs/lerobot/lerobot_2_physical-intelligence_libero_smolvla_lr1e-4bs32steps100000 -/home/jade_choghari/miniconda3/envs/lerobot/lib/python3.10/site-packages/transformers/utils/hub.py:111: FutureWarnin -g: Using `TRANSFORMERS_CACHE` is deprecated and will be removed in v5 of Transformers. Use `HF_HOME` instead. - warnings.warn( -INFO 2025-09-09 16:13:51 ils/utils.py:48 Cuda backend detected, using cuda. -WARNING 2025-09-09 16:13:51 /policies.py:81 Device 'None' is not available. Switching to 'cuda'. -INFO 2025-09-09 16:13:51 ts/train.py:137 {'batch_size': 32, - 'dataset': {'episodes': None, - 'image_transforms': {'enable': False, - 'max_num_transforms': 3, - 'random_order': False, - 'tfs': {'brightness': {'kwargs': {'brightness': [0.8, - 1.2]}, - 'type': 'ColorJitter', - 'weight': 1.0}, - 'contrast': {'kwargs': {'contrast': [0.8, - 1.2]}, - 'type': 'ColorJitter', - 'weight': 1.0}, - 'hue': {'kwargs': {'hue': [-0.05, - 0.05]}, - 'type': 'ColorJitter', - 'weight': 1.0}, - 'saturation': {'kwargs': {'saturation': [0.5, - 1.5]}, - 'type': 'ColorJitter', - 'weight': 1.0}, - 'sharpness': {'kwargs': {'sharpness': [0.5, - 1.5]}, - 'type': 'SharpnessJitter', - 'weight': 1.0}}}, - 'repo_id': 'physical-intelligence/libero', - 'revision': None, - 'root': '/raid/jade/.cache/huggingface/datasets', - 'use_imagenet_stats': True, - 'video_backend': 'torchcodec'}, - 'env': {'camera_name': 'agentview_image,robot0_eye_in_hand_image', - 'episode_length': 520, - 'features': {'action': {'shape': [7], - 'type': }, - 'agent_pos': {'shape': [8], - 'type': }, - 'pixels/agentview_image': {'shape': [360, 360, 3], - 'type': }, - 'pixels/robot0_eye_in_hand_image': {'shape': [360, - 360, - 3], - 'type': }}, - 'features_map': {'action': 'action', - 'agent_pos': 'observation.state', - 'pixels/agentview_image': 'observation.images.image', - 'pixels/robot0_eye_in_hand_image': 'observation.images.image2'}, - 'fps': 30, - 'init_states': True, - 'max_parallel_tasks': 5, - 'multitask_eval': True, - 'obs_type': 'pixels_agent_pos', - 'render_mode': 'rgb_array', - 'task': 'libero_spatial', - 'type': 'libero'}, - 'eval': {'batch_size': 1, 'n_episodes': 1, 'use_async_envs': False}, - 'eval_freq': 0, - 'job_name': 'libero_smolvla', - 'log_freq': 200, - 'num_workers': 4, - 'optimizer': {'betas': [0.9, 0.95], - 'eps': 1e-08, - 'grad_clip_norm': 10, - 'lr': 0.0001, - 'type': 'adamw', - 'weight_decay': 1e-10}, - 'output_dir': '/raid/jade/logs/lerobot/lerobot_2_physical-intelligence_libero_smolvla_lr1e-4bs32steps100000', - 'policy': {'adapt_to_pi_aloha': False, - 'add_image_special_tokens': False, - 'attention_mode': 'cross_attn', - 'chunk_size': 50, - 'device': 'cuda', - 'empty_cameras': 0, - 'expert_width_multiplier': 0.5, - 'freeze_vision_encoder': True, - 'gradient_accumulation_steps': 1, - 'input_features': {}, - 'license': None, - 'load_vlm_weights': False, - 'max_action_dim': 32, - 'max_period': 4.0, - 'max_state_dim': 32, - 'min_period': 0.004, - 'n_action_steps': 1, - 'n_obs_steps': 1, - 'normalization_mapping': {'ACTION': , - 'STATE': , - 'VISUAL': }, - 'num_expert_layers': -1, - 'num_steps': 10, - 'num_vlm_layers': 16, - 'optimizer_betas': [0.9, 0.95], - 'optimizer_eps': 1e-08, - 'optimizer_grad_clip_norm': 10, - 'optimizer_lr': 0.0001, - 'optimizer_weight_decay': 1e-10, - 'output_features': {}, - 'pad_language_to': 'longest', - 'prefix_length': 0, - 'private': None, - 'push_to_hub': True, - 'repo_id': 'None', - 'resize_imgs_with_padding': [512, 512], - 'scheduler_decay_lr': 2.5e-06, - 'scheduler_decay_steps': 30000, - 'scheduler_warmup_steps': 1000, - 'self_attn_every_n_layers': 2, - 'tags': None, - 'tokenizer_max_length': 48, - 'train_expert_only': True, - 'train_state_proj': True, - 'type': 'smolvla', - 'use_amp': False, - 'use_cache': True, - 'use_delta_joint_actions_aloha': False, - 'vlm_model_name': 'HuggingFaceTB/SmolVLM2-500M-Instruct'}, - 'resume': False, - 'save_checkpoint': True, - 'save_freq': 20000, - 'scheduler': {'decay_lr': 2.5e-06, - 'num_decay_steps': 30000, - 'num_warmup_steps': 1000, - 'peak_lr': 0.0001, - 'type': 'cosine_decay_with_warmup'}, - 'seed': 1000, - 'steps': 100000, - 'use_policy_training_preset': True, - 'wandb': {'disable_artifact': False, - 'enable': False, - 'entity': None, - 'mode': None, - 'notes': None, - 'project': 'lerobot', - 'run_id': None}} -INFO 2025-09-09 16:13:51 ts/train.py:143 Logs will be saved locally. -INFO 2025-09-09 16:13:51 ts/train.py:153 Creating dataset -WARNING 2025-09-09 16:13:51 ts/utils.py:302 -The dataset you requested (physical-intelligence/libero) is in 2.0 format. -While current version of LeRobot is backward-compatible with it, the version of your dataset still uses global -stats instead of per-episode stats. Update your dataset stats to the new format using this command: -``` -python -m lerobot.datasets.v21.convert_dataset_v20_to_v21 --repo-id=physical-intelligence/libero -``` - -If you encounter a problem, contact LeRobot maintainers on [Discord](https://discord.com/invite/s3KuuzsPFb) -or open an [issue on GitHub](https://github.com/huggingface/lerobot/issues/new/choose). - -WARNING 2025-09-09 16:13:51 ts/utils.py:302 -The dataset you requested (physical-intelligence/libero) is in 2.0 format. -While current version of LeRobot is backward-compatible with it, the version of your dataset still uses global -stats instead of per-episode stats. Update your dataset stats to the new format using this command: -``` -python -m lerobot.datasets.v21.convert_dataset_v20_to_v21 --repo-id=physical-intelligence/libero -``` - -If you encounter a problem, contact LeRobot maintainers on [Discord](https://discord.com/invite/s3KuuzsPFb) -or open an [issue on GitHub](https://github.com/huggingface/lerobot/issues/new/choose). - -Resolving data files: 100%|β–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆ| 1693/1693 [00:00<00:00, 82981.28it/s] -Loading dataset shards: 100%|β–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆ| 70/70 [00:00<00:00, 4687.94it/s] -INFO 2025-09-09 16:13:57 ts/train.py:163 Creating policy -Fetching 2 files: 100%|β–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆ| 2/2 [00:00<00:00, 21345.06it/s] -Fetching 2 files: 100%|β–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆ| 2/2 [00:00<00:00, 4226.00it/s] -Fetching 2 files: 100%|β–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆ| 2/2 [00:00<00:00, 2966.27it/s] -Fetching 2 files: 100%|β–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆ| 2/2 [00:00<00:00, 6497.76it/s] -Reducing the number of VLM layers to 16 ... -INFO 2025-09-09 16:14:30 ts/train.py:168 Creating optimizer and scheduler -INFO 2025-09-09 16:14:30 ts/train.py:180 Output dir: /raid/jade/logs/lerobot/lerobot_2_physical-intelligence_libero_ -smolvla_lr1e-4bs32steps100000 -INFO 2025-09-09 16:14:30 ts/train.py:182 cfg.env.task='libero_spatial' -INFO 2025-09-09 16:14:30 ts/train.py:183 cfg.steps=100000 (100K) -INFO 2025-09-09 16:14:30 ts/train.py:184 dataset.num_frames=273465 (273K) -INFO 2025-09-09 16:14:30 ts/train.py:185 dataset.num_episodes=1693 -INFO 2025-09-09 16:14:30 ts/train.py:186 num_learnable_params=49103712 (49M) -INFO 2025-09-09 16:14:30 ts/train.py:187 num_total_params=399268940 (399M) -INFO 2025-09-09 16:14:30 ts/train.py:225 Start offline training on a fixed dataset -INFO 2025-09-09 16:16:20 ts/train.py:255 step:200 smpl:6K ep:40 epch:0.02 loss:1.244 grdn:2.492 lr:1.0e-05 updt_s:0. -536 data_s:0.007 -INFO 2025-09-09 16:17:56 ts/train.py:255 step:400 smpl:13K ep:79 epch:0.05 loss:0.685 grdn:4.262 lr:3.0e-05 updt_s:0 -.481 data_s:0.000 -INFO 2025-09-09 16:19:33 ts/train.py:255 step:600 smpl:19K ep:119 epch:0.07 loss:0.364 grdn:4.849 lr:5.0e-05 updt_s: -0.482 data_s:0.000 -INFO 2025-09-09 16:21:10 ts/train.py:255 step:800 smpl:26K ep:158 epch:0.09 loss:0.239 grdn:4.024 lr:7.0e-05 updt_s: -0.481 data_s:0.000 -INFO 2025-09-09 16:22:46 ts/train.py:255 step:1K smpl:32K ep:198 epch:0.12 loss:0.197 grdn:3.267 lr:9.0e-05 updt_s:0 -.478 data_s:0.000 -INFO 2025-09-09 16:24:22 ts/train.py:255 step:1K smpl:38K ep:238 epch:0.14 loss:0.173 grdn:2.319 lr:1.0e-04 updt_s:0 -.481 data_s:0.000 -INFO 2025-09-09 16:25:59 ts/train.py:255 step:1K smpl:45K ep:277 epch:0.16 loss:0.153 grdn:1.741 lr:1.0e-04 updt_s:0 -.483 data_s:0.000 -INFO 2025-09-09 16:27:36 ts/train.py:255 step:2K smpl:51K ep:317 epch:0.19 loss:0.135 grdn:1.354 lr:9.9e-05 updt_s:0 -.483 data_s:0.000 -INFO 2025-09-09 16:29:14 ts/train.py:255 step:2K smpl:58K ep:357 epch:0.21 loss:0.126 grdn:1.177 lr:9.9e-05 updt_s:0 -.484 data_s:0.000 - diff --git a/src/lerobot/envs/libero copy.py b/src/lerobot/envs/libero copy.py deleted file mode 100644 index 83ccd2fb9..000000000 --- a/src/lerobot/envs/libero copy.py +++ /dev/null @@ -1,326 +0,0 @@ -import math -import os -from collections import defaultdict -from collections.abc import Callable -from itertools import chain -from typing import Any - -import gymnasium as gym -import numpy as np -import torch -from gymnasium import spaces -from libero.libero import benchmark, get_libero_path -from libero.libero.envs import OffScreenRenderEnv - - -def create_libero_envs( - task: str, - n_envs: int, - gym_kwargs: dict[str, Any] = None, - camera_name: str = "agentview_image,robot0_eye_in_hand_image", - init_states: bool = True, - env_cls: Callable = None, - multitask_eval: bool = True, -) -> dict[str, dict[str, Any]]: - """ - Here n_envs is per task and equal to the number of rollouts. - Returns: - dict[str, dict[str, list[LiberoEnv]]]: keys are task_suite and values are list of LiberoEnv envs. - """ - print("num envs", n_envs) - print("multitask_eval", multitask_eval) - print("gym_kwargs", gym_kwargs) - if gym_kwargs is None: - gym_kwargs = {} - - if not multitask_eval: - benchmark_dict = benchmark.get_benchmark_dict() - task_suite = benchmark_dict[task]() # can also choose libero_spatial, libero_object, libero_10 etc. - tasks_id = list(range(len(task_suite.tasks))) - episode_indices = [0 for i in range(len(tasks_id))] - if len(tasks_id) == 1: - tasks_id = [tasks_id[0] for _ in range(n_envs)] - episode_indices = list(range(n_envs)) - elif len(tasks_id) < n_envs and n_envs % len(tasks_id) == 0: - n_repeat = n_envs // len(tasks_id) - print("n_repeat", n_repeat) - episode_indices = [] - for _ in range(len(tasks_id)): - episode_indices.extend(list(range(n_repeat))) - tasks_id = list(chain.from_iterable([[item] * n_repeat for item in tasks_id])) - elif n_envs < len(tasks_id): - tasks_id = tasks_id[:n_envs] - episode_indices = list(range(n_envs))[:n_envs] - print(f"WARNING: n_envs < len(tasks_id), evaluating only on {tasks_id}") - print(f"Creating Libero envs with task ids {tasks_id} from suite {task}") - assert n_envs == len(tasks_id), ( - f"len(n_envs) and tasks_id should be the same, got {n_envs} and {len(tasks_id)}" - ) - return env_cls( - [ - lambda i=i: LiberoEnv( - task_suite=task_suite, - task_id=tasks_id[i], - task_suite_name=task, - camera_name=camera_name, - init_states=init_states, - episode_index=episode_indices[i], - **gym_kwargs, - ) - for i in range(n_envs) - ] - ) - else: - envs = defaultdict(dict) - benchmark_dict = benchmark.get_benchmark_dict() - task = task.split(",") - for _task in task: - task_suite = benchmark_dict[ - _task - ]() # can also choose libero_spatial, libero_object, libero_10 etc. - tasks_ids = list(range(len(task_suite.tasks))) - for tasks_id in tasks_ids: - episode_indices = list(range(n_envs)) - print( - f"Creating Libero envs with task ids {tasks_id} from suite {_task}, episode_indices: {episode_indices}" - ) - envs_list = [ - ( - lambda i=i, - task_suite=task_suite, - tasks_id=tasks_id, - _task=_task, - episode_indices=episode_indices: LiberoEnv( - task_suite=task_suite, - task_id=tasks_id, - task_suite_name=_task, - camera_name=camera_name, - init_states=init_states, - episode_index=episode_indices[i], - **gym_kwargs, - ) - ) - for i in range(n_envs) - ] - envs[_task][tasks_id] = env_cls(envs_list) - return envs - - -def quat2axisangle(quat): - """ - Copied from robosuite: https://github.com/ARISE-Initiative/robosuite/blob/eafb81f54ffc104f905ee48a16bb15f059176ad3/robosuite/utils/transform_utils.py#L490C1-L512C55 - - Converts quaternion to axis-angle format. - Returns a unit vector direction scaled by its angle in radians. - - Args: - quat (np.array): (x,y,z,w) vec4 float angles - - Returns: - np.array: (ax,ay,az) axis-angle exponential coordinates - """ - # clip quaternion - if quat[3] > 1.0: - quat[3] = 1.0 - elif quat[3] < -1.0: - quat[3] = -1.0 - - den = np.sqrt(1.0 - quat[3] * quat[3]) - if math.isclose(den, 0.0): - # This is (close to) a zero degree rotation, immediately return - return np.zeros(3) - - return (quat[:3] * 2.0 * math.acos(quat[3])) / den - - -def get_task_init_states(task_suite, i): - init_states_path = os.path.join( - get_libero_path("init_states"), - task_suite.tasks[i].problem_folder, - task_suite.tasks[i].init_states_file, - ) - init_states = torch.load(init_states_path, weights_only=False) # nosec B614 - return init_states - - -def get_libero_dummy_action(): - """Get dummy/no-op action, used to roll out the simulation while the robot does nothing.""" - return [0, 0, 0, 0, 0, 0, -1] - - -OBS_STATE_DIM = 8 -ACTION_DIM = 7 - - -class LiberoEnv(gym.Env): - metadata = {"render_modes": ["rgb_array"], "render_fps": 80} - - def __init__( - self, - task_suite, - task_id, - task_suite_name, - camera_name="agentview_image,robot0_eye_in_hand_image", - obs_type="pixels", - render_mode="rgb_array", - observation_width=256, - observation_height=256, - visualization_width=640, - visualization_height=480, - init_states=True, - episode_index=0, - ): - super().__init__() - self.task_id = task_id - self.obs_type = obs_type - self.render_mode = render_mode - self.observation_width = observation_width - self.observation_height = observation_height - self.visualization_width = visualization_width - self.visualization_height = visualization_height - self.init_states = init_states - self.camera_name = camera_name.split( - "," - ) # agentview_image (main) or robot0_eye_in_hand_image (wrist) - - # Map raw camera names to "image1" and "image2". - # The preprocessing step `preprocess_observation` will then prefix these with `.images.*`, - # following the LeRobot convention (e.g., `observation.images.image`, `observation.images.image2`). - # This ensures the policy consistently receives observations in the - # expected format regardless of the original camera naming. - self.camera_name_mapping = { - "agentview_image": "image", - "robot0_eye_in_hand_image": "image2", - } - - self.num_steps_wait = ( - 10 # Do nothing for the first few timesteps to wait for the simulator drops objects - ) - self.episode_index = episode_index - - self._env = self._make_envs_task(task_suite, self.task_id) - if task_suite_name == "libero_spatial": - max_steps = 220 # longest training demo has 193 steps - elif task_suite_name == "libero_object": - max_steps = 280 # longest training demo has 254 steps - elif task_suite_name == "libero_goal": - max_steps = 300 # longest training demo has 270 steps - elif task_suite_name == "libero_10": - max_steps = 520 # longest training demo has 505 steps - elif task_suite_name == "libero_90": - max_steps = 400 # longest training demo has 373 steps - self._max_episode_steps = max_steps - - images = {} - for cam in self.camera_name: - images[self.camera_name_mapping[cam]] = spaces.Box( - low=0, - high=255, - shape=(self.observation_height, self.observation_width, 3), - dtype=np.uint8, - ) - - if self.obs_type == "state": - raise NotImplementedError() - elif self.obs_type == "pixels": - self.observation_space = spaces.Dict( - { - "pixels": spaces.Dict(images), - } - ) - elif self.obs_type == "pixels_agent_pos": - self.observation_space = spaces.Dict( - { - "pixels": spaces.Dict(images), - "agent_pos": spaces.Box( - low=-1000.0, - high=1000.0, - shape=(OBS_STATE_DIM,), - dtype=np.float64, - ), - } - ) - - self.action_space = spaces.Box(low=-1, high=1, shape=(ACTION_DIM,), dtype=np.float32) - - def render(self): - raw_obs = self._env.env._get_observations() - image = self._format_raw_obs(raw_obs)["pixels"]["image"] - return image - - def _make_envs_task(self, task_suite, task_id: int = 0): - task = task_suite.get_task(task_id) - self.task = task.name - self.task_description = task.language - task_bddl_file = os.path.join(get_libero_path("bddl_files"), task.problem_folder, task.bddl_file) - - env_args = { - "bddl_file_name": task_bddl_file, - "camera_heights": self.observation_height, - "camera_widths": self.observation_width, - } - env = OffScreenRenderEnv(**env_args) - env.reset() - if self.init_states: - init_states = get_task_init_states( - task_suite, task_id - ) # for benchmarking purpose, we fix the a set of initial states FIXME(mshukor): should be in the reset()? - init_state_id = self.episode_index # episode index - env.set_init_state(init_states[init_state_id]) - - return env - - def _format_raw_obs(self, raw_obs): - images = {} - for camera_name in self.camera_name: - image = raw_obs[camera_name] - image = image[::-1, ::-1] # rotate 180 degrees - images[self.camera_name_mapping[camera_name]] = image - state = np.concatenate( - ( - raw_obs["robot0_eef_pos"], - quat2axisangle(raw_obs["robot0_eef_quat"]), - raw_obs["robot0_gripper_qpos"], - ) - ) - agent_pos = state - if self.obs_type == "state": - raise NotImplementedError() - elif self.obs_type == "pixels": - obs = {"pixels": images.copy()} - elif self.obs_type == "pixels_agent_pos": - obs = { - "pixels": images.copy(), - "agent_pos": agent_pos, - } - return obs - - def reset(self, seed=None, **kwargs): - super().reset(seed=seed) - - self._env.seed(seed) - raw_obs = self._env.reset() - # Do nothing for the first few timesteps to wait for the simulator drops objects - for _ in range(self.num_steps_wait): - raw_obs, _, _, _ = self._env.step(get_libero_dummy_action()) - observation = self._format_raw_obs(raw_obs) - info = {"is_success": False} - return observation, info - - def step(self, action): - assert action.ndim == 1 - raw_obs, reward, done, info = self._env.step(action) - - is_success = self._env.check_success() - terminated = done or is_success - info["is_success"] = done # is_success - - observation = self._format_raw_obs(raw_obs) - if done: - self.reset() - print(self.task, self.task_id, done, is_success) - truncated = False - return observation, reward, terminated, truncated, info - - def close(self): - self._env.close() diff --git a/src/lerobot/envs/libero2.py b/src/lerobot/envs/libero2.py deleted file mode 100644 index 1e794072c..000000000 --- a/src/lerobot/envs/libero2.py +++ /dev/null @@ -1,308 +0,0 @@ -import math -import os -from collections import defaultdict -from itertools import chain -from typing import Any, Callable - -import gymnasium as gym -import numpy as np -import torch -from gymnasium import spaces -from libero.libero import benchmark, get_libero_path -from libero.libero.envs import OffScreenRenderEnv - - -OBS_IMAGE = "observation.image" -OBS_IMAGE_2 = "observation.image2" -def create_libero_envs( - task: str, - n_envs: int, - gym_kwargs: dict[str, Any] = None, - camera_name: str = "agentview_image,robot0_eye_in_hand_image", - init_states: bool = True, - env_cls: Callable = None, - multitask_eval: bool = True, -) -> dict[str, dict[str, Any]]: - """ - Here n_envs is per task and equal to the number of rollouts. - Returns: - dict[str, dict[str, list[LiberoEnv]]]: keys are task_suite and values are list of LiberoEnv envs. - """ - if gym_kwargs is None: - gym_kwargs = {} - - if not multitask_eval: - benchmark_dict = benchmark.get_benchmark_dict() - task_suite = benchmark_dict[task]() # can also choose libero_spatial, libero_object, libero_10 etc. - tasks_id = list(range(len(task_suite.tasks))) - episode_indices = [0 for i in range(len(tasks_id))] - if len(tasks_id) == 1: - tasks_id = [tasks_id[0] for _ in range(n_envs)] - episode_indices = list(range(n_envs)) - elif len(tasks_id) < n_envs and n_envs % len(tasks_id) == 0: - n_repeat = n_envs // len(tasks_id) - episode_indices = [] - for i in range(len(tasks_id)): - episode_indices.extend(list(range(n_repeat))) - tasks_id = list(chain.from_iterable([[item] * n_repeat for item in tasks_id])) - elif n_envs < len(tasks_id): - tasks_id = tasks_id[:n_envs] - episode_indices = list(range(n_envs))[:n_envs] - print(f"WARNING: n_envs < len(tasks_id), evaluating only on {tasks_id}") - print(f"Creating Libero envs with task ids {tasks_id} from suite {task}") - assert n_envs == len( - tasks_id - ), f"len(n_envs) and tasks_id should be the same, got {n_envs} and {len(tasks_id)}" - return env_cls( - [ - lambda i=i: LiberoEnv( - task_suite=task_suite, - task_id=tasks_id[i], - task_suite_name=task, - camera_name=camera_name, - init_states=init_states, - episode_index=episode_indices[i], - **gym_kwargs, - ) - for i in range(n_envs) - ] - ) - else: - envs = defaultdict(dict) - benchmark_dict = benchmark.get_benchmark_dict() - task = task.split(",") - for _task in task: - task_suite = benchmark_dict[ - _task - ]() # can also choose libero_spatial, libero_object, libero_10 etc. - tasks_ids = list(range(len(task_suite.tasks))) - # tasks_ids = [0] # FIXME(mshukor): debug - for tasks_id in tasks_ids: - episode_indices = list(range(n_envs)) - print( - f"Creating Libero envs with task ids {tasks_id} from suite {_task}, episode_indices: {episode_indices}" - ) - envs_list = [ - lambda i=i: LiberoEnv( - task_suite=task_suite, - task_id=tasks_id, - task_suite_name=_task, - camera_name=camera_name, - init_states=init_states, - episode_index=episode_indices[i], - **gym_kwargs, - ) - for i in range(n_envs) - ] - envs[_task][tasks_id] = env_cls(envs_list) - return envs - - -def quat2axisangle(quat): - """ - Copied from robosuite: https://github.com/ARISE-Initiative/robosuite/blob/eafb81f54ffc104f905ee48a16bb15f059176ad3/robosuite/utils/transform_utils.py#L490C1-L512C55 - - Converts quaternion to axis-angle format. - Returns a unit vector direction scaled by its angle in radians. - - Args: - quat (np.array): (x,y,z,w) vec4 float angles - - Returns: - np.array: (ax,ay,az) axis-angle exponential coordinates - """ - # clip quaternion - if quat[3] > 1.0: - quat[3] = 1.0 - elif quat[3] < -1.0: - quat[3] = -1.0 - - den = np.sqrt(1.0 - quat[3] * quat[3]) - if math.isclose(den, 0.0): - # This is (close to) a zero degree rotation, immediately return - return np.zeros(3) - - return (quat[:3] * 2.0 * math.acos(quat[3])) / den - - -def get_task_init_states(task_suite, i): - init_states_path = os.path.join( - get_libero_path("init_states"), - task_suite.tasks[i].problem_folder, - task_suite.tasks[i].init_states_file, - ) - init_states = torch.load(init_states_path, weights_only=False) - return init_states - - -def get_libero_dummy_action(): - """Get dummy/no-op action, used to roll out the simulation while the robot does nothing.""" - return [0, 0, 0, 0, 0, 0, -1] - - -class LiberoEnv(gym.Env): - metadata = {"render_modes": ["rgb_array"], "render_fps": 80} - - def __init__( - self, - task_suite, - task_id, - task_suite_name, - camera_name="agentview_image,robot0_eye_in_hand_image", - obs_type="pixels", - render_mode="rgb_array", - observation_width=256, - observation_height=256, - visualization_width=640, - visualization_height=480, - init_states=True, - episode_index=0, - ): - super().__init__() - self.task_id = task_id - self.obs_type = obs_type - self.render_mode = render_mode - self.observation_width = observation_width - self.observation_height = observation_height - self.visualization_width = visualization_width - self.visualization_height = visualization_height - self.init_states = init_states - self.camera_name = camera_name.split( - "," - ) # agentview_image (main) or robot0_eye_in_hand_image (wrist) - self.camera_name_mapping = { - "agentview_image": OBS_IMAGE, - "robot0_eye_in_hand_image": OBS_IMAGE_2, - } - self.num_steps_wait = ( - 10 # Do nothing for the first few timesteps to wait for the simulator drops objects - ) - self.episode_index = episode_index - - self._env = self._make_envs_task(task_suite, self.task_id) - if task_suite_name == "libero_spatial": - max_steps = 220 # longest training demo has 193 steps - elif task_suite_name == "libero_object": - max_steps = 280 # longest training demo has 254 steps - elif task_suite_name == "libero_goal": - max_steps = 300 # longest training demo has 270 steps - elif task_suite_name == "libero_10": - max_steps = 520 # longest training demo has 505 steps - elif task_suite_name == "libero_90": - max_steps = 400 # longest training demo has 373 steps - self._max_episode_steps = max_steps - - images = {} - for cam in self.camera_name: - images[self.camera_name_mapping[cam]] = spaces.Box( - low=0, - high=255, - shape=(self.observation_height, self.observation_width, 3), - dtype=np.uint8, - ) - - if self.obs_type == "state": - raise NotImplementedError() - elif self.obs_type == "pixels": - self.observation_space = spaces.Dict( - { - "pixels": spaces.Dict(images), - } - ) - elif self.obs_type == "pixels_agent_pos": - self.observation_space = spaces.Dict( - { - "pixels": spaces.Dict(images), - "agent_pos": spaces.Box( - low=-1000.0, - high=1000.0, - shape=(8,), - dtype=np.float64, - ), - } - ) - - self.action_space = spaces.Box(low=-1, high=1, shape=(7,), dtype=np.float32) - - def render(self): - raw_obs = self._env.env._get_observations() - image = self._format_raw_obs(raw_obs)["pixels"][OBS_IMAGE] - return image - - def _make_envs_task(self, task_suite, task_id: int = 0): - task = task_suite.get_task(task_id) - self.task = task.name - self.task_description = task.language - task_bddl_file = os.path.join(get_libero_path("bddl_files"), task.problem_folder, task.bddl_file) - - env_args = { - "bddl_file_name": task_bddl_file, - "camera_heights": self.observation_height, - "camera_widths": self.observation_width, - } - env = OffScreenRenderEnv(**env_args) - env.reset() - if self.init_states: - init_states = get_task_init_states( - task_suite, task_id - ) # for benchmarking purpose, we fix the a set of initial states FIXME(mshukor): should be in the reset()? - init_state_id = self.episode_index # episode index - env.set_init_state(init_states[init_state_id]) - - return env - - def _format_raw_obs(self, raw_obs): - images = {} - for camera_name in self.camera_name: - image = raw_obs[camera_name] - image = image[::-1, ::-1] # rotate 180 degrees - images[self.camera_name_mapping[camera_name]] = image - # images = image if len(images) == 1 else images - state = np.concatenate( - ( - raw_obs["robot0_eef_pos"], - quat2axisangle(raw_obs["robot0_eef_quat"]), - raw_obs["robot0_gripper_qpos"], - ) - ) - agent_pos = state - if self.obs_type == "state": - raise NotImplementedError() - elif self.obs_type == "pixels": - obs = {"pixels": images.copy()} - elif self.obs_type == "pixels_agent_pos": - obs = { - "pixels": images.copy(), - "agent_pos": agent_pos, - } - return obs - - def reset(self, seed=None, **kwargs): - super().reset(seed=seed) - - self._env.seed(seed) - raw_obs = self._env.reset() - # Do nothing for the first few timesteps to wait for the simulator drops objects - for _ in range(self.num_steps_wait): - raw_obs, _, _, _ = self._env.step(get_libero_dummy_action()) - observation = self._format_raw_obs(raw_obs) - info = {"is_success": False} - return observation, info - - def step(self, action): - assert action.ndim == 1 - raw_obs, reward, done, info = self._env.step(action) - - is_success = self._env.check_success() - terminated = done or is_success - info["is_success"] = done # is_success - - observation = self._format_raw_obs(raw_obs) - if done: - self.reset() - print(self.task, self.task_id, done, is_success) - truncated = False - return observation, reward, terminated, truncated, info - - def close(self): - self._env.close() diff --git a/tmux_log.txt b/tmux_log.txt deleted file mode 100644 index 4936578bc..000000000 --- a/tmux_log.txt +++ /dev/null @@ -1,2008 +0,0 @@ - 'resume': False, - 'save_checkpoint': True, - 'save_freq': 20000, - 'scheduler': {'decay_lr': 2.5e-06, - 'num_decay_steps': 30000, - 'num_warmup_steps': 1000, - 'peak_lr': 0.0001, - 'type': 'cosine_decay_with_warmup'}, - 'seed': 1000, - 'steps': 100000, - 'use_policy_training_preset': True, - 'wandb': {'disable_artifact': False, - 'enable': False, - 'entity': None, - 'mode': None, - 'notes': None, - 'project': 'lerobot', - 'run_id': None}} -INFO 2025-09-08 13:23:15 ils/utils.py:48 Cuda backend detected, using cuda. -WARNING 2025-09-08 13:23:15 /policies.py:81 Device 'None' is not available. Switching to 'cuda'. -INFO 2025-09-08 13:23:15 ccelerate.py:99 {'batch_size': 32, - 'dataset': {'episodes': None, - 'image_transforms': {'enable': False, - 'max_num_transforms': 3, - 'random_order': False, - 'tfs': {'brightness': {'kwargs': {'brightness': [0.8, - 1.2]}, - 'type': 'ColorJitter', - 'weight': 1.0}, - 'contrast': {'kwargs': {'contrast': [0.8, - 1.2]}, - 'type': 'ColorJitter', - 'weight': 1.0}, - 'hue': {'kwargs': {'hue': [-0.05, - 0.05]}, - 'type': 'ColorJitter', - 'weight': 1.0}, - 'saturation': {'kwargs': {'saturation': [0.5, - 1.5]}, - 'type': 'ColorJitter', - 'weight': 1.0}, - 'sharpness': {'kwargs': {'sharpness': [0.5, - 1.5]}, - 'type': 'SharpnessJitter', - 'weight': 1.0}}}, - 'repo_id': 'HuggingFaceVLA/libero', - 'revision': None, - 'root': '/raid/jade/.cache/huggingface/datasets', - 'use_imagenet_stats': True, - 'video_backend': 'torchcodec'}, - 'env': {'camera_name': 'agentview_image,robot0_eye_in_hand_image', - 'episode_length': 520, - 'features': {'action': {'shape': [7], - 'type': }, - 'agent_pos': {'shape': [8], - 'type': }, - 'pixels/agentview_image': {'shape': [360, 360, 3], - 'type': }, - 'pixels/robot0_eye_in_hand_image': {'shape': [360, - 360, - 3], - 'type': }}, - 'features_map': {'action': 'action', - 'agent_pos': 'observation.state', - 'pixels/agentview_image': 'observation.images.image', - 'pixels/robot0_eye_in_hand_image': 'observation.images.image2'}, - 'fps': 30, - 'init_states': True, - 'max_parallel_tasks': 5, - 'multitask_eval': True, - 'obs_type': 'pixels_agent_pos', - 'render_mode': 'rgb_array', - 'task': 'libero_spatial', - 'type': 'libero'}, - 'eval': {'batch_size': 1, 'n_episodes': 1, 'use_async_envs': False}, - 'eval_freq': 0, - 'job_name': 'libero_smolvla', - 'log_freq': 200, - 'num_workers': 4, - 'optimizer': {'betas': [0.9, 0.95], - 'eps': 1e-08, - 'grad_clip_norm': 10, - 'lr': 0.0001, - 'type': 'adamw', - 'weight_decay': 1e-10}, - 'output_dir': '/raid/jade/logs/lerobot/lerobot_2_HuggingFaceVLA_libero_smolvla_lr1e-4bs32steps100000', - 'policy': {'adapt_to_pi_aloha': False, - 'add_image_special_tokens': False, - 'attention_mode': 'cross_attn', - 'chunk_size': 50, - 'device': 'cuda', - 'empty_cameras': 0, - 'expert_width_multiplier': 0.75, - 'freeze_vision_encoder': True, - 'gradient_accumulation_steps': 1, - 'input_features': {}, - 'license': None, - 'load_vlm_weights': False, - 'max_action_dim': 32, - 'max_period': 4.0, - 'max_state_dim': 32, - 'min_period': 0.004, - 'n_action_steps': 1, - 'n_obs_steps': 1, - 'normalization_mapping': {'ACTION': , - 'STATE': , - 'VISUAL': }, - 'num_expert_layers': -1, - 'num_steps': 10, - 'num_vlm_layers': 16, - 'optimizer_betas': [0.9, 0.95], - 'optimizer_eps': 1e-08, - 'optimizer_grad_clip_norm': 10, - 'optimizer_lr': 0.0001, - 'optimizer_weight_decay': 1e-10, - 'output_features': {}, - 'pad_language_to': 'longest', - 'prefix_length': -1, - 'private': None, - 'push_to_hub': True, - 'repo_id': 'None', - 'resize_imgs_with_padding': [512, 512], - 'scheduler_decay_lr': 2.5e-06, - 'scheduler_decay_steps': 30000, - 'scheduler_warmup_steps': 1000, - 'self_attn_every_n_layers': 2, - 'tags': None, - 'tokenizer_max_length': 48, - 'train_expert_only': True, - 'train_state_proj': True, - 'type': 'smolvla', - 'use_amp': True, - 'use_cache': True, - 'use_delta_joint_actions_aloha': False, - 'vlm_model_name': 'HuggingFaceTB/SmolVLM2-500M-Instruct'}, - 'resume': False, - 'save_checkpoint': True, - 'save_freq': 20000, - 'scheduler': {'decay_lr': 2.5e-06, - 'num_decay_steps': 30000, - 'num_warmup_steps': 1000, - 'peak_lr': 0.0001, - 'type': 'cosine_decay_with_warmup'}, - 'seed': 1000, - 'steps': 100000, - 'use_policy_training_preset': True, - 'wandb': {'disable_artifact': False, - 'enable': False, - 'entity': None, - 'mode': None, - 'notes': None, - 'project': 'lerobot', - 'run_id': None}} -WARNING 2025-09-08 13:23:15 ls/other.py:512 Detected kernel version 5.4.0, which is below the recommended minimum of - 5.5.0; this can cause the process to hang. It is recommended to upgrade the kernel to the minimum version or higher -. -WARNING 2025-09-08 13:23:15 ls/other.py:512 Detected kernel version 5.4.0, which is below the recommended minimum of - 5.5.0; this can cause the process to hang. It is recommended to upgrade the kernel to the minimum version or higher -. -INFO 2025-09-08 13:23:15 celerate.py:149 Creating dataset -Resolving data files: 100%|β–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆ| 1693/1693 [00:00<00:00, 35414.48it/s] -Loading dataset shards: 100%|β–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆ| 69/69 [00:00<00:00, 5660.00it/s] -Resolving data files: 100%|β–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆ| 1693/1693 [00:00<00:00, 43760.67it/s] -Loading dataset shards: 100%|β–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆ| 69/69 [00:00<00:00, 5629.72it/s] -c -INFO 2025-09-08 13:23:22 celerate.py:160 Creating policy -/home/jade_choghari/miniconda3/envs/lerobot/lib/python3.10/site-packages/transformers/utils/hub.py:111: FutureWarnin -g: Using `TRANSFORMERS_CACHE` is deprecated and will be removed in v5 of Transformers. Use `HF_HOME` instead. - warnings.warn( -c -/home/jade_choghari/miniconda3/envs/lerobot/lib/python3.10/site-packages/torch/distributed/distributed_c10d.py:4631: - UserWarning: No device id is provided via `init_process_group` or `barrier `. Using the current device set by the u -ser. - warnings.warn( # warn only once -[rank1]:[W908 13:23:22.785516795 ProcessGroupNCCL.cpp:4718] [PG ID 0 PG GUID 0 Rank 1] using GPU 1 as device used b -y this process is currently unknown. This can potentially cause a hang if this rank to GPU mapping is incorrect. You - can pecify device_id in init_process_group() to force use of a particular device. -Reducing the number of VLM layers to 16 ... -/home/jade_choghari/miniconda3/envs/lerobot/lib/python3.10/site-packages/torch/distributed/distributed_c10d.py:4631: - UserWarning: No device id is provided via `init_process_group` or `barrier `. Using the current device set by the u -ser. - warnings.warn( # warn only once -[rank0]:[W908 13:23:43.028071493 ProcessGroupNCCL.cpp:4718] [PG ID 0 PG GUID 0 Rank 0] using GPU 0 as device used b -y this process is currently unknown. This can potentially cause a hang if this rank to GPU mapping is incorrect. You - can pecify device_id in init_process_group() to force use of a particular device. -INFO 2025-09-08 13:23:43 celerate.py:171 Creating optimizer and scheduler -/home/jade_choghari/miniconda3/envs/lerobot/lib/python3.10/site-packages/transformers/utils/hub.py:111: FutureWarnin -g: Using `TRANSFORMERS_CACHE` is deprecated and will be removed in v5 of Transformers. Use `HF_HOME` instead. - warnings.warn( -Reducing the number of VLM layers to 16 ... -INFO 2025-09-08 13:24:04 celerate.py:211 Output dir: /raid/jade/logs/lerobot/lerobot_2_HuggingFaceVLA_libero_smolvla -_lr1e-4bs32steps100000 -INFO 2025-09-08 13:24:04 celerate.py:213 cfg.env.task='libero_spatial' -INFO 2025-09-08 13:24:04 celerate.py:214 cfg.steps=100000 (100K) -INFO 2025-09-08 13:24:04 celerate.py:215 dataset.num_frames=273465 (273K) -INFO 2025-09-08 13:24:04 celerate.py:216 dataset.num_episodes=1693 -INFO 2025-09-08 13:24:04 celerate.py:217 num_learnable_params=99880992 (100M) -INFO 2025-09-08 13:24:04 celerate.py:218 num_total_params=450046220 (450M) -INFO 2025-09-08 13:24:04 celerate.py:219 Number of processes: 2 -INFO 2025-09-08 13:24:04 celerate.py:220 Device: cuda:0 -INFO 2025-09-08 13:24:04 celerate.py:221 Mixed precision: bf16 -INFO 2025-09-08 13:24:04 celerate.py:243 Start offline training on a fixed dataset - -bach: dict_keys(['observation.images.image', 'observation.images.image2', 'observation.state', 'action', 'timestamp -', 'frame_index', 'episode_index', 'index', 'task_index', 'observation.images.image_is_pad', 'observation.images.ima -ge2_is_pad', 'observation.state_is_pad', 'action_is_pad', 'task']) -> /home/jade_choghari/lerobot/src/lerobot/scripts/train_accelerate.py(263)train() --> train_tracker, output_dict = update_policy( -(Pdb) -bach: dict_keys(['observation.images.image', 'observation.images.image2', 'observation.state', 'action', 'timestamp -', 'frame_index', 'episode_index', 'index', 'task_index', 'observation.images.image_is_pad', 'observation.images.ima -ge2_is_pad', 'observation.state_is_pad', 'action_is_pad', 'task']) -> /home/jade_choghari/lerobot/src/lerobot/scripts/train_accelerate.py(263)train() --> train_tracker, output_dict = update_policy( -(Pdb) batch.keys()[rank0]:[W908 13:24:43.868440913 reducer.cpp:1430] Warning: find_unused_parameters=True was specif -ied in DDP constructor, but did not find any unused parameters in the forward pass. This flag results in an extra tr -aversal of the autograd graph every iteration, which can adversely affect performance. If your model indeed never h -as any unused parameters in the forward pass, consider turning this flag off. Note that this warning may be a false -positive if your model has flow control causing later iterations to have unused parameters. (function operator()) -policy.config.input_features -*** SyntaxError: invalid syntax -(Pdb) policy.config.input_features -*** AttributeError: 'DistributedDataParallel' object has no attribute 'config' -(Pdb) policy -DistributedDataParallel( - (module): SmolVLAPolicy( - (normalize_inputs): Normalize( - (buffer_observation_state): ParameterDict( - (mean): Parameter containing: [torch.cuda.FloatTensor of size 8 (cuda:1)] - (std): Parameter containing: [torch.cuda.FloatTensor of size 8 (cuda:1)] - ) - ) - (normalize_targets): Normalize( - (buffer_action): ParameterDict( - (mean): Parameter containing: [torch.cuda.FloatTensor of size 7 (cuda:1)] - (std): Parameter containing: [torch.cuda.FloatTensor of size 7 (cuda:1)] - ) - ) - (unnormalize_outputs): Unnormalize( - (buffer_action): ParameterDict( - (mean): Parameter containing: [torch.cuda.FloatTensor of size 7 (cuda:1)] - (std): Parameter containing: [torch.cuda.FloatTensor of size 7 (cuda:1)] - ) - ) - (model): VLAFlowMatching( - (vlm_with_expert): SmolVLMWithExpertModel( - (vlm): SmolVLMForConditionalGeneration( - (model): SmolVLMModel( - (vision_model): SmolVLMVisionTransformer( - (embeddings): SmolVLMVisionEmbeddings( - (patch_embedding): Conv2d(3, 768, kernel_size=(16, 16), stride=(16, 16), padding=valid) - (position_embedding): Embedding(1024, 768) - ) - (encoder): SmolVLMEncoder( - (layers): ModuleList( - (0-11): 12 x SmolVLMEncoderLayer( - (self_attn): SmolVLMVisionAttention( - (k_proj): Linear(in_features=768, out_features=768, bias=True) - (v_proj): Linear(in_features=768, out_features=768, bias=True) - (q_proj): Linear(in_features=768, out_features=768, bias=True) - (out_proj): Linear(in_features=768, out_features=768, bias=True) - ) - (layer_norm1): LayerNorm((768,), eps=1e-06, elementwise_affine=True) - (mlp): SmolVLMVisionMLP( - (activation_fn): PytorchGELUTanh() - (fc1): Linear(in_features=768, out_features=3072, bias=True) - (fc2): Linear(in_features=3072, out_features=768, bias=True) - ) - (layer_norm2): LayerNorm((768,), eps=1e-06, elementwise_affine=True) - ) - ) - ) - (post_layernorm): LayerNorm((768,), eps=1e-06, elementwise_affine=True) - ) - (connector): SmolVLMConnector( - (modality_projection): SmolVLMSimpleMLP( - (proj): Linear(in_features=12288, out_features=960, bias=False) - ) - ) - (text_model): LlamaModel( - (embed_tokens): Embedding(49280, 960, padding_idx=2) - (layers): ModuleList( - (0-15): 16 x LlamaDecoderLayer( - (self_attn): LlamaAttention( - (q_proj): Linear(in_features=960, out_features=960, bias=False) - (k_proj): Linear(in_features=960, out_features=320, bias=False) - (v_proj): Linear(in_features=960, out_features=320, bias=False) - (o_proj): Linear(in_features=960, out_features=960, bias=False) - ) - (mlp): LlamaMLP( - (gate_proj): Linear(in_features=960, out_features=2560, bias=False) - (up_proj): Linear(in_features=960, out_features=2560, bias=False) - (down_proj): Linear(in_features=2560, out_features=960, bias=False) - (act_fn): SiLU() - ) - (input_layernorm): LlamaRMSNorm((960,), eps=1e-05) - (post_attention_layernorm): LlamaRMSNorm((960,), eps=1e-05) - ) - ) - (norm): LlamaRMSNorm((960,), eps=1e-05) - (rotary_emb): LlamaRotaryEmbedding() - ) - ) - (lm_head): Linear(in_features=960, out_features=49280, bias=False) - ) - (lm_expert): LlamaModel( - (embed_tokens): None - (layers): ModuleList( - (0): LlamaDecoderLayer( - (self_attn): LlamaAttention( - (q_proj): Linear(in_features=720, out_features=960, bias=False) - (k_proj): Linear(in_features=720, out_features=320, bias=False) - (v_proj): Linear(in_features=720, out_features=320, bias=False) - (o_proj): Linear(in_features=960, out_features=720, bias=False) - ) - (mlp): LlamaMLP( - (gate_proj): Linear(in_features=720, out_features=2048, bias=False) - (up_proj): Linear(in_features=720, out_features=2048, bias=False) - (down_proj): Linear(in_features=2048, out_features=720, bias=False) - (act_fn): SiLU() - ) - (input_layernorm): LlamaRMSNorm((720,), eps=1e-05) - (post_attention_layernorm): LlamaRMSNorm((720,), eps=1e-05) - ) - (1): LlamaDecoderLayer( - (self_attn): LlamaAttention( - (q_proj): Linear(in_features=720, out_features=960, bias=False) - (k_proj): Linear(in_features=320, out_features=320, bias=False) - (v_proj): Linear(in_features=320, out_features=320, bias=False) - (o_proj): Linear(in_features=960, out_features=720, bias=False) - ) - (mlp): LlamaMLP( - (gate_proj): Linear(in_features=720, out_features=2048, bias=False) - (up_proj): Linear(in_features=720, out_features=2048, bias=False) - (down_proj): Linear(in_features=2048, out_features=720, bias=False) - (act_fn): SiLU() - ) - (input_layernorm): LlamaRMSNorm((720,), eps=1e-05) - (post_attention_layernorm): LlamaRMSNorm((720,), eps=1e-05) - ) - (2): LlamaDecoderLayer( - (self_attn): LlamaAttention( - (q_proj): Linear(in_features=720, out_features=960, bias=False) - (k_proj): Linear(in_features=720, out_features=320, bias=False) - (v_proj): Linear(in_features=720, out_features=320, bias=False) - (o_proj): Linear(in_features=960, out_features=720, bias=False) - ) - (mlp): LlamaMLP( - (gate_proj): Linear(in_features=720, out_features=2048, bias=False) - (up_proj): Linear(in_features=720, out_features=2048, bias=False) - (down_proj): Linear(in_features=2048, out_features=720, bias=False) - (act_fn): SiLU() - ) - (input_layernorm): LlamaRMSNorm((720,), eps=1e-05) - (post_attention_layernorm): LlamaRMSNorm((720,), eps=1e-05) - ) - (3): LlamaDecoderLayer( - (self_attn): LlamaAttention( - (q_proj): Linear(in_features=720, out_features=960, bias=False) - (k_proj): Linear(in_features=320, out_features=320, bias=False) - (v_proj): Linear(in_features=320, out_features=320, bias=False) - (o_proj): Linear(in_features=960, out_features=720, bias=False) - ) - (mlp): LlamaMLP( - (gate_proj): Linear(in_features=720, out_features=2048, bias=False) - (up_proj): Linear(in_features=720, out_features=2048, bias=False) - (down_proj): Linear(in_features=2048, out_features=720, bias=False) - (act_fn): SiLU() - ) - (input_layernorm): LlamaRMSNorm((720,), eps=1e-05) - (post_attention_layernorm): LlamaRMSNorm((720,), eps=1e-05) - ) - (4): LlamaDecoderLayer( - (self_attn): LlamaAttention( - (q_proj): Linear(in_features=720, out_features=960, bias=False) - (k_proj): Linear(in_features=720, out_features=320, bias=False) - (v_proj): Linear(in_features=720, out_features=320, bias=False) - (o_proj): Linear(in_features=960, out_features=720, bias=False) - ) - (mlp): LlamaMLP( - (gate_proj): Linear(in_features=720, out_features=2048, bias=False) - (up_proj): Linear(in_features=720, out_features=2048, bias=False) - (down_proj): Linear(in_features=2048, out_features=720, bias=False) - (act_fn): SiLU() - ) - (input_layernorm): LlamaRMSNorm((720,), eps=1e-05) - (post_attention_layernorm): LlamaRMSNorm((720,), eps=1e-05) - ) - (5): LlamaDecoderLayer( - (self_attn): LlamaAttention( - (q_proj): Linear(in_features=720, out_features=960, bias=False) - (k_proj): Linear(in_features=320, out_features=320, bias=False) - (v_proj): Linear(in_features=320, out_features=320, bias=False) - (o_proj): Linear(in_features=960, out_features=720, bias=False) - ) - (mlp): LlamaMLP( - (gate_proj): Linear(in_features=720, out_features=2048, bias=False) - (up_proj): Linear(in_features=720, out_features=2048, bias=False) - (down_proj): Linear(in_features=2048, out_features=720, bias=False) - (act_fn): SiLU() - ) - (input_layernorm): LlamaRMSNorm((720,), eps=1e-05) - (post_attention_layernorm): LlamaRMSNorm((720,), eps=1e-05) - ) - (6): LlamaDecoderLayer( - (self_attn): LlamaAttention( - (q_proj): Linear(in_features=720, out_features=960, bias=False) - (k_proj): Linear(in_features=720, out_features=320, bias=False) - (v_proj): Linear(in_features=720, out_features=320, bias=False) - (o_proj): Linear(in_features=960, out_features=720, bias=False) - ) - (mlp): LlamaMLP( - (gate_proj): Linear(in_features=720, out_features=2048, bias=False) - (up_proj): Linear(in_features=720, out_features=2048, bias=False) - (down_proj): Linear(in_features=2048, out_features=720, bias=False) - (act_fn): SiLU() - ) - (input_layernorm): LlamaRMSNorm((720,), eps=1e-05) - (post_attention_layernorm): LlamaRMSNorm((720,), eps=1e-05) - ) - (7): LlamaDecoderLayer( - (self_attn): LlamaAttention( - (q_proj): Linear(in_features=720, out_features=960, bias=False) - (k_proj): Linear(in_features=320, out_features=320, bias=False) - (v_proj): Linear(in_features=320, out_features=320, bias=False) - (o_proj): Linear(in_features=960, out_features=720, bias=False) - ) - (mlp): LlamaMLP( - (gate_proj): Linear(in_features=720, out_features=2048, bias=False) - (up_proj): Linear(in_features=720, out_features=2048, bias=False) - (down_proj): Linear(in_features=2048, out_features=720, bias=False) - (act_fn): SiLU() - ) - (input_layernorm): LlamaRMSNorm((720,), eps=1e-05) - (post_attention_layernorm): LlamaRMSNorm((720,), eps=1e-05) - ) - (8): LlamaDecoderLayer( - (self_attn): LlamaAttention( - (q_proj): Linear(in_features=720, out_features=960, bias=False) - (k_proj): Linear(in_features=720, out_features=320, bias=False) - (v_proj): Linear(in_features=720, out_features=320, bias=False) - (o_proj): Linear(in_features=960, out_features=720, bias=False) - ) - (mlp): LlamaMLP( - (gate_proj): Linear(in_features=720, out_features=2048, bias=False) - (up_proj): Linear(in_features=720, out_features=2048, bias=False) - (down_proj): Linear(in_features=2048, out_features=720, bias=False) - (act_fn): SiLU() - ) - (input_layernorm): LlamaRMSNorm((720,), eps=1e-05) - (post_attention_layernorm): LlamaRMSNorm((720,), eps=1e-05) - ) - (9): LlamaDecoderLayer( - (self_attn): LlamaAttention( - (q_proj): Linear(in_features=720, out_features=960, bias=False) - (k_proj): Linear(in_features=320, out_features=320, bias=False) - (v_proj): Linear(in_features=320, out_features=320, bias=False) - (o_proj): Linear(in_features=960, out_features=720, bias=False) - ) - (mlp): LlamaMLP( - (gate_proj): Linear(in_features=720, out_features=2048, bias=False) - (up_proj): Linear(in_features=720, out_features=2048, bias=False) - (down_proj): Linear(in_features=2048, out_features=720, bias=False) - (act_fn): SiLU() - ) - (input_layernorm): LlamaRMSNorm((720,), eps=1e-05) - (post_attention_layernorm): LlamaRMSNorm((720,), eps=1e-05) - ) - (10): LlamaDecoderLayer( - (self_attn): LlamaAttention( - (q_proj): Linear(in_features=720, out_features=960, bias=False) - (k_proj): Linear(in_features=720, out_features=320, bias=False) - (v_proj): Linear(in_features=720, out_features=320, bias=False) - (o_proj): Linear(in_features=960, out_features=720, bias=False) - ) - (mlp): LlamaMLP( - (gate_proj): Linear(in_features=720, out_features=2048, bias=False) - (up_proj): Linear(in_features=720, out_features=2048, bias=False) - (down_proj): Linear(in_features=2048, out_features=720, bias=False) - (act_fn): SiLU() - ) - (input_layernorm): LlamaRMSNorm((720,), eps=1e-05) - (post_attention_layernorm): LlamaRMSNorm((720,), eps=1e-05) - ) - (11): LlamaDecoderLayer( - (self_attn): LlamaAttention( - (q_proj): Linear(in_features=720, out_features=960, bias=False) - (k_proj): Linear(in_features=320, out_features=320, bias=False) - (v_proj): Linear(in_features=320, out_features=320, bias=False) - (o_proj): Linear(in_features=960, out_features=720, bias=False) - ) - (mlp): LlamaMLP( - (gate_proj): Linear(in_features=720, out_features=2048, bias=False) - (up_proj): Linear(in_features=720, out_features=2048, bias=False) - (down_proj): Linear(in_features=2048, out_features=720, bias=False) - (act_fn): SiLU() - ) - (input_layernorm): LlamaRMSNorm((720,), eps=1e-05) - (post_attention_layernorm): LlamaRMSNorm((720,), eps=1e-05) - ) - (12): LlamaDecoderLayer( - (self_attn): LlamaAttention( - (q_proj): Linear(in_features=720, out_features=960, bias=False) - (k_proj): Linear(in_features=720, out_features=320, bias=False) - (v_proj): Linear(in_features=720, out_features=320, bias=False) - (o_proj): Linear(in_features=960, out_features=720, bias=False) - ) - (mlp): LlamaMLP( - (gate_proj): Linear(in_features=720, out_features=2048, bias=False) - (up_proj): Linear(in_features=720, out_features=2048, bias=False) - (down_proj): Linear(in_features=2048, out_features=720, bias=False) - (act_fn): SiLU() - ) - (input_layernorm): LlamaRMSNorm((720,), eps=1e-05) - (post_attention_layernorm): LlamaRMSNorm((720,), eps=1e-05) - ) - (13): LlamaDecoderLayer( - (self_attn): LlamaAttention( - (q_proj): Linear(in_features=720, out_features=960, bias=False) - (k_proj): Linear(in_features=320, out_features=320, bias=False) - (v_proj): Linear(in_features=320, out_features=320, bias=False) - (o_proj): Linear(in_features=960, out_features=720, bias=False) - ) - (mlp): LlamaMLP( - (gate_proj): Linear(in_features=720, out_features=2048, bias=False) - (up_proj): Linear(in_features=720, out_features=2048, bias=False) - (down_proj): Linear(in_features=2048, out_features=720, bias=False) - (act_fn): SiLU() - ) - (input_layernorm): LlamaRMSNorm((720,), eps=1e-05) - (post_attention_layernorm): LlamaRMSNorm((720,), eps=1e-05) - ) - (14): LlamaDecoderLayer( - (self_attn): LlamaAttention( - (q_proj): Linear(in_features=720, out_features=960, bias=False) - (k_proj): Linear(in_features=720, out_features=320, bias=False) - (v_proj): Linear(in_features=720, out_features=320, bias=False) - (o_proj): Linear(in_features=960, out_features=720, bias=False) - ) - (mlp): LlamaMLP( - (gate_proj): Linear(in_features=720, out_features=2048, bias=False) - (up_proj): Linear(in_features=720, out_features=2048, bias=False) - (down_proj): Linear(in_features=2048, out_features=720, bias=False) - (act_fn): SiLU() - ) - (input_layernorm): LlamaRMSNorm((720,), eps=1e-05) - (post_attention_layernorm): LlamaRMSNorm((720,), eps=1e-05) - ) - (15): LlamaDecoderLayer( - (self_attn): LlamaAttention( - (q_proj): Linear(in_features=720, out_features=960, bias=False) - (k_proj): Linear(in_features=320, out_features=320, bias=False) - (v_proj): Linear(in_features=320, out_features=320, bias=False) - (o_proj): Linear(in_features=960, out_features=720, bias=False) - ) - (mlp): LlamaMLP( - (gate_proj): Linear(in_features=720, out_features=2048, bias=False) - (up_proj): Linear(in_features=720, out_features=2048, bias=False) - (down_proj): Linear(in_features=2048, out_features=720, bias=False) - (act_fn): SiLU() - ) - (input_layernorm): LlamaRMSNorm((720,), eps=1e-05) - (post_attention_layernorm): LlamaRMSNorm((720,), eps=1e-05) - ) - ) - (norm): LlamaRMSNorm((720,), eps=1e-05) - (rotary_emb): LlamaRotaryEmbedding() - ) - ) - (state_proj): Linear(in_features=32, out_features=960, bias=True) - (action_in_proj): Linear(in_features=32, out_features=720, bias=True) - (action_out_proj): Linear(in_features=720, out_features=32, bias=True) - (action_time_mlp_in): Linear(in_features=1440, out_features=720, bias=True) - (action_time_mlp_out): Linear(in_features=720, out_features=720, bias=True) - ) - ) -) -(Pdb) policy.config -*** AttributeError: 'DistributedDataParallel' object has no attribute 'config' -(Pdb) policy.input_features -*** AttributeError: 'DistributedDataParallel' object has no attribute 'input_features' -(Pdb) quit() -[rank1]: Traceback (most recent call last): -[rank1]: File "/home/jade_choghari/lerobot/src/lerobot/scripts/train_accelerate.py", line 368, in -[rank1]: train() -[rank1]: File "/home/jade_choghari/lerobot/src/lerobot/configs/parser.py", line 225, in wrapper_inner -[rank1]: response = fn(cfg, *args, **kwargs) -[rank1]: File "/home/jade_choghari/lerobot/src/lerobot/scripts/train_accelerate.py", line 263, in train -[rank1]: train_tracker, output_dict = update_policy( -[rank1]: File "/home/jade_choghari/lerobot/src/lerobot/scripts/train_accelerate.py", line 263, in train -[rank1]: train_tracker, output_dict = update_policy( -[rank1]: File "/home/jade_choghari/miniconda3/envs/lerobot/lib/python3.10/bdb.py", line 90, in trace_dispatch -[rank1]: return self.dispatch_line(frame) -[rank1]: File "/home/jade_choghari/miniconda3/envs/lerobot/lib/python3.10/bdb.py", line 115, in dispatch_line -[rank1]: if self.quitting: raise BdbQuit -[rank1]: bdb.BdbQuit -W0908 13:25:34.274000 776579 site-packages/torch/distributed/elastic/multiprocessing/api.py:900] Sending process 776 -663 closing signal SIGTERM -E0908 13:25:34.589000 776579 site-packages/torch/distributed/elastic/multiprocessing/api.py:874] failed (exitcode: 1 -) local_rank: 1 (pid: 776664) of binary: /home/jade_choghari/miniconda3/envs/lerobot/bin/python -Traceback (most recent call last): - File "/home/jade_choghari/miniconda3/envs/lerobot/lib/python3.10/runpy.py", line 196, in _run_module_as_main - return _run_code(code, main_globals, None, - File "/home/jade_choghari/miniconda3/envs/lerobot/lib/python3.10/runpy.py", line 86, in _run_code - exec(code, run_globals) - File "/home/jade_choghari/miniconda3/envs/lerobot/lib/python3.10/site-packages/accelerate/commands/launch.py", lin -e 1245, in - main() - File "/home/jade_choghari/miniconda3/envs/lerobot/lib/python3.10/site-packages/accelerate/commands/launch.py", lin -e 1241, in main - launch_command(args) - File "/home/jade_choghari/miniconda3/envs/lerobot/lib/python3.10/site-packages/accelerate/commands/launch.py", lin -e 1226, in launch_command - multi_gpu_launcher(args) - File "/home/jade_choghari/miniconda3/envs/lerobot/lib/python3.10/site-packages/accelerate/commands/launch.py", lin -e 853, in multi_gpu_launcher - distrib_run.run(args) - File "/home/jade_choghari/miniconda3/envs/lerobot/lib/python3.10/site-packages/torch/distributed/run.py", line 883 -, in run - elastic_launch( - File "/home/jade_choghari/miniconda3/envs/lerobot/lib/python3.10/site-packages/torch/distributed/launcher/api.py", - line 139, in __call__ - return launch_agent(self._config, self._entrypoint, list(args)) - File "/home/jade_choghari/miniconda3/envs/lerobot/lib/python3.10/site-packages/torch/distributed/launcher/api.py", - line 270, in launch_agent - raise ChildFailedError( -torch.distributed.elastic.multiprocessing.errors.ChildFailedError: -============================================================ -src/lerobot/scripts/train_accelerate.py FAILED ------------------------------------------------------------- -Failures: - ------------------------------------------------------------- -Root Cause (first observed failure): -[0]: - time : 2025-09-08_13:25:34 - host : hf-dgx-01 - rank : 1 (local_rank: 1) - exitcode : 1 (pid: 776664) - error_file: - traceback : To enable traceback see: https://pytorch.org/docs/stable/elastic/errors.html -============================================================ -(lerobot) jade_choghari@hf-dgx-01:~/lerobot$ clear -(lerobot) jade_choghari@hf-dgx-01:~/lerobot$ clear -(lerobot) jade_choghari@hf-dgx-01:~/lerobot$ bash examples/7_train_acc.sh -Training dir: /raid/jade/logs/lerobot/lerobot_2_HuggingFaceVLA_libero_smolvla_lr1e-4bs32steps100000 -The following values were not passed to `accelerate launch` and had defaults used instead: - More than one GPU was found, enabling multi-GPU training. - If this was unintended please pass in `--num_processes=1`. - `--dynamo_backend` was set to a value of `'no'` -To avoid this warning pass in values for each of the problematic parameters or run `accelerate config`. -/home/jade_choghari/miniconda3/envs/lerobot/lib/python3.10/site-packages/accelerate/utils/launch.py:238: UserWarning -: Port `29522` is already in use. Accelerate will attempt to launch in a standalone-like mode by finding an open por -t automatically for this session. If this current attempt fails, or for more control in future runs, please specify -a different port (e.g., `--main_process_port `) or use `--main_process_port 0` for automatic selec -tion in your launch command or Accelerate config file. - warnings.warn( -INFO 2025-09-08 13:33:47 ils/utils.py:48 Cuda backend detected, using cuda. -WARNING 2025-09-08 13:33:47 /policies.py:81 Device 'None' is not available. Switching to 'cuda'. -INFO 2025-09-08 13:33:47 ccelerate.py:99 {'batch_size': 32, - 'dataset': {'episodes': None, - 'image_transforms': {'enable': False, - 'max_num_transforms': 3, - 'random_order': False, - 'tfs': {'brightness': {'kwargs': {'brightness': [0.8, - 1.2]}, - 'type': 'ColorJitter', - 'weight': 1.0}, - 'contrast': {'kwargs': {'contrast': [0.8, - 1.2]}, - 'type': 'ColorJitter', - 'weight': 1.0}, - 'hue': {'kwargs': {'hue': [-0.05, - 0.05]}, - 'type': 'ColorJitter', - 'weight': 1.0}, - 'saturation': {'kwargs': {'saturation': [0.5, - 1.5]}, - 'type': 'ColorJitter', - 'weight': 1.0}, - 'sharpness': {'kwargs': {'sharpness': [0.5, - 1.5]}, - 'type': 'SharpnessJitter', - 'weight': 1.0}}}, - 'repo_id': 'HuggingFaceVLA/libero', - 'revision': None, - 'root': '/raid/jade/.cache/huggingface/datasets', - 'use_imagenet_stats': True, - 'video_backend': 'torchcodec'}, - 'env': {'camera_name': 'agentview_image,robot0_eye_in_hand_image', - 'episode_length': 520, - 'features': {'action': {'shape': [7], - 'type': }, - 'agent_pos': {'shape': [8], - 'type': }, - 'pixels/agentview_image': {'shape': [360, 360, 3], - 'type': }, - 'pixels/robot0_eye_in_hand_image': {'shape': [360, - 360, - 3], - 'type': }}, - 'features_map': {'action': 'action', - 'agent_pos': 'observation.state', - 'pixels/agentview_image': 'observation.images.image', - 'pixels/robot0_eye_in_hand_image': 'observation.images.image2'}, - 'fps': 30, - 'init_states': True, - 'max_parallel_tasks': 5, - 'multitask_eval': True, - 'obs_type': 'pixels_agent_pos', - 'render_mode': 'rgb_array', - 'task': 'libero_spatial', - 'type': 'libero'}, - 'eval': {'batch_size': 1, 'n_episodes': 1, 'use_async_envs': False}, - 'eval_freq': 0, - 'job_name': 'libero_smolvla', - 'log_freq': 200, - 'num_workers': 4, - 'optimizer': {'betas': [0.9, 0.95], - 'eps': 1e-08, - 'grad_clip_norm': 10, - 'lr': 0.0001, - 'type': 'adamw', - 'weight_decay': 1e-10}, - 'output_dir': '/raid/jade/logs/lerobot/lerobot_2_HuggingFaceVLA_libero_smolvla_lr1e-4bs32steps100000', - 'policy': {'adapt_to_pi_aloha': False, - 'add_image_special_tokens': False, - 'attention_mode': 'cross_attn', - 'chunk_size': 50, - 'device': 'cuda', - 'empty_cameras': 0, - 'expert_width_multiplier': 0.75, - 'freeze_vision_encoder': True, - 'gradient_accumulation_steps': 1, - 'input_features': {}, - 'license': None, - 'load_vlm_weights': False, - 'max_action_dim': 32, - 'max_period': 4.0, - 'max_state_dim': 32, - 'min_period': 0.004, - 'n_action_steps': 1, - 'n_obs_steps': 1, - 'normalization_mapping': {'ACTION': , - 'STATE': , - 'VISUAL': }, - 'num_expert_layers': -1, - 'num_steps': 10, - 'num_vlm_layers': 16, - 'optimizer_betas': [0.9, 0.95], - 'optimizer_eps': 1e-08, - 'optimizer_grad_clip_norm': 10, - 'optimizer_lr': 0.0001, - 'optimizer_weight_decay': 1e-10, - 'output_features': {}, - 'pad_language_to': 'longest', - 'prefix_length': -1, - 'private': None, - 'push_to_hub': True, - 'repo_id': 'None', - 'resize_imgs_with_padding': [512, 512], - 'scheduler_decay_lr': 2.5e-06, - 'scheduler_decay_steps': 30000, - 'scheduler_warmup_steps': 1000, - 'self_attn_every_n_layers': 2, - 'tags': None, - 'tokenizer_max_length': 48, - 'train_expert_only': True, - 'train_state_proj': True, - 'type': 'smolvla', - 'use_amp': True, - 'use_cache': True, - 'use_delta_joint_actions_aloha': False, - 'vlm_model_name': 'HuggingFaceTB/SmolVLM2-500M-Instruct'}, - 'resume': False, - 'save_checkpoint': True, - 'save_freq': 20000, - 'scheduler': {'decay_lr': 2.5e-06, - 'num_decay_steps': 30000, - 'num_warmup_steps': 1000, - 'peak_lr': 0.0001, - 'type': 'cosine_decay_with_warmup'}, - 'seed': 1000, - 'steps': 100000, - 'use_policy_training_preset': True, - 'wandb': {'disable_artifact': False, - 'enable': False, - 'entity': None, - 'mode': None, - 'notes': None, - 'project': 'lerobot', - 'run_id': None}} -INFO 2025-09-08 13:33:47 ils/utils.py:48 Cuda backend detected, using cuda. -WARNING 2025-09-08 13:33:47 /policies.py:81 Device 'None' is not available. Switching to 'cuda'. -INFO 2025-09-08 13:33:47 ccelerate.py:99 {'batch_size': 32, - 'dataset': {'episodes': None, - 'image_transforms': {'enable': False, - 'max_num_transforms': 3, - 'random_order': False, - 'tfs': {'brightness': {'kwargs': {'brightness': [0.8, - 1.2]}, - 'type': 'ColorJitter', - 'weight': 1.0}, - 'contrast': {'kwargs': {'contrast': [0.8, - 1.2]}, - 'type': 'ColorJitter', - 'weight': 1.0}, - 'hue': {'kwargs': {'hue': [-0.05, - 0.05]}, - 'type': 'ColorJitter', - 'weight': 1.0}, - 'saturation': {'kwargs': {'saturation': [0.5, - 1.5]}, - 'type': 'ColorJitter', - 'weight': 1.0}, - 'sharpness': {'kwargs': {'sharpness': [0.5, - 1.5]}, - 'type': 'SharpnessJitter', - 'weight': 1.0}}}, - 'repo_id': 'HuggingFaceVLA/libero', - 'revision': None, - 'root': '/raid/jade/.cache/huggingface/datasets', - 'use_imagenet_stats': True, - 'video_backend': 'torchcodec'}, - 'env': {'camera_name': 'agentview_image,robot0_eye_in_hand_image', - 'episode_length': 520, - 'features': {'action': {'shape': [7], - 'type': }, - 'agent_pos': {'shape': [8], - 'type': }, - 'pixels/agentview_image': {'shape': [360, 360, 3], - 'type': }, - 'pixels/robot0_eye_in_hand_image': {'shape': [360, - 360, - 3], - 'type': }}, - 'features_map': {'action': 'action', - 'agent_pos': 'observation.state', - 'pixels/agentview_image': 'observation.images.image', - 'pixels/robot0_eye_in_hand_image': 'observation.images.image2'}, - 'fps': 30, - 'init_states': True, - 'max_parallel_tasks': 5, - 'multitask_eval': True, - 'obs_type': 'pixels_agent_pos', - 'render_mode': 'rgb_array', - 'task': 'libero_spatial', - 'type': 'libero'}, - 'eval': {'batch_size': 1, 'n_episodes': 1, 'use_async_envs': False}, - 'eval_freq': 0, - 'job_name': 'libero_smolvla', - 'log_freq': 200, - 'num_workers': 4, - 'optimizer': {'betas': [0.9, 0.95], - 'eps': 1e-08, - 'grad_clip_norm': 10, - 'lr': 0.0001, - 'type': 'adamw', - 'weight_decay': 1e-10}, - 'output_dir': '/raid/jade/logs/lerobot/lerobot_2_HuggingFaceVLA_libero_smolvla_lr1e-4bs32steps100000', - 'policy': {'adapt_to_pi_aloha': False, - 'add_image_special_tokens': False, - 'attention_mode': 'cross_attn', - 'chunk_size': 50, - 'device': 'cuda', - 'empty_cameras': 0, - 'expert_width_multiplier': 0.75, - 'freeze_vision_encoder': True, - 'gradient_accumulation_steps': 1, - 'input_features': {}, - 'license': None, - 'load_vlm_weights': False, - 'max_action_dim': 32, - 'max_period': 4.0, - 'max_state_dim': 32, - 'min_period': 0.004, - 'n_action_steps': 1, - 'n_obs_steps': 1, - 'normalization_mapping': {'ACTION': , - 'STATE': , - 'VISUAL': }, - 'num_expert_layers': -1, - 'num_steps': 10, - 'num_vlm_layers': 16, - 'optimizer_betas': [0.9, 0.95], - 'optimizer_eps': 1e-08, - 'optimizer_grad_clip_norm': 10, - 'optimizer_lr': 0.0001, - 'optimizer_weight_decay': 1e-10, - 'output_features': {}, - 'pad_language_to': 'longest', - 'prefix_length': -1, - 'private': None, - 'push_to_hub': True, - 'repo_id': 'None', - 'resize_imgs_with_padding': [512, 512], - 'scheduler_decay_lr': 2.5e-06, - 'scheduler_decay_steps': 30000, - 'scheduler_warmup_steps': 1000, - 'self_attn_every_n_layers': 2, - 'tags': None, - 'tokenizer_max_length': 48, - 'train_expert_only': True, - 'train_state_proj': True, - 'type': 'smolvla', - 'use_amp': True, - 'use_cache': True, - 'use_delta_joint_actions_aloha': False, - 'vlm_model_name': 'HuggingFaceTB/SmolVLM2-500M-Instruct'}, - 'resume': False, - 'save_checkpoint': True, - 'save_freq': 20000, - 'scheduler': {'decay_lr': 2.5e-06, - 'num_decay_steps': 30000, - 'num_warmup_steps': 1000, - 'peak_lr': 0.0001, - 'type': 'cosine_decay_with_warmup'}, - 'seed': 1000, - 'steps': 100000, - 'use_policy_training_preset': True, - 'wandb': {'disable_artifact': False, - 'enable': False, - 'entity': None, - 'mode': None, - 'notes': None, - 'project': 'lerobot', - 'run_id': None}} -WARNING 2025-09-08 13:33:47 ls/other.py:512 Detected kernel version 5.4.0, which is below the recommended minimum of - 5.5.0; this can cause the process to hang. It is recommended to upgrade the kernel to the minimum version or higher -. -WARNING 2025-09-08 13:33:47 ls/other.py:512 Detected kernel version 5.4.0, which is below the recommended minimum of - 5.5.0; this can cause the process to hang. It is recommended to upgrade the kernel to the minimum version or higher -. -INFO 2025-09-08 13:33:47 celerate.py:149 Creating dataset -Resolving data files: 100%|β–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆ| 1693/1693 [00:00<00:00, 103295.66it/s] -Loading dataset shards: 100%|β–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆ| 69/69 [00:00<00:00, 5229.81it/s] -Resolving data files: 100%|β–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆ| 1693/1693 [00:00<00:00, 360601.09it/s] -Loading dataset shards: 100%|β–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆ| 69/69 [00:00<00:00, 4881.54it/s] -c -INFO 2025-09-08 13:33:53 celerate.py:160 Creating policy -/home/jade_choghari/miniconda3/envs/lerobot/lib/python3.10/site-packages/transformers/utils/hub.py:111: FutureWarnin -g: Using `TRANSFORMERS_CACHE` is deprecated and will be removed in v5 of Transformers. Use `HF_HOME` instead. - warnings.warn( -c -/home/jade_choghari/miniconda3/envs/lerobot/lib/python3.10/site-packages/torch/distributed/distributed_c10d.py:4631: - UserWarning: No device id is provided via `init_process_group` or `barrier `. Using the current device set by the u -ser. - warnings.warn( # warn only once -[rank1]:[W908 13:33:54.613597516 ProcessGroupNCCL.cpp:4718] [PG ID 0 PG GUID 0 Rank 1] using GPU 1 as device used b -y this process is currently unknown. This can potentially cause a hang if this rank to GPU mapping is incorrect. You - can pecify device_id in init_process_group() to force use of a particular device. -Reducing the number of VLM layers to 16 ... -/home/jade_choghari/miniconda3/envs/lerobot/lib/python3.10/site-packages/torch/distributed/distributed_c10d.py:4631: - UserWarning: No device id is provided via `init_process_group` or `barrier `. Using the current device set by the u -ser. - warnings.warn( # warn only once -[rank0]:[W908 13:34:15.806448425 ProcessGroupNCCL.cpp:4718] [PG ID 0 PG GUID 0 Rank 0] using GPU 0 as device used b -y this process is currently unknown. This can potentially cause a hang if this rank to GPU mapping is incorrect. You - can pecify device_id in init_process_group() to force use of a particular device. -INFO 2025-09-08 13:34:15 celerate.py:171 Creating optimizer and scheduler -/home/jade_choghari/miniconda3/envs/lerobot/lib/python3.10/site-packages/transformers/utils/hub.py:111: FutureWarnin -g: Using `TRANSFORMERS_CACHE` is deprecated and will be removed in v5 of Transformers. Use `HF_HOME` instead. - warnings.warn( -Reducing the number of VLM layers to 16 ... -INFO 2025-09-08 13:34:36 celerate.py:211 Output dir: /raid/jade/logs/lerobot/lerobot_2_HuggingFaceVLA_libero_smolvla -_lr1e-4bs32steps100000 -INFO 2025-09-08 13:34:36 celerate.py:213 cfg.env.task='libero_spatial' -INFO 2025-09-08 13:34:36 celerate.py:214 cfg.steps=100000 (100K) -INFO 2025-09-08 13:34:36 celerate.py:215 dataset.num_frames=273465 (273K) -INFO 2025-09-08 13:34:36 celerate.py:216 dataset.num_episodes=1693 -INFO 2025-09-08 13:34:36 celerate.py:217 num_learnable_params=99880992 (100M) -INFO 2025-09-08 13:34:36 celerate.py:218 num_total_params=450046220 (450M) -INFO 2025-09-08 13:34:36 celerate.py:219 Number of processes: 2 -INFO 2025-09-08 13:34:36 celerate.py:220 Device: cuda:0 -INFO 2025-09-08 13:34:36 celerate.py:221 Mixed precision: bf16 -INFO 2025-09-08 13:34:36 celerate.py:243 Start offline training on a fixed dataset -[rank1]:[W908 13:34:39.454560620 reducer.cpp:1430] Warning: find_unused_parameters=True was specified in DDP constru -ctor, but did not find any unused parameters in the forward pass. This flag results in an extra traversal of the aut -ograd graph every iteration, which can adversely affect performance. If your model indeed never has any unused para -meters in the forward pass, consider turning this flag off. Note that this warning may be a false positive if your m -odel has flow control causing later iterations to have unused parameters. (function operator()) -[rank0]:[W908 13:34:40.502702504 reducer.cpp:1430] Warning: find_unused_parameters=True was specified in DDP constru -ctor, but did not find any unused parameters in the forward pass. This flag results in an extra traversal of the aut -ograd graph every iteration, which can adversely affect performance. If your model indeed never has any unused para -meters in the forward pass, consider turning this flag off. Note that this warning may be a false positive if your m -odel has flow control causing later iterations to have unused parameters. (function operator()) -INFO 2025-09-08 13:36:23 celerate.py:281 step:200 smpl:13K ep:79 epch:0.05 loss:0.963 grdn:2.699 lr:2.0e-05 updt_s:0 -.506 data_s:0.027 -INFO 2025-09-08 13:38:09 celerate.py:281 step:400 smpl:26K ep:158 epch:0.09 loss:0.389 grdn:3.127 lr:6.0e-05 updt_s: -0.525 data_s:0.003 -INFO 2025-09-08 13:39:53 celerate.py:281 step:600 smpl:38K ep:238 epch:0.14 loss:0.261 grdn:2.618 lr:9.5e-05 updt_s: -0.517 data_s:0.003 -INFO 2025-09-08 13:41:37 celerate.py:281 step:800 smpl:51K ep:317 epch:0.19 loss:0.231 grdn:1.684 lr:9.9e-05 updt_s: -0.516 data_s:0.003 -INFO 2025-09-08 13:43:21 celerate.py:281 step:1K smpl:64K ep:396 epch:0.23 loss:0.211 grdn:1.258 lr:9.9e-05 updt_s:0 -.514 data_s:0.003 -INFO 2025-09-08 13:45:05 celerate.py:281 step:1K smpl:77K ep:475 epch:0.28 loss:0.198 grdn:1.032 lr:9.9e-05 updt_s:0 -.517 data_s:0.003 -INFO 2025-09-08 13:46:49 celerate.py:281 step:1K smpl:90K ep:555 epch:0.33 loss:0.182 grdn:0.880 lr:9.8e-05 updt_s:0 -.515 data_s:0.003 -INFO 2025-09-08 13:48:33 celerate.py:281 step:2K smpl:102K ep:634 epch:0.37 loss:0.167 grdn:0.744 lr:9.8e-05 updt_s: -0.514 data_s:0.003 -INFO 2025-09-08 13:50:17 celerate.py:281 step:2K smpl:115K ep:713 epch:0.42 loss:0.157 grdn:0.680 lr:9.7e-05 updt_s: -0.514 data_s:0.003 -INFO 2025-09-08 13:52:01 celerate.py:281 step:2K smpl:128K ep:792 epch:0.47 loss:0.147 grdn:0.612 lr:9.6e-05 updt_s: -0.517 data_s:0.003 -INFO 2025-09-08 13:53:44 celerate.py:281 step:2K smpl:141K ep:872 epch:0.51 loss:0.142 grdn:0.576 lr:9.5e-05 updt_s: -0.510 data_s:0.003 -INFO 2025-09-08 13:55:27 celerate.py:281 step:2K smpl:154K ep:951 epch:0.56 loss:0.136 grdn:0.523 lr:9.4e-05 updt_s: -0.514 data_s:0.003 -INFO 2025-09-08 13:57:11 celerate.py:281 step:3K smpl:166K ep:1K epch:0.61 loss:0.132 grdn:0.509 lr:9.3e-05 updt_s:0 -.516 data_s:0.003 -INFO 2025-09-08 13:58:57 celerate.py:281 step:3K smpl:179K ep:1K epch:0.66 loss:0.126 grdn:0.492 lr:9.2e-05 updt_s:0 -.525 data_s:0.003 -INFO 2025-09-08 14:00:43 celerate.py:281 step:3K smpl:192K ep:1K epch:0.70 loss:0.124 grdn:0.467 lr:9.1e-05 updt_s:0 -.525 data_s:0.003 -INFO 2025-09-08 14:02:26 celerate.py:281 step:3K smpl:205K ep:1K epch:0.75 loss:0.119 grdn:0.438 lr:9.0e-05 updt_s:0 -.508 data_s:0.003 -INFO 2025-09-08 14:04:27 celerate.py:281 step:3K smpl:218K ep:1K epch:0.80 loss:0.118 grdn:0.426 lr:8.9e-05 updt_s:0 -.564 data_s:0.039 -INFO 2025-09-08 14:06:10 celerate.py:281 step:4K smpl:230K ep:1K epch:0.84 loss:0.116 grdn:0.422 lr:8.7e-05 updt_s:0 -.511 data_s:0.004 -INFO 2025-09-08 14:07:55 celerate.py:281 step:4K smpl:243K ep:2K epch:0.89 loss:0.113 grdn:0.395 lr:8.6e-05 updt_s:0 -.517 data_s:0.003 -INFO 2025-09-08 14:09:38 celerate.py:281 step:4K smpl:256K ep:2K epch:0.94 loss:0.111 grdn:0.401 lr:8.5e-05 updt_s:0 -.511 data_s:0.003 -INFO 2025-09-08 14:11:21 celerate.py:281 step:4K smpl:269K ep:2K epch:0.98 loss:0.110 grdn:0.380 lr:8.3e-05 updt_s:0 -.511 data_s:0.003 -INFO 2025-09-08 14:13:08 celerate.py:281 step:4K smpl:282K ep:2K epch:1.03 loss:0.109 grdn:0.381 lr:8.2e-05 updt_s:0 -.413 data_s:0.119 -INFO 2025-09-08 14:14:52 celerate.py:281 step:5K smpl:294K ep:2K epch:1.08 loss:0.107 grdn:0.387 lr:8.0e-05 updt_s:0 -.373 data_s:0.146 -INFO 2025-09-08 14:16:36 celerate.py:281 step:5K smpl:307K ep:2K epch:1.12 loss:0.107 grdn:0.366 lr:7.8e-05 updt_s:0 -.446 data_s:0.072 -INFO 2025-09-08 14:18:19 celerate.py:281 step:5K smpl:320K ep:2K epch:1.17 loss:0.105 grdn:0.347 lr:7.6e-05 updt_s:0 -.468 data_s:0.045 -INFO 2025-09-08 14:20:01 celerate.py:281 step:5K smpl:333K ep:2K epch:1.22 loss:0.103 grdn:0.350 lr:7.5e-05 updt_s:0 -.510 data_s:0.003 -INFO 2025-09-08 14:21:46 celerate.py:281 step:5K smpl:346K ep:2K epch:1.26 loss:0.101 grdn:0.336 lr:7.3e-05 updt_s:0 -.512 data_s:0.011 -INFO 2025-09-08 14:23:30 celerate.py:281 step:6K smpl:358K ep:2K epch:1.31 loss:0.102 grdn:0.345 lr:7.1e-05 updt_s:0 -.515 data_s:0.003 -INFO 2025-09-08 14:25:15 celerate.py:281 step:6K smpl:371K ep:2K epch:1.36 loss:0.100 grdn:0.333 lr:6.9e-05 updt_s:0 -.521 data_s:0.003 -INFO 2025-09-08 14:26:59 celerate.py:281 step:6K smpl:384K ep:2K epch:1.40 loss:0.100 grdn:0.328 lr:6.7e-05 updt_s:0 -.516 data_s:0.003 -INFO 2025-09-08 14:28:43 celerate.py:281 step:6K smpl:397K ep:2K epch:1.45 loss:0.099 grdn:0.319 lr:6.5e-05 updt_s:0 -.512 data_s:0.003 -INFO 2025-09-08 14:30:26 celerate.py:281 step:6K smpl:410K ep:3K epch:1.50 loss:0.098 grdn:0.313 lr:6.3e-05 updt_s:0 -.515 data_s:0.003 -INFO 2025-09-08 14:32:11 celerate.py:281 step:7K smpl:422K ep:3K epch:1.54 loss:0.097 grdn:0.319 lr:6.1e-05 updt_s:0 -.519 data_s:0.004 -INFO 2025-09-08 14:33:55 celerate.py:281 step:7K smpl:435K ep:3K epch:1.59 loss:0.097 grdn:0.312 lr:5.9e-05 updt_s:0 -.506 data_s:0.010 -INFO 2025-09-08 14:35:39 celerate.py:281 step:7K smpl:448K ep:3K epch:1.64 loss:0.097 grdn:0.307 lr:5.7e-05 updt_s:0 -.516 data_s:0.003 -INFO 2025-09-08 14:37:23 celerate.py:281 step:7K smpl:461K ep:3K epch:1.69 loss:0.095 grdn:0.294 lr:5.5e-05 updt_s:0 -.518 data_s:0.003 -INFO 2025-09-08 14:39:07 celerate.py:281 step:7K smpl:474K ep:3K epch:1.73 loss:0.095 grdn:0.299 lr:5.3e-05 updt_s:0 -.507 data_s:0.007 -INFO 2025-09-08 14:40:52 celerate.py:281 step:8K smpl:486K ep:3K epch:1.78 loss:0.094 grdn:0.283 lr:5.1e-05 updt_s:0 -.523 data_s:0.003 -INFO 2025-09-08 14:42:36 celerate.py:281 step:8K smpl:499K ep:3K epch:1.83 loss:0.093 grdn:0.284 lr:4.9e-05 updt_s:0 -.517 data_s:0.003 -INFO 2025-09-08 14:44:22 celerate.py:281 step:8K smpl:512K ep:3K epch:1.87 loss:0.092 grdn:0.284 lr:4.7e-05 updt_s:0 -.465 data_s:0.060 -INFO 2025-09-08 14:46:06 celerate.py:281 step:8K smpl:525K ep:3K epch:1.92 loss:0.093 grdn:0.292 lr:4.5e-05 updt_s:0 -.456 data_s:0.066 -INFO 2025-09-08 14:47:49 celerate.py:281 step:8K smpl:538K ep:3K epch:1.97 loss:0.093 grdn:0.290 lr:4.3e-05 updt_s:0 -.510 data_s:0.003 -INFO 2025-09-08 14:49:37 celerate.py:281 step:9K smpl:550K ep:3K epch:2.01 loss:0.092 grdn:0.283 lr:4.1e-05 updt_s:0 -.419 data_s:0.117 -INFO 2025-09-08 14:51:20 celerate.py:281 step:9K smpl:563K ep:3K epch:2.06 loss:0.092 grdn:0.275 lr:3.9e-05 updt_s:0 -.463 data_s:0.053 -INFO 2025-09-08 14:53:05 celerate.py:281 step:9K smpl:576K ep:4K epch:2.11 loss:0.090 grdn:0.272 lr:3.7e-05 updt_s:0 -.517 data_s:0.003 -INFO 2025-09-08 14:54:49 celerate.py:281 step:9K smpl:589K ep:4K epch:2.15 loss:0.090 grdn:0.268 lr:3.5e-05 updt_s:0 -.506 data_s:0.013 -INFO 2025-09-08 14:56:32 celerate.py:281 step:9K smpl:602K ep:4K epch:2.20 loss:0.090 grdn:0.271 lr:3.3e-05 updt_s:0 -.513 data_s:0.003 -INFO 2025-09-08 14:58:17 celerate.py:281 step:10K smpl:614K ep:4K epch:2.25 loss:0.090 grdn:0.268 lr:3.1e-05 updt_s: -0.520 data_s:0.003 -INFO 2025-09-08 15:00:02 celerate.py:281 step:10K smpl:627K ep:4K epch:2.29 loss:0.089 grdn:0.261 lr:3.0e-05 updt_s: -0.519 data_s:0.003 -INFO 2025-09-08 15:01:48 celerate.py:281 step:10K smpl:640K ep:4K epch:2.34 loss:0.090 grdn:0.271 lr:2.8e-05 updt_s: -0.526 data_s:0.003 -INFO 2025-09-08 15:03:33 celerate.py:281 step:10K smpl:653K ep:4K epch:2.39 loss:0.089 grdn:0.262 lr:2.6e-05 updt_s: -0.521 data_s:0.003 -INFO 2025-09-08 15:05:18 celerate.py:281 step:10K smpl:666K ep:4K epch:2.43 loss:0.090 grdn:0.264 lr:2.4e-05 updt_s: -0.519 data_s:0.003 -INFO 2025-09-08 15:07:32 celerate.py:281 step:11K smpl:678K ep:4K epch:2.48 loss:0.089 grdn:0.255 lr:2.3e-05 updt_s: -0.663 data_s:0.004 -INFO 2025-09-08 15:09:21 celerate.py:281 step:11K smpl:691K ep:4K epch:2.53 loss:0.090 grdn:0.263 lr:2.1e-05 updt_s: -0.514 data_s:0.030 -INFO 2025-09-08 15:11:06 celerate.py:281 step:11K smpl:704K ep:4K epch:2.57 loss:0.088 grdn:0.254 lr:1.9e-05 updt_s: -0.517 data_s:0.006 -INFO 2025-09-08 15:12:51 celerate.py:281 step:11K smpl:717K ep:4K epch:2.62 loss:0.088 grdn:0.252 lr:1.8e-05 updt_s: -0.517 data_s:0.005 -INFO 2025-09-08 15:14:38 celerate.py:281 step:11K smpl:730K ep:5K epch:2.67 loss:0.088 grdn:0.251 lr:1.6e-05 updt_s: -0.532 data_s:0.003 -INFO 2025-09-08 15:16:23 celerate.py:281 step:12K smpl:742K ep:5K epch:2.71 loss:0.088 grdn:0.253 lr:1.5e-05 updt_s: -0.520 data_s:0.003 -INFO 2025-09-08 15:18:08 celerate.py:281 step:12K smpl:755K ep:5K epch:2.76 loss:0.087 grdn:0.244 lr:1.4e-05 updt_s: -0.521 data_s:0.003 -INFO 2025-09-08 15:19:54 celerate.py:281 step:12K smpl:768K ep:5K epch:2.81 loss:0.088 grdn:0.247 lr:1.2e-05 updt_s: -0.524 data_s:0.003 -INFO 2025-09-08 15:21:39 celerate.py:281 step:12K smpl:781K ep:5K epch:2.86 loss:0.087 grdn:0.242 lr:1.1e-05 updt_s: -0.520 data_s:0.003 -INFO 2025-09-08 15:23:32 celerate.py:281 step:12K smpl:794K ep:5K epch:2.90 loss:0.088 grdn:0.243 lr:1.0e-05 updt_s: -0.560 data_s:0.003 -INFO 2025-09-08 15:25:48 celerate.py:281 step:13K smpl:806K ep:5K epch:2.95 loss:0.087 grdn:0.240 lr:9.0e-06 updt_s: -0.674 data_s:0.005 -INFO 2025-09-08 15:28:02 celerate.py:281 step:13K smpl:819K ep:5K epch:3.00 loss:0.088 grdn:0.245 lr:8.0e-06 updt_s: -0.662 data_s:0.004 -INFO 2025-09-08 15:31:06 celerate.py:281 step:13K smpl:832K ep:5K epch:3.04 loss:0.086 grdn:0.236 lr:7.1e-06 updt_s: -0.688 data_s:0.231 -INFO 2025-09-08 15:32:52 celerate.py:281 step:13K smpl:845K ep:5K epch:3.09 loss:0.087 grdn:0.231 lr:6.3e-06 updt_s: -0.521 data_s:0.003 -INFO 2025-09-08 15:35:46 celerate.py:281 step:13K smpl:858K ep:5K epch:3.14 loss:0.088 grdn:0.235 lr:5.6e-06 updt_s: -0.637 data_s:0.232 -INFO 2025-09-08 15:37:34 celerate.py:281 step:14K smpl:870K ep:5K epch:3.18 loss:0.087 grdn:0.238 lr:4.9e-06 updt_s: -0.514 data_s:0.025 -INFO 2025-09-08 15:39:18 celerate.py:281 step:14K smpl:883K ep:5K epch:3.23 loss:0.087 grdn:0.226 lr:4.3e-06 updt_s: -0.515 data_s:0.003 -INFO 2025-09-08 15:41:02 celerate.py:281 step:14K smpl:896K ep:6K epch:3.28 loss:0.087 grdn:0.230 lr:3.8e-06 updt_s: -0.515 data_s:0.003 -INFO 2025-09-08 15:42:45 celerate.py:281 step:14K smpl:909K ep:6K epch:3.32 loss:0.086 grdn:0.229 lr:3.4e-06 updt_s: -0.507 data_s:0.008 -INFO 2025-09-08 15:44:29 celerate.py:281 step:14K smpl:922K ep:6K epch:3.37 loss:0.087 grdn:0.229 lr:3.0e-06 updt_s: -0.515 data_s:0.003 -INFO 2025-09-08 15:46:12 celerate.py:281 step:15K smpl:934K ep:6K epch:3.42 loss:0.087 grdn:0.228 lr:2.8e-06 updt_s: -0.502 data_s:0.011 -INFO 2025-09-08 15:47:56 celerate.py:281 step:15K smpl:947K ep:6K epch:3.46 loss:0.086 grdn:0.232 lr:2.6e-06 updt_s: -0.515 data_s:0.004 -INFO 2025-09-08 15:49:39 celerate.py:281 step:15K smpl:960K ep:6K epch:3.51 loss:0.087 grdn:0.234 lr:2.5e-06 updt_s: -0.365 data_s:0.147 -INFO 2025-09-08 15:51:22 celerate.py:281 step:15K smpl:973K ep:6K epch:3.56 loss:0.086 grdn:0.230 lr:2.5e-06 updt_s: -0.333 data_s:0.179 -INFO 2025-09-08 15:53:07 celerate.py:281 step:15K smpl:986K ep:6K epch:3.60 loss:0.087 grdn:0.229 lr:2.5e-06 updt_s: -0.357 data_s:0.164 -INFO 2025-09-08 15:54:50 celerate.py:281 step:16K smpl:998K ep:6K epch:3.65 loss:0.087 grdn:0.230 lr:2.5e-06 updt_s: -0.365 data_s:0.151 -INFO 2025-09-08 15:56:35 celerate.py:281 step:16K smpl:1M ep:6K epch:3.70 loss:0.086 grdn:0.228 lr:2.5e-06 updt_s:0. -450 data_s:0.071 -INFO 2025-09-08 15:58:19 celerate.py:281 step:16K smpl:1M ep:6K epch:3.74 loss:0.087 grdn:0.232 lr:2.5e-06 updt_s:0. -495 data_s:0.023 -INFO 2025-09-08 16:00:04 celerate.py:281 step:16K smpl:1M ep:6K epch:3.79 loss:0.087 grdn:0.227 lr:2.5e-06 updt_s:0. -393 data_s:0.131 -INFO 2025-09-08 16:01:48 celerate.py:281 step:16K smpl:1M ep:6K epch:3.84 loss:0.086 grdn:0.230 lr:2.5e-06 updt_s:0. -515 data_s:0.003 -INFO 2025-09-08 16:03:32 celerate.py:281 step:17K smpl:1M ep:7K epch:3.88 loss:0.087 grdn:0.231 lr:2.5e-06 updt_s:0. -518 data_s:0.003 -INFO 2025-09-08 16:05:17 celerate.py:281 step:17K smpl:1M ep:7K epch:3.93 loss:0.087 grdn:0.231 lr:2.5e-06 updt_s:0. -397 data_s:0.125 -INFO 2025-09-08 16:12:41 celerate.py:281 step:17K smpl:1M ep:7K epch:3.98 loss:0.086 grdn:0.230 lr:2.5e-06 updt_s:1. -496 data_s:0.719 -INFO 2025-09-08 16:14:28 celerate.py:281 step:17K smpl:1M ep:7K epch:4.03 loss:0.087 grdn:0.228 lr:2.5e-06 updt_s:0. -413 data_s:0.124 -INFO 2025-09-08 16:16:13 celerate.py:281 step:17K smpl:1M ep:7K epch:4.07 loss:0.087 grdn:0.234 lr:2.5e-06 updt_s:0. -469 data_s:0.050 -INFO 2025-09-08 16:17:57 celerate.py:281 step:18K smpl:1M ep:7K epch:4.12 loss:0.087 grdn:0.228 lr:2.5e-06 updt_s:0. -375 data_s:0.145 -INFO 2025-09-08 16:19:42 celerate.py:281 step:18K smpl:1M ep:7K epch:4.17 loss:0.086 grdn:0.230 lr:2.5e-06 updt_s:0. -333 data_s:0.189 -INFO 2025-09-08 16:21:26 celerate.py:281 step:18K smpl:1M ep:7K epch:4.21 loss:0.087 grdn:0.231 lr:2.5e-06 updt_s:0. -334 data_s:0.184 -INFO 2025-09-08 16:23:09 celerate.py:281 step:18K smpl:1M ep:7K epch:4.26 loss:0.086 grdn:0.235 lr:2.5e-06 updt_s:0. -331 data_s:0.185 -INFO 2025-09-08 16:24:53 celerate.py:281 step:18K smpl:1M ep:7K epch:4.31 loss:0.088 grdn:0.236 lr:2.5e-06 updt_s:0. -333 data_s:0.182 -INFO 2025-09-08 16:26:38 celerate.py:281 step:19K smpl:1M ep:7K epch:4.35 loss:0.086 grdn:0.230 lr:2.5e-06 updt_s:0. -337 data_s:0.188 -INFO 2025-09-08 16:28:22 celerate.py:281 step:19K smpl:1M ep:7K epch:4.40 loss:0.086 grdn:0.231 lr:2.5e-06 updt_s:0. -420 data_s:0.099 -INFO 2025-09-08 16:30:06 celerate.py:281 step:19K smpl:1M ep:8K epch:4.45 loss:0.086 grdn:0.232 lr:2.5e-06 updt_s:0. -444 data_s:0.075 -INFO 2025-09-08 16:31:49 celerate.py:281 step:19K smpl:1M ep:8K epch:4.49 loss:0.087 grdn:0.235 lr:2.5e-06 updt_s:0. -475 data_s:0.036 -INFO 2025-09-08 16:33:33 celerate.py:281 step:19K smpl:1M ep:8K epch:4.54 loss:0.087 grdn:0.236 lr:2.5e-06 updt_s:0. -379 data_s:0.139 -INFO 2025-09-08 16:35:17 celerate.py:281 step:20K smpl:1M ep:8K epch:4.59 loss:0.087 grdn:0.237 lr:2.5e-06 updt_s:0. -348 data_s:0.171 -INFO 2025-09-08 16:37:01 celerate.py:281 step:20K smpl:1M ep:8K epch:4.63 loss:0.087 grdn:0.231 lr:2.5e-06 updt_s:0. -332 data_s:0.185 -/home/jade_choghari/miniconda3/envs/lerobot/lib/python3.10/site-packages/torch/distributed/distributed_c10d.py:4631: - UserWarning: No device id is provided via `init_process_group` or `barrier `. Using the current device set by the u -ser. - warnings.warn( # warn only once -INFO 2025-09-08 16:38:46 celerate.py:281 step:20K smpl:1M ep:8K epch:4.68 loss:0.086 grdn:0.228 lr:2.5e-06 updt_s:0. -486 data_s:0.037 -INFO 2025-09-08 16:38:46 celerate.py:295 Checkpoint policy after step 20000 -INFO 2025-09-08 16:40:30 celerate.py:281 step:20K smpl:1M ep:8K epch:4.73 loss:0.086 grdn:0.232 lr:2.5e-06 updt_s:0. -509 data_s:0.003 -INFO 2025-09-08 16:42:16 celerate.py:281 step:20K smpl:1M ep:8K epch:4.77 loss:0.086 grdn:0.237 lr:2.5e-06 updt_s:0. -527 data_s:0.003 -INFO 2025-09-08 16:44:01 celerate.py:281 step:21K smpl:1M ep:8K epch:4.82 loss:0.086 grdn:0.238 lr:2.5e-06 updt_s:0. -519 data_s:0.003 -INFO 2025-09-08 16:45:45 celerate.py:281 step:21K smpl:1M ep:8K epch:4.87 loss:0.087 grdn:0.235 lr:2.5e-06 updt_s:0. -504 data_s:0.013 -INFO 2025-09-08 16:47:29 celerate.py:281 step:21K smpl:1M ep:8K epch:4.91 loss:0.087 grdn:0.233 lr:2.5e-06 updt_s:0. -509 data_s:0.011 -INFO 2025-09-08 16:49:19 celerate.py:281 step:21K smpl:1M ep:8K epch:4.96 loss:0.086 grdn:0.234 lr:2.5e-06 updt_s:0. -544 data_s:0.003 -INFO 2025-09-08 16:51:04 celerate.py:281 step:21K smpl:1M ep:8K epch:5.01 loss:0.086 grdn:0.225 lr:2.5e-06 updt_s:0. -488 data_s:0.039 -INFO 2025-09-08 16:52:51 celerate.py:281 step:22K smpl:1M ep:9K epch:5.06 loss:0.086 grdn:0.232 lr:2.5e-06 updt_s:0. -430 data_s:0.099 -INFO 2025-09-08 16:54:36 celerate.py:281 step:22K smpl:1M ep:9K epch:5.10 loss:0.087 grdn:0.236 lr:2.5e-06 updt_s:0. -521 data_s:0.003 -INFO 2025-09-08 16:56:23 celerate.py:281 step:22K smpl:1M ep:9K epch:5.15 loss:0.086 grdn:0.233 lr:2.5e-06 updt_s:0. -521 data_s:0.014 -INFO 2025-09-08 16:58:09 celerate.py:281 step:22K smpl:1M ep:9K epch:5.20 loss:0.087 grdn:0.234 lr:2.5e-06 updt_s:0. -525 data_s:0.003 -INFO 2025-09-08 17:00:04 celerate.py:281 step:22K smpl:1M ep:9K epch:5.24 loss:0.086 grdn:0.233 lr:2.5e-06 updt_s:0. -568 data_s:0.003 -INFO 2025-09-08 17:02:00 celerate.py:281 step:23K smpl:1M ep:9K epch:5.29 loss:0.087 grdn:0.238 lr:2.5e-06 updt_s:0. -575 data_s:0.003 -INFO 2025-09-08 17:03:49 celerate.py:281 step:23K smpl:1M ep:9K epch:5.34 loss:0.087 grdn:0.233 lr:2.5e-06 updt_s:0. -513 data_s:0.030 -INFO 2025-09-08 17:05:39 celerate.py:281 step:23K smpl:1M ep:9K epch:5.38 loss:0.085 grdn:0.227 lr:2.5e-06 updt_s:0. -523 data_s:0.027 -INFO 2025-09-08 17:07:26 celerate.py:281 step:23K smpl:1M ep:9K epch:5.43 loss:0.086 grdn:0.233 lr:2.5e-06 updt_s:0. -529 data_s:0.003 -INFO 2025-09-08 17:09:12 celerate.py:281 step:23K smpl:1M ep:9K epch:5.48 loss:0.086 grdn:0.232 lr:2.5e-06 updt_s:0. -526 data_s:0.003 -INFO 2025-09-08 17:10:55 celerate.py:281 step:24K smpl:2M ep:9K epch:5.52 loss:0.087 grdn:0.230 lr:2.5e-06 updt_s:0. -443 data_s:0.072 -INFO 2025-09-08 17:12:40 celerate.py:281 step:24K smpl:2M ep:9K epch:5.57 loss:0.087 grdn:0.229 lr:2.5e-06 updt_s:0. -518 data_s:0.004 -INFO 2025-09-08 17:14:25 celerate.py:281 step:24K smpl:2M ep:10K epch:5.62 loss:0.087 grdn:0.232 lr:2.5e-06 updt_s:0 -.521 data_s:0.003 -INFO 2025-09-08 17:16:11 celerate.py:281 step:24K smpl:2M ep:10K epch:5.66 loss:0.086 grdn:0.230 lr:2.5e-06 updt_s:0 -.523 data_s:0.003 -INFO 2025-09-08 17:17:55 celerate.py:281 step:24K smpl:2M ep:10K epch:5.71 loss:0.086 grdn:0.228 lr:2.5e-06 updt_s:0 -.515 data_s:0.005 -INFO 2025-09-08 17:19:39 celerate.py:281 step:25K smpl:2M ep:10K epch:5.76 loss:0.087 grdn:0.229 lr:2.5e-06 updt_s:0 -.415 data_s:0.106 -INFO 2025-09-08 17:21:24 celerate.py:281 step:25K smpl:2M ep:10K epch:5.80 loss:0.086 grdn:0.234 lr:2.5e-06 updt_s:0 -.507 data_s:0.016 -INFO 2025-09-08 17:23:08 celerate.py:281 step:25K smpl:2M ep:10K epch:5.85 loss:0.085 grdn:0.229 lr:2.5e-06 updt_s:0 -.514 data_s:0.003 -INFO 2025-09-08 17:24:54 celerate.py:281 step:25K smpl:2M ep:10K epch:5.90 loss:0.087 grdn:0.227 lr:2.5e-06 updt_s:0 -.518 data_s:0.008 -INFO 2025-09-08 17:26:41 celerate.py:281 step:25K smpl:2M ep:10K epch:5.94 loss:0.087 grdn:0.237 lr:2.5e-06 updt_s:0 -.529 data_s:0.003 -INFO 2025-09-08 17:28:24 celerate.py:281 step:26K smpl:2M ep:10K epch:5.99 loss:0.087 grdn:0.232 lr:2.5e-06 updt_s:0 -.513 data_s:0.003 -INFO 2025-09-08 17:30:11 celerate.py:281 step:26K smpl:2M ep:10K epch:6.04 loss:0.087 grdn:0.233 lr:2.5e-06 updt_s:0 -.370 data_s:0.164 -INFO 2025-09-08 17:31:55 celerate.py:281 step:26K smpl:2M ep:10K epch:6.08 loss:0.087 grdn:0.231 lr:2.5e-06 updt_s:0 -.385 data_s:0.132 -INFO 2025-09-08 17:33:39 celerate.py:281 step:26K smpl:2M ep:10K epch:6.13 loss:0.087 grdn:0.237 lr:2.5e-06 updt_s:0 -.450 data_s:0.069 -INFO 2025-09-08 17:35:24 celerate.py:281 step:26K smpl:2M ep:10K epch:6.18 loss:0.087 grdn:0.238 lr:2.5e-06 updt_s:0 -.468 data_s:0.052 -INFO 2025-09-08 17:37:07 celerate.py:281 step:27K smpl:2M ep:11K epch:6.23 loss:0.087 grdn:0.233 lr:2.5e-06 updt_s:0 -.514 data_s:0.004 -INFO 2025-09-08 17:38:52 celerate.py:281 step:27K smpl:2M ep:11K epch:6.27 loss:0.087 grdn:0.236 lr:2.5e-06 updt_s:0 -.519 data_s:0.003 -INFO 2025-09-08 17:40:40 celerate.py:281 step:27K smpl:2M ep:11K epch:6.32 loss:0.086 grdn:0.234 lr:2.5e-06 updt_s:0 -.534 data_s:0.003 -INFO 2025-09-08 17:42:57 celerate.py:281 step:27K smpl:2M ep:11K epch:6.37 loss:0.087 grdn:0.231 lr:2.5e-06 updt_s:0 -.678 data_s:0.007 -INFO 2025-09-08 17:46:13 celerate.py:281 step:27K smpl:2M ep:11K epch:6.41 loss:0.086 grdn:0.233 lr:2.5e-06 updt_s:0 -.968 data_s:0.009 -INFO 2025-09-08 17:49:20 celerate.py:281 step:28K smpl:2M ep:11K epch:6.46 loss:0.087 grdn:0.231 lr:2.5e-06 updt_s:0 -.895 data_s:0.037 -INFO 2025-09-08 17:51:22 celerate.py:281 step:28K smpl:2M ep:11K epch:6.51 loss:0.087 grdn:0.235 lr:2.5e-06 updt_s:0 -.604 data_s:0.003 -INFO 2025-09-08 17:53:07 celerate.py:281 step:28K smpl:2M ep:11K epch:6.55 loss:0.086 grdn:0.236 lr:2.5e-06 updt_s:0 -.521 data_s:0.003 -INFO 2025-09-08 17:54:51 celerate.py:281 step:28K smpl:2M ep:11K epch:6.60 loss:0.087 grdn:0.234 lr:2.5e-06 updt_s:0 -.516 data_s:0.003 -INFO 2025-09-08 17:56:36 celerate.py:281 step:28K smpl:2M ep:11K epch:6.65 loss:0.086 grdn:0.232 lr:2.5e-06 updt_s:0 -.519 data_s:0.003 -INFO 2025-09-08 17:58:21 celerate.py:281 step:29K smpl:2M ep:11K epch:6.69 loss:0.085 grdn:0.228 lr:2.5e-06 updt_s:0 -.521 data_s:0.003 -INFO 2025-09-08 18:00:06 celerate.py:281 step:29K smpl:2M ep:11K epch:6.74 loss:0.087 grdn:0.237 lr:2.5e-06 updt_s:0 -.513 data_s:0.011 -INFO 2025-09-08 18:01:50 celerate.py:281 step:29K smpl:2M ep:11K epch:6.79 loss:0.086 grdn:0.232 lr:2.5e-06 updt_s:0 -.476 data_s:0.041 -INFO 2025-09-08 18:03:34 celerate.py:281 step:29K smpl:2M ep:12K epch:6.83 loss:0.086 grdn:0.236 lr:2.5e-06 updt_s:0 -.506 data_s:0.012 -INFO 2025-09-08 18:05:21 celerate.py:281 step:29K smpl:2M ep:12K epch:6.88 loss:0.086 grdn:0.229 lr:2.5e-06 updt_s:0 -.455 data_s:0.075 -INFO 2025-09-08 18:07:04 celerate.py:281 step:30K smpl:2M ep:12K epch:6.93 loss:0.086 grdn:0.233 lr:2.5e-06 updt_s:0 -.514 data_s:0.003 -INFO 2025-09-08 18:08:47 celerate.py:281 step:30K smpl:2M ep:12K epch:6.97 loss:0.086 grdn:0.236 lr:2.5e-06 updt_s:0 -.509 data_s:0.003 -INFO 2025-09-08 18:10:33 celerate.py:281 step:30K smpl:2M ep:12K epch:7.02 loss:0.087 grdn:0.237 lr:2.5e-06 updt_s:0 -.422 data_s:0.105 -INFO 2025-09-08 18:12:19 celerate.py:281 step:30K smpl:2M ep:12K epch:7.07 loss:0.087 grdn:0.236 lr:2.5e-06 updt_s:0 -.347 data_s:0.182 -INFO 2025-09-08 18:14:05 celerate.py:281 step:30K smpl:2M ep:12K epch:7.11 loss:0.086 grdn:0.231 lr:2.5e-06 updt_s:0 -.473 data_s:0.053 -INFO 2025-09-08 18:15:52 celerate.py:281 step:31K smpl:2M ep:12K epch:7.16 loss:0.086 grdn:0.238 lr:2.5e-06 updt_s:0 -.531 data_s:0.005 -INFO 2025-09-08 18:17:37 celerate.py:281 step:31K smpl:2M ep:12K epch:7.21 loss:0.086 grdn:0.235 lr:2.5e-06 updt_s:0 -.520 data_s:0.003 -INFO 2025-09-08 18:19:22 celerate.py:281 step:31K smpl:2M ep:12K epch:7.26 loss:0.086 grdn:0.231 lr:2.5e-06 updt_s:0 -.500 data_s:0.020 -INFO 2025-09-08 18:21:06 celerate.py:281 step:31K smpl:2M ep:12K epch:7.30 loss:0.087 grdn:0.243 lr:2.5e-06 updt_s:0 -.511 data_s:0.009 -INFO 2025-09-08 18:22:50 celerate.py:281 step:31K smpl:2M ep:12K epch:7.35 loss:0.087 grdn:0.227 lr:2.5e-06 updt_s:0 -.518 data_s:0.003 -INFO 2025-09-08 18:24:33 celerate.py:281 step:32K smpl:2M ep:13K epch:7.40 loss:0.086 grdn:0.235 lr:2.5e-06 updt_s:0 -.507 data_s:0.007 -INFO 2025-09-08 18:26:16 celerate.py:281 step:32K smpl:2M ep:13K epch:7.44 loss:0.087 grdn:0.238 lr:2.5e-06 updt_s:0 -.463 data_s:0.047 -INFO 2025-09-08 18:27:59 celerate.py:281 step:32K smpl:2M ep:13K epch:7.49 loss:0.087 grdn:0.240 lr:2.5e-06 updt_s:0 -.509 data_s:0.007 -INFO 2025-09-08 18:29:43 celerate.py:281 step:32K smpl:2M ep:13K epch:7.54 loss:0.087 grdn:0.234 lr:2.5e-06 updt_s:0 -.514 data_s:0.003 -INFO 2025-09-08 18:31:26 celerate.py:281 step:32K smpl:2M ep:13K epch:7.58 loss:0.086 grdn:0.234 lr:2.5e-06 updt_s:0 -.511 data_s:0.003 -INFO 2025-09-08 18:33:11 celerate.py:281 step:33K smpl:2M ep:13K epch:7.63 loss:0.086 grdn:0.233 lr:2.5e-06 updt_s:0 -.518 data_s:0.003 -INFO 2025-09-08 18:34:57 celerate.py:281 step:33K smpl:2M ep:13K epch:7.68 loss:0.087 grdn:0.236 lr:2.5e-06 updt_s:0 -.512 data_s:0.016 -INFO 2025-09-08 18:36:41 celerate.py:281 step:33K smpl:2M ep:13K epch:7.72 loss:0.086 grdn:0.229 lr:2.5e-06 updt_s:0 -.517 data_s:0.003 -INFO 2025-09-08 18:38:25 celerate.py:281 step:33K smpl:2M ep:13K epch:7.77 loss:0.087 grdn:0.236 lr:2.5e-06 updt_s:0 -.516 data_s:0.003 -INFO 2025-09-08 18:40:07 celerate.py:281 step:33K smpl:2M ep:13K epch:7.82 loss:0.086 grdn:0.235 lr:2.5e-06 updt_s:0 -.506 data_s:0.003 -INFO 2025-09-08 18:41:51 celerate.py:281 step:34K smpl:2M ep:13K epch:7.86 loss:0.087 grdn:0.234 lr:2.5e-06 updt_s:0 -.509 data_s:0.006 -INFO 2025-09-08 18:43:36 celerate.py:281 step:34K smpl:2M ep:13K epch:7.91 loss:0.086 grdn:0.239 lr:2.5e-06 updt_s:0 -.521 data_s:0.003 -INFO 2025-09-08 18:45:19 celerate.py:281 step:34K smpl:2M ep:13K epch:7.96 loss:0.086 grdn:0.233 lr:2.5e-06 updt_s:0 -.511 data_s:0.003 -INFO 2025-09-08 18:47:05 celerate.py:281 step:34K smpl:2M ep:14K epch:8.00 loss:0.086 grdn:0.232 lr:2.5e-06 updt_s:0 -.495 data_s:0.035 -INFO 2025-09-08 18:48:51 celerate.py:281 step:34K smpl:2M ep:14K epch:8.05 loss:0.087 grdn:0.243 lr:2.5e-06 updt_s:0 -.413 data_s:0.112 -INFO 2025-09-08 18:50:34 celerate.py:281 step:35K smpl:2M ep:14K epch:8.10 loss:0.086 grdn:0.237 lr:2.5e-06 updt_s:0 -.515 data_s:0.003 -INFO 2025-09-08 18:52:19 celerate.py:281 step:35K smpl:2M ep:14K epch:8.14 loss:0.087 grdn:0.237 lr:2.5e-06 updt_s:0 -.520 data_s:0.003 -INFO 2025-09-08 18:54:03 celerate.py:281 step:35K smpl:2M ep:14K epch:8.19 loss:0.087 grdn:0.231 lr:2.5e-06 updt_s:0 -.515 data_s:0.003 -INFO 2025-09-08 18:55:48 celerate.py:281 step:35K smpl:2M ep:14K epch:8.24 loss:0.086 grdn:0.231 lr:2.5e-06 updt_s:0 -.420 data_s:0.101 -INFO 2025-09-08 18:57:33 celerate.py:281 step:35K smpl:2M ep:14K epch:8.28 loss:0.086 grdn:0.237 lr:2.5e-06 updt_s:0 -.506 data_s:0.022 -INFO 2025-09-08 18:59:19 celerate.py:281 step:36K smpl:2M ep:14K epch:8.33 loss:0.087 grdn:0.235 lr:2.5e-06 updt_s:0 -.525 data_s:0.003 -INFO 2025-09-08 19:01:03 celerate.py:281 step:36K smpl:2M ep:14K epch:8.38 loss:0.086 grdn:0.228 lr:2.5e-06 updt_s:0 -.516 data_s:0.003 -INFO 2025-09-08 19:02:48 celerate.py:281 step:36K smpl:2M ep:14K epch:8.43 loss:0.086 grdn:0.232 lr:2.5e-06 updt_s:0 -.516 data_s:0.003 -INFO 2025-09-08 19:04:32 celerate.py:281 step:36K smpl:2M ep:14K epch:8.47 loss:0.087 grdn:0.235 lr:2.5e-06 updt_s:0 -.505 data_s:0.013 -INFO 2025-09-08 19:06:15 celerate.py:281 step:36K smpl:2M ep:14K epch:8.52 loss:0.086 grdn:0.232 lr:2.5e-06 updt_s:0 -.384 data_s:0.130 -INFO 2025-09-08 19:07:58 celerate.py:281 step:37K smpl:2M ep:15K epch:8.57 loss:0.087 grdn:0.235 lr:2.5e-06 updt_s:0 -.430 data_s:0.084 -INFO 2025-09-08 19:09:41 celerate.py:281 step:37K smpl:2M ep:15K epch:8.61 loss:0.087 grdn:0.234 lr:2.5e-06 updt_s:0 -.351 data_s:0.162 -INFO 2025-09-08 19:11:25 celerate.py:281 step:37K smpl:2M ep:15K epch:8.66 loss:0.086 grdn:0.239 lr:2.5e-06 updt_s:0 -.337 data_s:0.181 -INFO 2025-09-08 19:13:08 celerate.py:281 step:37K smpl:2M ep:15K epch:8.71 loss:0.086 grdn:0.232 lr:2.5e-06 updt_s:0 -.336 data_s:0.177 -INFO 2025-09-08 19:14:52 celerate.py:281 step:37K smpl:2M ep:15K epch:8.75 loss:0.086 grdn:0.235 lr:2.5e-06 updt_s:0 -.344 data_s:0.174 -INFO 2025-09-08 19:16:35 celerate.py:281 step:38K smpl:2M ep:15K epch:8.80 loss:0.086 grdn:0.231 lr:2.5e-06 updt_s:0 -.332 data_s:0.182 -INFO 2025-09-08 19:18:18 celerate.py:281 step:38K smpl:2M ep:15K epch:8.85 loss:0.086 grdn:0.234 lr:2.5e-06 updt_s:0 -.332 data_s:0.182 -INFO 2025-09-08 19:20:01 celerate.py:281 step:38K smpl:2M ep:15K epch:8.89 loss:0.087 grdn:0.233 lr:2.5e-06 updt_s:0 -.337 data_s:0.177 -INFO 2025-09-08 19:21:45 celerate.py:281 step:38K smpl:2M ep:15K epch:8.94 loss:0.086 grdn:0.234 lr:2.5e-06 updt_s:0 -.495 data_s:0.022 -INFO 2025-09-08 19:23:29 celerate.py:281 step:38K smpl:2M ep:15K epch:8.99 loss:0.086 grdn:0.235 lr:2.5e-06 updt_s:0 -.513 data_s:0.003 -INFO 2025-09-08 19:25:15 celerate.py:281 step:39K smpl:2M ep:15K epch:9.03 loss:0.086 grdn:0.234 lr:2.5e-06 updt_s:0 -.436 data_s:0.097 -INFO 2025-09-08 19:26:59 celerate.py:281 step:39K smpl:2M ep:15K epch:9.08 loss:0.086 grdn:0.237 lr:2.5e-06 updt_s:0 -.378 data_s:0.138 -INFO 2025-09-08 19:28:43 celerate.py:281 step:39K smpl:2M ep:15K epch:9.13 loss:0.087 grdn:0.241 lr:2.5e-06 updt_s:0 -.497 data_s:0.023 -INFO 2025-09-08 19:30:27 celerate.py:281 step:39K smpl:3M ep:16K epch:9.17 loss:0.086 grdn:0.235 lr:2.5e-06 updt_s:0 -.515 data_s:0.003 -INFO 2025-09-08 19:32:14 celerate.py:281 step:39K smpl:3M ep:16K epch:9.22 loss:0.086 grdn:0.233 lr:2.5e-06 updt_s:0 -.526 data_s:0.003 -INFO 2025-09-08 19:33:57 celerate.py:281 step:40K smpl:3M ep:16K epch:9.27 loss:0.087 grdn:0.234 lr:2.5e-06 updt_s:0 -.506 data_s:0.011 -INFO 2025-09-08 19:35:42 celerate.py:281 step:40K smpl:3M ep:16K epch:9.31 loss:0.087 grdn:0.229 lr:2.5e-06 updt_s:0 -.455 data_s:0.066 -INFO 2025-09-08 19:37:26 celerate.py:281 step:40K smpl:3M ep:16K epch:9.36 loss:0.086 grdn:0.238 lr:2.5e-06 updt_s:0 -.493 data_s:0.026 -INFO 2025-09-08 19:37:26 celerate.py:295 Checkpoint policy after step 40000 -INFO 2025-09-08 19:39:10 celerate.py:281 step:40K smpl:3M ep:16K epch:9.41 loss:0.086 grdn:0.236 lr:2.5e-06 updt_s:0 -.397 data_s:0.114 -INFO 2025-09-08 19:40:53 celerate.py:281 step:40K smpl:3M ep:16K epch:9.45 loss:0.086 grdn:0.239 lr:2.5e-06 updt_s:0 -.344 data_s:0.168 -INFO 2025-09-08 19:42:37 celerate.py:281 step:41K smpl:3M ep:16K epch:9.50 loss:0.086 grdn:0.234 lr:2.5e-06 updt_s:0 -.480 data_s:0.036 -INFO 2025-09-08 19:44:21 celerate.py:281 step:41K smpl:3M ep:16K epch:9.55 loss:0.086 grdn:0.237 lr:2.5e-06 updt_s:0 -.517 data_s:0.003 -INFO 2025-09-08 19:46:05 celerate.py:281 step:41K smpl:3M ep:16K epch:9.60 loss:0.086 grdn:0.233 lr:2.5e-06 updt_s:0 -.517 data_s:0.003 -INFO 2025-09-08 19:47:49 celerate.py:281 step:41K smpl:3M ep:16K epch:9.64 loss:0.086 grdn:0.234 lr:2.5e-06 updt_s:0 -.513 data_s:0.003 -INFO 2025-09-08 19:49:33 celerate.py:281 step:41K smpl:3M ep:16K epch:9.69 loss:0.087 grdn:0.236 lr:2.5e-06 updt_s:0 -.515 data_s:0.003 -INFO 2025-09-08 19:51:17 celerate.py:281 step:42K smpl:3M ep:16K epch:9.74 loss:0.086 grdn:0.228 lr:2.5e-06 updt_s:0 -.515 data_s:0.003 -INFO 2025-09-08 19:53:00 celerate.py:281 step:42K smpl:3M ep:17K epch:9.78 loss:0.086 grdn:0.232 lr:2.5e-06 updt_s:0 -.513 data_s:0.003 -INFO 2025-09-08 19:54:44 celerate.py:281 step:42K smpl:3M ep:17K epch:9.83 loss:0.086 grdn:0.233 lr:2.5e-06 updt_s:0 -.516 data_s:0.003 -INFO 2025-09-08 19:56:28 celerate.py:281 step:42K smpl:3M ep:17K epch:9.88 loss:0.087 grdn:0.237 lr:2.5e-06 updt_s:0 -.512 data_s:0.003 -INFO 2025-09-08 19:58:11 celerate.py:281 step:42K smpl:3M ep:17K epch:9.92 loss:0.086 grdn:0.233 lr:2.5e-06 updt_s:0 -.514 data_s:0.003 -INFO 2025-09-08 19:59:55 celerate.py:281 step:43K smpl:3M ep:17K epch:9.97 loss:0.087 grdn:0.238 lr:2.5e-06 updt_s:0 -.514 data_s:0.003 -INFO 2025-09-08 20:01:42 celerate.py:281 step:43K smpl:3M ep:17K epch:10.02 loss:0.087 grdn:0.234 lr:2.5e-06 updt_s: -0.476 data_s:0.057 -INFO 2025-09-08 20:03:25 celerate.py:281 step:43K smpl:3M ep:17K epch:10.06 loss:0.087 grdn:0.239 lr:2.5e-06 updt_s: -0.471 data_s:0.043 -INFO 2025-09-08 20:05:09 celerate.py:281 step:43K smpl:3M ep:17K epch:10.11 loss:0.086 grdn:0.237 lr:2.5e-06 updt_s: -0.515 data_s:0.004 -INFO 2025-09-08 20:06:53 celerate.py:281 step:43K smpl:3M ep:17K epch:10.16 loss:0.087 grdn:0.237 lr:2.5e-06 updt_s: -0.505 data_s:0.013 -INFO 2025-09-08 20:08:36 celerate.py:281 step:44K smpl:3M ep:17K epch:10.20 loss:0.087 grdn:0.237 lr:2.5e-06 updt_s: -0.511 data_s:0.003 -INFO 2025-09-08 20:10:20 celerate.py:281 step:44K smpl:3M ep:17K epch:10.25 loss:0.086 grdn:0.236 lr:2.5e-06 updt_s: -0.516 data_s:0.003 -INFO 2025-09-08 20:12:04 celerate.py:281 step:44K smpl:3M ep:17K epch:10.30 loss:0.086 grdn:0.232 lr:2.5e-06 updt_s: -0.511 data_s:0.003 -INFO 2025-09-08 20:13:47 celerate.py:281 step:44K smpl:3M ep:18K epch:10.34 loss:0.087 grdn:0.235 lr:2.5e-06 updt_s: -0.503 data_s:0.011 -INFO 2025-09-08 20:15:31 celerate.py:281 step:44K smpl:3M ep:18K epch:10.39 loss:0.086 grdn:0.233 lr:2.5e-06 updt_s: -0.416 data_s:0.102 -INFO 2025-09-08 20:17:15 celerate.py:281 step:45K smpl:3M ep:18K epch:10.44 loss:0.087 grdn:0.235 lr:2.5e-06 updt_s: -0.502 data_s:0.017 -INFO 2025-09-08 20:18:58 celerate.py:281 step:45K smpl:3M ep:18K epch:10.48 loss:0.086 grdn:0.234 lr:2.5e-06 updt_s: -0.512 data_s:0.003 -INFO 2025-09-08 20:20:41 celerate.py:281 step:45K smpl:3M ep:18K epch:10.53 loss:0.086 grdn:0.238 lr:2.5e-06 updt_s: -0.496 data_s:0.017 -INFO 2025-09-08 20:22:24 celerate.py:281 step:45K smpl:3M ep:18K epch:10.58 loss:0.086 grdn:0.233 lr:2.5e-06 updt_s: -0.493 data_s:0.022 -INFO 2025-09-08 20:24:08 celerate.py:281 step:45K smpl:3M ep:18K epch:10.63 loss:0.086 grdn:0.236 lr:2.5e-06 updt_s: -0.485 data_s:0.031 -INFO 2025-09-08 20:25:52 celerate.py:281 step:46K smpl:3M ep:18K epch:10.67 loss:0.086 grdn:0.231 lr:2.5e-06 updt_s: -0.518 data_s:0.003 -INFO 2025-09-08 20:27:36 celerate.py:281 step:46K smpl:3M ep:18K epch:10.72 loss:0.085 grdn:0.228 lr:2.5e-06 updt_s: -0.513 data_s:0.003 -INFO 2025-09-08 20:29:19 celerate.py:281 step:46K smpl:3M ep:18K epch:10.77 loss:0.087 grdn:0.235 lr:2.5e-06 updt_s: -0.514 data_s:0.003 -INFO 2025-09-08 20:31:04 celerate.py:281 step:46K smpl:3M ep:18K epch:10.81 loss:0.086 grdn:0.242 lr:2.5e-06 updt_s: -0.516 data_s:0.003 -INFO 2025-09-08 20:32:47 celerate.py:281 step:46K smpl:3M ep:18K epch:10.86 loss:0.086 grdn:0.235 lr:2.5e-06 updt_s: -0.513 data_s:0.003 -INFO 2025-09-08 20:34:32 celerate.py:281 step:47K smpl:3M ep:18K epch:10.91 loss:0.086 grdn:0.234 lr:2.5e-06 updt_s: -0.518 data_s:0.003 -INFO 2025-09-08 20:36:15 celerate.py:281 step:47K smpl:3M ep:19K epch:10.95 loss:0.087 grdn:0.238 lr:2.5e-06 updt_s: -0.514 data_s:0.003 -INFO 2025-09-08 20:38:01 celerate.py:281 step:47K smpl:3M ep:19K epch:11.00 loss:0.086 grdn:0.239 lr:2.5e-06 updt_s: -0.416 data_s:0.114 -INFO 2025-09-08 20:39:45 celerate.py:281 step:47K smpl:3M ep:19K epch:11.05 loss:0.086 grdn:0.233 lr:2.5e-06 updt_s: -0.429 data_s:0.086 -INFO 2025-09-08 20:41:28 celerate.py:281 step:47K smpl:3M ep:19K epch:11.09 loss:0.086 grdn:0.229 lr:2.5e-06 updt_s: -0.409 data_s:0.106 -INFO 2025-09-08 20:43:12 celerate.py:281 step:48K smpl:3M ep:19K epch:11.14 loss:0.086 grdn:0.238 lr:2.5e-06 updt_s: -0.470 data_s:0.050 -INFO 2025-09-08 20:44:56 celerate.py:281 step:48K smpl:3M ep:19K epch:11.19 loss:0.086 grdn:0.237 lr:2.5e-06 updt_s: -0.516 data_s:0.003 -INFO 2025-09-08 20:46:41 celerate.py:281 step:48K smpl:3M ep:19K epch:11.23 loss:0.087 grdn:0.243 lr:2.5e-06 updt_s: -0.517 data_s:0.003 -INFO 2025-09-08 20:48:25 celerate.py:281 step:48K smpl:3M ep:19K epch:11.28 loss:0.086 grdn:0.237 lr:2.5e-06 updt_s: -0.518 data_s:0.003 -INFO 2025-09-08 20:50:08 celerate.py:281 step:48K smpl:3M ep:19K epch:11.33 loss:0.086 grdn:0.238 lr:2.5e-06 updt_s: -0.505 data_s:0.009 -INFO 2025-09-08 20:51:52 celerate.py:281 step:49K smpl:3M ep:19K epch:11.37 loss:0.087 grdn:0.245 lr:2.5e-06 updt_s: -0.449 data_s:0.067 -INFO 2025-09-08 20:53:35 celerate.py:281 step:49K smpl:3M ep:19K epch:11.42 loss:0.086 grdn:0.238 lr:2.5e-06 updt_s: -0.420 data_s:0.094 -INFO 2025-09-08 20:55:19 celerate.py:281 step:49K smpl:3M ep:19K epch:11.47 loss:0.086 grdn:0.232 lr:2.5e-06 updt_s: -0.515 data_s:0.003 -INFO 2025-09-08 20:57:01 celerate.py:281 step:49K smpl:3M ep:19K epch:11.51 loss:0.087 grdn:0.236 lr:2.5e-06 updt_s: -0.505 data_s:0.003 -INFO 2025-09-08 20:58:44 celerate.py:281 step:49K smpl:3M ep:20K epch:11.56 loss:0.086 grdn:0.234 lr:2.5e-06 updt_s: -0.511 data_s:0.003 -INFO 2025-09-08 21:00:28 celerate.py:281 step:50K smpl:3M ep:20K epch:11.61 loss:0.087 grdn:0.239 lr:2.5e-06 updt_s: -0.516 data_s:0.003 -INFO 2025-09-08 21:02:11 celerate.py:281 step:50K smpl:3M ep:20K epch:11.65 loss:0.086 grdn:0.233 lr:2.5e-06 updt_s: -0.402 data_s:0.110 -INFO 2025-09-08 21:03:54 celerate.py:281 step:50K smpl:3M ep:20K epch:11.70 loss:0.086 grdn:0.238 lr:2.5e-06 updt_s: -0.332 data_s:0.184 -INFO 2025-09-08 21:05:37 celerate.py:281 step:50K smpl:3M ep:20K epch:11.75 loss:0.086 grdn:0.229 lr:2.5e-06 updt_s: -0.332 data_s:0.182 -INFO 2025-09-08 21:07:21 celerate.py:281 step:50K smpl:3M ep:20K epch:11.80 loss:0.087 grdn:0.237 lr:2.5e-06 updt_s: -0.466 data_s:0.049 -INFO 2025-09-08 21:09:05 celerate.py:281 step:51K smpl:3M ep:20K epch:11.84 loss:0.086 grdn:0.238 lr:2.5e-06 updt_s: -0.517 data_s:0.003 -INFO 2025-09-08 21:10:49 celerate.py:281 step:51K smpl:3M ep:20K epch:11.89 loss:0.087 grdn:0.240 lr:2.5e-06 updt_s: -0.512 data_s:0.004 -INFO 2025-09-08 21:12:32 celerate.py:281 step:51K smpl:3M ep:20K epch:11.94 loss:0.085 grdn:0.234 lr:2.5e-06 updt_s: -0.484 data_s:0.032 -INFO 2025-09-08 21:14:17 celerate.py:281 step:51K smpl:3M ep:20K epch:11.98 loss:0.087 grdn:0.236 lr:2.5e-06 updt_s: -0.517 data_s:0.004 -INFO 2025-09-08 21:16:03 celerate.py:281 step:51K smpl:3M ep:20K epch:12.03 loss:0.086 grdn:0.237 lr:2.5e-06 updt_s: -0.424 data_s:0.105 -INFO 2025-09-08 21:17:46 celerate.py:281 step:52K smpl:3M ep:20K epch:12.08 loss:0.086 grdn:0.236 lr:2.5e-06 updt_s: -0.442 data_s:0.073 -INFO 2025-09-08 21:19:30 celerate.py:281 step:52K smpl:3M ep:21K epch:12.12 loss:0.086 grdn:0.231 lr:2.5e-06 updt_s: -0.511 data_s:0.007 -INFO 2025-09-08 21:21:15 celerate.py:281 step:52K smpl:3M ep:21K epch:12.17 loss:0.086 grdn:0.241 lr:2.5e-06 updt_s: -0.520 data_s:0.003 -INFO 2025-09-08 21:22:59 celerate.py:281 step:52K smpl:3M ep:21K epch:12.22 loss:0.086 grdn:0.242 lr:2.5e-06 updt_s: -0.515 data_s:0.003 -INFO 2025-09-08 21:24:43 celerate.py:281 step:52K smpl:3M ep:21K epch:12.26 loss:0.086 grdn:0.241 lr:2.5e-06 updt_s: -0.518 data_s:0.003 -INFO 2025-09-08 21:26:27 celerate.py:281 step:53K smpl:3M ep:21K epch:12.31 loss:0.086 grdn:0.241 lr:2.5e-06 updt_s: -0.514 data_s:0.003 -INFO 2025-09-08 21:28:11 celerate.py:281 step:53K smpl:3M ep:21K epch:12.36 loss:0.087 grdn:0.241 lr:2.5e-06 updt_s: -0.517 data_s:0.003 -INFO 2025-09-08 21:29:55 celerate.py:281 step:53K smpl:3M ep:21K epch:12.40 loss:0.087 grdn:0.236 lr:2.5e-06 updt_s: -0.514 data_s:0.003 -INFO 2025-09-08 21:31:39 celerate.py:281 step:53K smpl:3M ep:21K epch:12.45 loss:0.086 grdn:0.240 lr:2.5e-06 updt_s: -0.517 data_s:0.003 -INFO 2025-09-08 21:33:23 celerate.py:281 step:53K smpl:3M ep:21K epch:12.50 loss:0.086 grdn:0.237 lr:2.5e-06 updt_s: -0.515 data_s:0.003 -INFO 2025-09-08 21:35:08 celerate.py:281 step:54K smpl:3M ep:21K epch:12.54 loss:0.086 grdn:0.236 lr:2.5e-06 updt_s: -0.519 data_s:0.003 -INFO 2025-09-08 21:36:51 celerate.py:281 step:54K smpl:3M ep:21K epch:12.59 loss:0.086 grdn:0.232 lr:2.5e-06 updt_s: -0.511 data_s:0.003 -INFO 2025-09-08 21:38:36 celerate.py:281 step:54K smpl:3M ep:21K epch:12.64 loss:0.087 grdn:0.236 lr:2.5e-06 updt_s: -0.517 data_s:0.003 -INFO 2025-09-08 21:40:18 celerate.py:281 step:54K smpl:3M ep:21K epch:12.68 loss:0.086 grdn:0.237 lr:2.5e-06 updt_s: -0.473 data_s:0.038 -INFO 2025-09-08 21:42:02 celerate.py:281 step:54K smpl:3M ep:22K epch:12.73 loss:0.085 grdn:0.232 lr:2.5e-06 updt_s: -0.408 data_s:0.109 -INFO 2025-09-08 21:43:45 celerate.py:281 step:55K smpl:3M ep:22K epch:12.78 loss:0.086 grdn:0.244 lr:2.5e-06 updt_s: -0.377 data_s:0.136 -INFO 2025-09-08 21:45:29 celerate.py:281 step:55K smpl:4M ep:22K epch:12.83 loss:0.087 grdn:0.233 lr:2.5e-06 updt_s: -0.497 data_s:0.022 -INFO 2025-09-08 21:47:12 celerate.py:281 step:55K smpl:4M ep:22K epch:12.87 loss:0.086 grdn:0.235 lr:2.5e-06 updt_s: -0.512 data_s:0.003 -INFO 2025-09-08 21:48:56 celerate.py:281 step:55K smpl:4M ep:22K epch:12.92 loss:0.086 grdn:0.237 lr:2.5e-06 updt_s: -0.429 data_s:0.086 -INFO 2025-09-08 21:50:39 celerate.py:281 step:55K smpl:4M ep:22K epch:12.97 loss:0.087 grdn:0.241 lr:2.5e-06 updt_s: -0.454 data_s:0.059 -INFO 2025-09-08 21:52:25 celerate.py:281 step:56K smpl:4M ep:22K epch:13.01 loss:0.086 grdn:0.235 lr:2.5e-06 updt_s: -0.459 data_s:0.072 -INFO 2025-09-08 21:54:08 celerate.py:281 step:56K smpl:4M ep:22K epch:13.06 loss:0.086 grdn:0.232 lr:2.5e-06 updt_s: -0.382 data_s:0.132 -INFO 2025-09-08 21:55:51 celerate.py:281 step:56K smpl:4M ep:22K epch:13.11 loss:0.086 grdn:0.236 lr:2.5e-06 updt_s: -0.500 data_s:0.016 -INFO 2025-09-08 21:57:36 celerate.py:281 step:56K smpl:4M ep:22K epch:13.15 loss:0.086 grdn:0.237 lr:2.5e-06 updt_s: -0.517 data_s:0.003 -INFO 2025-09-08 21:59:20 celerate.py:281 step:56K smpl:4M ep:22K epch:13.20 loss:0.085 grdn:0.235 lr:2.5e-06 updt_s: -0.518 data_s:0.003 -INFO 2025-09-08 22:01:03 celerate.py:281 step:57K smpl:4M ep:22K epch:13.25 loss:0.086 grdn:0.244 lr:2.5e-06 updt_s: -0.510 data_s:0.003 -INFO 2025-09-08 22:02:48 celerate.py:281 step:57K smpl:4M ep:23K epch:13.29 loss:0.086 grdn:0.242 lr:2.5e-06 updt_s: -0.517 data_s:0.003 -INFO 2025-09-08 22:04:32 celerate.py:281 step:57K smpl:4M ep:23K epch:13.34 loss:0.086 grdn:0.236 lr:2.5e-06 updt_s: -0.515 data_s:0.003 -INFO 2025-09-08 22:06:14 celerate.py:281 step:57K smpl:4M ep:23K epch:13.39 loss:0.087 grdn:0.244 lr:2.5e-06 updt_s: -0.505 data_s:0.005 -bINFO 2025-09-08 22:07:59 celerate.py:281 step:57K smpl:4M ep:23K epch:13.43 loss:0.086 grdn:0.239 lr:2.5e-06 updt_s -:0.496 data_s:0.026 -INFO 2025-09-08 22:09:43 celerate.py:281 step:58K smpl:4M ep:23K epch:13.48 loss:0.087 grdn:0.239 lr:2.5e-06 updt_s: -0.438 data_s:0.080 -INFO 2025-09-08 22:11:27 celerate.py:281 step:58K smpl:4M ep:23K epch:13.53 loss:0.087 grdn:0.240 lr:2.5e-06 updt_s: -0.444 data_s:0.073 -INFO 2025-09-08 22:13:11 celerate.py:281 step:58K smpl:4M ep:23K epch:13.57 loss:0.086 grdn:0.240 lr:2.5e-06 updt_s: -0.515 data_s:0.003 -INFO 2025-09-08 22:14:55 celerate.py:281 step:58K smpl:4M ep:23K epch:13.62 loss:0.086 grdn:0.237 lr:2.5e-06 updt_s: -0.518 data_s:0.003 -INFO 2025-09-08 22:16:39 celerate.py:281 step:58K smpl:4M ep:23K epch:13.67 loss:0.086 grdn:0.233 lr:2.5e-06 updt_s: -0.513 data_s:0.003 -INFO 2025-09-08 22:18:22 celerate.py:281 step:59K smpl:4M ep:23K epch:13.71 loss:0.087 grdn:0.240 lr:2.5e-06 updt_s: -0.513 data_s:0.003 -INFO 2025-09-08 22:20:05 celerate.py:281 step:59K smpl:4M ep:23K epch:13.76 loss:0.087 grdn:0.239 lr:2.5e-06 updt_s: -0.508 data_s:0.003 -INFO 2025-09-08 22:21:48 celerate.py:281 step:59K smpl:4M ep:23K epch:13.81 loss:0.086 grdn:0.228 lr:2.5e-06 updt_s: -0.505 data_s:0.008 -INFO 2025-09-08 22:23:31 celerate.py:281 step:59K smpl:4M ep:23K epch:13.85 loss:0.086 grdn:0.238 lr:2.5e-06 updt_s: -0.505 data_s:0.008 -INFO 2025-09-08 22:25:15 celerate.py:281 step:59K smpl:4M ep:24K epch:13.90 loss:0.086 grdn:0.240 lr:2.5e-06 updt_s: -0.515 data_s:0.003 -INFO 2025-09-08 22:26:58 celerate.py:281 step:60K smpl:4M ep:24K epch:13.95 loss:0.087 grdn:0.235 lr:2.5e-06 updt_s: -0.493 data_s:0.022 -INFO 2025-09-08 22:28:42 celerate.py:281 step:60K smpl:4M ep:24K epch:14.00 loss:0.086 grdn:0.231 lr:2.5e-06 updt_s: -0.515 data_s:0.003 -INFO 2025-09-08 22:30:29 celerate.py:281 step:60K smpl:4M ep:24K epch:14.04 loss:0.086 grdn:0.235 lr:2.5e-06 updt_s: -0.435 data_s:0.101 -INFO 2025-09-08 22:30:29 celerate.py:295 Checkpoint policy after step 60000 -INFO 2025-09-08 22:32:14 celerate.py:281 step:60K smpl:4M ep:24K epch:14.09 loss:0.086 grdn:0.237 lr:2.5e-06 updt_s: -0.508 data_s:0.003 -INFO 2025-09-08 22:33:58 celerate.py:281 step:60K smpl:4M ep:24K epch:14.14 loss:0.086 grdn:0.238 lr:2.5e-06 updt_s: -0.516 data_s:0.003 -INFO 2025-09-08 22:35:42 celerate.py:281 step:61K smpl:4M ep:24K epch:14.18 loss:0.086 grdn:0.241 lr:2.5e-06 updt_s: -0.515 data_s:0.003 -INFO 2025-09-08 22:37:25 celerate.py:281 step:61K smpl:4M ep:24K epch:14.23 loss:0.086 grdn:0.240 lr:2.5e-06 updt_s: -0.513 data_s:0.003 -INFO 2025-09-08 22:39:09 celerate.py:281 step:61K smpl:4M ep:24K epch:14.28 loss:0.087 grdn:0.235 lr:2.5e-06 updt_s: -0.514 data_s:0.003 -INFO 2025-09-08 22:40:52 celerate.py:281 step:61K smpl:4M ep:24K epch:14.32 loss:0.086 grdn:0.242 lr:2.5e-06 updt_s: -0.509 data_s:0.003 -INFO 2025-09-08 22:42:35 celerate.py:281 step:61K smpl:4M ep:24K epch:14.37 loss:0.086 grdn:0.235 lr:2.5e-06 updt_s: -0.513 data_s:0.003 -INFO 2025-09-08 22:44:18 celerate.py:281 step:62K smpl:4M ep:24K epch:14.42 loss:0.086 grdn:0.233 lr:2.5e-06 updt_s: -0.511 data_s:0.003 -INFO 2025-09-08 22:46:02 celerate.py:281 step:62K smpl:4M ep:24K epch:14.46 loss:0.086 grdn:0.235 lr:2.5e-06 updt_s: -0.515 data_s:0.003 -INFO 2025-09-08 22:47:47 celerate.py:281 step:62K smpl:4M ep:25K epch:14.51 loss:0.086 grdn:0.242 lr:2.5e-06 updt_s: -0.517 data_s:0.003 -INFO 2025-09-08 22:49:30 celerate.py:281 step:62K smpl:4M ep:25K epch:14.56 loss:0.086 grdn:0.234 lr:2.5e-06 updt_s: -0.512 data_s:0.003 -INFO 2025-09-08 22:51:14 celerate.py:281 step:62K smpl:4M ep:25K epch:14.60 loss:0.086 grdn:0.237 lr:2.5e-06 updt_s: -0.514 data_s:0.003 -INFO 2025-09-08 22:52:58 celerate.py:281 step:63K smpl:4M ep:25K epch:14.65 loss:0.087 grdn:0.245 lr:2.5e-06 updt_s: -0.484 data_s:0.033 -INFO 2025-09-08 22:54:41 celerate.py:281 step:63K smpl:4M ep:25K epch:14.70 loss:0.086 grdn:0.239 lr:2.5e-06 updt_s: -0.501 data_s:0.016 -INFO 2025-09-08 22:56:25 celerate.py:281 step:63K smpl:4M ep:25K epch:14.74 loss:0.086 grdn:0.233 lr:2.5e-06 updt_s: -0.436 data_s:0.081 -INFO 2025-09-08 22:58:08 celerate.py:281 step:63K smpl:4M ep:25K epch:14.79 loss:0.087 grdn:0.235 lr:2.5e-06 updt_s: -0.436 data_s:0.080 -INFO 2025-09-08 22:59:51 celerate.py:281 step:63K smpl:4M ep:25K epch:14.84 loss:0.086 grdn:0.234 lr:2.5e-06 updt_s: -0.344 data_s:0.168 -INFO 2025-09-08 23:01:34 celerate.py:281 step:64K smpl:4M ep:25K epch:14.88 loss:0.086 grdn:0.237 lr:2.5e-06 updt_s: -0.479 data_s:0.035 -INFO 2025-09-08 23:03:18 celerate.py:281 step:64K smpl:4M ep:25K epch:14.93 loss:0.086 grdn:0.241 lr:2.5e-06 updt_s: -0.514 data_s:0.003 -INFO 2025-09-08 23:05:02 celerate.py:281 step:64K smpl:4M ep:25K epch:14.98 loss:0.086 grdn:0.230 lr:2.5e-06 updt_s: -0.513 data_s:0.003 -INFO 2025-09-08 23:06:47 celerate.py:281 step:64K smpl:4M ep:25K epch:15.02 loss:0.086 grdn:0.234 lr:2.5e-06 updt_s: -0.405 data_s:0.119 -INFO 2025-09-08 23:08:30 celerate.py:281 step:64K smpl:4M ep:26K epch:15.07 loss:0.087 grdn:0.248 lr:2.5e-06 updt_s: -0.393 data_s:0.121 -INFO 2025-09-08 23:10:13 celerate.py:281 step:65K smpl:4M ep:26K epch:15.12 loss:0.087 grdn:0.242 lr:2.5e-06 updt_s: -0.369 data_s:0.142 -INFO 2025-09-08 23:11:56 celerate.py:281 step:65K smpl:4M ep:26K epch:15.17 loss:0.085 grdn:0.230 lr:2.5e-06 updt_s: -0.360 data_s:0.156 -INFO 2025-09-08 23:13:40 celerate.py:281 step:65K smpl:4M ep:26K epch:15.21 loss:0.086 grdn:0.241 lr:2.5e-06 updt_s: -0.333 data_s:0.182 -INFO 2025-09-08 23:15:23 celerate.py:281 step:65K smpl:4M ep:26K epch:15.26 loss:0.087 grdn:0.241 lr:2.5e-06 updt_s: -0.376 data_s:0.136 -INFO 2025-09-08 23:17:05 celerate.py:281 step:65K smpl:4M ep:26K epch:15.31 loss:0.087 grdn:0.239 lr:2.5e-06 updt_s: -0.439 data_s:0.069 -INFO 2025-09-08 23:18:49 celerate.py:281 step:66K smpl:4M ep:26K epch:15.35 loss:0.086 grdn:0.242 lr:2.5e-06 updt_s: -0.512 data_s:0.006 -INFO 2025-09-08 23:20:32 celerate.py:281 step:66K smpl:4M ep:26K epch:15.40 loss:0.087 grdn:0.242 lr:2.5e-06 updt_s: -0.511 data_s:0.003 -INFO 2025-09-08 23:22:15 celerate.py:281 step:66K smpl:4M ep:26K epch:15.45 loss:0.086 grdn:0.236 lr:2.5e-06 updt_s: -0.508 data_s:0.004 -INFO 2025-09-08 23:23:59 celerate.py:281 step:66K smpl:4M ep:26K epch:15.49 loss:0.086 grdn:0.235 lr:2.5e-06 updt_s: -0.474 data_s:0.046 -INFO 2025-09-08 23:25:42 celerate.py:281 step:66K smpl:4M ep:26K epch:15.54 loss:0.087 grdn:0.238 lr:2.5e-06 updt_s: -0.510 data_s:0.003 -INFO 2025-09-08 23:27:25 celerate.py:281 step:67K smpl:4M ep:26K epch:15.59 loss:0.086 grdn:0.240 lr:2.5e-06 updt_s: -0.487 data_s:0.025 -INFO 2025-09-08 23:29:08 celerate.py:281 step:67K smpl:4M ep:26K epch:15.63 loss:0.086 grdn:0.241 lr:2.5e-06 updt_s: -0.510 data_s:0.005 -INFO 2025-09-08 23:30:50 celerate.py:281 step:67K smpl:4M ep:27K epch:15.68 loss:0.086 grdn:0.237 lr:2.5e-06 updt_s: -0.506 data_s:0.003 -INFO 2025-09-08 23:32:33 celerate.py:281 step:67K smpl:4M ep:27K epch:15.73 loss:0.086 grdn:0.245 lr:2.5e-06 updt_s: -0.509 data_s:0.003 -INFO 2025-09-08 23:34:16 celerate.py:281 step:67K smpl:4M ep:27K epch:15.77 loss:0.087 grdn:0.244 lr:2.5e-06 updt_s: -0.503 data_s:0.014 -INFO 2025-09-08 23:35:59 celerate.py:281 step:68K smpl:4M ep:27K epch:15.82 loss:0.086 grdn:0.236 lr:2.5e-06 updt_s: -0.512 data_s:0.003 -INFO 2025-09-08 23:37:42 celerate.py:281 step:68K smpl:4M ep:27K epch:15.87 loss:0.086 grdn:0.233 lr:2.5e-06 updt_s: -0.509 data_s:0.003 -INFO 2025-09-08 23:39:26 celerate.py:281 step:68K smpl:4M ep:27K epch:15.91 loss:0.085 grdn:0.235 lr:2.5e-06 updt_s: -0.512 data_s:0.003 -INFO 2025-09-08 23:41:10 celerate.py:281 step:68K smpl:4M ep:27K epch:15.96 loss:0.086 grdn:0.241 lr:2.5e-06 updt_s: -0.517 data_s:0.003 -INFO 2025-09-08 23:42:56 celerate.py:281 step:68K smpl:4M ep:27K epch:16.01 loss:0.086 grdn:0.242 lr:2.5e-06 updt_s: -0.469 data_s:0.061 -INFO 2025-09-08 23:44:40 celerate.py:281 step:69K smpl:4M ep:27K epch:16.05 loss:0.086 grdn:0.235 lr:2.5e-06 updt_s: -0.339 data_s:0.179 -INFO 2025-09-08 23:46:22 celerate.py:281 step:69K smpl:4M ep:27K epch:16.10 loss:0.086 grdn:0.242 lr:2.5e-06 updt_s: -0.386 data_s:0.124 -INFO 2025-09-08 23:48:06 celerate.py:281 step:69K smpl:4M ep:27K epch:16.15 loss:0.085 grdn:0.239 lr:2.5e-06 updt_s: -0.515 data_s:0.003 -INFO 2025-09-08 23:49:50 celerate.py:281 step:69K smpl:4M ep:27K epch:16.20 loss:0.086 grdn:0.237 lr:2.5e-06 updt_s: -0.516 data_s:0.003 -INFO 2025-09-08 23:51:33 celerate.py:281 step:69K smpl:4M ep:27K epch:16.24 loss:0.087 grdn:0.246 lr:2.5e-06 updt_s: -0.501 data_s:0.014 -INFO 2025-09-08 23:53:17 celerate.py:281 step:70K smpl:4M ep:28K epch:16.29 loss:0.086 grdn:0.236 lr:2.5e-06 updt_s: -0.513 data_s:0.003 -INFO 2025-09-08 23:54:59 celerate.py:281 step:70K smpl:4M ep:28K epch:16.34 loss:0.087 grdn:0.245 lr:2.5e-06 updt_s: -0.507 data_s:0.005 -INFO 2025-09-08 23:56:43 celerate.py:281 step:70K smpl:4M ep:28K epch:16.38 loss:0.086 grdn:0.238 lr:2.5e-06 updt_s: -0.516 data_s:0.003 -INFO 2025-09-08 23:58:27 celerate.py:281 step:70K smpl:4M ep:28K epch:16.43 loss:0.086 grdn:0.235 lr:2.5e-06 updt_s: -0.514 data_s:0.003 -INFO 2025-09-09 00:00:12 celerate.py:281 step:70K smpl:5M ep:28K epch:16.48 loss:0.086 grdn:0.240 lr:2.5e-06 updt_s: -0.518 data_s:0.003 -INFO 2025-09-09 00:01:55 celerate.py:281 step:71K smpl:5M ep:28K epch:16.52 loss:0.086 grdn:0.241 lr:2.5e-06 updt_s: -0.512 data_s:0.003 -INFO 2025-09-09 00:03:37 celerate.py:281 step:71K smpl:5M ep:28K epch:16.57 loss:0.087 grdn:0.241 lr:2.5e-06 updt_s: -0.508 data_s:0.003 -INFO 2025-09-09 00:05:20 celerate.py:281 step:71K smpl:5M ep:28K epch:16.62 loss:0.086 grdn:0.238 lr:2.5e-06 updt_s: -0.509 data_s:0.003 -INFO 2025-09-09 00:07:04 celerate.py:281 step:71K smpl:5M ep:28K epch:16.66 loss:0.086 grdn:0.235 lr:2.5e-06 updt_s: -0.519 data_s:0.003 -INFO 2025-09-09 00:08:47 celerate.py:281 step:71K smpl:5M ep:28K epch:16.71 loss:0.087 grdn:0.240 lr:2.5e-06 updt_s: -0.509 data_s:0.003 -INFO 2025-09-09 00:10:30 celerate.py:281 step:72K smpl:5M ep:28K epch:16.76 loss:0.087 grdn:0.245 lr:2.5e-06 updt_s: -0.511 data_s:0.003 -INFO 2025-09-09 00:12:13 celerate.py:281 step:72K smpl:5M ep:28K epch:16.80 loss:0.085 grdn:0.230 lr:2.5e-06 updt_s: -0.510 data_s:0.003 -INFO 2025-09-09 00:13:58 celerate.py:281 step:72K smpl:5M ep:29K epch:16.85 loss:0.086 grdn:0.237 lr:2.5e-06 updt_s: -0.522 data_s:0.003 -INFO 2025-09-09 00:15:40 celerate.py:281 step:72K smpl:5M ep:29K epch:16.90 loss:0.086 grdn:0.239 lr:2.5e-06 updt_s: -0.506 data_s:0.003 -INFO 2025-09-09 00:17:23 celerate.py:281 step:72K smpl:5M ep:29K epch:16.94 loss:0.086 grdn:0.237 lr:2.5e-06 updt_s: -0.510 data_s:0.004 -INFO 2025-09-09 00:19:05 celerate.py:281 step:73K smpl:5M ep:29K epch:16.99 loss:0.086 grdn:0.237 lr:2.5e-06 updt_s: -0.505 data_s:0.003 -INFO 2025-09-09 00:20:51 celerate.py:281 step:73K smpl:5M ep:29K epch:17.04 loss:0.086 grdn:0.239 lr:2.5e-06 updt_s: -0.437 data_s:0.092 -INFO 2025-09-09 00:22:34 celerate.py:281 step:73K smpl:5M ep:29K epch:17.08 loss:0.087 grdn:0.241 lr:2.5e-06 updt_s: -0.489 data_s:0.025 -INFO 2025-09-09 00:24:17 celerate.py:281 step:73K smpl:5M ep:29K epch:17.13 loss:0.086 grdn:0.243 lr:2.5e-06 updt_s: -0.511 data_s:0.003 -INFO 2025-09-09 00:26:01 celerate.py:281 step:73K smpl:5M ep:29K epch:17.18 loss:0.087 grdn:0.245 lr:2.5e-06 updt_s: -0.513 data_s:0.003 -INFO 2025-09-09 00:27:45 celerate.py:281 step:74K smpl:5M ep:29K epch:17.22 loss:0.087 grdn:0.246 lr:2.5e-06 updt_s: -0.515 data_s:0.003 -INFO 2025-09-09 00:29:28 celerate.py:281 step:74K smpl:5M ep:29K epch:17.27 loss:0.086 grdn:0.239 lr:2.5e-06 updt_s: -0.509 data_s:0.003 -INFO 2025-09-09 00:31:11 celerate.py:281 step:74K smpl:5M ep:29K epch:17.32 loss:0.086 grdn:0.238 lr:2.5e-06 updt_s: -0.512 data_s:0.003 -INFO 2025-09-09 00:32:54 celerate.py:281 step:74K smpl:5M ep:29K epch:17.37 loss:0.087 grdn:0.237 lr:2.5e-06 updt_s: -0.510 data_s:0.003 -INFO 2025-09-09 00:34:38 celerate.py:281 step:74K smpl:5M ep:29K epch:17.41 loss:0.086 grdn:0.240 lr:2.5e-06 updt_s: -0.515 data_s:0.003 -INFO 2025-09-09 00:36:21 celerate.py:281 step:75K smpl:5M ep:30K epch:17.46 loss:0.086 grdn:0.241 lr:2.5e-06 updt_s: -0.513 data_s:0.003 -INFO 2025-09-09 00:38:05 celerate.py:281 step:75K smpl:5M ep:30K epch:17.51 loss:0.086 grdn:0.244 lr:2.5e-06 updt_s: -0.512 data_s:0.003 -INFO 2025-09-09 00:39:49 celerate.py:281 step:75K smpl:5M ep:30K epch:17.55 loss:0.086 grdn:0.251 lr:2.5e-06 updt_s: -0.517 data_s:0.003 -INFO 2025-09-09 00:41:32 celerate.py:281 step:75K smpl:5M ep:30K epch:17.60 loss:0.086 grdn:0.243 lr:2.5e-06 updt_s: -0.506 data_s:0.005 -INFO 2025-09-09 00:43:14 celerate.py:281 step:75K smpl:5M ep:30K epch:17.65 loss:0.086 grdn:0.240 lr:2.5e-06 updt_s: -0.390 data_s:0.122 -INFO 2025-09-09 00:44:58 celerate.py:281 step:76K smpl:5M ep:30K epch:17.69 loss:0.086 grdn:0.232 lr:2.5e-06 updt_s: -0.410 data_s:0.107 -INFO 2025-09-09 00:46:41 celerate.py:281 step:76K smpl:5M ep:30K epch:17.74 loss:0.087 grdn:0.247 lr:2.5e-06 updt_s: -0.427 data_s:0.085 -INFO 2025-09-09 00:48:24 celerate.py:281 step:76K smpl:5M ep:30K epch:17.79 loss:0.085 grdn:0.241 lr:2.5e-06 updt_s: -0.492 data_s:0.025 -INFO 2025-09-09 00:50:08 celerate.py:281 step:76K smpl:5M ep:30K epch:17.83 loss:0.086 grdn:0.238 lr:2.5e-06 updt_s: -0.514 data_s:0.003 -INFO 2025-09-09 00:51:51 celerate.py:281 step:76K smpl:5M ep:30K epch:17.88 loss:0.087 grdn:0.241 lr:2.5e-06 updt_s: -0.510 data_s:0.003 -INFO 2025-09-09 00:53:33 celerate.py:281 step:77K smpl:5M ep:30K epch:17.93 loss:0.086 grdn:0.234 lr:2.5e-06 updt_s: -0.510 data_s:0.003 -INFO 2025-09-09 00:55:18 celerate.py:281 step:77K smpl:5M ep:30K epch:17.97 loss:0.086 grdn:0.240 lr:2.5e-06 updt_s: -0.519 data_s:0.003 -INFO 2025-09-09 00:57:05 celerate.py:281 step:77K smpl:5M ep:31K epch:18.02 loss:0.086 grdn:0.236 lr:2.5e-06 updt_s: -0.486 data_s:0.046 -INFO 2025-09-09 00:58:47 celerate.py:281 step:77K smpl:5M ep:31K epch:18.07 loss:0.086 grdn:0.239 lr:2.5e-06 updt_s: -0.509 data_s:0.003 -INFO 2025-09-09 01:00:31 celerate.py:281 step:77K smpl:5M ep:31K epch:18.11 loss:0.086 grdn:0.244 lr:2.5e-06 updt_s: -0.514 data_s:0.003 -INFO 2025-09-09 01:02:15 celerate.py:281 step:78K smpl:5M ep:31K epch:18.16 loss:0.086 grdn:0.241 lr:2.5e-06 updt_s: -0.515 data_s:0.003 -INFO 2025-09-09 01:03:56 celerate.py:281 step:78K smpl:5M ep:31K epch:18.21 loss:0.086 grdn:0.233 lr:2.5e-06 updt_s: -0.498 data_s:0.008 -INFO 2025-09-09 01:05:40 celerate.py:281 step:78K smpl:5M ep:31K epch:18.25 loss:0.086 grdn:0.237 lr:2.5e-06 updt_s: -0.508 data_s:0.010 -INFO 2025-09-09 01:07:24 celerate.py:281 step:78K smpl:5M ep:31K epch:18.30 loss:0.085 grdn:0.240 lr:2.5e-06 updt_s: -0.515 data_s:0.003 -INFO 2025-09-09 01:09:07 celerate.py:281 step:78K smpl:5M ep:31K epch:18.35 loss:0.086 grdn:0.237 lr:2.5e-06 updt_s: -0.513 data_s:0.003 -INFO 2025-09-09 01:10:51 celerate.py:281 step:79K smpl:5M ep:31K epch:18.40 loss:0.086 grdn:0.236 lr:2.5e-06 updt_s: -0.515 data_s:0.003 -INFO 2025-09-09 01:12:35 celerate.py:281 step:79K smpl:5M ep:31K epch:18.44 loss:0.086 grdn:0.239 lr:2.5e-06 updt_s: -0.514 data_s:0.003 -INFO 2025-09-09 01:14:17 celerate.py:281 step:79K smpl:5M ep:31K epch:18.49 loss:0.086 grdn:0.247 lr:2.5e-06 updt_s: -0.508 data_s:0.003 -INFO 2025-09-09 01:16:01 celerate.py:281 step:79K smpl:5M ep:31K epch:18.54 loss:0.086 grdn:0.244 lr:2.5e-06 updt_s: -0.513 data_s:0.003 -INFO 2025-09-09 01:17:43 celerate.py:281 step:79K smpl:5M ep:31K epch:18.58 loss:0.086 grdn:0.233 lr:2.5e-06 updt_s: -0.507 data_s:0.003 -INFO 2025-09-09 01:19:26 celerate.py:281 step:80K smpl:5M ep:32K epch:18.63 loss:0.086 grdn:0.237 lr:2.5e-06 updt_s: -0.511 data_s:0.003 -INFO 2025-09-09 01:21:09 celerate.py:281 step:80K smpl:5M ep:32K epch:18.68 loss:0.086 grdn:0.237 lr:2.5e-06 updt_s: -0.513 data_s:0.003 -INFO 2025-09-09 01:22:52 celerate.py:281 step:80K smpl:5M ep:32K epch:18.72 loss:0.086 grdn:0.240 lr:2.5e-06 updt_s: -0.501 data_s:0.011 -INFO 2025-09-09 01:22:52 celerate.py:295 Checkpoint policy after step 80000 -INFO 2025-09-09 01:24:37 celerate.py:281 step:80K smpl:5M ep:32K epch:18.77 loss:0.086 grdn:0.241 lr:2.5e-06 updt_s: -0.510 data_s:0.003 -INFO 2025-09-09 01:26:20 celerate.py:281 step:80K smpl:5M ep:32K epch:18.82 loss:0.087 grdn:0.242 lr:2.5e-06 updt_s: -0.514 data_s:0.003 -INFO 2025-09-09 01:28:05 celerate.py:281 step:81K smpl:5M ep:32K epch:18.86 loss:0.086 grdn:0.241 lr:2.5e-06 updt_s: -0.517 data_s:0.003 -INFO 2025-09-09 01:29:48 celerate.py:281 step:81K smpl:5M ep:32K epch:18.91 loss:0.086 grdn:0.243 lr:2.5e-06 updt_s: -0.511 data_s:0.003 -INFO 2025-09-09 01:31:32 celerate.py:281 step:81K smpl:5M ep:32K epch:18.96 loss:0.086 grdn:0.237 lr:2.5e-06 updt_s: -0.517 data_s:0.003 -INFO 2025-09-09 01:33:18 celerate.py:281 step:81K smpl:5M ep:32K epch:19.00 loss:0.087 grdn:0.244 lr:2.5e-06 updt_s: -0.487 data_s:0.042 -INFO 2025-09-09 01:35:01 celerate.py:281 step:81K smpl:5M ep:32K epch:19.05 loss:0.086 grdn:0.236 lr:2.5e-06 updt_s: -0.461 data_s:0.054 -INFO 2025-09-09 01:36:46 celerate.py:281 step:82K smpl:5M ep:32K epch:19.10 loss:0.086 grdn:0.239 lr:2.5e-06 updt_s: -0.450 data_s:0.071 -INFO 2025-09-09 01:38:29 celerate.py:281 step:82K smpl:5M ep:32K epch:19.14 loss:0.086 grdn:0.241 lr:2.5e-06 updt_s: -0.451 data_s:0.064 -INFO 2025-09-09 01:40:12 celerate.py:281 step:82K smpl:5M ep:32K epch:19.19 loss:0.086 grdn:0.234 lr:2.5e-06 updt_s: -0.512 data_s:0.003 -INFO 2025-09-09 01:41:54 celerate.py:281 step:82K smpl:5M ep:33K epch:19.24 loss:0.085 grdn:0.237 lr:2.5e-06 updt_s: -0.480 data_s:0.030 -INFO 2025-09-09 01:43:37 celerate.py:281 step:82K smpl:5M ep:33K epch:19.28 loss:0.086 grdn:0.242 lr:2.5e-06 updt_s: -0.434 data_s:0.080 -INFO 2025-09-09 01:45:20 celerate.py:281 step:83K smpl:5M ep:33K epch:19.33 loss:0.086 grdn:0.243 lr:2.5e-06 updt_s: -0.511 data_s:0.003 -INFO 2025-09-09 01:47:04 celerate.py:281 step:83K smpl:5M ep:33K epch:19.38 loss:0.086 grdn:0.243 lr:2.5e-06 updt_s: -0.511 data_s:0.003 -INFO 2025-09-09 01:48:49 celerate.py:281 step:83K smpl:5M ep:33K epch:19.42 loss:0.086 grdn:0.243 lr:2.5e-06 updt_s: -0.520 data_s:0.003 -INFO 2025-09-09 01:50:32 celerate.py:281 step:83K smpl:5M ep:33K epch:19.47 loss:0.086 grdn:0.238 lr:2.5e-06 updt_s: -0.511 data_s:0.003 -INFO 2025-09-09 01:52:15 celerate.py:281 step:83K smpl:5M ep:33K epch:19.52 loss:0.086 grdn:0.243 lr:2.5e-06 updt_s: -0.513 data_s:0.003 -INFO 2025-09-09 01:53:59 celerate.py:281 step:84K smpl:5M ep:33K epch:19.57 loss:0.085 grdn:0.236 lr:2.5e-06 updt_s: -0.513 data_s:0.003 -INFO 2025-09-09 01:55:41 celerate.py:281 step:84K smpl:5M ep:33K epch:19.61 loss:0.086 grdn:0.238 lr:2.5e-06 updt_s: -0.508 data_s:0.003 -INFO 2025-09-09 01:57:24 celerate.py:281 step:84K smpl:5M ep:33K epch:19.66 loss:0.087 grdn:0.237 lr:2.5e-06 updt_s: -0.500 data_s:0.013 -INFO 2025-09-09 01:59:07 celerate.py:281 step:84K smpl:5M ep:33K epch:19.71 loss:0.087 grdn:0.245 lr:2.5e-06 updt_s: -0.510 data_s:0.003 -INFO 2025-09-09 02:00:50 celerate.py:281 step:84K smpl:5M ep:33K epch:19.75 loss:0.086 grdn:0.237 lr:2.5e-06 updt_s: -0.512 data_s:0.003 -INFO 2025-09-09 02:02:34 celerate.py:281 step:85K smpl:5M ep:34K epch:19.80 loss:0.086 grdn:0.240 lr:2.5e-06 updt_s: -0.516 data_s:0.003 -INFO 2025-09-09 02:04:17 celerate.py:281 step:85K smpl:5M ep:34K epch:19.85 loss:0.086 grdn:0.244 lr:2.5e-06 updt_s: -0.511 data_s:0.003 -INFO 2025-09-09 02:05:59 celerate.py:281 step:85K smpl:5M ep:34K epch:19.89 loss:0.086 grdn:0.243 lr:2.5e-06 updt_s: -0.507 data_s:0.004 -INFO 2025-09-09 02:07:43 celerate.py:281 step:85K smpl:5M ep:34K epch:19.94 loss:0.086 grdn:0.237 lr:2.5e-06 updt_s: -0.505 data_s:0.011 -INFO 2025-09-09 02:09:26 celerate.py:281 step:85K smpl:5M ep:34K epch:19.99 loss:0.086 grdn:0.242 lr:2.5e-06 updt_s: -0.392 data_s:0.124 -INFO 2025-09-09 02:11:14 celerate.py:281 step:86K smpl:5M ep:34K epch:20.03 loss:0.086 grdn:0.240 lr:2.5e-06 updt_s: -0.435 data_s:0.100 -INFO 2025-09-09 02:12:57 celerate.py:281 step:86K smpl:5M ep:34K epch:20.08 loss:0.086 grdn:0.242 lr:2.5e-06 updt_s: -0.514 data_s:0.003 -INFO 2025-09-09 02:14:41 celerate.py:281 step:86K smpl:6M ep:34K epch:20.13 loss:0.085 grdn:0.234 lr:2.5e-06 updt_s: -0.516 data_s:0.003 -INFO 2025-09-09 02:16:25 celerate.py:281 step:86K smpl:6M ep:34K epch:20.17 loss:0.085 grdn:0.234 lr:2.5e-06 updt_s: -0.514 data_s:0.003 -INFO 2025-09-09 02:18:09 celerate.py:281 step:86K smpl:6M ep:34K epch:20.22 loss:0.085 grdn:0.238 lr:2.5e-06 updt_s: -0.506 data_s:0.010 -INFO 2025-09-09 02:19:53 celerate.py:281 step:87K smpl:6M ep:34K epch:20.27 loss:0.086 grdn:0.238 lr:2.5e-06 updt_s: -0.515 data_s:0.003 -INFO 2025-09-09 02:21:35 celerate.py:281 step:87K smpl:6M ep:34K epch:20.31 loss:0.086 grdn:0.243 lr:2.5e-06 updt_s: -0.501 data_s:0.007 -INFO 2025-09-09 02:23:17 celerate.py:281 step:87K smpl:6M ep:34K epch:20.36 loss:0.085 grdn:0.242 lr:2.5e-06 updt_s: -0.432 data_s:0.080 -INFO 2025-09-09 02:25:01 celerate.py:281 step:87K smpl:6M ep:35K epch:20.41 loss:0.086 grdn:0.235 lr:2.5e-06 updt_s: -0.445 data_s:0.073 -INFO 2025-09-09 02:26:43 celerate.py:281 step:87K smpl:6M ep:35K epch:20.45 loss:0.085 grdn:0.239 lr:2.5e-06 updt_s: -0.401 data_s:0.107 -INFO 2025-09-09 02:28:26 celerate.py:281 step:88K smpl:6M ep:35K epch:20.50 loss:0.086 grdn:0.237 lr:2.5e-06 updt_s: -0.502 data_s:0.009 -INFO 2025-09-09 02:30:09 celerate.py:281 step:88K smpl:6M ep:35K epch:20.55 loss:0.087 grdn:0.248 lr:2.5e-06 updt_s: -0.491 data_s:0.021 -INFO 2025-09-09 02:31:52 celerate.py:281 step:88K smpl:6M ep:35K epch:20.59 loss:0.086 grdn:0.243 lr:2.5e-06 updt_s: -0.391 data_s:0.124 -INFO 2025-09-09 02:33:35 celerate.py:281 step:88K smpl:6M ep:35K epch:20.64 loss:0.086 grdn:0.238 lr:2.5e-06 updt_s: -0.332 data_s:0.180 -INFO 2025-09-09 02:35:18 celerate.py:281 step:88K smpl:6M ep:35K epch:20.69 loss:0.087 grdn:0.243 lr:2.5e-06 updt_s: -0.333 data_s:0.181 -INFO 2025-09-09 02:37:02 celerate.py:281 step:89K smpl:6M ep:35K epch:20.74 loss:0.086 grdn:0.245 lr:2.5e-06 updt_s: -0.332 data_s:0.185 -INFO 2025-09-09 02:38:47 celerate.py:281 step:89K smpl:6M ep:35K epch:20.78 loss:0.086 grdn:0.241 lr:2.5e-06 updt_s: -0.332 data_s:0.190 -INFO 2025-09-09 02:40:30 celerate.py:281 step:89K smpl:6M ep:35K epch:20.83 loss:0.086 grdn:0.240 lr:2.5e-06 updt_s: -0.330 data_s:0.187 -INFO 2025-09-09 02:42:12 celerate.py:281 step:89K smpl:6M ep:35K epch:20.88 loss:0.085 grdn:0.243 lr:2.5e-06 updt_s: -0.331 data_s:0.177 -INFO 2025-09-09 02:43:56 celerate.py:281 step:89K smpl:6M ep:35K epch:20.92 loss:0.086 grdn:0.244 lr:2.5e-06 updt_s: -0.330 data_s:0.185 -INFO 2025-09-09 02:45:39 celerate.py:281 step:90K smpl:6M ep:36K epch:20.97 loss:0.087 grdn:0.252 lr:2.5e-06 updt_s: -0.335 data_s:0.182 -INFO 2025-09-09 02:47:25 celerate.py:281 step:90K smpl:6M ep:36K epch:21.02 loss:0.087 grdn:0.247 lr:2.5e-06 updt_s: -0.330 data_s:0.197 -INFO 2025-09-09 02:49:08 celerate.py:281 step:90K smpl:6M ep:36K epch:21.06 loss:0.086 grdn:0.240 lr:2.5e-06 updt_s: -0.330 data_s:0.182 -INFO 2025-09-09 02:50:51 celerate.py:281 step:90K smpl:6M ep:36K epch:21.11 loss:0.086 grdn:0.243 lr:2.5e-06 updt_s: -0.334 data_s:0.181 -INFO 2025-09-09 02:52:34 celerate.py:281 step:90K smpl:6M ep:36K epch:21.16 loss:0.086 grdn:0.237 lr:2.5e-06 updt_s: -0.330 data_s:0.183 -INFO 2025-09-09 02:54:17 celerate.py:281 step:91K smpl:6M ep:36K epch:21.20 loss:0.085 grdn:0.235 lr:2.5e-06 updt_s: -0.342 data_s:0.173 -INFO 2025-09-09 02:56:01 celerate.py:281 step:91K smpl:6M ep:36K epch:21.25 loss:0.086 grdn:0.246 lr:2.5e-06 updt_s: -0.331 data_s:0.189 -INFO 2025-09-09 02:57:44 celerate.py:281 step:91K smpl:6M ep:36K epch:21.30 loss:0.086 grdn:0.236 lr:2.5e-06 updt_s: -0.329 data_s:0.182 -INFO 2025-09-09 02:59:27 celerate.py:281 step:91K smpl:6M ep:36K epch:21.34 loss:0.087 grdn:0.246 lr:2.5e-06 updt_s: -0.341 data_s:0.175 -INFO 2025-09-09 03:01:11 celerate.py:281 step:91K smpl:6M ep:36K epch:21.39 loss:0.086 grdn:0.242 lr:2.5e-06 updt_s: -0.330 data_s:0.188 -INFO 2025-09-09 03:02:54 celerate.py:281 step:92K smpl:6M ep:36K epch:21.44 loss:0.086 grdn:0.237 lr:2.5e-06 updt_s: -0.328 data_s:0.186 -INFO 2025-09-09 03:04:39 celerate.py:281 step:92K smpl:6M ep:36K epch:21.48 loss:0.086 grdn:0.244 lr:2.5e-06 updt_s: -0.329 data_s:0.195 -INFO 2025-09-09 03:06:22 celerate.py:281 step:92K smpl:6M ep:36K epch:21.53 loss:0.086 grdn:0.244 lr:2.5e-06 updt_s: -0.330 data_s:0.183 -INFO 2025-09-09 03:08:04 celerate.py:281 step:92K smpl:6M ep:37K epch:21.58 loss:0.086 grdn:0.241 lr:2.5e-06 updt_s: -0.462 data_s:0.051 -INFO 2025-09-09 03:09:48 celerate.py:281 step:92K smpl:6M ep:37K epch:21.62 loss:0.086 grdn:0.248 lr:2.5e-06 updt_s: -0.407 data_s:0.108 -INFO 2025-09-09 03:11:32 celerate.py:281 step:93K smpl:6M ep:37K epch:21.67 loss:0.086 grdn:0.232 lr:2.5e-06 updt_s: -0.333 data_s:0.185 -INFO 2025-09-09 03:13:15 celerate.py:281 step:93K smpl:6M ep:37K epch:21.72 loss:0.085 grdn:0.242 lr:2.5e-06 updt_s: -0.329 data_s:0.187 -INFO 2025-09-09 03:14:58 celerate.py:281 step:93K smpl:6M ep:37K epch:21.77 loss:0.086 grdn:0.240 lr:2.5e-06 updt_s: -0.357 data_s:0.156 -INFO 2025-09-09 03:16:41 celerate.py:281 step:93K smpl:6M ep:37K epch:21.81 loss:0.086 grdn:0.237 lr:2.5e-06 updt_s: -0.487 data_s:0.027 -INFO 2025-09-09 03:18:25 celerate.py:281 step:93K smpl:6M ep:37K epch:21.86 loss:0.086 grdn:0.240 lr:2.5e-06 updt_s: -0.512 data_s:0.003 -INFO 2025-09-09 03:20:08 celerate.py:281 step:94K smpl:6M ep:37K epch:21.91 loss:0.087 grdn:0.247 lr:2.5e-06 updt_s: -0.512 data_s:0.003 -INFO 2025-09-09 03:21:51 celerate.py:281 step:94K smpl:6M ep:37K epch:21.95 loss:0.086 grdn:0.238 lr:2.5e-06 updt_s: -0.508 data_s:0.004 -INFO 2025-09-09 03:23:38 celerate.py:281 step:94K smpl:6M ep:37K epch:22.00 loss:0.085 grdn:0.239 lr:2.5e-06 updt_s: -0.429 data_s:0.104 -INFO 2025-09-09 03:25:20 celerate.py:281 step:94K smpl:6M ep:37K epch:22.05 loss:0.086 grdn:0.243 lr:2.5e-06 updt_s: -0.328 data_s:0.183 -INFO 2025-09-09 03:27:04 celerate.py:281 step:94K smpl:6M ep:37K epch:22.09 loss:0.086 grdn:0.246 lr:2.5e-06 updt_s: -0.329 data_s:0.191 -INFO 2025-09-09 03:28:47 celerate.py:281 step:95K smpl:6M ep:37K epch:22.14 loss:0.086 grdn:0.246 lr:2.5e-06 updt_s: -0.329 data_s:0.186 -INFO 2025-09-09 03:30:30 celerate.py:281 step:95K smpl:6M ep:38K epch:22.19 loss:0.086 grdn:0.241 lr:2.5e-06 updt_s: -0.346 data_s:0.166 -INFO 2025-09-09 03:32:13 celerate.py:281 step:95K smpl:6M ep:38K epch:22.23 loss:0.086 grdn:0.247 lr:2.5e-06 updt_s: -0.334 data_s:0.181 -INFO 2025-09-09 03:33:56 celerate.py:281 step:95K smpl:6M ep:38K epch:22.28 loss:0.086 grdn:0.244 lr:2.5e-06 updt_s: -0.402 data_s:0.110 -INFO 2025-09-09 03:35:39 celerate.py:281 step:95K smpl:6M ep:38K epch:22.33 loss:0.085 grdn:0.237 lr:2.5e-06 updt_s: -0.353 data_s:0.161 -INFO 2025-09-09 03:37:22 celerate.py:281 step:96K smpl:6M ep:38K epch:22.37 loss:0.086 grdn:0.235 lr:2.5e-06 updt_s: -0.356 data_s:0.158 -INFO 2025-09-09 03:39:04 celerate.py:281 step:96K smpl:6M ep:38K epch:22.42 loss:0.086 grdn:0.244 lr:2.5e-06 updt_s: -0.379 data_s:0.131 -INFO 2025-09-09 03:40:49 celerate.py:281 step:96K smpl:6M ep:38K epch:22.47 loss:0.085 grdn:0.239 lr:2.5e-06 updt_s: -0.344 data_s:0.175 -INFO 2025-09-09 03:42:32 celerate.py:281 step:96K smpl:6M ep:38K epch:22.51 loss:0.086 grdn:0.236 lr:2.5e-06 updt_s: -0.331 data_s:0.185 -INFO 2025-09-09 03:44:15 celerate.py:281 step:96K smpl:6M ep:38K epch:22.56 loss:0.086 grdn:0.244 lr:2.5e-06 updt_s: -0.331 data_s:0.183 -INFO 2025-09-09 03:45:58 celerate.py:281 step:97K smpl:6M ep:38K epch:22.61 loss:0.086 grdn:0.238 lr:2.5e-06 updt_s: -0.330 data_s:0.184 -INFO 2025-09-09 03:47:43 celerate.py:281 step:97K smpl:6M ep:38K epch:22.65 loss:0.086 grdn:0.248 lr:2.5e-06 updt_s: -0.331 data_s:0.188 -INFO 2025-09-09 03:49:27 celerate.py:281 step:97K smpl:6M ep:38K epch:22.70 loss:0.086 grdn:0.240 lr:2.5e-06 updt_s: -0.333 data_s:0.185 -INFO 2025-09-09 03:51:10 celerate.py:281 step:97K smpl:6M ep:39K epch:22.75 loss:0.085 grdn:0.241 lr:2.5e-06 updt_s: -0.330 data_s:0.185 -INFO 2025-09-09 03:52:54 celerate.py:281 step:97K smpl:6M ep:39K epch:22.79 loss:0.086 grdn:0.247 lr:2.5e-06 updt_s: -0.330 data_s:0.192 -INFO 2025-09-09 03:54:37 celerate.py:281 step:98K smpl:6M ep:39K epch:22.84 loss:0.086 grdn:0.237 lr:2.5e-06 updt_s: -0.329 data_s:0.185 -INFO 2025-09-09 03:56:21 celerate.py:281 step:98K smpl:6M ep:39K epch:22.89 loss:0.086 grdn:0.237 lr:2.5e-06 updt_s: -0.329 data_s:0.187 -INFO 2025-09-09 03:58:04 celerate.py:281 step:98K smpl:6M ep:39K epch:22.94 loss:0.086 grdn:0.242 lr:2.5e-06 updt_s: -0.329 data_s:0.185 -INFO 2025-09-09 03:59:46 celerate.py:281 step:98K smpl:6M ep:39K epch:22.98 loss:0.086 grdn:0.239 lr:2.5e-06 updt_s: -0.329 data_s:0.183 -INFO 2025-09-09 04:01:32 celerate.py:281 step:98K smpl:6M ep:39K epch:23.03 loss:0.087 grdn:0.250 lr:2.5e-06 updt_s: -0.376 data_s:0.151 -INFO 2025-09-09 04:03:16 celerate.py:281 step:99K smpl:6M ep:39K epch:23.08 loss:0.086 grdn:0.241 lr:2.5e-06 updt_s: -0.329 data_s:0.187 -INFO 2025-09-09 04:04:59 celerate.py:281 step:99K smpl:6M ep:39K epch:23.12 loss:0.086 grdn:0.243 lr:2.5e-06 updt_s: -0.379 data_s:0.136 -INFO 2025-09-09 04:06:42 celerate.py:281 step:99K smpl:6M ep:39K epch:23.17 loss:0.086 grdn:0.240 lr:2.5e-06 updt_s: -0.513 data_s:0.003 -INFO 2025-09-09 04:08:25 celerate.py:281 step:99K smpl:6M ep:39K epch:23.22 loss:0.086 grdn:0.242 lr:2.5e-06 updt_s: -0.510 data_s:0.003 -INFO 2025-09-09 04:10:07 celerate.py:281 step:99K smpl:6M ep:39K epch:23.26 loss:0.087 grdn:0.242 lr:2.5e-06 updt_s: -0.470 data_s:0.039 -INFO 2025-09-09 04:11:50 celerate.py:281 step:100K smpl:6M ep:39K epch:23.31 loss:0.086 grdn:0.247 lr:2.5e-06 updt_s -:0.347 data_s:0.169 -INFO 2025-09-09 04:13:34 celerate.py:281 step:100K smpl:6M ep:40K epch:23.36 loss:0.085 grdn:0.237 lr:2.5e-06 updt_s -:0.330 data_s:0.185 -INFO 2025-09-09 04:15:17 celerate.py:281 step:100K smpl:6M ep:40K epch:23.40 loss:0.086 grdn:0.239 lr:2.5e-06 updt_s -:0.356 data_s:0.156 -INFO 2025-09-09 04:15:17 celerate.py:295 Checkpoint policy after step 100000 -INFO 2025-09-09 04:15:18 celerate.py:359 End of training -(lerobot) jade_choghari@hf-dgx-01:~/lerobot$ clear -(lerobot) jade_choghari@hf-dgx-01:~/lerobot$ tmux capture-pane -pS - > tmux_log.txt - - - - - - - - - - From a19d7fb6bfdcd55b3c8475d1a249b7f815cda092 Mon Sep 17 00:00:00 2001 From: "pre-commit-ci[bot]" <66853113+pre-commit-ci[bot]@users.noreply.github.com> Date: Thu, 11 Sep 2025 11:51:53 +0000 Subject: [PATCH 06/12] [pre-commit.ci] auto fixes from pre-commit.com hooks for more information, see https://pre-commit.ci --- examples/6_evaluate_libero.sh | 2 +- src/lerobot/envs/factory.py | 8 +- src/lerobot/envs/libero.py | 55 ++-- src/lerobot/envs/utils.py | 7 +- src/lerobot/policies/factory.py | 2 +- src/lerobot/policies/normalize.py | 6 +- .../policies/smolpi0/configuration_smolpi0.py | 22 +- .../policies/smolpi0/flex_attention.py | 10 +- .../policies/smolpi0/modeling_smolpi0.py | 211 +++++++++----- .../policies/smolpi0/smolvlm_with_expert.py | 262 ++++++++++++------ .../policies/smolvla/modeling_smolvla.py | 3 +- src/lerobot/policies/smolvla/saver.txt | 2 +- .../policies/smolvla/smolvlm_with_expert.py | 2 +- src/lerobot/scripts/eval.py | 95 ++++--- src/lerobot/scripts/train.py | 17 +- src/lerobot/scripts/train_2.py | 10 +- src/lerobot/scripts/train_accelerate.py | 9 +- 17 files changed, 469 insertions(+), 254 deletions(-) diff --git a/examples/6_evaluate_libero.sh b/examples/6_evaluate_libero.sh index 46355dfa1..a7e235fcd 100644 --- a/examples/6_evaluate_libero.sh +++ b/examples/6_evaluate_libero.sh @@ -55,4 +55,4 @@ python src/lerobot/scripts/eval.py \ # --num_trials_per_task 10 \ # --video_out_path "data/libero/videos" \ # --device "cuda" \ -# --seed 7 \ No newline at end of file +# --seed 7 diff --git a/src/lerobot/envs/factory.py b/src/lerobot/envs/factory.py index d38b2eed3..b74af7276 100644 --- a/src/lerobot/envs/factory.py +++ b/src/lerobot/envs/factory.py @@ -61,9 +61,9 @@ def make_env( env_cls = gym.vector.AsyncVectorEnv if use_async_envs else gym.vector.SyncVectorEnv - if "libero" in cfg.type: from lerobot.envs.libero import create_libero_envs + return create_libero_envs( task=cfg.task, n_envs=n_envs, @@ -74,17 +74,16 @@ def make_env( multitask_eval=cfg.multitask_eval, ) - package_name = f"gym_{cfg.type}" try: importlib.import_module(package_name) except ModuleNotFoundError as e: raise ModuleNotFoundError( - f"{package_name} is not installed. Install with: pip install \"lerobot[{cfg.type}]\"" + f'{package_name} is not installed. Install with: pip install "lerobot[{cfg.type}]"' ) from e gym_handle = f"{package_name}/{cfg.task}" - + def _make_one(): return gym.make(gym_handle, disable_env_checker=True, **(cfg.gym_kwargs or {})) @@ -93,4 +92,3 @@ def make_env( # normalize to {suite: {task_id: vec_env}} for consistency suite_name = cfg.type # e.g., "pusht", "aloha" return {suite_name: {0: vec}} - diff --git a/src/lerobot/envs/libero.py b/src/lerobot/envs/libero.py index ff1574416..0672f583f 100644 --- a/src/lerobot/envs/libero.py +++ b/src/lerobot/envs/libero.py @@ -1,11 +1,13 @@ from __future__ import annotations +import logging import math import os from collections import defaultdict from collections.abc import Callable from itertools import chain -from typing import Any +from typing import Any, Dict, List +from collections.abc import Callable, Iterable, Mapping, Sequence import gymnasium as gym import numpy as np @@ -14,16 +16,12 @@ from gymnasium import spaces from libero.libero import benchmark, get_libero_path from libero.libero.envs import OffScreenRenderEnv -import logging -from collections import defaultdict -from typing import Any, Callable, Dict, Iterable, List, Mapping, Sequence - - logger = logging.getLogger(__name__) # ---- Helpers ----------------------------------------------------------------- -def _parse_camera_names(camera_name: str | Sequence[str]) -> List[str]: + +def _parse_camera_names(camera_name: str | Sequence[str]) -> list[str]: """Normalize camera_name into a non-empty list of strings.""" if isinstance(camera_name, str): cams = [c.strip() for c in camera_name.split(",") if c.strip()] @@ -47,14 +45,14 @@ def _get_suite(name: str): return suite -def _select_task_ids(total_tasks: int, task_ids: Iterable[int] | None) -> List[int]: +def _select_task_ids(total_tasks: int, task_ids: Iterable[int] | None) -> list[int]: """Validate/normalize task ids. If None β†’ all tasks.""" if task_ids is None: return list(range(total_tasks)) - ids = sorted(set(int(t) for t in task_ids)) + ids = sorted({int(t) for t in task_ids}) for t in ids: if t < 0 or t >= total_tasks: - raise ValueError(f"task_id {t} out of range [0, {total_tasks-1}].") + raise ValueError(f"task_id {t} out of range [0, {total_tasks - 1}].") return ids @@ -64,16 +62,25 @@ def _make_env_fns( suite_name: str, task_id: int, n_envs: int, - camera_names: List[str], + camera_names: list[str], init_states: bool, gym_kwargs: Mapping[str, Any], LiberoEnv: type, # injected to avoid forward ref issues if needed -) -> List[Callable[[], "LiberoEnv"]]: +) -> list[Callable[[], LiberoEnv]]: """Build n_envs factory callables for a single (suite, task_id).""" joined_cams = ",".join(camera_names) # keep backward-compat: downstream expects a string - fns: List[Callable[[], "LiberoEnv"]] = [] + fns: list[Callable[[], LiberoEnv]] = [] for i in range(n_envs): - def _mk(i=i, suite=suite, task_id=task_id, suite_name=suite_name, joined_cams=joined_cams, init_states=init_states, gym_kwargs=dict(gym_kwargs)): + + def _mk( + i=i, + suite=suite, + task_id=task_id, + suite_name=suite_name, + joined_cams=joined_cams, + init_states=init_states, + gym_kwargs=dict(gym_kwargs), + ): return LiberoEnv( task_suite=suite, task_id=task_id, @@ -83,11 +90,14 @@ def _make_env_fns( episode_index=i, **gym_kwargs, ) + fns.append(_mk) return fns + # ---- Main API ---------------------------------------------------------------- + def create_libero_envs( task: str, n_envs: int, @@ -130,12 +140,15 @@ def create_libero_envs( logger.info( "Creating LIBERO envs | suites=%s | n_envs(per task)=%d | init_states=%s | multitask_eval=%s", - suite_names, n_envs, init_states, bool(multitask_eval) + suite_names, + n_envs, + init_states, + bool(multitask_eval), ) if task_ids_filter is not None: logger.info("Restricting to task_ids=%s", task_ids_filter) - out: Dict[str, Dict[int, Any]] = defaultdict(dict) + out: dict[str, dict[int, Any]] = defaultdict(dict) for suite_name in suite_names: suite = _get_suite(suite_name) @@ -161,6 +174,8 @@ def create_libero_envs( # return plain dicts for predictability return {suite: dict(task_map) for suite, task_map in out.items()} + + def quat2axisangle(quat): """ Copied from robosuite: https://github.com/ARISE-Initiative/robosuite/blob/eafb81f54ffc104f905ee48a16bb15f059176ad3/robosuite/utils/transform_utils.py#L490C1-L512C55 @@ -256,10 +271,10 @@ class LiberoEnv(gym.Env): self._env = self._make_envs_task(task_suite, self.task_id) TASK_SUITE_MAX_STEPS: dict[str, int] = { "libero_spatial": 220, # longest training demo has 193 steps - "libero_object": 280, # longest training demo has 254 steps - "libero_goal": 300, # longest training demo has 270 steps - "libero_10": 520, # longest training demo has 505 steps - "libero_90": 400, # longest training demo has 373 steps + "libero_object": 280, # longest training demo has 254 steps + "libero_goal": 300, # longest training demo has 270 steps + "libero_10": 520, # longest training demo has 505 steps + "libero_90": 400, # longest training demo has 373 steps } default_steps = 500 self._max_episode_steps = TASK_SUITE_MAX_STEPS.get(task_suite_name, default_steps) diff --git a/src/lerobot/envs/utils.py b/src/lerobot/envs/utils.py index 9490f670e..d65f7f29f 100644 --- a/src/lerobot/envs/utils.py +++ b/src/lerobot/envs/utils.py @@ -80,6 +80,7 @@ def preprocess_observation(observations: dict[str, np.ndarray]) -> dict[str, Ten return return_observations + def preprocess_observation1( observations: dict[str, np.ndarray], cfg: dict[str, Any] = None ) -> dict[str, Tensor]: @@ -130,6 +131,8 @@ def preprocess_observation1( if "task" in observations: return_observations["task"] = observations["task"] return return_observations + + def env_to_policy_features(env_cfg: EnvConfig) -> dict[str, PolicyFeature]: # TODO(aliberts, rcadene): remove this hardcoding of keys and just use the nested keys as is # (need to also refactor preprocess_observation and externalize normalization from policies) @@ -183,6 +186,7 @@ def add_envs_task(env: gym.vector.VectorEnv, observation: dict[str, Any]) -> dic observation["task"] = ["" for _ in range(num_envs)] return observation + def _close_single_env(env: Any) -> None: """Try to close a single env object if it exposes .close().""" try: @@ -193,6 +197,7 @@ def _close_single_env(env: Any) -> None: # Best-effort close: log but don't raise LOG.debug("Exception while closing env %s: %s", env, exc) + def close_envs(env_or_collection: Any) -> None: """ Close a single env or any nested structure of envs. @@ -225,4 +230,4 @@ def close_envs(env_or_collection: Any) -> None: # Fallback: try to close if possible if hasattr(env_or_collection, "close"): - _close_single_env(env_or_collection) \ No newline at end of file + _close_single_env(env_or_collection) diff --git a/src/lerobot/policies/factory.py b/src/lerobot/policies/factory.py index 03fa44a2a..cc1b0480d 100644 --- a/src/lerobot/policies/factory.py +++ b/src/lerobot/policies/factory.py @@ -31,10 +31,10 @@ from lerobot.policies.pi0fast.configuration_pi0fast import PI0FASTConfig from lerobot.policies.pretrained import PreTrainedPolicy from lerobot.policies.sac.configuration_sac import SACConfig from lerobot.policies.sac.reward_model.configuration_classifier import RewardClassifierConfig +from lerobot.policies.smolpi0.configuration_smolpi0 import SMOLPI0Config from lerobot.policies.smolvla.configuration_smolvla import SmolVLAConfig from lerobot.policies.tdmpc.configuration_tdmpc import TDMPCConfig from lerobot.policies.vqbet.configuration_vqbet import VQBeTConfig -from lerobot.policies.smolpi0.configuration_smolpi0 import SMOLPI0Config def get_policy_class(name: str) -> PreTrainedPolicy: diff --git a/src/lerobot/policies/normalize.py b/src/lerobot/policies/normalize.py index 043265b1b..646c330cb 100644 --- a/src/lerobot/policies/normalize.py +++ b/src/lerobot/policies/normalize.py @@ -309,8 +309,8 @@ class NormalizePerRobotType(nn.Module): getattr(self, f"{robot_type}_buffer_" + key.replace(".", "_")) for robot_type in robot_types ] if norm_mode is NormalizationMode.MEAN_STD: - mean = torch.stack([buffers[i]["mean"] for i in range(len(robot_types))],dim=0) - std = torch.stack([buffers[i]["std"] for i in range(len(robot_types))],dim=0) + mean = torch.stack([buffers[i]["mean"] for i in range(len(robot_types))], dim=0) + std = torch.stack([buffers[i]["std"] for i in range(len(robot_types))], dim=0) if batch[key].ndim == 3: mean = mean.unsqueeze(1) std = std.unsqueeze(1) @@ -332,6 +332,8 @@ class NormalizePerRobotType(nn.Module): else: raise ValueError(norm_mode) return batch + + # TODO (azouitine): We should replace all normalization on the policies with register_buffer normalization # and remove the `Normalize` and `Unnormalize` classes. def _initialize_stats_buffers( diff --git a/src/lerobot/policies/smolpi0/configuration_smolpi0.py b/src/lerobot/policies/smolpi0/configuration_smolpi0.py index c3605cd82..e39d17f15 100644 --- a/src/lerobot/policies/smolpi0/configuration_smolpi0.py +++ b/src/lerobot/policies/smolpi0/configuration_smolpi0.py @@ -14,12 +14,12 @@ from dataclasses import dataclass, field +from lerobot.configs.policies import PreTrainedConfig +from lerobot.configs.types import FeatureType, NormalizationMode, PolicyFeature from lerobot.optim.optimizers import AdamWConfig from lerobot.optim.schedulers import ( CosineDecayWithWarmupSchedulerConfig, ) -from lerobot.configs.policies import PreTrainedConfig -from lerobot.configs.types import FeatureType, NormalizationMode, PolicyFeature @dataclass @@ -52,7 +52,7 @@ class SMOLPI0Config(PreTrainedConfig): max_action_dim: int = 32 # Image preprocessing - resize_imgs_with_padding: tuple[int, int] = (512, 512) #(224, 224) + resize_imgs_with_padding: tuple[int, int] = (512, 512) # (224, 224) # Add empty images. Used by pi0_aloha_sim which adds the empty # left and right wrist cameras in addition to the top camera. @@ -107,14 +107,14 @@ class SMOLPI0Config(PreTrainedConfig): add_image_special_tokens: bool = False add_prompt_template: bool = False - prefix_prompt_template: str = f"<|im_start|>User: What action should the robot take to" - suffix_prompt_template: str = f"?\nAssistant:" + prefix_prompt_template: str = "<|im_start|>User: What action should the robot take to" + suffix_prompt_template: str = "?\nAssistant:" attention_mode: str = "self_attn" - prefix_length: int = -1 # n_obs_steps * num_cameras * num_image_token_per_image + tokenizer_max_length + prefix_length: int = -1 # n_obs_steps * num_cameras * num_image_token_per_image + tokenizer_max_length - past_obs_keys: str = f"image" + past_obs_keys: str = "image" add_local_special_image_tokens: bool = False @@ -122,7 +122,7 @@ class SMOLPI0Config(PreTrainedConfig): state_to_prefix: bool = False - pad_language_to: str = "longest" # "max_length" + pad_language_to: str = "longest" # "max_length" num_expert_layers: int = -1 num_vlm_layers: int = -1 @@ -144,9 +144,9 @@ class SMOLPI0Config(PreTrainedConfig): shuffle_camera_positions: bool = False vlm_img_size: int = -1 - + regression_loss: bool = False - + def __post_init__(self): super().__post_init__() if self.vlm_img_size > 0: @@ -198,7 +198,7 @@ class SMOLPI0Config(PreTrainedConfig): ) @property - def observation_delta_indices(self) -> list: # FIXME(mshukor): support spacing between observations + def observation_delta_indices(self) -> list: # FIXME(mshukor): support spacing between observations return [-k for k in range(0, self.n_obs_steps * self.n_obs_gap, self.n_obs_gap)][::-1] @property diff --git a/src/lerobot/policies/smolpi0/flex_attention.py b/src/lerobot/policies/smolpi0/flex_attention.py index 13950f743..732920af2 100644 --- a/src/lerobot/policies/smolpi0/flex_attention.py +++ b/src/lerobot/policies/smolpi0/flex_attention.py @@ -85,7 +85,7 @@ def flex_attention_forward( b_mask, h_mask, q_len, kv_len = causal_mask.shape # The shape of your mask - block_size = 128 # limitation of flex attention + block_size = 128 # limitation of flex attention q_len_rounded = _round_up_to_multiple(q_len, block_size) kv_len_rounded = _round_up_to_multiple(kv_len, block_size) @@ -95,7 +95,7 @@ def flex_attention_forward( pad_k = kv_len_rounded - kv_len if pad_q > 0 or pad_k > 0: padded_causal_mask = F.pad(causal_mask, (0, pad_k, 0, pad_q), value=0.0) - else: + else: padded_causal_mask = causal_mask mask_mod_fn_orig = precomputed_mask_factory(padded_causal_mask) @@ -107,21 +107,21 @@ def flex_attention_forward( KV_LEN=kv_len_rounded, device=causal_mask.device, ) - + mask_mod_fn_padded = precomputed_mask_factory(mask_4d) # FIXME(mshukor): compile mask torch.compile(create_block_mask) create_block_mask_compiled = torch.compile(create_block_mask) block_mask = create_block_mask_compiled( mask_mod=mask_mod_fn_padded, B=b_mask, - H=None, # + H=None, # Q_LEN=q_len_rounded, KV_LEN=kv_len_rounded, BLOCK_SIZE=block_size, device=causal_mask.device, _compile=False, ) - padded_query_states = F.pad(query_states, (0, 0, 0, pad_q), value=0.0) if pad_q > 0 else query_states + padded_query_states = F.pad(query_states, (0, 0, 0, pad_q), value=0.0) if pad_q > 0 else query_states padded_key_states = F.pad(key_states, (0, 0, 0, pad_k), value=0.0) if pad_k > 0 else key_states padded_value_states = F.pad(value_states, (0, 0, 0, pad_k), value=0.0) if pad_k > 0 else value_states # mask is applied inside the kernel, ideally more efficiently than score_mod. diff --git a/src/lerobot/policies/smolpi0/modeling_smolpi0.py b/src/lerobot/policies/smolpi0/modeling_smolpi0.py index 9a128f7b6..765a5901a 100644 --- a/src/lerobot/policies/smolpi0/modeling_smolpi0.py +++ b/src/lerobot/policies/smolpi0/modeling_smolpi0.py @@ -50,9 +50,10 @@ policy = Pi0Policy.from_pretrained("lerobot/pi0") """ import math -from collections import deque import os import re +from collections import deque + import safetensors import torch import torch.nn.functional as F # noqa: N812 @@ -66,12 +67,11 @@ from lerobot.policies.normalize import ( Unnormalize, UnnormalizePerRobotType, ) -from lerobot.policies.smolpi0.configuration_smolpi0 import SMOLPI0Config -from lerobot.policies.smolpi0.smolvlm_with_expert import ( - SmolVLMWithExpertModel -) from lerobot.policies.pretrained import PreTrainedPolicy +from lerobot.policies.smolpi0.configuration_smolpi0 import SMOLPI0Config +from lerobot.policies.smolpi0.smolvlm_with_expert import SmolVLMWithExpertModel from lerobot.utils.utils import get_safe_dtype + OBS_IMAGE = "observation.image" OBS_IMAGES = "observation.images" ACTION = "action" @@ -86,10 +86,13 @@ IMAGES_ORDER = { OBS_IMAGE_3: 2, OBS_IMAGE_4: 3, } +import random + from lerobot.policies.utils import ( populate_queues, ) -import random + + def create_sinusoidal_pos_embedding( time: torch.tensor, dimension: int, min_period: float, max_period: float, device="cpu" ) -> Tensor: @@ -171,7 +174,10 @@ def resize_with_pad(img, width, height, pad_value=-1): padded_img = F.pad(resized_img, (pad_width, 0, pad_height, 0), value=pad_value) return padded_img + _VARIANT_RE = re.compile(r"\.so\d+(?:-[\w]+)?_buffer_") + + def canonicalise(k: str) -> str: """ Remove dataset-variant markers like '.so100-blue_' or '.so100_' from a @@ -179,6 +185,7 @@ def canonicalise(k: str) -> str: """ return _VARIANT_RE.sub(".buffer_", k) + def standardise_state_dict( checkpoint: dict[str, torch.Tensor], ref_keys: set[str], *, verbose: bool = True ) -> tuple[dict[str, torch.Tensor], list[str]]: @@ -209,6 +216,7 @@ def standardise_state_dict( out.update({k: checkpoint[k] for k in unmatched}) return out, unmatched + def load_smolvla( model: torch.nn.Module, filename: str | os.PathLike, @@ -237,6 +245,8 @@ def load_smolvla( ) return model + + def pad_vector(vector, new_dim): """Can be (batch_size x sequence_length x features_dimension) or (batch_size x features_dimension) @@ -286,6 +296,7 @@ def aloha_gripper_to_angular(value): # The values 0.4 and 1.5 were measured on an actual Trossen robot. return normalize(value, min_val=0.4, max_val=1.5) + def rename_checkpoint_keys(checkpoint: dict, rename_str: str): """ Renames keys in a checkpoint dictionary based on the given rename string. @@ -307,6 +318,8 @@ def rename_checkpoint_keys(checkpoint: dict, rename_str: str): k = k.replace(old_key, new_key) new_checkpoint[k] = v return new_checkpoint + + def aloha_gripper_from_angular(value): # Convert from the gripper position used by pi0 to the gripper position that is used by Aloha. # Note that the units are still angular but the range is different. @@ -324,6 +337,7 @@ def aloha_gripper_from_angular_inv(value): value = unnormalize(value, min_val=-0.6213, max_val=1.4910) return normalize(value, min_val=0.4, max_val=1.5) + class SMOLPI0Policy(PreTrainedPolicy): """Wrapper class around VLAFlowMatching model to train and run inference within LeRobot.""" @@ -374,7 +388,9 @@ class SMOLPI0Policy(PreTrainedPolicy): self.language_tokenizer = AutoProcessor.from_pretrained(self.config.vlm_model_name).tokenizer self.model = VLAFlowMatching(config) - self.include_past_states = config.n_obs_steps > 1 and OBS_STATE in self.config.past_obs_keys.split(",") + self.include_past_states = config.n_obs_steps > 1 and OBS_STATE in self.config.past_obs_keys.split( + "," + ) self.include_past_images = config.n_obs_steps > 1 and "image" in self.config.past_obs_keys.split(",") self.num_past_images = self.config.n_obs_steps if self.include_past_images else 1 self.reset() @@ -389,31 +405,20 @@ class SMOLPI0Policy(PreTrainedPolicy): for k in self.config.input_features: if any([past_obs_key in k for past_obs_key in self.config.past_obs_keys.split(",")]): self._queues[k] = deque(maxlen=self.config.n_obs_steps) - + def get_optim_params(self) -> dict: if self.config.optimizer_lr_vlm > 0 and self.config.optimizer_lr_vlm != self.config.optimizer_lr: params = [ + {"params": [p for n, p in self.named_parameters() if ".vlm." not in n and p.requires_grad]}, { - "params": [ - p - for n, p in self.named_parameters() - if not ".vlm." in n and p.requires_grad - ] - }, - { - "params": [ - p - for n, p in self.named_parameters() - if ".vlm." in n and p.requires_grad - ], + "params": [p for n, p in self.named_parameters() if ".vlm." in n and p.requires_grad], "lr": self.config.optimizer_lr_vlm, }, ] return params - + else: return self.parameters() - def merge_peft_model_weights(self) -> None: if "lora" in self.config.peft_method: @@ -438,9 +443,7 @@ class SMOLPI0Policy(PreTrainedPolicy): state = self.prepare_state(batch) lang_tokens, lang_masks = self.prepare_language(batch) - actions = self.model.sample_actions( - images, img_masks, lang_tokens, lang_masks, state, noise=noise - ) + actions = self.model.sample_actions(images, img_masks, lang_tokens, lang_masks, state, noise=noise) # Unpad actions original_action_dim = self.config.action_feature.shape[0] actions = actions[:, :, :original_action_dim] @@ -469,6 +472,7 @@ class SMOLPI0Policy(PreTrainedPolicy): device=map_location, checkpoint_keys_mapping="model._orig_mod.//model.", ) + @torch.no_grad def select_action(self, batch: dict[str, Tensor], noise: Tensor | None = None) -> Tensor: """Select a single action given environment observations. @@ -564,8 +568,12 @@ class SMOLPI0Policy(PreTrainedPolicy): present_img_keys = [key for key in self.config.image_features if key in batch] missing_img_keys = [key for key in self.config.image_features if key not in batch] - present_img_keys = sorted(present_img_keys, key=lambda k: IMAGES_ORDER.get(k, float("inf")), reverse=self.config.reverse_images_order) - if self.config.shuffle_camera_positions and ACTION in batch: # only during training + present_img_keys = sorted( + present_img_keys, + key=lambda k: IMAGES_ORDER.get(k, float("inf")), + reverse=self.config.reverse_images_order, + ) + if self.config.shuffle_camera_positions and ACTION in batch: # only during training present_img_keys = random.sample(present_img_keys, len(present_img_keys)) if len(present_img_keys) == 0: raise ValueError( @@ -609,7 +617,10 @@ class SMOLPI0Policy(PreTrainedPolicy): tasks = [tasks[0] for _ in range(batch[OBS_STATE].shape[0])] if self.config.add_prompt_template: - tasks = [f"{self.config.prefix_prompt_template}{task}{self.config.suffix_prompt_template}" for task in tasks] + tasks = [ + f"{self.config.prefix_prompt_template}{task}{self.config.suffix_prompt_template}" + for task in tasks + ] else: tasks = [task if task.endswith("\n") else f"{task}\n" for task in tasks] tokenized_prompt = self.language_tokenizer.__call__( @@ -618,7 +629,7 @@ class SMOLPI0Policy(PreTrainedPolicy): padding_side="right", max_length=self.config.tokenizer_max_length, return_tensors="pt", - truncation=True, # FIXME(mshukor) + truncation=True, # FIXME(mshukor) ) lang_tokens = tokenized_prompt["input_ids"].to(device=device) @@ -655,7 +666,11 @@ class SMOLPI0Policy(PreTrainedPolicy): def prepare_state(self, batch): """Pad state""" - state = batch[OBS_STATE][:, -1, :] if (batch[OBS_STATE].ndim > 2 and not self.include_past_states) else batch[OBS_STATE] # FIXME(mshukor): no state history for now + state = ( + batch[OBS_STATE][:, -1, :] + if (batch[OBS_STATE].ndim > 2 and not self.include_past_states) + else batch[OBS_STATE] + ) # FIXME(mshukor): no state history for now state = pad_vector(state, self.config.max_state_dim) return state @@ -666,7 +681,9 @@ class SMOLPI0Policy(PreTrainedPolicy): if self.config.relative_actions_mode == "first": actions = torch.cat((actions[:, :1], actions[:, 1:] - actions[:, :1]), dim=1) elif self.config.relative_actions_mode == "state": - assert batch[ACTION].shape[-1] == batch[OBS_STATE].shape[-1], "Relative action mode 'state' requires the action and state to have the same dimension." + assert batch[ACTION].shape[-1] == batch[OBS_STATE].shape[-1], ( + "Relative action mode 'state' requires the action and state to have the same dimension." + ) if state.ndim == 2: state = state.unsqueeze(1) actions = actions - state @@ -674,6 +691,7 @@ class SMOLPI0Policy(PreTrainedPolicy): actions = torch.cat((actions[:, :1], actions[:, 1:] - actions[:, :-1]), dim=1) return actions + def pad_tensor(tensor, max_len, pad_value=0): """ Efficiently pads a tensor along sequence dimension to match max_len. @@ -687,13 +705,16 @@ def pad_tensor(tensor, max_len, pad_value=0): torch.Tensor: Shape (B, max_len, ...) or (B, max_len). """ B, L = tensor.shape[:2] - + # Create a padded tensor of max_len and copy the existing values - padded_tensor = torch.full((B, max_len, *tensor.shape[2:]), pad_value, dtype=tensor.dtype, device=tensor.device) + padded_tensor = torch.full( + (B, max_len, *tensor.shape[2:]), pad_value, dtype=tensor.dtype, device=tensor.device + ) padded_tensor[:, :L] = tensor # Efficient in-place copy return padded_tensor + class VLAFlowMatching(nn.Module): """ Ο€0: A Vision-Language-Action Flow Model for General Robot Control @@ -725,7 +746,8 @@ class VLAFlowMatching(nn.Module): super().__init__() self.config = config - self.vlm_with_expert = SmolVLMWithExpertModel(model_id=self.config.vlm_model_name, + self.vlm_with_expert = SmolVLMWithExpertModel( + model_id=self.config.vlm_model_name, freeze_vision_encoder=self.config.freeze_vision_encoder, train_expert_only=self.config.train_expert_only, attention_implementation=self.config.attention_implementation, @@ -736,50 +758,64 @@ class VLAFlowMatching(nn.Module): self_attn_every_n_layers=self.config.self_attn_every_n_layers, expert_width_multiplier=self.config.expert_width_multiplier, self_attn_only_actions=self.config.self_attn_only_actions, - ) + ) # self.paligemma_with_expert = self.configure_peft(paligemma_with_expert) self.vlm_with_expert.configure_peft(config=self.config) # Projections are float32 self.state_to_prefix = self.config.state_to_prefix if self.state_to_prefix: - self.state_proj = nn.Linear(self.config.max_state_dim, self.vlm_with_expert.config.text_config.hidden_size) + self.state_proj = nn.Linear( + self.config.max_state_dim, self.vlm_with_expert.config.text_config.hidden_size + ) else: self.state_proj = nn.Linear(self.config.max_state_dim, self.vlm_with_expert.expert_hidden_size) self.action_in_proj = nn.Linear(self.config.max_action_dim, self.vlm_with_expert.expert_hidden_size) self.action_out_proj = nn.Linear(self.vlm_with_expert.expert_hidden_size, self.config.max_action_dim) - self.action_time_mlp_in = nn.Linear(self.vlm_with_expert.expert_hidden_size * 2, self.vlm_with_expert.expert_hidden_size) - self.action_time_mlp_out = nn.Linear(self.vlm_with_expert.expert_hidden_size, self.vlm_with_expert.expert_hidden_size) + self.action_time_mlp_in = nn.Linear( + self.vlm_with_expert.expert_hidden_size * 2, self.vlm_with_expert.expert_hidden_size + ) + self.action_time_mlp_out = nn.Linear( + self.vlm_with_expert.expert_hidden_size, self.vlm_with_expert.expert_hidden_size + ) self.set_requires_grad() # SmolVLM2 has: [fake_tok + crop_tok + crop + fake_tok + crop_tok ... + fake_tok + global_tok + global + fake_tok] + [second image] + ... - if any([k in self.config.vlm_model_name for k in ["SmolVLM-", "SmolVLA-"]]): + if any([k in self.config.vlm_model_name for k in ["SmolVLM-", "SmolVLA-"]]): if "SmolVLM-Instruct" in self.config.vlm_model_name: - self.fake_image_token = 49152 - self.global_image_token = [44, 13906, 29, 6266, 46] - self.global_image_start_token = torch.tensor([self.fake_image_token] + self.global_image_token, dtype=torch.long) + self.fake_image_token = 49152 + self.global_image_token = [44, 13906, 29, 6266, 46] + self.global_image_start_token = torch.tensor( + [self.fake_image_token] + self.global_image_token, dtype=torch.long + ) else: - self.fake_image_token = 49189 - self.global_image_token = 49152 - self.global_image_start_token = torch.tensor([self.fake_image_token, self.global_image_token], dtype=torch.long) + self.fake_image_token = 49189 + self.global_image_token = 49152 + self.global_image_start_token = torch.tensor( + [self.fake_image_token, self.global_image_token], dtype=torch.long + ) else: self.fake_image_token = self.vlm_with_expert.processor.tokenizer.fake_image_token_id self.global_image_token = self.vlm_with_expert.processor.tokenizer.global_image_token_id - self.global_image_start_token = torch.tensor([self.fake_image_token, self.global_image_token], dtype=torch.long) + self.global_image_start_token = torch.tensor( + [self.fake_image_token, self.global_image_token], dtype=torch.long + ) self.add_image_special_tokens = self.config.add_image_special_tokens self.add_local_special_image_tokens = self.config.add_local_special_image_tokens - self.local_image_tokens = [torch.tensor([self.fake_image_token, tok], dtype=torch.long) for tok in [49153, 49154, 49155, 49159, 49160, 49161, 49165, 49166, 49167]] # assume 3 x 3 grid - + self.local_image_tokens = [ + torch.tensor([self.fake_image_token, tok], dtype=torch.long) + for tok in [49153, 49154, 49155, 49159, 49160, 49161, 49165, 49166, 49167] + ] # assume 3 x 3 grid + self.local_image_start_token = self.global_image_start_token self.image_end_token = torch.tensor([self.fake_image_token], dtype=torch.long) self.prefix_length = self.config.prefix_length - self.include_past_images = self.config.n_obs_steps > 1 and "image" in self.config.past_obs_keys.split(",") + self.include_past_images = self.config.n_obs_steps > 1 and "image" in self.config.past_obs_keys.split( + "," + ) self.num_past_images = self.config.n_obs_steps if self.include_past_images else 1 self.causal_attention_on_history = self.config.causal_attention_on_history - - - # def configure_peft(self, model): # # return model @@ -845,14 +881,30 @@ class VLAFlowMatching(nn.Module): img, img_mask, ) in enumerate(zip(images, img_masks, strict=False)): - # FIXME(mshukor): add special tokens for the history each history_steps or not + # FIXME(mshukor): add special tokens for the history each history_steps or not if self.add_image_special_tokens: if self.add_local_special_image_tokens and img_idx % num_images != num_images - 1: local_token_idx = img_idx % num_images - image_start_token = self.vlm_with_expert.embed_language_tokens(self.local_image_tokens[local_token_idx].to(device=self.vlm_with_expert.vlm.device)).unsqueeze(0).expand(img.shape[0], -1, -1) + image_start_token = ( + self.vlm_with_expert.embed_language_tokens( + self.local_image_tokens[local_token_idx].to( + device=self.vlm_with_expert.vlm.device + ) + ) + .unsqueeze(0) + .expand(img.shape[0], -1, -1) + ) else: - image_start_token = self.vlm_with_expert.embed_language_tokens(self.global_image_start_token.to(device=self.vlm_with_expert.vlm.device)).unsqueeze(0).expand(img.shape[0], -1, -1) - image_start_mask = torch.ones_like(image_start_token[:, :, 0], dtype=torch.bool, device=image_start_token.device) + image_start_token = ( + self.vlm_with_expert.embed_language_tokens( + self.global_image_start_token.to(device=self.vlm_with_expert.vlm.device) + ) + .unsqueeze(0) + .expand(img.shape[0], -1, -1) + ) + image_start_mask = torch.ones_like( + image_start_token[:, :, 0], dtype=torch.bool, device=image_start_token.device + ) if self.causal_attention_on_history and img_idx % num_images == 0: att_masks += [1] + [0] * (image_start_mask.shape[-1] - 1) else: @@ -861,7 +913,7 @@ class VLAFlowMatching(nn.Module): pad_masks.append(image_start_mask) img_emb = self.vlm_with_expert.embed_image(img) - img_emb = img_emb #.to(dtype=self.vlm_with_expert.type) + img_emb = img_emb # .to(dtype=self.vlm_with_expert.type) # Normalize image embeddings img_emb_dim = img_emb.shape[-1] @@ -880,16 +932,26 @@ class VLAFlowMatching(nn.Module): att_masks += [0] * (num_img_embs) if self.add_image_special_tokens: - if not self.add_local_special_image_tokens or (self.add_local_special_image_tokens and img_idx % num_images == num_images - 1): - image_end_token = self.vlm_with_expert.embed_language_tokens(self.image_end_token.to(device=self.vlm_with_expert.vlm.device)).unsqueeze(0).expand(img.shape[0], -1, -1) - image_end_mask = torch.ones_like(image_end_token[:, :, 0], dtype=torch.bool, device=image_end_token.device) + if not self.add_local_special_image_tokens or ( + self.add_local_special_image_tokens and img_idx % num_images == num_images - 1 + ): + image_end_token = ( + self.vlm_with_expert.embed_language_tokens( + self.image_end_token.to(device=self.vlm_with_expert.vlm.device) + ) + .unsqueeze(0) + .expand(img.shape[0], -1, -1) + ) + image_end_mask = torch.ones_like( + image_end_token[:, :, 0], dtype=torch.bool, device=image_end_token.device + ) embs.append(image_end_token) pad_masks.append(image_end_mask) att_masks += [0] * (image_end_mask.shape[1]) lang_emb = self.vlm_with_expert.embed_language_tokens(lang_tokens) # Normalize language embeddings lang_emb_dim = lang_emb.shape[-1] - lang_emb = lang_emb * math.sqrt(lang_emb_dim) # FIXME(mshukor): is this needed for smolvlm? + lang_emb = lang_emb * math.sqrt(lang_emb_dim) # FIXME(mshukor): is this needed for smolvlm? embs.append(lang_emb) pad_masks.append(lang_masks) @@ -900,7 +962,9 @@ class VLAFlowMatching(nn.Module): if state is not None and self.state_to_prefix: state_emb = self.state_proj(state) - state_emb = state_emb[:, None, :] if state_emb.ndim == 2 else state_emb #.to(dtype=self.vlm_with_expert.type) + state_emb = ( + state_emb[:, None, :] if state_emb.ndim == 2 else state_emb + ) # .to(dtype=self.vlm_with_expert.type) embs.append(state_emb) bsize = state_emb.shape[0] dtype = state_emb.dtype @@ -912,7 +976,7 @@ class VLAFlowMatching(nn.Module): # Set attention masks so that image and language inputs do not attend to state or actions # att_masks += [1] + [0]*(states_seq_len - 1) - att_masks += [1]*(states_seq_len) + att_masks += [1] * (states_seq_len) embs = torch.cat(embs, dim=1) pad_masks = torch.cat(pad_masks, dim=1) att_masks = torch.tensor(att_masks, dtype=torch.bool, device=pad_masks.device) @@ -937,7 +1001,9 @@ class VLAFlowMatching(nn.Module): # Embed state if not self.state_to_prefix: state_emb = self.state_proj(state) - state_emb = state_emb[:, None, :] if state_emb.ndim == 2 else state_emb #.to(dtype=self.vlm_with_expert.type) + state_emb = ( + state_emb[:, None, :] if state_emb.ndim == 2 else state_emb + ) # .to(dtype=self.vlm_with_expert.type) embs.append(state_emb) bsize = state_emb.shape[0] dtype = state_emb.dtype @@ -948,8 +1014,7 @@ class VLAFlowMatching(nn.Module): pad_masks.append(state_mask) # Set attention masks so that image and language inputs do not attend to state or actions - att_masks += [1] + [0]*(states_seq_len - 1) - + att_masks += [1] + [0] * (states_seq_len - 1) # Fuse timestep + action information using an MLP action_emb = self.action_in_proj(noisy_actions) @@ -1010,7 +1075,7 @@ class VLAFlowMatching(nn.Module): images, img_masks, lang_tokens, lang_masks, state=state ) suffix_embs, suffix_pad_masks, suffix_att_masks = self.embed_suffix(state, x_t, time) - + pad_masks = torch.cat([prefix_pad_masks, suffix_pad_masks], dim=1) att_masks = torch.cat([prefix_att_masks, suffix_att_masks], dim=1) @@ -1061,12 +1126,12 @@ class VLAFlowMatching(nn.Module): x_t = torch.zeros_like(noise, dtype=torch.float32, device=device) expanded_time = torch.zeros(bsize, dtype=torch.float32, device=device) x_t = self.denoise_step( - state, - prefix_pad_masks, - past_key_values, - x_t, - expanded_time, - ) + state, + prefix_pad_masks, + past_key_values, + x_t, + expanded_time, + ) else: dt = -1.0 / self.config.num_steps dt = torch.tensor(dt, dtype=torch.float32, device=device) diff --git a/src/lerobot/policies/smolpi0/smolvlm_with_expert.py b/src/lerobot/policies/smolpi0/smolvlm_with_expert.py index b910679f1..0ccdcccc8 100644 --- a/src/lerobot/policies/smolpi0/smolvlm_with_expert.py +++ b/src/lerobot/policies/smolpi0/smolvlm_with_expert.py @@ -12,31 +12,28 @@ # See the License for the specific language governing permissions and # limitations under the License. -from typing import List, Optional, Union -from functools import partial import copy +from functools import partial +from typing import List, Optional, Union import torch -import torch.version import torch.nn.functional as F # noqa: N812 +import torch.version from peft import LoraConfig, TaskType, get_peft_model from pytest import Cache from torch import nn from transformers import ( AutoConfig, - GemmaForCausalLM, - AutoModelForImageTextToText, - AutoProcessor, - PretrainedConfig, - PreTrainedModel, - SmolVLMForConditionalGeneration, AutoModel, + AutoModelForImageTextToText, AutoModelForVision2Seq, + AutoProcessor, + SmolVLMForConditionalGeneration, ) -from transformers.models.auto import CONFIG_MAPPING -from transformers import SmolVLMModel, SmolVLMConfig + from lerobot.policies.smolpi0.flex_attention import flex_attention_forward + def _round_up_to_multiple(x, multiple): return (x + multiple - 1) // multiple * multiple @@ -180,20 +177,31 @@ def apply_rope(x, positions, max_wavelength=10_000): # f"Wrong value provided for `attention_implementation` ({self.attention_implementation}). Expected 'eager', 'fa2' or 'flex'." # ) + def get_intermediate_size(hidden_dim, ffn_dim_multiplier=4, multiple_of=256): hidden_dim = int(2 * hidden_dim / 3) hidden_dim = int(ffn_dim_multiplier * hidden_dim) hidden_dim = multiple_of * ((hidden_dim + multiple_of - 1) // multiple_of) return hidden_dim - + class SmolVLMWithExpertModel(nn.Module): # config_class = PaliGemmaWithExpertConfig - def __init__(self, model_id: str = "HuggingFaceTB/SmolVLM2-500M-Video-Instruct", - load_vlm_weights: bool = True, train_expert_only: bool = True, freeze_vision_encoder: bool = False, - attention_implementation: str = "eager", attention_mode: str = "self_attn", num_expert_layers: int = -1, - num_vlm_layers: int = -1, self_attn_every_n_layers: int = -1, expert_width_multiplier: float = 0.5, self_attn_only_actions: bool = False): + def __init__( + self, + model_id: str = "HuggingFaceTB/SmolVLM2-500M-Video-Instruct", + load_vlm_weights: bool = True, + train_expert_only: bool = True, + freeze_vision_encoder: bool = False, + attention_implementation: str = "eager", + attention_mode: str = "self_attn", + num_expert_layers: int = -1, + num_vlm_layers: int = -1, + self_attn_every_n_layers: int = -1, + expert_width_multiplier: float = 0.5, + self_attn_only_actions: bool = False, + ): super().__init__() if load_vlm_weights: print(f"Loading {model_id} weights ...") @@ -227,15 +235,17 @@ class SmolVLMWithExpertModel(nn.Module): # Smaller lm expert lm_expert_config = copy.deepcopy(config.text_config) hidden_size = lm_expert_config.hidden_size - lm_expert_config.hidden_size = int(hidden_size*expert_width_multiplier) #hidden_size // 2 - lm_expert_config.intermediate_size = get_intermediate_size(int(hidden_size*expert_width_multiplier)) + lm_expert_config.hidden_size = int(hidden_size * expert_width_multiplier) # hidden_size // 2 + lm_expert_config.intermediate_size = get_intermediate_size(int(hidden_size * expert_width_multiplier)) lm_expert_config.num_hidden_layers = self.num_vlm_layers - if num_expert_layers > 0 : - assert len(self.get_vlm_model().text_model.layers) % num_expert_layers == 0, f"Number of layers in the VLM {len(self.get_vlm_model().text_model.layers)} are not multiple of num_expert_layers {num_expert_layers}" + if num_expert_layers > 0: + assert len(self.get_vlm_model().text_model.layers) % num_expert_layers == 0, ( + f"Number of layers in the VLM {len(self.get_vlm_model().text_model.layers)} are not multiple of num_expert_layers {num_expert_layers}" + ) lm_expert_config.num_hidden_layers = num_expert_layers # lm_expert_config.head_dim = lm_expert_config.head_dim * 2 self.lm_expert = AutoModel.from_config(lm_expert_config) - + self.num_expert_layers = len(self.lm_expert.layers) self.self_attn_every_n_layers = self_attn_every_n_layers self.self_attn_only_actions = self_attn_only_actions @@ -245,10 +255,14 @@ class SmolVLMWithExpertModel(nn.Module): if self.self_attn_every_n_layers > 0 and layer_idx % self.self_attn_every_n_layers == 0: continue self.lm_expert.layers[layer_idx].self_attn.k_proj = nn.Linear( - config.text_config.num_key_value_heads * config.text_config.head_dim, lm_expert_config.num_key_value_heads * lm_expert_config.head_dim, bias=lm_expert_config.attention_bias + config.text_config.num_key_value_heads * config.text_config.head_dim, + lm_expert_config.num_key_value_heads * lm_expert_config.head_dim, + bias=lm_expert_config.attention_bias, ) self.lm_expert.layers[layer_idx].self_attn.v_proj = nn.Linear( - config.text_config.num_key_value_heads * config.text_config.head_dim, lm_expert_config.num_key_value_heads * lm_expert_config.head_dim, bias=lm_expert_config.attention_bias + config.text_config.num_key_value_heads * config.text_config.head_dim, + lm_expert_config.num_key_value_heads * lm_expert_config.head_dim, + bias=lm_expert_config.attention_bias, ) # Remove unused embed_tokens self.lm_expert.embed_tokens = None @@ -308,8 +322,10 @@ class SmolVLMWithExpertModel(nn.Module): else: self.vlm = self.vlm.merge_and_unload() - def get_vlm_model(self,): - if hasattr(self.vlm.model, "model"): # When using peft + def get_vlm_model( + self, + ): + if hasattr(self.vlm.model, "model"): # When using peft return self.vlm.model.model else: return self.vlm.model @@ -326,22 +342,20 @@ class SmolVLMWithExpertModel(nn.Module): else: # To avoid unused params issue with distributed training last_layers = [self.num_vlm_layers - 1] - if self.num_vlm_layers != self.num_expert_layers and self.num_vlm_layers % self.num_expert_layers == 0: + if ( + self.num_vlm_layers != self.num_expert_layers + and self.num_vlm_layers % self.num_expert_layers == 0 + ): last_layers.append(self.num_vlm_layers - 2) frozen_layers = [ - "lm_head", - "text_model.model.norm.weight", - ] + "lm_head", + "text_model.model.norm.weight", + ] for layer in last_layers: frozen_layers.append(f"text_model.model.layers.{layer}.") for name, params in self.vlm.named_parameters(): - if any( - [ - k in name - for k in frozen_layers - ] - ): + if any([k in name for k in frozen_layers]): params.requires_grad = False # To avoid unused params issue with distributed training for name, params in self.lm_expert.named_parameters(): @@ -410,19 +424,34 @@ class SmolVLMWithExpertModel(nn.Module): # FIXME(mshukor): add special image tokens specific to smolvlm # Get sequence from the vision encoder - image_hidden_states = self.get_vlm_model().vision_model( - pixel_values=image.to(dtype=self.get_vlm_model().vision_model.dtype), - patch_attention_mask=patch_attention_mask, - ).last_hidden_state + image_hidden_states = ( + self.get_vlm_model() + .vision_model( + pixel_values=image.to(dtype=self.get_vlm_model().vision_model.dtype), + patch_attention_mask=patch_attention_mask, + ) + .last_hidden_state + ) # Modality projection & resampling image_hidden_states = self.get_vlm_model().connector(image_hidden_states) return image_hidden_states - + def embed_language_tokens(self, tokens: torch.Tensor): return self.get_vlm_model().text_model.get_input_embeddings()(tokens) - def forward_attn_layer(self, model_layers, inputs_embeds, layer_idx, position_ids, attention_mask, batch_size, head_dim, use_cache: bool = True, fill_kv_cache: bool = True, past_key_values=None) -> list[torch.Tensor]: - + def forward_attn_layer( + self, + model_layers, + inputs_embeds, + layer_idx, + position_ids, + attention_mask, + batch_size, + head_dim, + use_cache: bool = True, + fill_kv_cache: bool = True, + past_key_values=None, + ) -> list[torch.Tensor]: query_states = [] key_states = [] value_states = [] @@ -430,7 +459,7 @@ class SmolVLMWithExpertModel(nn.Module): layer = model_layers[i][layer_idx] if hidden_states is None or layer is None: continue - + # normalizer = torch.tensor(models[i].config.hidden_size**0.5, dtype=hidden_states.dtype) # hidden_states = hidden_states * normalizer hidden_states = layer.input_layernorm(hidden_states) @@ -468,12 +497,16 @@ class SmolVLMWithExpertModel(nn.Module): if inputs_embeds[1] is not None: suffix_len = inputs_embeds[1].shape[1] attention_mask_[:, -suffix_len:, :-suffix_len] = False - position_ids_[:, -suffix_len:] = _position_ids[:, -suffix_len:] - _position_ids[:, -suffix_len][:, None] + position_ids_[:, -suffix_len:] = ( + _position_ids[:, -suffix_len:] - _position_ids[:, -suffix_len][:, None] + ) else: attention_mask_ = _attention_mask position_ids_ = _position_ids - query_states = apply_rope(query_states, position_ids_) # FIXME(mshukor): this assumes we have always the vlm features? + query_states = apply_rope( + query_states, position_ids_ + ) # FIXME(mshukor): this assumes we have always the vlm features? key_states = apply_rope(key_states, position_ids_) if use_cache and past_key_values is None: @@ -491,9 +524,7 @@ class SmolVLMWithExpertModel(nn.Module): # the max len, then we (for instance) double the cache size. This implementation already exists # in `transformers`. (molbap) key_states = torch.cat([past_key_values[layer_idx]["key_states"], key_states], dim=1) - value_states = torch.cat( - [past_key_values[layer_idx]["value_states"], value_states], dim=1 - ) + value_states = torch.cat([past_key_values[layer_idx]["value_states"], value_states], dim=1) attention_interface = self.get_attention_interface() @@ -504,20 +535,32 @@ class SmolVLMWithExpertModel(nn.Module): return [att_output], past_key_values - - def forward_cross_attn_layer(self, model_layers, inputs_embeds, layer_idx, position_ids, attention_mask, batch_size, head_dim, use_cache: bool = True, fill_kv_cache: bool = True, past_key_values = None) -> list[torch.Tensor]: - + def forward_cross_attn_layer( + self, + model_layers, + inputs_embeds, + layer_idx, + position_ids, + attention_mask, + batch_size, + head_dim, + use_cache: bool = True, + fill_kv_cache: bool = True, + past_key_values=None, + ) -> list[torch.Tensor]: attention_interface = self.get_attention_interface() att_outputs = [] - assert len(inputs_embeds) == 2 or (use_cache and past_key_values is not None and not fill_kv_cache), f"Both len(inputs_embeds) == {len(inputs_embeds)} and past_key_values is {past_key_values}" + assert len(inputs_embeds) == 2 or (use_cache and past_key_values is not None and not fill_kv_cache), ( + f"Both len(inputs_embeds) == {len(inputs_embeds)} and past_key_values is {past_key_values}" + ) if len(inputs_embeds) == 2 and not past_key_values: # Prefix attention seq_len = inputs_embeds[0].shape[1] - position_id, expert_position_id = position_ids[:, :seq_len], position_ids[:, seq_len:] + position_id, expert_position_id = position_ids[:, :seq_len], position_ids[:, seq_len:] prefix_attention_mask = attention_mask[:, :seq_len, :seq_len] - + layer = model_layers[0][layer_idx] hidden_states = layer.input_layernorm(inputs_embeds[0]) @@ -558,7 +601,6 @@ class SmolVLMWithExpertModel(nn.Module): key_states = past_key_values[layer_idx]["key_states"] value_states = past_key_values[layer_idx]["value_states"] - # Expert expert_layer = model_layers[1][layer_idx] if expert_layer is not None: @@ -570,21 +612,37 @@ class SmolVLMWithExpertModel(nn.Module): expert_hidden_states = expert_hidden_states.to(dtype=expert_layer.self_attn.q_proj.weight.dtype) expert_query_state = expert_layer.self_attn.q_proj(expert_hidden_states).view(expert_hidden_shape) + _key_states = key_states.to(dtype=expert_layer.self_attn.k_proj.weight.dtype).view( + *key_states.shape[:2], -1 + ) + expert_key_states = expert_layer.self_attn.k_proj(_key_states).view( + *_key_states.shape[:-1], -1, expert_layer.self_attn.head_dim + ) # k_proj should have same dim as kv - _key_states = key_states.to(dtype=expert_layer.self_attn.k_proj.weight.dtype).view(*key_states.shape[:2], -1) - expert_key_states = expert_layer.self_attn.k_proj(_key_states).view(*_key_states.shape[:-1], -1, expert_layer.self_attn.head_dim) # k_proj should have same dim as kv + _value_states = value_states.to(dtype=expert_layer.self_attn.v_proj.weight.dtype).view( + *value_states.shape[:2], -1 + ) + expert_value_states = expert_layer.self_attn.v_proj(_value_states).view( + *_value_states.shape[:-1], -1, expert_layer.self_attn.head_dim + ) - _value_states = value_states.to(dtype=expert_layer.self_attn.v_proj.weight.dtype).view(*value_states.shape[:2], -1) - expert_value_states = expert_layer.self_attn.v_proj(_value_states).view(*_value_states.shape[:-1], -1, expert_layer.self_attn.head_dim) - - expert_position_id = expert_position_id - torch.min(expert_position_id, dim=1, keepdim=True).values # start from 0 - expert_attention_mask = attention_mask[:, -inputs_embeds[1].shape[1]:, :expert_key_states.shape[1]:] # take into account kv + expert_position_id = ( + expert_position_id - torch.min(expert_position_id, dim=1, keepdim=True).values + ) # start from 0 + expert_attention_mask = attention_mask[ + :, -inputs_embeds[1].shape[1] :, : expert_key_states.shape[1] : + ] # take into account kv expert_query_states = apply_rope(expert_query_state, expert_position_id) # expert_key_states = apply_rope(expert_key_state, expert_position_id) - + att_output = attention_interface( - expert_attention_mask, batch_size, head_dim, expert_query_states, expert_key_states, expert_value_states + expert_attention_mask, + batch_size, + head_dim, + expert_query_states, + expert_key_states, + expert_value_states, ) att_outputs.append(att_output) else: @@ -592,8 +650,8 @@ class SmolVLMWithExpertModel(nn.Module): # att_output = att_output.to(dtype=models[i].dtype) return att_outputs, past_key_values - - def get_model_layers(self, models: list) -> list: # FIXME(mshukor): is this efficient? + + def get_model_layers(self, models: list) -> list: # FIXME(mshukor): is this efficient? vlm_layers = [] expert_layers = [] multiple_of = self.num_vlm_layers // self.num_expert_layers @@ -606,15 +664,16 @@ class SmolVLMWithExpertModel(nn.Module): vlm_layers.append(models[0].layers[i]) expert_layers.append(expert_layer) return [vlm_layers, expert_layers] + # TODO: break down this huge forward into modules or functions def forward( self, - attention_mask: Optional[torch.Tensor] = None, - position_ids: Optional[torch.LongTensor] = None, - past_key_values: Optional[Union[List[torch.FloatTensor], Cache]] = None, - inputs_embeds: List[torch.FloatTensor] = None, - use_cache: Optional[bool] = None, - fill_kv_cache: Optional[bool] = None, + attention_mask: torch.Tensor | None = None, + position_ids: torch.LongTensor | None = None, + past_key_values: list[torch.FloatTensor] | Cache | None = None, + inputs_embeds: list[torch.FloatTensor] = None, + use_cache: bool | None = None, + fill_kv_cache: bool | None = None, ): models = [self.get_vlm_model().text_model, self.lm_expert] model_layers = self.get_model_layers(models) @@ -626,26 +685,58 @@ class SmolVLMWithExpertModel(nn.Module): continue batch_size = hidden_states.shape[0] - # # Pad prefix embds so that prefix_embs + prefix_embs len are multiple of 128, pad left or right depending on the gen or train + # # Pad prefix embds so that prefix_embs + prefix_embs len are multiple of 128, pad left or right depending on the gen or train if self.attention_implementation == "flex": - if inputs_embeds[0] is not None and inputs_embeds[1] is not None and attention_mask.shape[-1] == attention_mask.shape[-2] and past_key_values is None: # Now only during training + if ( + inputs_embeds[0] is not None + and inputs_embeds[1] is not None + and attention_mask.shape[-1] == attention_mask.shape[-2] + and past_key_values is None + ): # Now only during training seq_len = inputs_embeds[0].shape[1] + inputs_embeds[1].shape[1] - padded_seq_len = _round_up_to_multiple(seq_len, 128) # FIXME(mshukor): more efficient to have a fixed seq len? + padded_seq_len = _round_up_to_multiple( + seq_len, 128 + ) # FIXME(mshukor): more efficient to have a fixed seq len? b_mask, q_len, kv_len = attention_mask.shape # The shape of your mask pad = padded_seq_len - q_len attention_mask = F.pad(attention_mask, (0, pad, 0, pad), value=True) inputs_embeds[0] = F.pad(inputs_embeds[0], (0, 0, 0, pad), value=0.0) position_ids = F.pad(position_ids, (0, pad), value=0) - # RMSNorm num_layers = self.num_vlm_layers head_dim = self.vlm.config.text_config.head_dim for layer_idx in range(num_layers): - if fill_kv_cache or "cross" not in self.attention_mode or (self.self_attn_every_n_layers > 0 and layer_idx % self.self_attn_every_n_layers == 0): - att_outputs, past_key_values = self.forward_attn_layer(model_layers, inputs_embeds, layer_idx, position_ids, attention_mask, batch_size, head_dim, use_cache=use_cache, fill_kv_cache=fill_kv_cache, past_key_values=past_key_values) + if ( + fill_kv_cache + or "cross" not in self.attention_mode + or (self.self_attn_every_n_layers > 0 and layer_idx % self.self_attn_every_n_layers == 0) + ): + att_outputs, past_key_values = self.forward_attn_layer( + model_layers, + inputs_embeds, + layer_idx, + position_ids, + attention_mask, + batch_size, + head_dim, + use_cache=use_cache, + fill_kv_cache=fill_kv_cache, + past_key_values=past_key_values, + ) else: - att_outputs, past_key_values = self.forward_cross_attn_layer(model_layers, inputs_embeds, layer_idx, position_ids, attention_mask, batch_size, head_dim, use_cache=use_cache, fill_kv_cache=fill_kv_cache, past_key_values=past_key_values) + att_outputs, past_key_values = self.forward_cross_attn_layer( + model_layers, + inputs_embeds, + layer_idx, + position_ids, + attention_mask, + batch_size, + head_dim, + use_cache=use_cache, + fill_kv_cache=fill_kv_cache, + past_key_values=past_key_values, + ) # query_states = [] # key_states = [] # value_states = [] @@ -703,7 +794,6 @@ class SmolVLMWithExpertModel(nn.Module): # attention_mask, batch_size, head_dim, query_states, key_states, value_states # ) - # att_output = att_output.to(dtype=models[i].dtype) # first part of att_output is prefix (up to sequence length, [:, 0:prefix_seq_len]) @@ -712,7 +802,9 @@ class SmolVLMWithExpertModel(nn.Module): for i, hidden_states in enumerate(inputs_embeds): # layer = models[i].layers[layer_idx] layer = model_layers[i][layer_idx] - att_output = att_outputs[i] if i < len(att_outputs) else att_outputs[0] # in case of self_attn + att_output = ( + att_outputs[i] if i < len(att_outputs) else att_outputs[0] + ) # in case of self_attn if hidden_states is not None: if layer is None: outputs_embeds.append(hidden_states) @@ -759,7 +851,11 @@ class SmolVLMWithExpertModel(nn.Module): if self.attention_implementation == "fa2": attention_interface = self.flash_attention_forward elif self.attention_implementation == "flex": - attention_interface = partial(flex_attention_forward, num_att_heads=self.num_attention_heads, num_key_value_heads=self.num_key_value_heads) + attention_interface = partial( + flex_attention_forward, + num_att_heads=self.num_attention_heads, + num_key_value_heads=self.num_key_value_heads, + ) else: attention_interface = self.eager_attention_forward return attention_interface @@ -807,7 +903,7 @@ class SmolVLMWithExpertModel(nn.Module): att_weights *= head_dim**-0.5 att_weights = att_weights.to(dtype=torch.float32) - big_neg = torch.finfo(att_weights.dtype).min #-2.3819763e38 # See gemma/modules.py + big_neg = torch.finfo(att_weights.dtype).min # -2.3819763e38 # See gemma/modules.py masked_att_weights = torch.where(attention_mask[:, None, :, :], att_weights, big_neg) probs = nn.functional.softmax(masked_att_weights, dim=-1) probs = probs.to(dtype=value_states.dtype) diff --git a/src/lerobot/policies/smolvla/modeling_smolvla.py b/src/lerobot/policies/smolvla/modeling_smolvla.py index 1c07d98e5..9bb22d7f7 100644 --- a/src/lerobot/policies/smolvla/modeling_smolvla.py +++ b/src/lerobot/policies/smolvla/modeling_smolvla.py @@ -1027,6 +1027,7 @@ from lerobot.policies.utils import ( populate_queues, ) from lerobot.utils.utils import get_safe_dtype + # OBS_STATE = 'state' # Matches ".soNNN", optionally followed by "-something", up to the "_buffer_" marker _VARIANT_RE = re.compile(r"\.so\d+(?:-[\w]+)?_buffer_") @@ -1891,4 +1892,4 @@ class VLAFlowMatching(nn.Module): suffix_out = suffix_out[:, -self.config.chunk_size :] suffix_out = suffix_out.to(dtype=torch.float32) v_t = self.action_out_proj(suffix_out) - return v_t \ No newline at end of file + return v_t diff --git a/src/lerobot/policies/smolvla/saver.txt b/src/lerobot/policies/smolvla/saver.txt index 3410062ba..f2ad6c76f 100644 --- a/src/lerobot/policies/smolvla/saver.txt +++ b/src/lerobot/policies/smolvla/saver.txt @@ -1 +1 @@ -c \ No newline at end of file +c diff --git a/src/lerobot/policies/smolvla/smolvlm_with_expert.py b/src/lerobot/policies/smolvla/smolvlm_with_expert.py index e4cd7acac..3b78c99e6 100644 --- a/src/lerobot/policies/smolvla/smolvlm_with_expert.py +++ b/src/lerobot/policies/smolvla/smolvlm_with_expert.py @@ -547,4 +547,4 @@ class SmolVLMWithExpertModel(nn.Module): # we use -1 because sequence length can change att_output = att_output.reshape(batch_size, -1, num_key_value_heads * num_key_value_groups * head_dim) - return att_output \ No newline at end of file + return att_output diff --git a/src/lerobot/scripts/eval.py b/src/lerobot/scripts/eval.py index 795ed2b3c..b96fbb8a3 100644 --- a/src/lerobot/scripts/eval.py +++ b/src/lerobot/scripts/eval.py @@ -45,17 +45,21 @@ Note that in both examples, the repo/folder should contain at least `config.json You can learn about the CLI options for this script in the `EvalPipelineConfig` in lerobot/configs/eval.py """ -import concurrent + +import concurrent.futures as cf import json import logging import threading import time +from collections import defaultdict from collections.abc import Callable from contextlib import nullcontext from copy import deepcopy from dataclasses import asdict from pathlib import Path from pprint import pformat +from typing import Dict, List, Tuple, TypedDict +from collections.abc import Iterator import einops import gymnasium as gym @@ -68,7 +72,11 @@ from tqdm import trange from lerobot.configs import parser from lerobot.configs.eval import EvalPipelineConfig from lerobot.envs.factory import make_env -from lerobot.envs.utils import add_envs_task, check_env_attributes_and_types, preprocess_observation, preprocess_observation1 +from lerobot.envs.utils import ( + add_envs_task, + check_env_attributes_and_types, + preprocess_observation, +) from lerobot.policies.factory import make_policy from lerobot.policies.pretrained import PreTrainedPolicy from lerobot.policies.utils import get_device_from_parameters @@ -79,9 +87,6 @@ from lerobot.utils.utils import ( init_logging, inside_slurm, ) -from typing import TypedDict, Dict, List, Tuple, Iterator -from collections import defaultdict -import concurrent.futures as cf def rollout( @@ -485,8 +490,12 @@ def _compile_episode_data( data_dict["index"] = torch.arange(start_data_index, start_data_index + total_frames, 1) return data_dict -from lerobot.policies.smolvla.modeling_smolvla import SmolVLAPolicy + + from lerobot.datasets.lerobot_dataset import LeRobotDatasetMetadata +from lerobot.policies.smolvla.modeling_smolvla import SmolVLAPolicy + + def _inject_normalization_stats(policy: SmolVLAPolicy, dataset_meta: LeRobotDatasetMetadata): """Recreate normalization layers with proper stats from the dataset.""" from lerobot.policies.normalize import Normalize, Unnormalize @@ -518,7 +527,8 @@ def _inject_normalization_stats(policy: SmolVLAPolicy, dataset_meta: LeRobotData def load_smolvla(cfg, dataset_repo: str, policy): from lerobot.datasets.lerobot_dataset import LeRobotDataset - dataset = LeRobotDataset(dataset_repo, root='/raid/jade/.cache/huggingface/datasets/') + + dataset = LeRobotDataset(dataset_repo, root="/raid/jade/.cache/huggingface/datasets/") _inject_normalization_stats(policy=policy, dataset_meta=dataset.meta) # only needed if stats are missing return policy.to("cuda"), dataset @@ -529,8 +539,8 @@ def eval_main(cfg: EvalPipelineConfig): # Check device is available device = get_safe_torch_device(cfg.policy.device, log=True) - #login to hf - from huggingface_hub import login + # login to hf + # login() torch.backends.cudnn.benchmark = True torch.backends.cuda.matmul.allow_tf32 = True @@ -549,7 +559,7 @@ def eval_main(cfg: EvalPipelineConfig): breakpoint() # policy, _ = load_smolvla(cfg.policy, "physical-intelligence/libero", policy) # rename "image" -> "observation.image" - + policy.eval() with torch.no_grad(), torch.autocast(device_type=device.type) if cfg.policy.use_amp else nullcontext(): info = eval_policy_all( @@ -584,10 +594,11 @@ def eval_main(cfg: EvalPipelineConfig): # ---- typed payload returned by one task eval ---- class TaskMetrics(TypedDict): - sum_rewards: List[float] - max_rewards: List[float] - successes: List[bool] - video_paths: List[str] + sum_rewards: list[float] + max_rewards: list[float] + successes: list[bool] + video_paths: list[str] + ACC_KEYS = ("sum_rewards", "max_rewards", "successes", "video_paths") @@ -610,7 +621,7 @@ def eval_policy_all( """ global_start = time.time() - # inner: evaluate a single (suite, task) + # inner: evaluate a single (suite, task) def eval_one( task_group: str, task_id: int, @@ -650,27 +661,36 @@ def eval_policy_all( video_paths=task_result.get("video_paths", []), ) - # result producer: sequential or threaded, same consumer - def iter_task_results() -> Iterator[Tuple[str, int, TaskMetrics]]: + # result producer: sequential or threaded, same consumer + def iter_task_results() -> Iterator[tuple[str, int, TaskMetrics]]: if max_parallel_tasks == 1: for task_group, tasks in envs.items(): for task_id, vec in tasks.items(): - yield task_group, task_id, eval_one( - task_group, task_id, vec, - policy=policy, - n_episodes=n_episodes, - max_episodes_rendered=max_episodes_rendered, - videos_dir=videos_dir, - return_episode_data=return_episode_data, - start_seed=start_seed, + yield ( + task_group, + task_id, + eval_one( + task_group, + task_id, + vec, + policy=policy, + n_episodes=n_episodes, + max_episodes_rendered=max_episodes_rendered, + videos_dir=videos_dir, + return_episode_data=return_episode_data, + start_seed=start_seed, + ), ) else: with cf.ThreadPoolExecutor(max_workers=max_parallel_tasks) as executor: - fut2key: Dict[cf.Future, Tuple[str, int]] = {} + fut2key: dict[cf.Future, tuple[str, int]] = {} for task_group, tasks in envs.items(): for task_id, vec in tasks.items(): fut = executor.submit( - eval_one, task_group, task_id, vec, + eval_one, + task_group, + task_id, + vec, policy=policy, n_episodes=n_episodes, max_episodes_rendered=max_episodes_rendered, @@ -683,9 +703,9 @@ def eval_policy_all( task_group, task_id = fut2key[fut] yield task_group, task_id, fut.result() - # single accumulator path on the main thread - group_acc: Dict[str, Dict[str, List]] = defaultdict(lambda: {k: [] for k in ACC_KEYS}) - overall: Dict[str, List] = {k: [] for k in ACC_KEYS} + # single accumulator path on the main thread + group_acc: dict[str, dict[str, list]] = defaultdict(lambda: {k: [] for k in ACC_KEYS}) + overall: dict[str, list] = {k: [] for k in ACC_KEYS} for task_group, task_id, metrics in iter_task_results(): acc = group_acc[task_group] @@ -694,7 +714,7 @@ def eval_policy_all( overall[k].extend(metrics[k]) # build outputs - results: Dict[str, dict] = {} + results: dict[str, dict] = {} for task_group, data in group_acc.items(): suite_rewards = data["sum_rewards"] suite_max = data["max_rewards"] @@ -720,9 +740,15 @@ def eval_policy_all( global_eval_ep_s = global_eval_s / max(1, len(overall["sum_rewards"])) results["overall"] = { "aggregated": { - "avg_sum_reward": float(np.nanmean(overall["sum_rewards"])) if overall["sum_rewards"] else float("nan"), - "avg_max_reward": float(np.nanmean(overall["max_rewards"])) if overall["max_rewards"] else float("nan"), - "pc_success": float(np.nanmean(overall["successes"]) * 100) if overall["successes"] else float("nan"), + "avg_sum_reward": float(np.nanmean(overall["sum_rewards"])) + if overall["sum_rewards"] + else float("nan"), + "avg_max_reward": float(np.nanmean(overall["max_rewards"])) + if overall["max_rewards"] + else float("nan"), + "pc_success": float(np.nanmean(overall["successes"]) * 100) + if overall["successes"] + else float("nan"), "eval_s": global_eval_s, "eval_ep_s": global_eval_ep_s, }, @@ -732,7 +758,6 @@ def eval_policy_all( return results - if __name__ == "__main__": init_logging() eval_main() diff --git a/src/lerobot/scripts/train.py b/src/lerobot/scripts/train.py index 3feeb0512..fc8be4ebc 100644 --- a/src/lerobot/scripts/train.py +++ b/src/lerobot/scripts/train.py @@ -105,6 +105,7 @@ def update_policy( train_metrics.update_s = time.perf_counter() - start_time return train_metrics, output_dict + # def _inject_normalization_stats(policy: SmolVLAPolicy, dataset_meta: LeRobotDatasetMetadata): # """Recreate normalization layers with dataset stats if missing (Adil's workaround).""" # from lerobot.policies.normalize import Normalize, Unnormalize @@ -132,6 +133,7 @@ def update_policy( # print("βœ… Normalization layers injected with dataset stats.") + @parser.wrap() def train(cfg: TrainPipelineConfig): cfg.validate() @@ -271,9 +273,12 @@ def train(cfg: TrainPipelineConfig): if cfg.env and is_eval_step: step_id = get_step_identifier(step, cfg.steps) logging.info(f"Eval policy at step {step}") - with torch.no_grad(), (torch.autocast(device_type=device.type) if cfg.policy.use_amp else nullcontext()): + with ( + torch.no_grad(), + torch.autocast(device_type=device.type) if cfg.policy.use_amp else nullcontext(), + ): eval_info = eval_policy_all( - eval_env, # dict[suite][task_id] -> vec_env + eval_env, # dict[suite][task_id] -> vec_env policy, cfg.eval.n_episodes, videos_dir=videos_dir, @@ -295,15 +300,15 @@ def train(cfg: TrainPipelineConfig): # meters/tracker eval_metrics = { "avg_sum_reward": AverageMeter("βˆ‘rwrd", ":.3f"), - "pc_success": AverageMeter("success", ":.1f"), - "eval_s": AverageMeter("eval_s", ":.3f"), + "pc_success": AverageMeter("success", ":.1f"), + "eval_s": AverageMeter("eval_s", ":.3f"), } eval_tracker = MetricsTracker( cfg.batch_size, dataset.num_frames, dataset.num_episodes, eval_metrics, initial_step=step ) - eval_tracker.eval_s = aggregated.get("eval_s", 0.0) + eval_tracker.eval_s = aggregated.get("eval_s", 0.0) eval_tracker.avg_sum_reward = aggregated.get("avg_sum_reward", float("nan")) - eval_tracker.pc_success = aggregated.get("pc_success", float("nan")) + eval_tracker.pc_success = aggregated.get("pc_success", float("nan")) if wandb_logger: wandb_log_dict = {**eval_tracker.to_dict(), **eval_info} wandb_logger.log_dict(wandb_log_dict, step, mode="eval") diff --git a/src/lerobot/scripts/train_2.py b/src/lerobot/scripts/train_2.py index 26a9e7aea..5b82ef044 100644 --- a/src/lerobot/scripts/train_2.py +++ b/src/lerobot/scripts/train_2.py @@ -104,6 +104,7 @@ def update_policy( train_metrics.update_s = time.perf_counter() - start_time return train_metrics, output_dict + def _inject_normalization_stats(policy: SmolVLAPolicy, dataset_meta: LeRobotDatasetMetadata): """Recreate normalization layers with dataset stats if missing (Adil's workaround).""" from lerobot.policies.normalize import Normalize, Unnormalize @@ -115,15 +116,15 @@ def _inject_normalization_stats(policy: SmolVLAPolicy, dataset_meta: LeRobotData stats = {} for key, stat_dict in dataset_meta.stats.items(): stats[key] = { - stat_type: torch.as_tensor(stat_array) - if isinstance(stat_array, np.ndarray) - else stat_array + stat_type: torch.as_tensor(stat_array) if isinstance(stat_array, np.ndarray) else stat_array for stat_type, stat_array in stat_dict.items() } normalize_inputs = Normalize(policy.config.input_features, policy.config.normalization_mapping, stats) normalize_targets = Normalize(policy.config.output_features, policy.config.normalization_mapping, stats) - unnormalize_outputs = Unnormalize(policy.config.output_features, policy.config.normalization_mapping, stats) + unnormalize_outputs = Unnormalize( + policy.config.output_features, policy.config.normalization_mapping, stats + ) policy.normalize_inputs = normalize_inputs policy.normalize_targets = normalize_targets @@ -131,6 +132,7 @@ def _inject_normalization_stats(policy: SmolVLAPolicy, dataset_meta: LeRobotData print("βœ… Normalization layers injected with dataset stats.") + @parser.wrap() def train(cfg: TrainPipelineConfig): cfg.validate() diff --git a/src/lerobot/scripts/train_accelerate.py b/src/lerobot/scripts/train_accelerate.py index e205f138f..1e8a59a64 100644 --- a/src/lerobot/scripts/train_accelerate.py +++ b/src/lerobot/scripts/train_accelerate.py @@ -24,12 +24,15 @@ from accelerate.utils import set_seed as accelerate_set_seed from termcolor import colored from torch.optim import Optimizer +from lerobot.configs import parser +from lerobot.configs.train import TrainPipelineConfig from lerobot.datasets.factory import make_dataset from lerobot.datasets.sampler import EpisodeAwareSampler from lerobot.envs.factory import make_env from lerobot.optim.factory import make_optimizer_and_scheduler from lerobot.policies.factory import make_policy from lerobot.policies.pretrained import PreTrainedPolicy +from lerobot.scripts.eval import eval_policy from lerobot.utils.logging_utils import AverageMeter, MetricsTracker from lerobot.utils.train_utils import ( get_step_checkpoint_dir, @@ -43,9 +46,6 @@ from lerobot.utils.utils import ( has_method, init_logging, ) -from lerobot.configs import parser -from lerobot.configs.train import TrainPipelineConfig -from lerobot.scripts.eval import eval_policy def update_policy( @@ -100,6 +100,7 @@ def train(cfg: TrainPipelineConfig): # Initialize accelerator from accelerate.utils import DistributedDataParallelKwargs + # added by jade 2 lines ddp_kwargs = DistributedDataParallelKwargs(find_unused_parameters=False) accelerator = Accelerator(..., kwargs_handlers=[ddp_kwargs]) @@ -357,7 +358,7 @@ def train(cfg: TrainPipelineConfig): if accelerator.is_main_process: logging.info("End of training") - accelerator.end_training() # added by jade + accelerator.end_training() # added by jade if __name__ == "__main__": From 4c2add41d764ae3e0e05a6702f75a84bae9a2d86 Mon Sep 17 00:00:00 2001 From: Jade Choghari Date: Thu, 11 Sep 2025 14:18:09 +0200 Subject: [PATCH 07/12] remove files --- src/lerobot/configs/policies.py | 1 - src/lerobot/datasets/lerobot_dataset.py | 1 - src/lerobot/datasets/utils.py | 3 +- src/lerobot/policies/factory.py | 7 - src/lerobot/policies/normalize.py | 163 --- .../policies/smolpi0/configuration_smolpi0.py | 210 --- .../policies/smolpi0/flex_attention.py | 145 -- .../policies/smolpi0/modeling_smolpi0.py | 1190 ----------------- .../policies/smolpi0/smolvlm_with_expert.py | 920 ------------- .../policies/smolvla/modeling_smolvla.py | 956 +------------ .../policies/smolvla/modeling_smolvla_v2.py | 0 src/lerobot/policies/smolvla/saver.txt | 1 - src/lerobot/scripts/eval.py | 29 +- src/lerobot/scripts/train_2.py | 345 ----- src/lerobot/scripts/train_accelerate.py | 366 ----- 15 files changed, 3 insertions(+), 4334 deletions(-) delete mode 100644 src/lerobot/policies/smolpi0/configuration_smolpi0.py delete mode 100644 src/lerobot/policies/smolpi0/flex_attention.py delete mode 100644 src/lerobot/policies/smolpi0/modeling_smolpi0.py delete mode 100644 src/lerobot/policies/smolpi0/smolvlm_with_expert.py delete mode 100644 src/lerobot/policies/smolvla/modeling_smolvla_v2.py delete mode 100644 src/lerobot/policies/smolvla/saver.txt delete mode 100644 src/lerobot/scripts/train_2.py delete mode 100644 src/lerobot/scripts/train_accelerate.py diff --git a/src/lerobot/configs/policies.py b/src/lerobot/configs/policies.py index 75863d3fc..f5fa727cf 100644 --- a/src/lerobot/configs/policies.py +++ b/src/lerobot/configs/policies.py @@ -62,7 +62,6 @@ class PreTrainedConfig(draccus.ChoiceRegistry, HubMixin, abc.ABC): # `use_amp` determines whether to use Automatic Mixed Precision (AMP) for training and evaluation. With AMP, # automatic gradient scaling is used. use_amp: bool = False - gradient_accumulation_steps: int = 1 push_to_hub: bool = True repo_id: str | None = None diff --git a/src/lerobot/datasets/lerobot_dataset.py b/src/lerobot/datasets/lerobot_dataset.py index 6509993bb..875608c59 100644 --- a/src/lerobot/datasets/lerobot_dataset.py +++ b/src/lerobot/datasets/lerobot_dataset.py @@ -597,7 +597,6 @@ class LeRobotDataset(torch.utils.data.Dataset): """hf_dataset contains all the observations, states, actions, rewards, etc.""" if self.episodes is None: path = str(self.root / "data") - # added by jade hf_dataset = load_dataset("parquet", data_dir=path, split="train") else: files = [str(self.root / self.meta.get_data_file_path(ep_idx)) for ep_idx in self.episodes] diff --git a/src/lerobot/datasets/utils.py b/src/lerobot/datasets/utils.py index daa1de163..078c5351d 100644 --- a/src/lerobot/datasets/utils.py +++ b/src/lerobot/datasets/utils.py @@ -455,8 +455,7 @@ def dataset_to_policy_features(features: dict[str, dict]) -> dict[str, PolicyFea shape = (shape[2], shape[0], shape[1]) elif key == "observation.environment_state": type = FeatureType.ENV - # changed by jade - elif key.startswith("observation") or key.startswith("state"): + elif key.startswith("observation"): type = FeatureType.STATE elif key.startswith("action"): type = FeatureType.ACTION diff --git a/src/lerobot/policies/factory.py b/src/lerobot/policies/factory.py index cc1b0480d..c3ae9cd54 100644 --- a/src/lerobot/policies/factory.py +++ b/src/lerobot/policies/factory.py @@ -31,7 +31,6 @@ from lerobot.policies.pi0fast.configuration_pi0fast import PI0FASTConfig from lerobot.policies.pretrained import PreTrainedPolicy from lerobot.policies.sac.configuration_sac import SACConfig from lerobot.policies.sac.reward_model.configuration_classifier import RewardClassifierConfig -from lerobot.policies.smolpi0.configuration_smolpi0 import SMOLPI0Config from lerobot.policies.smolvla.configuration_smolvla import SmolVLAConfig from lerobot.policies.tdmpc.configuration_tdmpc import TDMPCConfig from lerobot.policies.vqbet.configuration_vqbet import VQBeTConfig @@ -75,10 +74,6 @@ def get_policy_class(name: str) -> PreTrainedPolicy: from lerobot.policies.smolvla.modeling_smolvla import SmolVLAPolicy return SmolVLAPolicy - elif name == "smolpi0": - from lerobot.policies.smolpi0.modeling_smolpi0 import SMOLPI0Policy - - return SMOLPI0Policy else: raise NotImplementedError(f"Policy with name {name} is not implemented.") @@ -102,8 +97,6 @@ def make_policy_config(policy_type: str, **kwargs) -> PreTrainedConfig: return SmolVLAConfig(**kwargs) elif policy_type == "reward_classifier": return RewardClassifierConfig(**kwargs) - elif policy_type == "smolpi0": - return SMOLPI0Config(**kwargs) else: raise ValueError(f"Policy type '{policy_type}' is not available.") diff --git a/src/lerobot/policies/normalize.py b/src/lerobot/policies/normalize.py index 646c330cb..119055873 100644 --- a/src/lerobot/policies/normalize.py +++ b/src/lerobot/policies/normalize.py @@ -255,85 +255,6 @@ class Unnormalize(nn.Module): return batch -class NormalizePerRobotType(nn.Module): - """Normalizes data (e.g. "observation.image") for more stable and faster convergence during training.""" - - def __init__( - self, - features: dict[str, PolicyFeature], - norm_map: dict[str, NormalizationMode], - stats: dict[str, dict[str, Tensor]] | None = None, - ): - """ - Args: - shapes (dict): A dictionary where keys are input modalities (e.g. "observation.image") and values - are their shapes (e.g. `[3,96,96]`]). These shapes are used to create the tensor buffer containing - mean, std, min, max statistics. If the provided `shapes` contain keys related to images, the shape - is adjusted to be invariant to height and width, assuming a channel-first (c, h, w) format. - modes (dict): A dictionary where keys are output modalities (e.g. "observation.image") and values - are their normalization modes among: - - "mean_std": subtract the mean and divide by standard deviation. - - "min_max": map to [-1, 1] range. - stats (dict, optional): A dictionary where keys are output modalities (e.g. "observation.image") - and values are dictionaries of statistic types and their values (e.g. - `{"mean": torch.randn(3,1,1)}, "std": torch.randn(3,1,1)}`). If provided, as expected for - training the model for the first time, these statistics will overwrite the default buffers. If - not provided, as expected for finetuning or evaluation, the default buffers should to be - overwritten by a call to `policy.load_state_dict(state_dict)`. That way, initializing the - dataset is not needed to get the stats, since they are already in the policy state_dict. - """ - super().__init__() - self.features = features - self.norm_map = norm_map - for robot_type in stats.keys(): - stats_buffers = create_stats_buffers(features, norm_map, stats[robot_type]) - for key, buffer in stats_buffers.items(): - setattr(self, f"{robot_type}_buffer_" + key.replace(".", "_"), buffer) - - # TODO(rcadene): should we remove torch.no_grad? - @torch.no_grad - def forward(self, batch: dict[str, Tensor]) -> dict[str, Tensor]: - batch = dict(batch) # shallow copy avoids mutating the input batch - assert "robot_type" in batch, "robot_type is not in the batch" - robot_types = batch["robot_type"] - - for key, ft in self.features.items(): - if key not in batch: - continue - - norm_mode = self.norm_map.get(ft.type, NormalizationMode.IDENTITY) - if norm_mode is NormalizationMode.IDENTITY: - continue - # FIXME(mshukor): make it more efficient - buffers = [ - getattr(self, f"{robot_type}_buffer_" + key.replace(".", "_")) for robot_type in robot_types - ] - if norm_mode is NormalizationMode.MEAN_STD: - mean = torch.stack([buffers[i]["mean"] for i in range(len(robot_types))], dim=0) - std = torch.stack([buffers[i]["std"] for i in range(len(robot_types))], dim=0) - if batch[key].ndim == 3: - mean = mean.unsqueeze(1) - std = std.unsqueeze(1) - assert not torch.isinf(mean).any(), _no_stats_error_str("mean") - assert not torch.isinf(std).any(), _no_stats_error_str("std") - batch[key] = (batch[key] - mean) / (std + 1e-8) - elif norm_mode is NormalizationMode.MIN_MAX: - min = torch.stack([buffers[i]["min"] for i in range(len(robot_types))], dim=0) - max = torch.stack([buffers[i]["max"] for i in range(len(robot_types))], dim=0) - assert not torch.isinf(min).any(), _no_stats_error_str("min") - assert not torch.isinf(max).any(), _no_stats_error_str("max") - if batch[key].ndim == 3: - min = min.unsqueeze(1) - max = max.unsqueeze(1) - # normalize to [0,1] - batch[key] = (batch[key] - min) / (max - min + 1e-8) - # normalize to [-1, 1] - batch[key] = batch[key] * 2 - 1 - else: - raise ValueError(norm_mode) - return batch - - # TODO (azouitine): We should replace all normalization on the policies with register_buffer normalization # and remove the `Normalize` and `Unnormalize` classes. def _initialize_stats_buffers( @@ -497,87 +418,3 @@ class UnnormalizeBuffer(nn.Module): raise ValueError(norm_mode) return batch - - -class UnnormalizePerRobotType(nn.Module): - """ - Similar to `Normalize` but unnormalizes output data (e.g. `{"action": torch.randn(b,c)}`) in their - original range used by the environment. - """ - - def __init__( - self, - features: dict[str, PolicyFeature], - norm_map: dict[str, NormalizationMode], - stats: dict[str, dict[str, Tensor]] | None = None, - ): - """ - Args: - shapes (dict): A dictionary where keys are input modalities (e.g. "observation.image") and values - are their shapes (e.g. `[3,96,96]`]). These shapes are used to create the tensor buffer containing - mean, std, min, max statistics. If the provided `shapes` contain keys related to images, the shape - is adjusted to be invariant to height and width, assuming a channel-first (c, h, w) format. - modes (dict): A dictionary where keys are output modalities (e.g. "observation.image") and values - are their normalization modes among: - - "mean_std": subtract the mean and divide by standard deviation. - - "min_max": map to [-1, 1] range. - stats (dict, optional): A dictionary where keys are output modalities (e.g. "observation.image") - and values are dictionaries of statistic types and their values (e.g. - `{"mean": torch.randn(3,1,1)}, "std": torch.randn(3,1,1)}`). If provided, as expected for - training the model for the first time, these statistics will overwrite the default buffers. If - not provided, as expected for finetuning or evaluation, the default buffers should to be - overwritten by a call to `policy.load_state_dict(state_dict)`. That way, initializing the - dataset is not needed to get the stats, since they are already in the policy state_dict. - """ - super().__init__() - self.features = features - self.norm_map = norm_map - self.stats = stats - # `self.buffer_observation_state["mean"]` contains `torch.tensor(state_dim)` - for robot_type in stats.keys(): - stats_buffers = create_stats_buffers(features, norm_map, stats[robot_type]) - for key, buffer in stats_buffers.items(): - setattr(self, f"{robot_type}_buffer_" + key.replace(".", "_"), buffer) - - # TODO(rcadene): should we remove torch.no_grad? - @torch.no_grad - def forward(self, batch: dict[str, Tensor]) -> dict[str, Tensor]: - batch = dict(batch) # shallow copy avoids mutating the input batch - assert "robot_type" in batch, "robot_type is not in the batch" - robot_types = batch["robot_type"] - - for key, ft in self.features.items(): - if key not in batch: - continue - - norm_mode = self.norm_map.get(ft.type, NormalizationMode.IDENTITY) - if norm_mode is NormalizationMode.IDENTITY: - continue - - # buffer = getattr(self, "buffer_" + key.replace(".", "_")) - buffers = [ - getattr(self, f"{robot_type}_buffer_" + key.replace(".", "_")) for robot_type in robot_types - ] - - if norm_mode is NormalizationMode.MEAN_STD: - mean = torch.stack([buffers[i]["mean"] for i in range(len(robot_types))], dim=0) - std = torch.stack([buffers[i]["std"] for i in range(len(robot_types))], dim=0) - assert not torch.isinf(mean).any(), _no_stats_error_str("mean") - assert not torch.isinf(std).any(), _no_stats_error_str("std") - if batch[key].ndim == 3: - mean = mean.unsqueeze(1) - std = std.unsqueeze(1) - batch[key] = batch[key] * std + mean - elif norm_mode is NormalizationMode.MIN_MAX: - min = torch.stack([buffers[i]["min"] for i in range(len(robot_types))], dim=0) - max = torch.stack([buffers[i]["max"] for i in range(len(robot_types))], dim=0) - assert not torch.isinf(min).any(), _no_stats_error_str("min") - assert not torch.isinf(max).any(), _no_stats_error_str("max") - if batch[key].ndim == 3: - min = min.unsqueeze(1) - max = max.unsqueeze(1) - batch[key] = (batch[key] + 1) / 2 - batch[key] = batch[key] * (max - min) + min - else: - raise ValueError(norm_mode) - return batch diff --git a/src/lerobot/policies/smolpi0/configuration_smolpi0.py b/src/lerobot/policies/smolpi0/configuration_smolpi0.py deleted file mode 100644 index e39d17f15..000000000 --- a/src/lerobot/policies/smolpi0/configuration_smolpi0.py +++ /dev/null @@ -1,210 +0,0 @@ -# Copyright 2024 The HuggingFace Inc. team. All rights reserved. -# -# Licensed under the Apache License, Version 2.0 (the "License"); -# you may not use this file except in compliance with the License. -# You may obtain a copy of the License at -# -# http://www.apache.org/licenses/LICENSE-2.0 -# -# Unless required by applicable law or agreed to in writing, software -# distributed under the License is distributed on an "AS IS" BASIS, -# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. -# See the License for the specific language governing permissions and -# limitations under the License. - -from dataclasses import dataclass, field - -from lerobot.configs.policies import PreTrainedConfig -from lerobot.configs.types import FeatureType, NormalizationMode, PolicyFeature -from lerobot.optim.optimizers import AdamWConfig -from lerobot.optim.schedulers import ( - CosineDecayWithWarmupSchedulerConfig, -) - - -@dataclass -class PEFTConfig: - r: int = 4 - lora_alpha: int = 16 - lora_dropout: float = 0.1 - target_modules: str = "q_proj,v_proj" - - -@PreTrainedConfig.register_subclass("smolpi0") -@dataclass -class SMOLPI0Config(PreTrainedConfig): - # Input / output structure. - n_obs_steps: int = 1 - chunk_size: int = 50 - n_action_steps: int = 50 - n_obs_gap: int = 1 - - normalization_mapping: dict[str, NormalizationMode] = field( - default_factory=lambda: { - "VISUAL": NormalizationMode.IDENTITY, - "STATE": NormalizationMode.MEAN_STD, - "ACTION": NormalizationMode.MEAN_STD, - } - ) - - # Shorter state and action vectors will be padded - max_state_dim: int = 32 - max_action_dim: int = 32 - - # Image preprocessing - resize_imgs_with_padding: tuple[int, int] = (512, 512) # (224, 224) - - # Add empty images. Used by pi0_aloha_sim which adds the empty - # left and right wrist cameras in addition to the top camera. - empty_cameras: int = 0 - - # Converts the joint and gripper values from the standard Aloha space to - # the space used by the pi internal runtime which was used to train the base model. - adapt_to_pi_aloha: bool = False - - # Converts joint dimensions to deltas with respect to the current state before passing to the model. - # Gripper dimensions will remain in absolute values. - use_delta_joint_actions_aloha: bool = False - - # Tokenizer - tokenizer_max_length: int = 48 - - # Projector - proj_width: int = 480 - - # Decoding - num_steps: int = 10 - - # Attention utils - use_cache: bool = True - attention_implementation: str = "eager" # or fa2, flex - - # Finetuning settings - freeze_vision_encoder: bool = True - train_expert_only: bool = False - train_state_proj: bool = True - - # Training presets - optimizer_lr: float = 2.5e-5 - optimizer_betas: tuple[float, float] = (0.9, 0.95) - optimizer_eps: float = 1e-8 - optimizer_weight_decay: float = 1e-10 - optimizer_grad_clip_norm: float = 10 - optimizer_lr_vlm: float = 0 - - scheduler_warmup_steps: int = 1_000 - scheduler_decay_steps: int = 30_000 - scheduler_decay_lr: float = 2.5e-6 - - # TODO: Add EMA - vlm_model_name: str = "HuggingFaceTB/SmolVLM2-500M-Video-Instruct" - checkpoint_path: str = None - load_vlm_weights: bool = False - - peft_method: str = "" - peft_config: PEFTConfig = PEFTConfig() - peft_target_model: str = "" - - add_image_special_tokens: bool = False - add_prompt_template: bool = False - prefix_prompt_template: str = "<|im_start|>User: What action should the robot take to" - suffix_prompt_template: str = "?\nAssistant:" - - attention_mode: str = "self_attn" - - prefix_length: int = -1 # n_obs_steps * num_cameras * num_image_token_per_image + tokenizer_max_length - - past_obs_keys: str = "image" - - add_local_special_image_tokens: bool = False - - reverse_images_order: bool = False - - state_to_prefix: bool = False - - pad_language_to: str = "longest" # "max_length" - - num_expert_layers: int = -1 - num_vlm_layers: int = -1 - - causal_action_attention_mask: bool = False - - self_attn_every_n_layers: int = -1 - - expert_width_multiplier: float = 0.5 - - robot_type: str = "" - - self_attn_only_actions: bool = False - - causal_attention_on_history: bool = False - - predict_relative_actions: bool = False - relative_actions_mode: str = "first" - - shuffle_camera_positions: bool = False - vlm_img_size: int = -1 - - regression_loss: bool = False - - def __post_init__(self): - super().__post_init__() - if self.vlm_img_size > 0: - self.resize_imgs_with_padding = (self.vlm_img_size, self.vlm_img_size) - """Input validation (not exhaustive).""" - if self.n_action_steps > self.chunk_size: - raise ValueError( - f"The chunk size is the upper bound for the number of action steps per model invocation. Got " - f"{self.n_action_steps} for `n_action_steps` and {self.chunk_size} for `chunk_size`." - ) - # if self.n_obs_steps != 1: - # raise ValueError( - # f"Multiple observation steps not handled yet. Got `nobs_steps={self.n_obs_steps}`" - # ) - - if self.use_delta_joint_actions_aloha: - raise NotImplementedError( - "`use_delta_joint_actions_aloha` is used by pi0 for aloha real models. It is not ported yet in LeRobot." - ) - - def validate_features(self) -> None: - # TODO: implement value error - # if not self.image_features and not self.env_state_feature: - # raise ValueError("You must provide at least one image or the environment state among the inputs.") - - for i in range(self.empty_cameras): - key = f"observation.images.empty_camera_{i}" - empty_camera = PolicyFeature( - type=FeatureType.VISUAL, - shape=(3, 480, 640), - ) - self.input_features[key] = empty_camera - - def get_optimizer_preset(self) -> AdamWConfig: - return AdamWConfig( - lr=self.optimizer_lr, - betas=self.optimizer_betas, - eps=self.optimizer_eps, - weight_decay=self.optimizer_weight_decay, - grad_clip_norm=self.optimizer_grad_clip_norm, - ) - - def get_scheduler_preset(self): - return CosineDecayWithWarmupSchedulerConfig( - peak_lr=self.optimizer_lr, - decay_lr=self.scheduler_decay_lr, - num_warmup_steps=self.scheduler_warmup_steps, - num_decay_steps=self.scheduler_decay_steps, - ) - - @property - def observation_delta_indices(self) -> list: # FIXME(mshukor): support spacing between observations - return [-k for k in range(0, self.n_obs_steps * self.n_obs_gap, self.n_obs_gap)][::-1] - - @property - def action_delta_indices(self) -> list: - return list(range(self.chunk_size)) - - @property - def reward_delta_indices(self) -> None: - return None diff --git a/src/lerobot/policies/smolpi0/flex_attention.py b/src/lerobot/policies/smolpi0/flex_attention.py deleted file mode 100644 index 732920af2..000000000 --- a/src/lerobot/policies/smolpi0/flex_attention.py +++ /dev/null @@ -1,145 +0,0 @@ -# Copyright 2024 The HuggingFace Inc. team. All rights reserved. -# -# Licensed under the Apache License, Version 2.0 (the "License"); -# you may not use this file except in compliance with the License. -# You may obtain a copy of the License at -# -# http://www.apache.org/licenses/LICENSE-2.0 -# -# Unless required by applicable law or agreed to in writing, software -# distributed under the License is distributed on an "AS IS" BASIS, -# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. -# See the License for the specific language governing permissions and -# limitations under the License. - -import torch -import torch.nn.functional as F # noqa: N812 -from packaging.version import Version - -if Version(torch.__version__) > Version("2.5.0"): - # Ffex attention is only available from torch 2.5 onwards - from torch.nn.attention.flex_attention import ( - _mask_mod_signature, - _round_up_to_multiple, - create_block_mask, - create_mask, - flex_attention, - ) - - -@torch.compile(dynamic=False) -def flex_attention_forward( - attention_mask: torch.Tensor, - batch_size: int, - head_dim: int, - query_states: torch.Tensor, - key_states: torch.Tensor, - value_states: torch.Tensor, - scaling=None, - num_att_heads: int = 8, - num_key_value_heads: int = 1, -): - """ - This is defined out of classes to make compile happy. - """ - - original_dtype = query_states.dtype - num_key_value_groups = num_att_heads // num_key_value_heads - key_states = key_states[:, :, :, None, :] - key_states = key_states.expand( - batch_size, key_states.shape[1], num_key_value_heads, num_key_value_groups, head_dim - ) - key_states = key_states.reshape( - batch_size, key_states.shape[1], num_key_value_heads * num_key_value_groups, head_dim - ) - - value_states = value_states[:, :, :, None, :] - value_states = value_states.expand( - batch_size, value_states.shape[1], num_key_value_heads, num_key_value_groups, head_dim - ) - value_states = value_states.reshape( - batch_size, value_states.shape[1], num_key_value_heads * num_key_value_groups, head_dim - ) - - query_states = query_states.transpose(1, 2) - key_states = key_states.transpose(1, 2) - value_states = value_states.transpose(1, 2) - - # query_states = query_states.to(torch.float32) - # key_states = key_states.to(torch.float32) - # value_states = value_states.to(torch.float32) - - causal_mask = attention_mask - if causal_mask is not None: - causal_mask = causal_mask[:, None, :, : key_states.shape[2]] - - if causal_mask.shape[1] == 1 and query_states.shape[1] > 1: - causal_mask = causal_mask.expand(-1, query_states.shape[1], -1, -1) - - def precomputed_mask_factory(precomputed_mask: torch.Tensor) -> _mask_mod_signature: - def mask_mod(b, h, q_idx, kv_idx): - # Danger zone: if b,h,q_idx,kv_idx exceed the shape, device-side assert occurs. - return precomputed_mask[b][h][q_idx][kv_idx] - - return mask_mod - - b_mask, h_mask, q_len, kv_len = causal_mask.shape # The shape of your mask - - block_size = 128 # limitation of flex attention - q_len_rounded = _round_up_to_multiple(q_len, block_size) - kv_len_rounded = _round_up_to_multiple(kv_len, block_size) - - # *CRITICAL* we do need to expand here, else we get a CUDA index error - - pad_q = q_len_rounded - q_len - pad_k = kv_len_rounded - kv_len - if pad_q > 0 or pad_k > 0: - padded_causal_mask = F.pad(causal_mask, (0, pad_k, 0, pad_q), value=0.0) - else: - padded_causal_mask = causal_mask - mask_mod_fn_orig = precomputed_mask_factory(padded_causal_mask) - - mask_4d = create_mask( - mod_fn=mask_mod_fn_orig, - B=b_mask, - H=h_mask, - Q_LEN=q_len_rounded, - KV_LEN=kv_len_rounded, - device=causal_mask.device, - ) - - mask_mod_fn_padded = precomputed_mask_factory(mask_4d) - # FIXME(mshukor): compile mask torch.compile(create_block_mask) - create_block_mask_compiled = torch.compile(create_block_mask) - block_mask = create_block_mask_compiled( - mask_mod=mask_mod_fn_padded, - B=b_mask, - H=None, # - Q_LEN=q_len_rounded, - KV_LEN=kv_len_rounded, - BLOCK_SIZE=block_size, - device=causal_mask.device, - _compile=False, - ) - padded_query_states = F.pad(query_states, (0, 0, 0, pad_q), value=0.0) if pad_q > 0 else query_states - padded_key_states = F.pad(key_states, (0, 0, 0, pad_k), value=0.0) if pad_k > 0 else key_states - padded_value_states = F.pad(value_states, (0, 0, 0, pad_k), value=0.0) if pad_k > 0 else value_states - # mask is applied inside the kernel, ideally more efficiently than score_mod. - attn_output, attention_weights = flex_attention( - padded_query_states, - padded_key_states, - padded_value_states, - block_mask=block_mask, - enable_gqa=True, # because we shaped query/key states for GQA - scale=head_dim**-0.5 if scaling is None else scaling, - return_lse=True, - ) - - attn_output = attn_output.to(dtype=original_dtype) - attn_output = attn_output.transpose(1, 2).contiguous() # [B, Q_LEN, H, head_dim] - attn_output = attn_output.reshape( - batch_size, - -1, - attn_output.shape[2] * attn_output.shape[3], # merges [H, head_dim] - ) - return attn_output[:, :-pad_k, :] if pad_k > 0 else attn_output diff --git a/src/lerobot/policies/smolpi0/modeling_smolpi0.py b/src/lerobot/policies/smolpi0/modeling_smolpi0.py deleted file mode 100644 index 765a5901a..000000000 --- a/src/lerobot/policies/smolpi0/modeling_smolpi0.py +++ /dev/null @@ -1,1190 +0,0 @@ -#!/usr/bin/env python - -# Copyright 2025 Physical Intelligence and The HuggingFace Inc. team. All rights reserved. -# -# Licensed under the Apache License, Version 2.0 (the "License"); -# you may not use this file except in compliance with the License. -# You may obtain a copy of the License at -# -# http://www.apache.org/licenses/LICENSE-2.0 -# -# Unless required by applicable law or agreed to in writing, software -# distributed under the License is distributed on an "AS IS" BASIS, -# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. -# See the License for the specific language governing permissions and -# limitations under the License. - -""" -Ο€0: A Vision-Language-Action Flow Model for General Robot Control - -[Paper](https://www.physicalintelligence.company/download/pi0.pdf) -[Jax code](https://github.com/Physical-Intelligence/openpi) - -Designed by Physical Intelligence. Ported from Jax by Hugging Face. - -Install pi0 extra dependencies: -```bash -pip install -e ".[pi0]" -``` - -Example of finetuning the pi0 pretrained model (`pi0_base` in `openpi`): -```bash -python lerobot/scripts/train.py \ ---policy.path=lerobot/pi0 \ ---dataset.repo_id=danaaubakirova/koch_test -``` - -Example of finetuning the pi0 neural network with PaliGemma and expert Gemma -pretrained with VLM default parameters before pi0 finetuning: -```bash -python lerobot/scripts/train.py \ ---policy.type=pi0 \ ---dataset.repo_id=danaaubakirova/koch_test -``` - -Example of using the pi0 pretrained model outside LeRobot training framework: -```python -policy = Pi0Policy.from_pretrained("lerobot/pi0") -``` - -""" - -import math -import os -import re -from collections import deque - -import safetensors -import torch -import torch.nn.functional as F # noqa: N812 -from torch import Tensor, nn -from transformers import AutoProcessor - -from lerobot.constants import ACTION, OBS_STATE -from lerobot.policies.normalize import ( - Normalize, - NormalizePerRobotType, - Unnormalize, - UnnormalizePerRobotType, -) -from lerobot.policies.pretrained import PreTrainedPolicy -from lerobot.policies.smolpi0.configuration_smolpi0 import SMOLPI0Config -from lerobot.policies.smolpi0.smolvlm_with_expert import SmolVLMWithExpertModel -from lerobot.utils.utils import get_safe_dtype - -OBS_IMAGE = "observation.image" -OBS_IMAGES = "observation.images" -ACTION = "action" -OBS_IMAGE_2 = "observation.image2" -OBS_IMAGE_3 = "observation.image3" -OBS_IMAGE_4 = "observation.image4" -TASK = "task" -ROBOT = "robot_type" -IMAGES_ORDER = { - OBS_IMAGE: 0, - OBS_IMAGE_2: 1, - OBS_IMAGE_3: 2, - OBS_IMAGE_4: 3, -} -import random - -from lerobot.policies.utils import ( - populate_queues, -) - - -def create_sinusoidal_pos_embedding( - time: torch.tensor, dimension: int, min_period: float, max_period: float, device="cpu" -) -> Tensor: - """Computes sine-cosine positional embedding vectors for scalar positions.""" - if dimension % 2 != 0: - raise ValueError(f"dimension ({dimension}) must be divisible by 2") - - if time.ndim != 1: - raise ValueError("The time tensor is expected to be of shape `(batch_size, )`.") - - dtype = get_safe_dtype(torch.float64, device.type) - fraction = torch.linspace(0.0, 1.0, dimension // 2, dtype=dtype, device=device) - period = min_period * (max_period / min_period) ** fraction - - # Compute the outer product - scaling_factor = 1.0 / period * 2 * math.pi - sin_input = scaling_factor[None, :] * time[:, None] - pos_emb = torch.cat([torch.sin(sin_input), torch.cos(sin_input)], dim=1) - return pos_emb - - -def sample_beta(alpha, beta, bsize, device): - gamma1 = torch.empty((bsize,), device=device).uniform_(0, 1).pow(1 / alpha) - gamma2 = torch.empty((bsize,), device=device).uniform_(0, 1).pow(1 / beta) - return gamma1 / (gamma1 + gamma2) - - -def make_att_2d_masks(pad_masks, att_masks): - """Copied from big_vision. - - Tokens can attend to valid inputs tokens which have a cumulative mask_ar - smaller or equal to theirs. This way `mask_ar` int[B, N] can be used to - setup several types of attention, for example: - - [[1 1 1 1 1 1]]: pure causal attention. - - [[0 0 0 1 1 1]]: prefix-lm attention. The first 3 tokens can attend between - themselves and the last 3 tokens have a causal attention. The first - entry could also be a 1 without changing behaviour. - - [[1 0 1 0 1 0 0 1 0 0]]: causal attention between 4 blocks. Tokens of a - block can attend all previous blocks and all tokens on the same block. - - Args: - input_mask: bool[B, N] true if its part of the input, false if padding. - mask_ar: int32[B, N] mask that's 1 where previous tokens cannot depend on - it and 0 where it shares the same attention mask as the previous token. - """ - if att_masks.ndim != 2: - raise ValueError(att_masks.ndim) - if pad_masks.ndim != 2: - raise ValueError(pad_masks.ndim) - - cumsum = torch.cumsum(att_masks, dim=1) - att_2d_masks = cumsum[:, None, :] <= cumsum[:, :, None] - pad_2d_masks = pad_masks[:, None, :] * pad_masks[:, :, None] - att_2d_masks = att_2d_masks & pad_2d_masks - return att_2d_masks - - -def resize_with_pad(img, width, height, pad_value=-1): - # assume no-op when width height fits already - if img.ndim != 4: - raise ValueError(f"(b,c,h,w) expected, but {img.shape}") - - cur_height, cur_width = img.shape[2:] - - ratio = max(cur_width / width, cur_height / height) - resized_height = int(cur_height / ratio) - resized_width = int(cur_width / ratio) - resized_img = F.interpolate( - img, size=(resized_height, resized_width), mode="bilinear", align_corners=False - ) - - pad_height = max(0, int(height - resized_height)) - pad_width = max(0, int(width - resized_width)) - - # pad on left and top of image - padded_img = F.pad(resized_img, (pad_width, 0, pad_height, 0), value=pad_value) - return padded_img - - -_VARIANT_RE = re.compile(r"\.so\d+(?:-[\w]+)?_buffer_") - - -def canonicalise(k: str) -> str: - """ - Remove dataset-variant markers like '.so100-blue_' or '.so100_' from a - normalisation-buffer key. - """ - return _VARIANT_RE.sub(".buffer_", k) - - -def standardise_state_dict( - checkpoint: dict[str, torch.Tensor], ref_keys: set[str], *, verbose: bool = True -) -> tuple[dict[str, torch.Tensor], list[str]]: - """ - β€’ Re-keys `checkpoint ` so that every entry matches the *reference* key set. - β€’ If several variant keys collapse to the same canonical name we keep the - first one and log the collision. - β€’ Returns the new dict + a list of entries that could not be matched. - """ - out, collisions, unmatched = {}, {}, [] - - for k, v in checkpoint.items(): - canon = canonicalise(k) - if canon in ref_keys: - if canon in out: # duplicate after collapsing - collisions.setdefault(canon, []).append(k) - else: - out[canon] = v - else: - unmatched.append(k) - - if verbose: - for canon, variants in collisions.items(): - print(f"[standardise_state_dict] '{canon}' ← {variants}") - if unmatched: - print(f"[standardise_state_dict] kept {len(unmatched)} unmatched keys") - - out.update({k: checkpoint[k] for k in unmatched}) - return out, unmatched - - -def load_smolvla( - model: torch.nn.Module, - filename: str | os.PathLike, - *, - device: str = "cpu", - checkpoint_keys_mapping: str = "", -) -> torch.nn.Module: - state_dict = safetensors.torch.load_file(filename, device=device) - - # Optional user-supplied renames (e.g. "model._orig_mod.//model.") - if checkpoint_keys_mapping and "//" in checkpoint_keys_mapping: - state_dict = rename_checkpoint_keys(state_dict, checkpoint_keys_mapping) - - state_dict, _ = standardise_state_dict(state_dict, set(model.state_dict().keys())) - - # HACK(aliberts): to not overwrite normalization parameters as they should come from the dataset - norm_keys = ("normalize_inputs", "normalize_targets", "unnormalize_outputs") - state_dict = {k: v for k, v in state_dict.items() if not k.startswith(norm_keys)} - - missing, unexpected = model.load_state_dict(state_dict, strict=False) - if not all(key.startswith(norm_keys) for key in missing) or unexpected: - raise RuntimeError( - "SmolVLA %d missing / %d unexpected keys", - len(missing), - len(unexpected), - ) - - return model - - -def pad_vector(vector, new_dim): - """Can be (batch_size x sequence_length x features_dimension) - or (batch_size x features_dimension) - """ - if vector.shape[-1] == new_dim: - return vector - shape = list(vector.shape) - current_dim = shape[-1] - shape[-1] = new_dim - new_vector = torch.zeros(*shape, dtype=vector.dtype, device=vector.device) - new_vector[..., :current_dim] = vector - return new_vector - - -def normalize(x, min_val, max_val): - return (x - min_val) / (max_val - min_val) - - -def unnormalize(x, min_val, max_val): - return x * (max_val - min_val) + min_val - - -def safe_arcsin(value): - # This ensures that the input stays within - # [βˆ’1,1] to avoid invalid values for arcsin - return torch.arcsin(torch.clamp(value, -1.0, 1.0)) - - -def aloha_gripper_to_angular(value): - # Aloha transforms the gripper positions into a linear space. The following code - # reverses this transformation to be consistent with pi0 which is pretrained in - # angular space. - # - # These values are coming from the Aloha code: - # PUPPET_GRIPPER_POSITION_OPEN, PUPPET_GRIPPER_POSITION_CLOSED - value = unnormalize(value, min_val=0.01844, max_val=0.05800) - - # This is the inverse of the angular to linear transformation inside the Interbotix code. - def linear_to_radian(linear_position, arm_length, horn_radius): - value = (horn_radius**2 + linear_position**2 - arm_length**2) / (2 * horn_radius * linear_position) - return safe_arcsin(value) - - # The constants are taken from the Interbotix code. - value = linear_to_radian(value, arm_length=0.036, horn_radius=0.022) - - # Normalize to [0, 1]. - # The values 0.4 and 1.5 were measured on an actual Trossen robot. - return normalize(value, min_val=0.4, max_val=1.5) - - -def rename_checkpoint_keys(checkpoint: dict, rename_str: str): - """ - Renames keys in a checkpoint dictionary based on the given rename string. - - Args: - checkpoint (dict): The checkpoint dictionary. - rename_str (str): A string specifying key mappings in the format "old1//new1,old2//new2". - - Returns: - dict: The modified checkpoint with renamed keys. - """ - - rename_dict = dict(pair.split("//") for pair in rename_str.split(",")) - - new_checkpoint = {} - for k, v in checkpoint.items(): - for old_key, new_key in rename_dict.items(): - if old_key in k: - k = k.replace(old_key, new_key) - new_checkpoint[k] = v - return new_checkpoint - - -def aloha_gripper_from_angular(value): - # Convert from the gripper position used by pi0 to the gripper position that is used by Aloha. - # Note that the units are still angular but the range is different. - - # The values 0.4 and 1.5 were measured on an actual Trossen robot. - value = unnormalize(value, min_val=0.4, max_val=1.5) - - # These values are coming from the Aloha code: - # PUPPET_GRIPPER_JOINT_OPEN, PUPPET_GRIPPER_JOINT_CLOSE - return normalize(value, min_val=-0.6213, max_val=1.4910) - - -def aloha_gripper_from_angular_inv(value): - # Directly inverts the gripper_from_angular function. - value = unnormalize(value, min_val=-0.6213, max_val=1.4910) - return normalize(value, min_val=0.4, max_val=1.5) - - -class SMOLPI0Policy(PreTrainedPolicy): - """Wrapper class around VLAFlowMatching model to train and run inference within LeRobot.""" - - config_class = SMOLPI0Config - name = "smolpi0" - - def __init__( - self, - config: SMOLPI0Config, - dataset_stats: dict[str, dict[str, Tensor]] | None = None, - ): - """ - Args: - config: Policy configuration class instance or None, in which case the default instantiation of - the configuration class is used. - dataset_stats: Dataset statistics to be used for normalization. If not passed here, it is expected - that they will be passed with a call to `load_state_dict` before the policy is used. - """ - - super().__init__(config) - config.validate_features() - self.config = config - self.normalize_per_robot_type = getattr( - config, "normalize_per_robot_type", False - ) # FIXME(mshukor): assert in case of single dataset - if self.normalize_per_robot_type: - if not dataset_stats: - dataset_stats[config.robot_type] = {} - self.normalize_inputs = NormalizePerRobotType( - config.input_features, config.normalization_mapping, dataset_stats - ) - self.normalize_targets = NormalizePerRobotType( - config.output_features, config.normalization_mapping, dataset_stats - ) - self.unnormalize_outputs = UnnormalizePerRobotType( - config.output_features, config.normalization_mapping, dataset_stats - ) - else: - self.normalize_inputs = Normalize( - config.input_features, config.normalization_mapping, dataset_stats - ) - self.normalize_targets = Normalize( - config.output_features, config.normalization_mapping, dataset_stats - ) - self.unnormalize_outputs = Unnormalize( - config.output_features, config.normalization_mapping, dataset_stats - ) - - self.language_tokenizer = AutoProcessor.from_pretrained(self.config.vlm_model_name).tokenizer - self.model = VLAFlowMatching(config) - self.include_past_states = config.n_obs_steps > 1 and OBS_STATE in self.config.past_obs_keys.split( - "," - ) - self.include_past_images = config.n_obs_steps > 1 and "image" in self.config.past_obs_keys.split(",") - self.num_past_images = self.config.n_obs_steps if self.include_past_images else 1 - self.reset() - - def reset(self): - """This should be called whenever the environment is reset.""" - # self._action_queue = deque([], maxlen=self.config.n_action_steps) - self._queues = { - ACTION: deque(maxlen=self.config.n_action_steps), - } - if self.config.n_obs_steps > 1: - for k in self.config.input_features: - if any([past_obs_key in k for past_obs_key in self.config.past_obs_keys.split(",")]): - self._queues[k] = deque(maxlen=self.config.n_obs_steps) - - def get_optim_params(self) -> dict: - if self.config.optimizer_lr_vlm > 0 and self.config.optimizer_lr_vlm != self.config.optimizer_lr: - params = [ - {"params": [p for n, p in self.named_parameters() if ".vlm." not in n and p.requires_grad]}, - { - "params": [p for n, p in self.named_parameters() if ".vlm." in n and p.requires_grad], - "lr": self.config.optimizer_lr_vlm, - }, - ] - return params - - else: - return self.parameters() - - def merge_peft_model_weights(self) -> None: - if "lora" in self.config.peft_method: - self.model.vlm_with_expert.merge_lora_weights() - - @torch.no_grad - def predict_action_chunk(self, batch: dict[str, Tensor], noise: Tensor | None = None) -> Tensor: - """Select a single action given environment observations. - - This method wraps `select_actions` in order to return one action at a time for execution in the - environment. It works by managing the actions in a queue and only calling `select_actions` when the - queue is empty. - """ - self.eval() - - if self.config.adapt_to_pi_aloha: - batch[OBS_STATE] = self._pi_aloha_decode_state(batch[OBS_STATE]) - - batch = self.normalize_inputs(batch) - - images, img_masks = self.prepare_images(batch) - state = self.prepare_state(batch) - lang_tokens, lang_masks = self.prepare_language(batch) - - actions = self.model.sample_actions(images, img_masks, lang_tokens, lang_masks, state, noise=noise) - # Unpad actions - original_action_dim = self.config.action_feature.shape[0] - actions = actions[:, :, :original_action_dim] - - actions = self.unnormalize_outputs({"action": actions, "robot_type": batch["robot_type"]})["action"] - - if self.config.adapt_to_pi_aloha: - actions = self._pi_aloha_encode_actions(actions) - - return actions - - # HACK(aliberts, danaaubakirova): we overwrite this classmethod here to fix smolVLA-specific issues - @classmethod - def _load_as_safetensor( - cls, - model: "SmolVLAPolicy", - model_file: str, - map_location: str, - strict: bool, - **kwargs, - ): - safetensors.torch.load_model(model, model_file, strict=strict, device=map_location) - return load_smolvla( - model, - model_file, - device=map_location, - checkpoint_keys_mapping="model._orig_mod.//model.", - ) - - @torch.no_grad - def select_action(self, batch: dict[str, Tensor], noise: Tensor | None = None) -> Tensor: - """Select a single action given environment observations. - - This method wraps `select_actions` in order to return one action at a time for execution in the - environment. It works by managing the actions in a queue and only calling `select_actions` when the - queue is empty. - """ - self.eval() - - if self.config.adapt_to_pi_aloha: - batch[OBS_STATE] = self._pi_aloha_decode_state(batch[OBS_STATE]) - - batch = self.normalize_inputs(batch) - - self._queues = populate_queues(self._queues, batch, exclude_keys=[ACTION]) - # Action queue logic for n_action_steps > 1. When the action_queue is depleted, populate it by - # querying the policy. - if len(self._queues[ACTION]) == 0: - for k in batch: - if k in self._queues: - batch[k] = torch.stack(list(self._queues[k]), dim=1) - images, img_masks = self.prepare_images(batch) - state = self.prepare_state(batch) - lang_tokens, lang_masks = self.prepare_language(batch) - actions = self.model.sample_actions( - images, img_masks, lang_tokens, lang_masks, state, noise=noise - ) - if self.config.predict_relative_actions and actions.ndim == 3: - # If the model predicts relative actions, we need to unpad the actions - # and then convert them to absolute actions. - if self.config.relative_actions_mode == "first": - actions = torch.cat((actions[:, :1], actions[:, 1:] + actions[:, :1]), dim=1) - elif self.config.relative_actions_mode == "state": - actions = actions + state.unsqueeze(1) - else: - actions = torch.cat((actions[:, :1], actions[:, 1:] + actions[:, :-1]), dim=1) - # Unpad actions - original_action_dim = self.config.action_feature.shape[0] - actions = actions[:, :, :original_action_dim] - - actions = self.unnormalize_outputs({"action": actions})["action"] - - if self.config.adapt_to_pi_aloha: - actions = self._pi_aloha_encode_actions(actions) - - # `self.model.forward` returns a (batch_size, n_action_steps, action_dim) tensor, but the queue - # effectively has shape (n_action_steps, batch_size, *), hence the transpose. - self._queues[ACTION].extend(actions.transpose(0, 1)[: self.config.n_action_steps]) - return self._queues[ACTION].popleft() - - def forward(self, batch: dict[str, Tensor], noise=None, time=None) -> dict[str, Tensor]: - """Do a full training forward pass to compute the loss""" - if self.config.adapt_to_pi_aloha: - batch[OBS_STATE] = self._pi_aloha_decode_state(batch[OBS_STATE]) - batch[ACTION] = self._pi_aloha_encode_actions_inv(batch[ACTION]) - batch = self.normalize_inputs(batch) - batch = self.normalize_targets(batch) - images, img_masks = self.prepare_images( - batch - ) # FIXME(mshukor): adapte it to take into account already padded images in the batch - state = self.prepare_state(batch) - lang_tokens, lang_masks = self.prepare_language(batch) - actions = self.prepare_action(batch, state=state) - actions_is_pad = batch.get("actions_id_pad") - loss_dict = {} - losses = self.model.forward(images, img_masks, lang_tokens, lang_masks, state, actions, noise, time) - loss_dict["losses_after_forward"] = losses.mean().clone() - - if actions_is_pad is not None: - in_episode_bound = ~actions_is_pad - losses = losses * in_episode_bound.unsqueeze(-1) - loss_dict["losses_after_in_ep_bound"] = losses.mean().clone() - - # Remove padding - losses = losses[:, :, : self.config.max_action_dim] - loss_dict["losses_after_rm_padding"] = losses.mean().clone() - - # For backward pass - loss = losses.mean() - # For backward pass - loss_dict["loss"] = loss - # # For logging - # loss_dict["l2_loss"] = loss.item() # remove for torch compile - return loss_dict - - def prepare_images(self, batch): - """Apply Pi0 preprocessing to the images, like resizing to 224x224 and padding to keep aspect ratio, and - convert pixel range from [0.0, 1.0] to [-1.0, 1.0] as requested by SigLIP. - """ - images = [] - img_masks = [] - present_img_keys = [key for key in self.config.image_features if key in batch] - missing_img_keys = [key for key in self.config.image_features if key not in batch] - - present_img_keys = sorted( - present_img_keys, - key=lambda k: IMAGES_ORDER.get(k, float("inf")), - reverse=self.config.reverse_images_order, - ) - if self.config.shuffle_camera_positions and ACTION in batch: # only during training - present_img_keys = random.sample(present_img_keys, len(present_img_keys)) - if len(present_img_keys) == 0: - raise ValueError( - f"All image features are missing from the batch. At least one expected. (batch: {batch.keys()}) (image_features:{self.config.image_features})" - ) - for i in range(self.num_past_images): - # Preprocess image features present in the batch - for key in present_img_keys: - img = batch[key][:, i, :, :, :] if batch[key].ndim == 5 else batch[key] - if self.config.resize_imgs_with_padding is not None: - img = resize_with_pad(img, *self.config.resize_imgs_with_padding, pad_value=0) - - # Normalize from range [0,1] to [-1,1] as expacted by siglip - img = img * 2.0 - 1.0 - - bsize = img.shape[0] - device = img.device - if f"{key}_padding_mask" in batch: - mask = batch[f"{key}_padding_mask"].bool() - else: - mask = torch.ones(bsize, dtype=torch.bool, device=device) - images.append(img) - img_masks.append(mask) - - # Create image features not present in the batch - # as fully 0 padded images. - for num_empty_cameras in range(len(missing_img_keys)): - if num_empty_cameras >= self.config.empty_cameras: - break - img = torch.ones_like(img) * -1 - mask = torch.zeros_like(mask) - images.append(img) - img_masks.append(mask) - return images, img_masks - - def prepare_language(self, batch) -> tuple[Tensor, Tensor]: - """Tokenize the text input""" - device = batch[OBS_STATE].device - tasks = batch["task"] - if len(tasks) == 1: - tasks = [tasks[0] for _ in range(batch[OBS_STATE].shape[0])] - - if self.config.add_prompt_template: - tasks = [ - f"{self.config.prefix_prompt_template}{task}{self.config.suffix_prompt_template}" - for task in tasks - ] - else: - tasks = [task if task.endswith("\n") else f"{task}\n" for task in tasks] - tokenized_prompt = self.language_tokenizer.__call__( - tasks, - padding=self.config.pad_language_to, - padding_side="right", - max_length=self.config.tokenizer_max_length, - return_tensors="pt", - truncation=True, # FIXME(mshukor) - ) - - lang_tokens = tokenized_prompt["input_ids"].to(device=device) - lang_masks = tokenized_prompt["attention_mask"].to(device=device, dtype=torch.bool) - - return lang_tokens, lang_masks - - def _pi_aloha_decode_state(self, state): - # Flip the joints. - for motor_idx in [1, 2, 8, 9]: - state[:, motor_idx] *= -1 - # Reverse the gripper transformation that is being applied by the Aloha runtime. - for motor_idx in [6, 13]: - state[:, motor_idx] = aloha_gripper_to_angular(state[:, motor_idx]) - return state - - def _pi_aloha_encode_actions(self, actions): - # Flip the joints. - for motor_idx in [1, 2, 8, 9]: - actions[:, :, motor_idx] *= -1 - # Reverse the gripper transformation that is being applied by the Aloha runtime. - for motor_idx in [6, 13]: - actions[:, :, motor_idx] = aloha_gripper_from_angular(actions[:, :, motor_idx]) - return actions - - def _pi_aloha_encode_actions_inv(self, actions): - # Flip the joints again. - for motor_idx in [1, 2, 8, 9]: - actions[:, :, motor_idx] *= -1 - # Reverse the gripper transformation that is being applied by the Aloha runtime. - for motor_idx in [6, 13]: - actions[:, :, motor_idx] = aloha_gripper_from_angular_inv(actions[:, :, motor_idx]) - return actions - - def prepare_state(self, batch): - """Pad state""" - state = ( - batch[OBS_STATE][:, -1, :] - if (batch[OBS_STATE].ndim > 2 and not self.include_past_states) - else batch[OBS_STATE] - ) # FIXME(mshukor): no state history for now - state = pad_vector(state, self.config.max_state_dim) - return state - - def prepare_action(self, batch, state=None): - """Pad action""" - actions = pad_vector(batch[ACTION], self.config.max_action_dim) - if self.config.predict_relative_actions and actions.ndim == 3: - if self.config.relative_actions_mode == "first": - actions = torch.cat((actions[:, :1], actions[:, 1:] - actions[:, :1]), dim=1) - elif self.config.relative_actions_mode == "state": - assert batch[ACTION].shape[-1] == batch[OBS_STATE].shape[-1], ( - "Relative action mode 'state' requires the action and state to have the same dimension." - ) - if state.ndim == 2: - state = state.unsqueeze(1) - actions = actions - state - else: - actions = torch.cat((actions[:, :1], actions[:, 1:] - actions[:, :-1]), dim=1) - return actions - - -def pad_tensor(tensor, max_len, pad_value=0): - """ - Efficiently pads a tensor along sequence dimension to match max_len. - - Args: - tensor (torch.Tensor): Shape (B, L, ...) or (B, L). - max_len (int): Fixed sequence length. - pad_value (int/float): Value for padding. - - Returns: - torch.Tensor: Shape (B, max_len, ...) or (B, max_len). - """ - B, L = tensor.shape[:2] - - # Create a padded tensor of max_len and copy the existing values - padded_tensor = torch.full( - (B, max_len, *tensor.shape[2:]), pad_value, dtype=tensor.dtype, device=tensor.device - ) - padded_tensor[:, :L] = tensor # Efficient in-place copy - - return padded_tensor - - -class VLAFlowMatching(nn.Module): - """ - Ο€0: A Vision-Language-Action Flow Model for General Robot Control - - [Paper](https://www.physicalintelligence.company/download/pi0.pdf) - [Jax code](https://github.com/Physical-Intelligence/openpi) - - Designed by Physical Intelligence. Ported from Jax by Hugging Face. - β”Œβ”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β” - β”‚ actions β”‚ - β”‚ β–² β”‚ - β”‚ β”Œβ”΄β”€β”€β”€β”€β”€β” β”‚ - β”‚ kv cache β”‚Gemma β”‚ β”‚ - β”‚ β”Œβ”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β–Ίβ”‚Expertβ”‚ β”‚ - β”‚ β”‚ β”‚ β”‚ β”‚ - β”‚ β”Œβ”΄β”€β”€β”€β”€β”€β”€β”€β”€β” β”‚x 10 β”‚ β”‚ - β”‚ β”‚ β”‚ β””β–²β”€β”€β–²β”€β”€β”˜ β”‚ - β”‚ β”‚ VLM β”‚ β”‚ β”‚ β”‚ - β”‚ β”‚ β”‚ β”‚ robot state β”‚ - β”‚ β”‚ β”‚ noise β”‚ - β”‚ β””β–²β”€β”€β–²β”€β”€β”€β”€β”€β”˜ β”‚ - β”‚ β”‚ β”‚ β”‚ - β”‚ β”‚ image(s) β”‚ - β”‚ language tokens β”‚ - β””β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”˜ - """ - - def __init__(self, config): - super().__init__() - self.config = config - - self.vlm_with_expert = SmolVLMWithExpertModel( - model_id=self.config.vlm_model_name, - freeze_vision_encoder=self.config.freeze_vision_encoder, - train_expert_only=self.config.train_expert_only, - attention_implementation=self.config.attention_implementation, - load_vlm_weights=self.config.load_vlm_weights, - attention_mode=self.config.attention_mode, - num_expert_layers=self.config.num_expert_layers, - num_vlm_layers=self.config.num_vlm_layers, - self_attn_every_n_layers=self.config.self_attn_every_n_layers, - expert_width_multiplier=self.config.expert_width_multiplier, - self_attn_only_actions=self.config.self_attn_only_actions, - ) - # self.paligemma_with_expert = self.configure_peft(paligemma_with_expert) - self.vlm_with_expert.configure_peft(config=self.config) - # Projections are float32 - self.state_to_prefix = self.config.state_to_prefix - if self.state_to_prefix: - self.state_proj = nn.Linear( - self.config.max_state_dim, self.vlm_with_expert.config.text_config.hidden_size - ) - else: - self.state_proj = nn.Linear(self.config.max_state_dim, self.vlm_with_expert.expert_hidden_size) - self.action_in_proj = nn.Linear(self.config.max_action_dim, self.vlm_with_expert.expert_hidden_size) - self.action_out_proj = nn.Linear(self.vlm_with_expert.expert_hidden_size, self.config.max_action_dim) - - self.action_time_mlp_in = nn.Linear( - self.vlm_with_expert.expert_hidden_size * 2, self.vlm_with_expert.expert_hidden_size - ) - self.action_time_mlp_out = nn.Linear( - self.vlm_with_expert.expert_hidden_size, self.vlm_with_expert.expert_hidden_size - ) - - self.set_requires_grad() - # SmolVLM2 has: [fake_tok + crop_tok + crop + fake_tok + crop_tok ... + fake_tok + global_tok + global + fake_tok] + [second image] + ... - if any([k in self.config.vlm_model_name for k in ["SmolVLM-", "SmolVLA-"]]): - if "SmolVLM-Instruct" in self.config.vlm_model_name: - self.fake_image_token = 49152 - self.global_image_token = [44, 13906, 29, 6266, 46] - self.global_image_start_token = torch.tensor( - [self.fake_image_token] + self.global_image_token, dtype=torch.long - ) - else: - self.fake_image_token = 49189 - self.global_image_token = 49152 - self.global_image_start_token = torch.tensor( - [self.fake_image_token, self.global_image_token], dtype=torch.long - ) - else: - self.fake_image_token = self.vlm_with_expert.processor.tokenizer.fake_image_token_id - self.global_image_token = self.vlm_with_expert.processor.tokenizer.global_image_token_id - self.global_image_start_token = torch.tensor( - [self.fake_image_token, self.global_image_token], dtype=torch.long - ) - - self.add_image_special_tokens = self.config.add_image_special_tokens - self.add_local_special_image_tokens = self.config.add_local_special_image_tokens - self.local_image_tokens = [ - torch.tensor([self.fake_image_token, tok], dtype=torch.long) - for tok in [49153, 49154, 49155, 49159, 49160, 49161, 49165, 49166, 49167] - ] # assume 3 x 3 grid - - self.local_image_start_token = self.global_image_start_token - self.image_end_token = torch.tensor([self.fake_image_token], dtype=torch.long) - self.prefix_length = self.config.prefix_length - self.include_past_images = self.config.n_obs_steps > 1 and "image" in self.config.past_obs_keys.split( - "," - ) - self.num_past_images = self.config.n_obs_steps if self.include_past_images else 1 - self.causal_attention_on_history = self.config.causal_attention_on_history - - # def configure_peft(self, model): - # # return model - # self.peft_method = self.config.peft_method - # if "lora" in self.peft_method: - # peft_config = self.config.peft_config - # target_modules = peft_config.target_modules - # if not isinstance(target_modules, list): - # target_modules = target_modules.split(",") - # lora_config = LoraConfig( - # task_type=TaskType.CAUSAL_LM, # Based on the task type (e.g., language modeling, etc.) - # r=peft_config.r, # The rank of the low-rank adaptation - # lora_alpha=peft_config.lora_alpha, # Scaling factor - # lora_dropout=peft_config.lora_dropout, # Dropout applied to LoRA layers - # target_modules=target_modules, # The components where LoRA is applied - # exclude_modules=["gemma_expert", "model.gemma_expert.model.layers"], # FIXME(mshukor): this does not work for now - # ) - # # LoraConfig(task_type=TaskType.CAUSAL_LM, r=16, lora_alpha=1, lora_dropout=0, target_modules=["q_proj"], exclude_modules=["gemma_expert"]) - # self.lora_config = lora_config - # # Apply LoRA and ensure only LoRA parameters are trainable - - # model = get_peft_model(model, lora_config) - # assert self.config.train_expert_only, "Backbone should be frozen and only lora parameters are " # FIXME(mshukor): handle this here? - # for name, param in model.named_parameters(): - # if ( - # "lora" in name - # ): # lm_head is not a parameter in most LLMs becasue it's tied to the embedding layer - # param.requires_grad = True - # return model - - def set_requires_grad(self): - for params in self.state_proj.parameters(): - params.requires_grad = self.config.train_state_proj - - def sample_noise(self, shape, device): - noise = torch.normal( - mean=0.0, - std=1.0, - size=shape, - dtype=torch.float32, - device=device, - ) - return noise - - def sample_time(self, bsize, device): - time_beta = sample_beta(1.5, 1.0, bsize, device) - time = time_beta * 0.999 + 0.001 - return time.to(dtype=torch.float32, device=device) - - def embed_prefix( - self, images, img_masks, lang_tokens, lang_masks, state: torch.Tensor = None - ) -> tuple[torch.Tensor, torch.Tensor, torch.Tensor]: - """Embed images with SigLIP and language tokens with embedding layer to prepare - for SmolVLM transformer processing. - """ - # TODO: avoid list in python and torch.cat ; prefer pre-allocation with torch.empty - embs = [] - pad_masks = [] - att_masks = [] - num_images = len(images) // self.num_past_images - # TODO: remove for loop - for img_idx, ( - img, - img_mask, - ) in enumerate(zip(images, img_masks, strict=False)): - # FIXME(mshukor): add special tokens for the history each history_steps or not - if self.add_image_special_tokens: - if self.add_local_special_image_tokens and img_idx % num_images != num_images - 1: - local_token_idx = img_idx % num_images - image_start_token = ( - self.vlm_with_expert.embed_language_tokens( - self.local_image_tokens[local_token_idx].to( - device=self.vlm_with_expert.vlm.device - ) - ) - .unsqueeze(0) - .expand(img.shape[0], -1, -1) - ) - else: - image_start_token = ( - self.vlm_with_expert.embed_language_tokens( - self.global_image_start_token.to(device=self.vlm_with_expert.vlm.device) - ) - .unsqueeze(0) - .expand(img.shape[0], -1, -1) - ) - image_start_mask = torch.ones_like( - image_start_token[:, :, 0], dtype=torch.bool, device=image_start_token.device - ) - if self.causal_attention_on_history and img_idx % num_images == 0: - att_masks += [1] + [0] * (image_start_mask.shape[-1] - 1) - else: - att_masks += [0] * (image_start_mask.shape[-1]) - embs.append(image_start_token) - pad_masks.append(image_start_mask) - - img_emb = self.vlm_with_expert.embed_image(img) - img_emb = img_emb # .to(dtype=self.vlm_with_expert.type) - - # Normalize image embeddings - img_emb_dim = img_emb.shape[-1] - img_emb = img_emb * torch.tensor(img_emb_dim**0.5, dtype=img_emb.dtype, device=img_emb.device) - - bsize, num_img_embs = img_emb.shape[:2] - img_mask = img_mask[:, None].expand(bsize, num_img_embs) - - # FIXME(mshukor): add special image tokens. Assume no tiling fake global images fake - # template <|im_start|>User: What actions? image tokens \nAssistant: or processor.apply_chat_template? - # processor.fake_image_token - # processor.global_image_token - - embs.append(img_emb) - pad_masks.append(img_mask) - - att_masks += [0] * (num_img_embs) - if self.add_image_special_tokens: - if not self.add_local_special_image_tokens or ( - self.add_local_special_image_tokens and img_idx % num_images == num_images - 1 - ): - image_end_token = ( - self.vlm_with_expert.embed_language_tokens( - self.image_end_token.to(device=self.vlm_with_expert.vlm.device) - ) - .unsqueeze(0) - .expand(img.shape[0], -1, -1) - ) - image_end_mask = torch.ones_like( - image_end_token[:, :, 0], dtype=torch.bool, device=image_end_token.device - ) - embs.append(image_end_token) - pad_masks.append(image_end_mask) - att_masks += [0] * (image_end_mask.shape[1]) - lang_emb = self.vlm_with_expert.embed_language_tokens(lang_tokens) - # Normalize language embeddings - lang_emb_dim = lang_emb.shape[-1] - lang_emb = lang_emb * math.sqrt(lang_emb_dim) # FIXME(mshukor): is this needed for smolvlm? - - embs.append(lang_emb) - pad_masks.append(lang_masks) - - # full attention between image and language inputs - num_lang_embs = lang_emb.shape[1] - att_masks += [0] * num_lang_embs - - if state is not None and self.state_to_prefix: - state_emb = self.state_proj(state) - state_emb = ( - state_emb[:, None, :] if state_emb.ndim == 2 else state_emb - ) # .to(dtype=self.vlm_with_expert.type) - embs.append(state_emb) - bsize = state_emb.shape[0] - dtype = state_emb.dtype - device = state_emb.device - - states_seq_len = state_emb.shape[1] - state_mask = torch.ones(bsize, states_seq_len, dtype=torch.bool, device=device) - pad_masks.append(state_mask) - - # Set attention masks so that image and language inputs do not attend to state or actions - # att_masks += [1] + [0]*(states_seq_len - 1) - att_masks += [1] * (states_seq_len) - embs = torch.cat(embs, dim=1) - pad_masks = torch.cat(pad_masks, dim=1) - att_masks = torch.tensor(att_masks, dtype=torch.bool, device=pad_masks.device) - att_masks = att_masks[None, :] - - seq_len = pad_masks.shape[1] - if seq_len < self.prefix_length: - embs = pad_tensor(embs, self.prefix_length, pad_value=0) - pad_masks = pad_tensor(pad_masks, self.prefix_length, pad_value=0) - att_masks = pad_tensor(att_masks, self.prefix_length, pad_value=0) - - att_masks = att_masks.expand(bsize, -1) - - return embs, pad_masks, att_masks - - def embed_suffix(self, state, noisy_actions, timestep): - """Embed state, noisy_actions, timestep to prepare for Expert Gemma processing.""" - embs = [] - pad_masks = [] - att_masks = [] - - # Embed state - if not self.state_to_prefix: - state_emb = self.state_proj(state) - state_emb = ( - state_emb[:, None, :] if state_emb.ndim == 2 else state_emb - ) # .to(dtype=self.vlm_with_expert.type) - embs.append(state_emb) - bsize = state_emb.shape[0] - dtype = state_emb.dtype - device = state_emb.device - - states_seq_len = state_emb.shape[1] - state_mask = torch.ones(bsize, states_seq_len, dtype=torch.bool, device=device) - pad_masks.append(state_mask) - - # Set attention masks so that image and language inputs do not attend to state or actions - att_masks += [1] + [0] * (states_seq_len - 1) - - # Fuse timestep + action information using an MLP - action_emb = self.action_in_proj(noisy_actions) - device = action_emb.device - bsize = action_emb.shape[0] - dtype = action_emb.dtype - # Embed timestep using sine-cosine positional encoding with sensitivity in the range [0, 1] - time_emb = create_sinusoidal_pos_embedding( - timestep, self.vlm_with_expert.expert_hidden_size, min_period=4e-3, max_period=4.0, device=device - ) - time_emb = time_emb.type(dtype=dtype) - - time_emb = time_emb[:, None, :].expand_as(action_emb) - action_time_emb = torch.cat([action_emb, time_emb], dim=2) - - action_time_emb = self.action_time_mlp_in(action_time_emb) - action_time_emb = F.silu(action_time_emb) # swish == silu - action_time_emb = self.action_time_mlp_out(action_time_emb) - - # Add to input tokens - embs.append(action_time_emb) - - bsize, action_time_dim = action_time_emb.shape[:2] - action_time_mask = torch.ones(bsize, action_time_dim, dtype=torch.bool, device=device) - pad_masks.append(action_time_mask) - - # Set attention masks so that image, language and state inputs do not attend to action tokens - if self.config.causal_action_attention_mask: - att_masks += [1] * self.config.chunk_size - else: - att_masks += [1] + ([0] * (self.config.chunk_size - 1)) - embs = torch.cat(embs, dim=1) - pad_masks = torch.cat(pad_masks, dim=1) - att_masks = torch.tensor(att_masks, dtype=embs.dtype, device=embs.device) - att_masks = att_masks[None, :].expand(bsize, len(att_masks)) - return embs, pad_masks, att_masks - - def forward( - self, images, img_masks, lang_tokens, lang_masks, state, actions, noise=None, time=None - ) -> Tensor: - """Do a full training forward pass and compute the loss (batch_size x num_steps x num_motors)""" - if noise is None: - noise = self.sample_noise(actions.shape, actions.device) - - if time is None: - time = self.sample_time(actions.shape[0], actions.device) - - time_expanded = time[:, None, None] - if self.config.regression_loss: - # Hack to compare regression to flow matching - time = torch.zeros_like(time, dtype=time.dtype, device=time.device) - x_t = torch.zeros_like(actions, dtype=actions.dtype, device=actions.device) - u_t = actions - else: - x_t = time_expanded * noise + (1 - time_expanded) * actions - u_t = noise - actions - prefix_embs, prefix_pad_masks, prefix_att_masks = self.embed_prefix( - images, img_masks, lang_tokens, lang_masks, state=state - ) - suffix_embs, suffix_pad_masks, suffix_att_masks = self.embed_suffix(state, x_t, time) - - pad_masks = torch.cat([prefix_pad_masks, suffix_pad_masks], dim=1) - att_masks = torch.cat([prefix_att_masks, suffix_att_masks], dim=1) - - att_2d_masks = make_att_2d_masks(pad_masks, att_masks) - position_ids = torch.cumsum(pad_masks, dim=1) - 1 - (_, suffix_out), _ = self.vlm_with_expert.forward( - attention_mask=att_2d_masks, - position_ids=position_ids, - past_key_values=None, - inputs_embeds=[prefix_embs, suffix_embs], - use_cache=False, - fill_kv_cache=False, - ) - suffix_out = suffix_out[:, -self.config.chunk_size :] - # Original openpi code, upcast attention output - suffix_out = suffix_out.to(dtype=torch.float32) - v_t = self.action_out_proj(suffix_out) - if self.config.regression_loss: - losses = F.l1_loss(u_t, v_t, reduction="none") - else: - losses = F.mse_loss(u_t, v_t, reduction="none") - return losses - - def sample_actions(self, images, img_masks, lang_tokens, lang_masks, state, noise=None) -> Tensor: - """Do a full inference forward and compute the action (batch_size x num_steps x num_motors)""" - bsize = state.shape[0] - device = state.device - - if noise is None: - actions_shape = (bsize, self.config.chunk_size, self.config.max_action_dim) - noise = self.sample_noise(actions_shape, device) - - prefix_embs, prefix_pad_masks, prefix_att_masks = self.embed_prefix( - images, img_masks, lang_tokens, lang_masks, state=state - ) - prefix_att_2d_masks = make_att_2d_masks(prefix_pad_masks, prefix_att_masks) - prefix_position_ids = torch.cumsum(prefix_pad_masks, dim=1) - 1 - # Compute image and language key value cache - _, past_key_values = self.vlm_with_expert.forward( - attention_mask=prefix_att_2d_masks, - position_ids=prefix_position_ids, - past_key_values=None, - inputs_embeds=[prefix_embs, None], - use_cache=self.config.use_cache, - fill_kv_cache=True, - ) - if self.config.regression_loss: - x_t = torch.zeros_like(noise, dtype=torch.float32, device=device) - expanded_time = torch.zeros(bsize, dtype=torch.float32, device=device) - x_t = self.denoise_step( - state, - prefix_pad_masks, - past_key_values, - x_t, - expanded_time, - ) - else: - dt = -1.0 / self.config.num_steps - dt = torch.tensor(dt, dtype=torch.float32, device=device) - - x_t = noise - time = torch.tensor(1.0, dtype=torch.float32, device=device) - while time >= -dt / 2: - expanded_time = time.expand(bsize) - v_t = self.denoise_step( - state, - prefix_pad_masks, - past_key_values, - x_t, - expanded_time, - ) - - # Euler step - x_t += dt * v_t - time += dt - return x_t - - def denoise_step( - self, - state, - prefix_pad_masks, - past_key_values, - x_t, - timestep, - ): - """Apply one denoising step of the noise `x_t` at a given timestep.""" - suffix_embs, suffix_pad_masks, suffix_att_masks = self.embed_suffix(state, x_t, timestep) - - suffix_len = suffix_pad_masks.shape[1] - batch_size = prefix_pad_masks.shape[0] - prefix_len = prefix_pad_masks.shape[1] - prefix_pad_2d_masks = prefix_pad_masks[:, None, :].expand(batch_size, suffix_len, prefix_len) - - suffix_att_2d_masks = make_att_2d_masks(suffix_pad_masks, suffix_att_masks) - - full_att_2d_masks = torch.cat([prefix_pad_2d_masks, suffix_att_2d_masks], dim=2) - prefix_offsets = torch.sum(prefix_pad_masks, dim=-1)[:, None] - position_ids = prefix_offsets + torch.cumsum(suffix_pad_masks, dim=1) - 1 - - outputs_embeds, _ = self.vlm_with_expert.forward( - attention_mask=full_att_2d_masks, - position_ids=position_ids, - past_key_values=past_key_values, - inputs_embeds=[None, suffix_embs], - use_cache=self.config.use_cache, - fill_kv_cache=False, - ) - suffix_out = outputs_embeds[1] - suffix_out = suffix_out[:, -self.config.chunk_size :] - suffix_out = suffix_out.to(dtype=torch.float32) - v_t = self.action_out_proj(suffix_out) - return v_t diff --git a/src/lerobot/policies/smolpi0/smolvlm_with_expert.py b/src/lerobot/policies/smolpi0/smolvlm_with_expert.py deleted file mode 100644 index 0ccdcccc8..000000000 --- a/src/lerobot/policies/smolpi0/smolvlm_with_expert.py +++ /dev/null @@ -1,920 +0,0 @@ -# Copyright 2024 The HuggingFace Inc. team. All rights reserved. -# -# Licensed under the Apache License, Version 2.0 (the "License"); -# you may not use this file except in compliance with the License. -# You may obtain a copy of the License at -# -# http://www.apache.org/licenses/LICENSE-2.0 -# -# Unless required by applicable law or agreed to in writing, software -# distributed under the License is distributed on an "AS IS" BASIS, -# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. -# See the License for the specific language governing permissions and -# limitations under the License. - -import copy -from functools import partial -from typing import List, Optional, Union - -import torch -import torch.nn.functional as F # noqa: N812 -import torch.version -from peft import LoraConfig, TaskType, get_peft_model -from pytest import Cache -from torch import nn -from transformers import ( - AutoConfig, - AutoModel, - AutoModelForImageTextToText, - AutoModelForVision2Seq, - AutoProcessor, - SmolVLMForConditionalGeneration, -) - -from lerobot.policies.smolpi0.flex_attention import flex_attention_forward - - -def _round_up_to_multiple(x, multiple): - return (x + multiple - 1) // multiple * multiple - - -def apply_rope(x, positions, max_wavelength=10_000): - """ - Applies RoPE positions [B, L] to x [B, L, H, D]. - """ - d_half = x.shape[-1] // 2 - device = x.device - dtype = x.dtype - x = x.to(torch.float32) - - freq_exponents = (2.0 / x.shape[-1]) * torch.arange(d_half, dtype=torch.float32, device=device) - timescale = max_wavelength**freq_exponents - radians = positions[..., None].to(torch.float32) / timescale[None, None, :].to(torch.float32) - - radians = radians[..., None, :] - - sin = torch.sin(radians) # .to(dtype=dtype) - cos = torch.cos(radians) # .to(dtype=dtype) - - x1, x2 = x.split(d_half, dim=-1) - res = torch.empty_like(x) - res[..., :d_half] = x1 * cos - x2 * sin - res[..., d_half:] = x2 * cos + x1 * sin - - return res.to(dtype) - - -# class SmolVLMWithExpertConfig(PretrainedConfig): -# model_type = "SmolVLMWithExpertModel" -# sub_configs = {"smolvlm_config": AutoConfig, "lm_expert_config": AutoConfig} - -# def __init__( -# self, -# smolvlm_config: dict | None = None, -# lm_expert_config: dict | None = None, -# freeze_vision_encoder: bool = True, -# train_expert_only: bool = True, -# attention_implementation: str = "eager", -# load_vlm_weights: bool = False, -# **kwargs, -# ): -# self.load_vlm_weights = load_vlm_weights -# self.freeze_vision_encoder = freeze_vision_encoder -# self.train_expert_only = train_expert_only -# self.attention_implementation = attention_implementation - -# if smolvlm_config is None: -# # Default config from Pi0 -# self.smolvlm_config = CONFIG_MAPPING["smolvlm"]( -# transformers_version="4.48.1", -# _vocab_size=257152, -# bos_token_id=2, -# eos_token_id=1, -# hidden_size=2048, -# image_token_index=257152, -# model_type="smolvlm", -# pad_token_id=0, -# projection_dim=2048, -# text_config={ -# "hidden_activation": "gelu_pytorch_tanh", -# "hidden_size": 2048, -# "intermediate_size": 16384, -# "model_type": "gemma", -# "num_attention_heads": 8, -# "num_hidden_layers": 18, -# "num_image_tokens": 256, -# "num_key_value_heads": 1, -# "torch_dtype": "float32", -# "vocab_size": 257152, -# }, -# vision_config={ -# "hidden_size": 1152, -# "intermediate_size": 4304, -# "model_type": "siglip_vision_model", -# "num_attention_heads": 16, -# "num_hidden_layers": 27, -# "num_image_tokens": 256, -# "patch_size": 14, -# "projection_dim": 2048, -# "projector_hidden_act": "gelu_fast", -# "torch_dtype": "float32", -# "vision_use_head": False, -# }, -# ) -# elif isinstance(self.paligemma_config, dict): -# # Override Pi0 default config for PaliGemma -# if "model_type" not in gemma_expert_config: -# paligemma_config["model_type"] = "paligemma" - -# cfg_cls = CONFIG_MAPPING[paligemma_config["model_type"]] -# self.paligemma_config = cfg_cls(**paligemma_config) - -# if gemma_expert_config is None: -# # Default config from Pi0 -# self.gemma_expert_config = CONFIG_MAPPING["gemma"]( -# attention_bias=False, -# attention_dropout=0.0, -# bos_token_id=2, -# eos_token_id=1, -# head_dim=256, -# hidden_act="gelu_pytorch_tanh", -# hidden_activation="gelu_pytorch_tanh", -# hidden_size=1024, -# initializer_range=0.02, -# intermediate_size=4096, -# max_position_embeddings=8192, -# model_type="gemma", -# num_attention_heads=8, -# num_hidden_layers=18, -# num_key_value_heads=1, -# pad_token_id=0, -# rms_norm_eps=1e-06, -# rope_theta=10000.0, -# torch_dtype="float32", -# transformers_version="4.48.1", -# use_cache=True, -# vocab_size=257152, -# ) -# elif isinstance(self.gemma_expert_config, dict): -# # Override Pi0 default config for Gemma Expert -# if "model_type" not in gemma_expert_config: -# gemma_expert_config["model_type"] = "gemma" - -# cfg_cls = CONFIG_MAPPING[paligemma_config["model_type"]] -# self.gemma_expert_config = cfg_cls(**gemma_expert_config) - -# super().__init__(**kwargs) - -# def __post_init__(self): -# super().__post_init__() -# if self.train_expert_only and not self.freeze_vision_encoder: -# raise ValueError( -# "You set `freeze_vision_encoder=False` and `train_expert_only=True` which are not compatible." -# ) - -# if self.attention_implementation not in ["eager", "fa2", "flex"]: -# raise ValueError( -# f"Wrong value provided for `attention_implementation` ({self.attention_implementation}). Expected 'eager', 'fa2' or 'flex'." -# ) - - -def get_intermediate_size(hidden_dim, ffn_dim_multiplier=4, multiple_of=256): - hidden_dim = int(2 * hidden_dim / 3) - hidden_dim = int(ffn_dim_multiplier * hidden_dim) - hidden_dim = multiple_of * ((hidden_dim + multiple_of - 1) // multiple_of) - return hidden_dim - - -class SmolVLMWithExpertModel(nn.Module): - # config_class = PaliGemmaWithExpertConfig - - def __init__( - self, - model_id: str = "HuggingFaceTB/SmolVLM2-500M-Video-Instruct", - load_vlm_weights: bool = True, - train_expert_only: bool = True, - freeze_vision_encoder: bool = False, - attention_implementation: str = "eager", - attention_mode: str = "self_attn", - num_expert_layers: int = -1, - num_vlm_layers: int = -1, - self_attn_every_n_layers: int = -1, - expert_width_multiplier: float = 0.5, - self_attn_only_actions: bool = False, - ): - super().__init__() - if load_vlm_weights: - print(f"Loading {model_id} weights ...") - if "SmolVLM-" in model_id: - self.vlm = AutoModelForVision2Seq.from_pretrained( - model_id, - device_map="cuda", - torch_dtype="bfloat16", - low_cpu_mem_usage=True, - ) - else: - # model_id = "HuggingFaceTB/SmolVLM2-500M-Video-Instruct" - self.vlm = AutoModelForImageTextToText.from_pretrained( - model_id, - device_map="cuda", - torch_dtype="bfloat16", - low_cpu_mem_usage=True, - # attn_implementation="eager", - # attn_implementation="flash_attention_2" - ) - config = self.vlm.config - else: - config = AutoConfig.from_pretrained(model_id) - self.vlm = SmolVLMForConditionalGeneration(config=config) - self.processor = AutoProcessor.from_pretrained(model_id) - if num_vlm_layers > 0: - print(f"Reducing the number of VLM layers to {num_vlm_layers} ...") - self.get_vlm_model().text_model.layers = self.get_vlm_model().text_model.layers[:num_vlm_layers] - self.num_vlm_layers = len(self.get_vlm_model().text_model.layers) - self.config = config - # Smaller lm expert - lm_expert_config = copy.deepcopy(config.text_config) - hidden_size = lm_expert_config.hidden_size - lm_expert_config.hidden_size = int(hidden_size * expert_width_multiplier) # hidden_size // 2 - lm_expert_config.intermediate_size = get_intermediate_size(int(hidden_size * expert_width_multiplier)) - lm_expert_config.num_hidden_layers = self.num_vlm_layers - if num_expert_layers > 0: - assert len(self.get_vlm_model().text_model.layers) % num_expert_layers == 0, ( - f"Number of layers in the VLM {len(self.get_vlm_model().text_model.layers)} are not multiple of num_expert_layers {num_expert_layers}" - ) - lm_expert_config.num_hidden_layers = num_expert_layers - # lm_expert_config.head_dim = lm_expert_config.head_dim * 2 - self.lm_expert = AutoModel.from_config(lm_expert_config) - - self.num_expert_layers = len(self.lm_expert.layers) - self.self_attn_every_n_layers = self_attn_every_n_layers - self.self_attn_only_actions = self_attn_only_actions - if "cross" in attention_mode: - # Reshape qkv projections to have the same input dimension as the vlm - for layer_idx in range(len(self.lm_expert.layers)): - if self.self_attn_every_n_layers > 0 and layer_idx % self.self_attn_every_n_layers == 0: - continue - self.lm_expert.layers[layer_idx].self_attn.k_proj = nn.Linear( - config.text_config.num_key_value_heads * config.text_config.head_dim, - lm_expert_config.num_key_value_heads * lm_expert_config.head_dim, - bias=lm_expert_config.attention_bias, - ) - self.lm_expert.layers[layer_idx].self_attn.v_proj = nn.Linear( - config.text_config.num_key_value_heads * config.text_config.head_dim, - lm_expert_config.num_key_value_heads * lm_expert_config.head_dim, - bias=lm_expert_config.attention_bias, - ) - # Remove unused embed_tokens - self.lm_expert.embed_tokens = None - - self.num_attention_heads = self.config.text_config.num_attention_heads - self.num_key_value_heads = self.config.text_config.num_key_value_heads - - self.freeze_vision_encoder = freeze_vision_encoder - self.train_expert_only = train_expert_only - self.attention_implementation = attention_implementation - self.attention_mode = attention_mode - self.expert_hidden_size = lm_expert_config.hidden_size - # self.to_bfloat16_like_physical_intelligence() - self.set_requires_grad() - - def configure_peft(self, config): - # return model - self.peft_method = config.peft_method - self.peft_target_model = config.peft_target_model - if "lora" in self.peft_method: - peft_config = config.peft_config - target_modules = peft_config.target_modules - if not isinstance(target_modules, list): - target_modules = target_modules.split(",") - lora_config = LoraConfig( - task_type=TaskType.CAUSAL_LM, # Based on the task type (e.g., language modeling, etc.) - r=peft_config.r, # The rank of the low-rank adaptation - lora_alpha=peft_config.lora_alpha, # Scaling factor - lora_dropout=peft_config.lora_dropout, # Dropout applied to LoRA layers - target_modules=target_modules, # The components where LoRA is applied - exclude_modules=[ - "lm_expert", - "model.lm_expert.model.layers", - ], # FIXME(mshukor): this does not work for now - ) - self.lora_config = lora_config - # Apply LoRA and ensure only LoRA parameters are trainable - if "text" in self.peft_target_model: - self.get_vlm_model().text_model = get_peft_model(self.get_vlm_model().text_model, lora_config) - else: - self.vlm = get_peft_model(self.vlm, lora_config) - # assert config.train_expert_only, "Backbone should be frozen and only lora parameters are " # FIXME(mshukor): handle this here? - for name, param in self.vlm.named_parameters(): - if ( - "lora" in name and "text_model.model.layers.17" not in name - ): # lm_head is not a parameter in most LLMs becasue it's tied to the embedding layer - param.requires_grad = True - else: - param.requires_grad = False - - def merge_lora_weights(self): - """ - Merge LoRA weights into the base model. - """ - if "text" in self.peft_target_model: - self.get_vlm_model().text_model = self.get_vlm_model().text_model.merge_and_unload() - else: - self.vlm = self.vlm.merge_and_unload() - - def get_vlm_model( - self, - ): - if hasattr(self.vlm.model, "model"): # When using peft - return self.vlm.model.model - else: - return self.vlm.model - - def set_requires_grad(self): - if self.freeze_vision_encoder: - self.get_vlm_model().vision_model.eval() - for params in self.get_vlm_model().vision_model.parameters(): - params.requires_grad = False - if self.train_expert_only: - self.vlm.eval() - for params in self.vlm.parameters(): - params.requires_grad = False - else: - # To avoid unused params issue with distributed training - last_layers = [self.num_vlm_layers - 1] - if ( - self.num_vlm_layers != self.num_expert_layers - and self.num_vlm_layers % self.num_expert_layers == 0 - ): - last_layers.append(self.num_vlm_layers - 2) - frozen_layers = [ - "lm_head", - "text_model.model.norm.weight", - ] - for layer in last_layers: - frozen_layers.append(f"text_model.model.layers.{layer}.") - - for name, params in self.vlm.named_parameters(): - if any([k in name for k in frozen_layers]): - params.requires_grad = False - # To avoid unused params issue with distributed training - for name, params in self.lm_expert.named_parameters(): - if any( - [ - k in name - for k in [ - "lm_head", - ] - ] - ): - params.requires_grad = False - - def train(self, mode: bool = True): - super().train(mode) - - if self.freeze_vision_encoder: - self.get_vlm_model().vision_model.eval() - - if self.train_expert_only: - self.vlm.eval() - - # def to_bfloat16_like_physical_intelligence(self): - # self.vlm = self.vlm.to(dtype=torch.bfloat16) - - # params_to_change_dtype = [ - # "language_model.model.layers", - # "gemma_expert.model.layers", - # "vision_tower", - # "multi_modal", - # ] - # for name, param in self.named_parameters(): - # if any(selector in name for selector in params_to_change_dtype): - # param.data = param.data.to(dtype=torch.bfloat16) - - def embed_image(self, image: torch.Tensor): - patch_attention_mask = None - # # FIXME(mshukor): probably not needed as we don't have padded images here - # pixel_values = image.unsqueeze(1) - # batch_size, num_images, num_channels, height, width = pixel_values.shape - # pixel_values = pixel_values - # pixel_values = pixel_values.view(batch_size * num_images, *pixel_values.shape[2:]) - - # # Remove padding images - padding images are full 0. - # nb_values_per_image = pixel_values.shape[1:].numel() - # real_images_inds = (pixel_values == 0.0).sum(dim=(-1, -2, -3)) != nb_values_per_image - - # if not any(real_images_inds): - # # no images, leave one empty image. - # real_images_inds[0] = True - - # pixel_values = pixel_values[real_images_inds].contiguous() - - # # Handle the vision attention mask - - # pixel_attention_mask = torch.ones( - # size=[pixel_values.shape[i] for i in (0, 2, 3)], - # dtype=torch.bool, - # device=pixel_values.device, - # ) - - # patch_size = self.vlm.config.vision_config.patch_size - # patches_subgrid = pixel_attention_mask.unfold(dimension=1, size=patch_size, step=patch_size) - # patches_subgrid = patches_subgrid.unfold(dimension=2, size=patch_size, step=patch_size) - # patch_attention_mask = (patches_subgrid.sum(dim=(-1, -2)) > 0).bool() - - # FIXME(mshukor): add special image tokens specific to smolvlm - # Get sequence from the vision encoder - image_hidden_states = ( - self.get_vlm_model() - .vision_model( - pixel_values=image.to(dtype=self.get_vlm_model().vision_model.dtype), - patch_attention_mask=patch_attention_mask, - ) - .last_hidden_state - ) - # Modality projection & resampling - image_hidden_states = self.get_vlm_model().connector(image_hidden_states) - return image_hidden_states - - def embed_language_tokens(self, tokens: torch.Tensor): - return self.get_vlm_model().text_model.get_input_embeddings()(tokens) - - def forward_attn_layer( - self, - model_layers, - inputs_embeds, - layer_idx, - position_ids, - attention_mask, - batch_size, - head_dim, - use_cache: bool = True, - fill_kv_cache: bool = True, - past_key_values=None, - ) -> list[torch.Tensor]: - query_states = [] - key_states = [] - value_states = [] - for i, hidden_states in enumerate(inputs_embeds): - layer = model_layers[i][layer_idx] - if hidden_states is None or layer is None: - continue - - # normalizer = torch.tensor(models[i].config.hidden_size**0.5, dtype=hidden_states.dtype) - # hidden_states = hidden_states * normalizer - hidden_states = layer.input_layernorm(hidden_states) - - input_shape = hidden_states.shape[:-1] - hidden_shape = (*input_shape, -1, layer.self_attn.head_dim) - - hidden_states = hidden_states.to(dtype=layer.self_attn.q_proj.weight.dtype) - query_state = layer.self_attn.q_proj(hidden_states).view(hidden_shape) - key_state = layer.self_attn.k_proj(hidden_states).view(hidden_shape) - value_state = layer.self_attn.v_proj(hidden_states).view(hidden_shape) - - query_states.append(query_state) - key_states.append(key_state) - value_states.append(value_state) - - # FIXME(mshukor): self attention always when having only the prefix - # B,L,H,D with L sequence length, H number of heads, D head dim - # concatenate on the number of embeddings/tokens - query_states = torch.cat(query_states, dim=1) - key_states = torch.cat(key_states, dim=1) - value_states = torch.cat(value_states, dim=1) - # FIXME(mshukor): seq should be B, H, L, D ? - seq_len = query_states.shape[1] - if seq_len < position_ids.shape[1]: - _position_ids = position_ids[:, :seq_len] - _attention_mask = attention_mask[:, :seq_len, :seq_len] - else: - _position_ids = position_ids - _attention_mask = attention_mask - - if self.self_attn_only_actions: - attention_mask_ = _attention_mask.clone() - position_ids_ = _position_ids.clone() - if inputs_embeds[1] is not None: - suffix_len = inputs_embeds[1].shape[1] - attention_mask_[:, -suffix_len:, :-suffix_len] = False - position_ids_[:, -suffix_len:] = ( - _position_ids[:, -suffix_len:] - _position_ids[:, -suffix_len][:, None] - ) - else: - attention_mask_ = _attention_mask - position_ids_ = _position_ids - - query_states = apply_rope( - query_states, position_ids_ - ) # FIXME(mshukor): this assumes we have always the vlm features? - key_states = apply_rope(key_states, position_ids_) - - if use_cache and past_key_values is None: - past_key_values = {} - - if use_cache: - if fill_kv_cache: - past_key_values[layer_idx] = { - "key_states": key_states, - "value_states": value_states, - } - else: - # TODO here, some optimization can be done - similar to a `StaticCache` we can declare the `max_len` before. - # so we create an empty cache, with just one cuda malloc, and if (in autoregressive case) we reach - # the max len, then we (for instance) double the cache size. This implementation already exists - # in `transformers`. (molbap) - key_states = torch.cat([past_key_values[layer_idx]["key_states"], key_states], dim=1) - value_states = torch.cat([past_key_values[layer_idx]["value_states"], value_states], dim=1) - - attention_interface = self.get_attention_interface() - - att_output = attention_interface( - attention_mask_, batch_size, head_dim, query_states, key_states, value_states - ) - # att_output = att_output.to(dtype=models[i].dtype) - - return [att_output], past_key_values - - def forward_cross_attn_layer( - self, - model_layers, - inputs_embeds, - layer_idx, - position_ids, - attention_mask, - batch_size, - head_dim, - use_cache: bool = True, - fill_kv_cache: bool = True, - past_key_values=None, - ) -> list[torch.Tensor]: - attention_interface = self.get_attention_interface() - - att_outputs = [] - assert len(inputs_embeds) == 2 or (use_cache and past_key_values is not None and not fill_kv_cache), ( - f"Both len(inputs_embeds) == {len(inputs_embeds)} and past_key_values is {past_key_values}" - ) - - if len(inputs_embeds) == 2 and not past_key_values: - # Prefix attention - seq_len = inputs_embeds[0].shape[1] - position_id, expert_position_id = position_ids[:, :seq_len], position_ids[:, seq_len:] - prefix_attention_mask = attention_mask[:, :seq_len, :seq_len] - - layer = model_layers[0][layer_idx] - - hidden_states = layer.input_layernorm(inputs_embeds[0]) - - input_shape = hidden_states.shape[:-1] - hidden_shape = (*input_shape, -1, layer.self_attn.head_dim) - - hidden_states = hidden_states.to(dtype=layer.self_attn.q_proj.weight.dtype) - query_state = layer.self_attn.q_proj(hidden_states).view(hidden_shape) - key_state = layer.self_attn.k_proj(hidden_states).view(hidden_shape) - value_states = layer.self_attn.v_proj(hidden_states).view(hidden_shape) - - # B,L,H,D with L sequence length, H number of heads, D head dim - query_states = apply_rope(query_state, position_id) - key_states = apply_rope(key_state, position_id) - - att_output = attention_interface( - prefix_attention_mask, batch_size, head_dim, query_states, key_states, value_states - ) - att_outputs.append(att_output) - else: - expert_position_id = position_ids - - if use_cache and past_key_values is None: - past_key_values = {} - - if use_cache: - if fill_kv_cache: - past_key_values[layer_idx] = { - "key_states": key_states, - "value_states": value_states, - } - else: - # TODO here, some optimization can be done - similar to a `StaticCache` we can declare the `max_len` before. - # so we create an empty cache, with just one cuda malloc, and if (in autoregressive case) we reach - # the max len, then we (for instance) double the cache size. This implementation already exists - # in `transformers`. (molbap) - key_states = past_key_values[layer_idx]["key_states"] - value_states = past_key_values[layer_idx]["value_states"] - - # Expert - expert_layer = model_layers[1][layer_idx] - if expert_layer is not None: - expert_hidden_states = expert_layer.input_layernorm(inputs_embeds[1]) - - expert_input_shape = expert_hidden_states.shape[:-1] - expert_hidden_shape = (*expert_input_shape, -1, expert_layer.self_attn.head_dim) - - expert_hidden_states = expert_hidden_states.to(dtype=expert_layer.self_attn.q_proj.weight.dtype) - expert_query_state = expert_layer.self_attn.q_proj(expert_hidden_states).view(expert_hidden_shape) - - _key_states = key_states.to(dtype=expert_layer.self_attn.k_proj.weight.dtype).view( - *key_states.shape[:2], -1 - ) - expert_key_states = expert_layer.self_attn.k_proj(_key_states).view( - *_key_states.shape[:-1], -1, expert_layer.self_attn.head_dim - ) # k_proj should have same dim as kv - - _value_states = value_states.to(dtype=expert_layer.self_attn.v_proj.weight.dtype).view( - *value_states.shape[:2], -1 - ) - expert_value_states = expert_layer.self_attn.v_proj(_value_states).view( - *_value_states.shape[:-1], -1, expert_layer.self_attn.head_dim - ) - - expert_position_id = ( - expert_position_id - torch.min(expert_position_id, dim=1, keepdim=True).values - ) # start from 0 - expert_attention_mask = attention_mask[ - :, -inputs_embeds[1].shape[1] :, : expert_key_states.shape[1] : - ] # take into account kv - - expert_query_states = apply_rope(expert_query_state, expert_position_id) - # expert_key_states = apply_rope(expert_key_state, expert_position_id) - - att_output = attention_interface( - expert_attention_mask, - batch_size, - head_dim, - expert_query_states, - expert_key_states, - expert_value_states, - ) - att_outputs.append(att_output) - else: - att_outputs.append(None) - - # att_output = att_output.to(dtype=models[i].dtype) - return att_outputs, past_key_values - - def get_model_layers(self, models: list) -> list: # FIXME(mshukor): is this efficient? - vlm_layers = [] - expert_layers = [] - multiple_of = self.num_vlm_layers // self.num_expert_layers - for i in range(self.num_vlm_layers): - if multiple_of > 0 and i > 0 and i % multiple_of != 0: - expert_layer = None - else: - expert_layer_index = i // multiple_of if multiple_of > 0 else i - expert_layer = models[1].layers[expert_layer_index] - vlm_layers.append(models[0].layers[i]) - expert_layers.append(expert_layer) - return [vlm_layers, expert_layers] - - # TODO: break down this huge forward into modules or functions - def forward( - self, - attention_mask: torch.Tensor | None = None, - position_ids: torch.LongTensor | None = None, - past_key_values: list[torch.FloatTensor] | Cache | None = None, - inputs_embeds: list[torch.FloatTensor] = None, - use_cache: bool | None = None, - fill_kv_cache: bool | None = None, - ): - models = [self.get_vlm_model().text_model, self.lm_expert] - model_layers = self.get_model_layers(models) - for hidden_states in inputs_embeds: - # TODO this is very inefficient - # dtype is always the same, batch size too (if > 1 len) - # device could be trickier in multi gpu edge cases but that's it - if hidden_states is None: - continue - batch_size = hidden_states.shape[0] - - # # Pad prefix embds so that prefix_embs + prefix_embs len are multiple of 128, pad left or right depending on the gen or train - if self.attention_implementation == "flex": - if ( - inputs_embeds[0] is not None - and inputs_embeds[1] is not None - and attention_mask.shape[-1] == attention_mask.shape[-2] - and past_key_values is None - ): # Now only during training - seq_len = inputs_embeds[0].shape[1] + inputs_embeds[1].shape[1] - padded_seq_len = _round_up_to_multiple( - seq_len, 128 - ) # FIXME(mshukor): more efficient to have a fixed seq len? - b_mask, q_len, kv_len = attention_mask.shape # The shape of your mask - pad = padded_seq_len - q_len - attention_mask = F.pad(attention_mask, (0, pad, 0, pad), value=True) - inputs_embeds[0] = F.pad(inputs_embeds[0], (0, 0, 0, pad), value=0.0) - position_ids = F.pad(position_ids, (0, pad), value=0) - - # RMSNorm - num_layers = self.num_vlm_layers - head_dim = self.vlm.config.text_config.head_dim - for layer_idx in range(num_layers): - if ( - fill_kv_cache - or "cross" not in self.attention_mode - or (self.self_attn_every_n_layers > 0 and layer_idx % self.self_attn_every_n_layers == 0) - ): - att_outputs, past_key_values = self.forward_attn_layer( - model_layers, - inputs_embeds, - layer_idx, - position_ids, - attention_mask, - batch_size, - head_dim, - use_cache=use_cache, - fill_kv_cache=fill_kv_cache, - past_key_values=past_key_values, - ) - else: - att_outputs, past_key_values = self.forward_cross_attn_layer( - model_layers, - inputs_embeds, - layer_idx, - position_ids, - attention_mask, - batch_size, - head_dim, - use_cache=use_cache, - fill_kv_cache=fill_kv_cache, - past_key_values=past_key_values, - ) - # query_states = [] - # key_states = [] - # value_states = [] - # for i, hidden_states in enumerate(inputs_embeds): - # if hidden_states is None: - # continue - # layer = models[i].layers[layer_idx] - # # normalizer = torch.tensor(models[i].config.hidden_size**0.5, dtype=hidden_states.dtype) - # # hidden_states = hidden_states * normalizer - # hidden_states = layer.input_layernorm(hidden_states) - - # input_shape = hidden_states.shape[:-1] - # hidden_shape = (*input_shape, -1, layer.self_attn.head_dim) - - # hidden_states = hidden_states.to(dtype=layer.self_attn.q_proj.weight.dtype) - # query_state = layer.self_attn.q_proj(hidden_states).view(hidden_shape) - # key_state = layer.self_attn.k_proj(hidden_states).view(hidden_shape) - # value_state = layer.self_attn.v_proj(hidden_states).view(hidden_shape) - - # query_states.append(query_state) - # key_states.append(key_state) - # value_states.append(value_state) - - # # FIXME(mshukor): self attention always when having only the prefix - # # B,L,H,D with L sequence length, H number of heads, D head dim - # # concatenate on the number of embeddings/tokens - # query_states = torch.cat(query_states, dim=1) - # key_states = torch.cat(key_states, dim=1) - # value_states = torch.cat(value_states, dim=1) - # # FIXME(mshukor): seq should be B, H, L, D ? - # query_states = apply_rope(query_states, position_ids) - # key_states = apply_rope(key_states, position_ids) - - # if use_cache and past_key_values is None: - # past_key_values = {} - - # if use_cache: - # if fill_kv_cache: - # past_key_values[layer_idx] = { - # "key_states": key_states, - # "value_states": value_states, - # } - # else: - # # TODO here, some optimization can be done - similar to a `StaticCache` we can declare the `max_len` before. - # # so we create an empty cache, with just one cuda malloc, and if (in autoregressive case) we reach - # # the max len, then we (for instance) double the cache size. This implementation already exists - # # in `transformers`. (molbap) - # key_states = torch.cat([past_key_values[layer_idx]["key_states"], key_states], dim=1) - # value_states = torch.cat( - # [past_key_values[layer_idx]["value_states"], value_states], dim=1 - # ) - - # attention_interface = self.get_attention_interface() - # att_output = attention_interface( - # attention_mask, batch_size, head_dim, query_states, key_states, value_states - # ) - - # att_output = att_output.to(dtype=models[i].dtype) - - # first part of att_output is prefix (up to sequence length, [:, 0:prefix_seq_len]) - outputs_embeds = [] - start = 0 - for i, hidden_states in enumerate(inputs_embeds): - # layer = models[i].layers[layer_idx] - layer = model_layers[i][layer_idx] - att_output = ( - att_outputs[i] if i < len(att_outputs) else att_outputs[0] - ) # in case of self_attn - if hidden_states is not None: - if layer is None: - outputs_embeds.append(hidden_states) - continue - end = start + hidden_states.shape[1] - - if att_output.dtype != layer.self_attn.o_proj.weight.dtype: - att_output = att_output.to(layer.self_attn.o_proj.weight.dtype) - att_out = att_output[:, start:end] - out_emb = layer.self_attn.o_proj(att_out) - - # TODO: first dropout (by default 0.0) - # first residual - out_emb += hidden_states - after_first_residual = out_emb.clone() - - out_emb = layer.post_attention_layernorm(out_emb) - out_emb = layer.mlp(out_emb) - - # TODO: second dropout (by default 0.0) - - # second residual - out_emb += after_first_residual - - outputs_embeds.append(out_emb) - - start = end if len(att_outputs) == 1 else 0 - else: - outputs_embeds.append(None) - - inputs_embeds = outputs_embeds - - # final norm - outputs_embeds = [] - for i, hidden_states in enumerate(inputs_embeds): - if hidden_states is not None: - out_emb = models[i].norm(hidden_states) - outputs_embeds.append(out_emb) - else: - outputs_embeds.append(None) - return outputs_embeds, past_key_values - - def get_attention_interface(self): - if self.attention_implementation == "fa2": - attention_interface = self.flash_attention_forward - elif self.attention_implementation == "flex": - attention_interface = partial( - flex_attention_forward, - num_att_heads=self.num_attention_heads, - num_key_value_heads=self.num_key_value_heads, - ) - else: - attention_interface = self.eager_attention_forward - return attention_interface - - def flash_attention_forward( - self, attention_mask, batch_size, head_dim, query_states, key_states, value_states - ): - raise NotImplementedError("FA2 is not implemented (yet)") - - def eager_attention_forward( - self, attention_mask, batch_size, head_dim, query_states, key_states, value_states - ): - num_att_heads = self.num_attention_heads - num_key_value_heads = self.num_key_value_heads - num_key_value_groups = num_att_heads // num_key_value_heads - - # query_states: batch_size, sequence_length, num_att_head, head_dim - # key_states: batch_size, sequence_length, num_key_value_head, head_dim - # value_states: batch_size, sequence_length, num_key_value_head, head_dim - sequence_length = key_states.shape[1] - - key_states = key_states[:, :, :, None, :].expand( - batch_size, sequence_length, num_key_value_heads, num_key_value_groups, head_dim - ) - key_states = key_states.reshape( - batch_size, sequence_length, num_key_value_heads * num_key_value_groups, head_dim - ) - - value_states = value_states[:, :, :, None, :].expand( - batch_size, sequence_length, num_key_value_heads, num_key_value_groups, head_dim - ) - value_states = value_states.reshape( - batch_size, sequence_length, num_key_value_heads * num_key_value_groups, head_dim - ) - - # Attention here is upcasted to float32 to match the original eager implementation. - - query_states = query_states.to(dtype=torch.float32) - key_states = key_states.to(dtype=torch.float32) - - query_states = query_states.transpose(1, 2) - key_states = key_states.transpose(1, 2) - - att_weights = torch.matmul(query_states, key_states.transpose(2, 3)) - att_weights *= head_dim**-0.5 - - att_weights = att_weights.to(dtype=torch.float32) - big_neg = torch.finfo(att_weights.dtype).min # -2.3819763e38 # See gemma/modules.py - masked_att_weights = torch.where(attention_mask[:, None, :, :], att_weights, big_neg) - probs = nn.functional.softmax(masked_att_weights, dim=-1) - probs = probs.to(dtype=value_states.dtype) - - # probs: batch_size, num_key_value_head, num_att_head, sequence_length, sequence_length - # value_states: batch_size, sequence_length, num_att_heads, head_dim - - att_output = torch.matmul(probs, value_states.permute(0, 2, 1, 3)) - - att_output = att_output.permute(0, 2, 1, 3) - # we use -1 because sequence length can change - att_output = att_output.reshape(batch_size, -1, num_key_value_heads * num_key_value_groups * head_dim) - - return att_output diff --git a/src/lerobot/policies/smolvla/modeling_smolvla.py b/src/lerobot/policies/smolvla/modeling_smolvla.py index 9bb22d7f7..6bf956aa3 100644 --- a/src/lerobot/policies/smolvla/modeling_smolvla.py +++ b/src/lerobot/policies/smolvla/modeling_smolvla.py @@ -1,955 +1,3 @@ -# #!/usr/bin/env python - -# # Copyright 2025 HuggingFace Inc. team. All rights reserved. -# # -# # Licensed under the Apache License, Version 2.0 (the "License"); -# # you may not use this file except in compliance with the License. -# # You may obtain a copy of the License at -# # -# # http://www.apache.org/licenses/LICENSE-2.0 -# # -# # Unless required by applicable law or agreed to in writing, software -# # distributed under the License is distributed on an "AS IS" BASIS, -# # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. -# # See the License for the specific language governing permissions and -# # limitations under the License. - -# """ -# SmolVLA: - -# [Paper](https://huggingface.co/papers/2506.01844) - -# Designed by Hugging Face. - -# Install smolvla extra dependencies: -# ```bash -# pip install -e ".[smolvla]" -# ``` - -# Example of finetuning the smolvla pretrained model (`smolvla_base`): -# ```bash -# lerobot-train \ -# --policy.path=lerobot/smolvla_base \ -# --dataset.repo_id=danaaubakirova/svla_so100_task1_v3 \ -# --batch_size=64 \ -# --steps=200000 -# ``` - -# Example of finetuning a smolVLA. SmolVLA is composed of a pretrained VLM, -# and an action expert. -# ```bash -# lerobot-train \ -# --policy.type=smolvla \ -# --dataset.repo_id=danaaubakirova/svla_so100_task1_v3 \ -# --batch_size=64 \ -# --steps=200000 -# ``` - -# Example of using the smolvla pretrained model outside LeRobot training framework: -# ```python -# policy = SmolVLAPolicy.from_pretrained("lerobot/smolvla_base") -# ``` - -# """ - -# import math -# import os -# import re -# from collections import deque - -# import safetensors -# import torch -# import torch.nn.functional as F # noqa: N812 -# from torch import Tensor, nn -# from transformers import AutoProcessor - -# from lerobot.constants import ACTION -# from lerobot.policies.normalize import ( -# Normalize, -# Unnormalize, -# ) -# from lerobot.policies.pretrained import PreTrainedPolicy -# from lerobot.policies.smolvla.configuration_smolvla import SmolVLAConfig -# from lerobot.policies.smolvla.smolvlm_with_expert import SmolVLMWithExpertModel -# from lerobot.policies.utils import ( -# populate_queues, -# ) -# from lerobot.utils.utils import get_safe_dtype -# OBS_STATE = 'state' -# ACTION = 'actions' -# # Matches ".soNNN", optionally followed by "-something", up to the "_buffer_" marker -# _VARIANT_RE = re.compile(r"\.so\d+(?:-[\w]+)?_buffer_") - - -# def canonicalise(k: str) -> str: -# """ -# Remove dataset-variant markers like '.so100-blue_' or '.so100_' from a -# normalisation-buffer key. -# """ -# return _VARIANT_RE.sub(".buffer_", k) - - -# def standardise_state_dict( -# checkpoint: dict[str, torch.Tensor], ref_keys: set[str], *, verbose: bool = True -# ) -> tuple[dict[str, torch.Tensor], list[str]]: -# """ -# β€’ Re-keys `checkpoint ` so that every entry matches the *reference* key set. -# β€’ If several variant keys collapse to the same canonical name we keep the -# first one and log the collision. -# β€’ Returns the new dict + a list of entries that could not be matched. -# """ -# out, collisions, unmatched = {}, {}, [] - -# for k, v in checkpoint.items(): -# canon = canonicalise(k) -# if canon in ref_keys: -# if canon in out: # duplicate after collapsing -# collisions.setdefault(canon, []).append(k) -# else: -# out[canon] = v -# else: -# unmatched.append(k) - -# if verbose: -# for canon, variants in collisions.items(): -# print(f"[standardise_state_dict] '{canon}' ← {variants}") -# if unmatched: -# print(f"[standardise_state_dict] kept {len(unmatched)} unmatched keys") - -# out.update({k: checkpoint[k] for k in unmatched}) -# return out, unmatched - - -# def rename_checkpoint_keys(checkpoint: dict, rename_str: str): -# """ -# Renames keys in a checkpoint dictionary based on the given rename string. - -# Args: -# checkpoint (dict): The checkpoint dictionary. -# rename_str (str): A string specifying key mappings in the format "old1//new1,old2//new2". - -# Returns: -# dict: The modified checkpoint with renamed keys. -# """ - -# rename_dict = dict(pair.split("//") for pair in rename_str.split(",")) - -# new_checkpoint = {} -# for k, v in checkpoint.items(): -# for old_key, new_key in rename_dict.items(): -# if old_key in k: -# k = k.replace(old_key, new_key) -# new_checkpoint[k] = v -# return new_checkpoint - - -# def load_smolvla( -# model: torch.nn.Module, -# filename: str | os.PathLike, -# *, -# device: str = "cpu", -# checkpoint_keys_mapping: str = "", -# ) -> torch.nn.Module: -# state_dict = safetensors.torch.load_file(filename, device=device) - -# # Optional user-supplied renames (e.g. "model._orig_mod.//model.") -# if checkpoint_keys_mapping and "//" in checkpoint_keys_mapping: -# state_dict = rename_checkpoint_keys(state_dict, checkpoint_keys_mapping) - -# state_dict, _ = standardise_state_dict(state_dict, set(model.state_dict().keys())) - -# # HACK(aliberts): to not overwrite normalization parameters as they should come from the dataset -# norm_keys = ("normalize_inputs", "normalize_targets", "unnormalize_outputs") -# state_dict = {k: v for k, v in state_dict.items() if not k.startswith(norm_keys)} - -# missing, unexpected = model.load_state_dict(state_dict, strict=False) - -# if not all(key.startswith(norm_keys) for key in missing) or unexpected: -# raise RuntimeError( -# "SmolVLA %d missing / %d unexpected keys", -# len(missing), -# len(unexpected), -# ) - -# return model - - -# def create_sinusoidal_pos_embedding( -# time: torch.tensor, dimension: int, min_period: float, max_period: float, device="cpu" -# ) -> Tensor: -# """Computes sine-cosine positional embedding vectors for scalar positions.""" -# if dimension % 2 != 0: -# raise ValueError(f"dimension ({dimension}) must be divisible by 2") - -# if time.ndim != 1: -# raise ValueError("The time tensor is expected to be of shape `(batch_size, )`.") - -# dtype = get_safe_dtype(torch.float64, device.type) -# fraction = torch.linspace(0.0, 1.0, dimension // 2, dtype=dtype, device=device) -# period = min_period * (max_period / min_period) ** fraction - -# # Compute the outer product -# scaling_factor = 1.0 / period * 2 * math.pi -# sin_input = scaling_factor[None, :] * time[:, None] -# pos_emb = torch.cat([torch.sin(sin_input), torch.cos(sin_input)], dim=1) -# return pos_emb - - -# def make_att_2d_masks(pad_masks, att_masks): -# """Copied from big_vision. - -# Tokens can attend to valid inputs tokens which have a cumulative mask_ar -# smaller or equal to theirs. This way `mask_ar` int[B, N] can be used to -# setup several types of attention, for example: - -# [[1 1 1 1 1 1]]: pure causal attention. - -# [[0 0 0 1 1 1]]: prefix-lm attention. The first 3 tokens can attend between -# themselves and the last 3 tokens have a causal attention. The first -# entry could also be a 1 without changing behaviour. - -# [[1 0 1 0 1 0 0 1 0 0]]: causal attention between 4 blocks. Tokens of a -# block can attend all previous blocks and all tokens on the same block. - -# Args: -# input_mask: bool[B, N] true if its part of the input, false if padding. -# mask_ar: int32[B, N] mask that's 1 where previous tokens cannot depend on -# it and 0 where it shares the same attention mask as the previous token. -# """ -# if att_masks.ndim != 2: -# raise ValueError(att_masks.ndim) -# if pad_masks.ndim != 2: -# raise ValueError(pad_masks.ndim) - -# cumsum = torch.cumsum(att_masks, dim=1) -# att_2d_masks = cumsum[:, None, :] <= cumsum[:, :, None] -# pad_2d_masks = pad_masks[:, None, :] * pad_masks[:, :, None] -# att_2d_masks = att_2d_masks & pad_2d_masks -# return att_2d_masks - - -# def resize_with_pad(img, width, height, pad_value=-1): -# # assume no-op when width height fits already -# if img.ndim != 4: -# raise ValueError(f"(b,c,h,w) expected, but {img.shape}") - -# cur_height, cur_width = img.shape[2:] - -# ratio = max(cur_width / width, cur_height / height) -# resized_height = int(cur_height / ratio) -# resized_width = int(cur_width / ratio) -# resized_img = F.interpolate( -# img, size=(resized_height, resized_width), mode="bilinear", align_corners=False -# ) - -# pad_height = max(0, int(height - resized_height)) -# pad_width = max(0, int(width - resized_width)) - -# # pad on left and top of image -# padded_img = F.pad(resized_img, (pad_width, 0, pad_height, 0), value=pad_value) -# return padded_img - - -# def pad_vector(vector, new_dim): -# """Can be (batch_size x sequence_length x features_dimension) -# or (batch_size x features_dimension) -# """ -# if vector.shape[-1] == new_dim: -# return vector -# shape = list(vector.shape) -# current_dim = shape[-1] -# shape[-1] = new_dim -# new_vector = torch.zeros(*shape, dtype=vector.dtype, device=vector.device) -# new_vector[..., :current_dim] = vector -# return new_vector - - -# def normalize(x, min_val, max_val): -# return (x - min_val) / (max_val - min_val) - - -# def unnormalize(x, min_val, max_val): -# return x * (max_val - min_val) + min_val - - -# def safe_arcsin(value): -# # This ensures that the input stays within -# # [βˆ’1,1] to avoid invalid values for arcsin -# return torch.arcsin(torch.clamp(value, -1.0, 1.0)) - - -# def aloha_gripper_to_angular(value): -# # Aloha transforms the gripper positions into a linear space. The following code -# # reverses this transformation to be consistent with smolvla which is pretrained in -# # angular space. -# # -# # These values are coming from the Aloha code: -# # PUPPET_GRIPPER_POSITION_OPEN, PUPPET_GRIPPER_POSITION_CLOSED -# value = unnormalize(value, min_val=0.01844, max_val=0.05800) - -# # This is the inverse of the angular to linear transformation inside the Interbotix code. -# def linear_to_radian(linear_position, arm_length, horn_radius): -# value = (horn_radius**2 + linear_position**2 - arm_length**2) / (2 * horn_radius * linear_position) -# return safe_arcsin(value) - -# # The constants are taken from the Interbotix code. -# value = linear_to_radian(value, arm_length=0.036, horn_radius=0.022) - -# # Normalize to [0, 1]. -# # The values 0.4 and 1.5 were measured on an actual Trossen robot. -# return normalize(value, min_val=0.4, max_val=1.5) - - -# def aloha_gripper_from_angular(value): -# # Convert from the gripper position used by smolvla to the gripper position that is used by Aloha. -# # Note that the units are still angular but the range is different. - -# # The values 0.4 and 1.5 were measured on an actual Trossen robot. -# value = unnormalize(value, min_val=0.4, max_val=1.5) - -# # These values are coming from the Aloha code: -# # PUPPET_GRIPPER_JOINT_OPEN, PUPPET_GRIPPER_JOINT_CLOSE -# return normalize(value, min_val=-0.6213, max_val=1.4910) - - -# def aloha_gripper_from_angular_inv(value): -# # Directly inverts the gripper_from_angular function. -# value = unnormalize(value, min_val=-0.6213, max_val=1.4910) -# return normalize(value, min_val=0.4, max_val=1.5) - - -# class SmolVLAPolicy(PreTrainedPolicy): -# """Wrapper class around VLAFlowMatching model to train and run inference within LeRobot.""" - -# config_class = SmolVLAConfig -# name = "smolvla" - -# def __init__( -# self, -# config: SmolVLAConfig, -# dataset_stats: dict[str, dict[str, Tensor]] | None = None, -# ): -# """ -# Args: -# config: Policy configuration class instance or None, in which case the default instantiation of -# the configuration class is used. -# dataset_stats: Dataset statistics to be used for normalization. If not passed here, it is expected -# that they will be passed with a call to `load_state_dict` before the policy is used. -# """ - -# super().__init__(config) -# config.validate_features() -# self.config = config -# self.normalize_inputs = Normalize(config.input_features, config.normalization_mapping, dataset_stats) -# self.normalize_targets = Normalize( -# config.output_features, config.normalization_mapping, dataset_stats -# ) -# self.unnormalize_outputs = Unnormalize( -# config.output_features, config.normalization_mapping, dataset_stats -# ) - -# self.language_tokenizer = AutoProcessor.from_pretrained(self.config.vlm_model_name).tokenizer -# self.model = VLAFlowMatching(config) -# self.reset() - -# def reset(self): -# """This should be called whenever the environment is reset.""" -# self._queues = { -# ACTION: deque(maxlen=self.config.n_action_steps), -# } - -# # HACK(aliberts, danaaubakirova): we overwrite this classmethod here to fix smolVLA-specific issues -# @classmethod -# def _load_as_safetensor( -# cls, -# model: "SmolVLAPolicy", -# model_file: str, -# map_location: str, -# strict: bool, -# ): -# safetensors.torch.load_model(model, model_file, strict=strict, device=map_location) -# return load_smolvla( -# model, -# model_file, -# device=map_location, -# checkpoint_keys_mapping="model._orig_mod.//model.", -# ) - -# def get_optim_params(self) -> dict: -# return self.parameters() - -# def _get_action_chunk(self, batch: dict[str, Tensor], noise: Tensor | None = None) -> Tensor: -# # TODO: Check if this for loop is needed. -# # Context: In fact, self.queues contains only ACTION field, and in inference, we don't have action in the batch -# # In the case of offline inference, we have the action in the batch -# # that why without the k != ACTION check, it will raise an error because we are trying to stack -# # on an empty container. -# for k in batch: -# if k in self._queues and k != ACTION: -# batch[k] = torch.stack(list(self._queues[k]), dim=1) - -# images, img_masks = self.prepare_images(batch) -# state = self.prepare_state(batch) -# lang_tokens, lang_masks = self.prepare_language(batch) - -# actions = self.model.sample_actions(images, img_masks, lang_tokens, lang_masks, state, noise=noise) - -# # Unpad actions -# original_action_dim = self.config.action_feature.shape[0] -# actions = actions[:, :, :original_action_dim] - -# actions = self.unnormalize_outputs({ACTION: actions})[ACTION] - -# if self.config.adapt_to_pi_aloha: -# actions = self._pi_aloha_encode_actions(actions) - -# return actions - -# def _prepare_batch(self, batch: dict[str, Tensor]) -> dict[str, Tensor]: -# if self.config.adapt_to_pi_aloha: -# batch[OBS_STATE] = self._pi_aloha_decode_state(batch[OBS_STATE]) - -# batch = self.normalize_inputs(batch) - -# return batch - -# @torch.no_grad() -# def predict_action_chunk(self, batch: dict[str, Tensor], noise: Tensor | None = None) -> Tensor: -# self.eval() - -# batch = self._prepare_batch(batch) -# self._queues = populate_queues(self._queues, batch, exclude_keys=[ACTION]) - -# actions = self._get_action_chunk(batch, noise) -# return actions - -# @torch.no_grad() -# def select_action(self, batch: dict[str, Tensor], noise: Tensor | None = None) -> Tensor: -# """Select a single action given environment observations. - -# This method wraps `select_actions` in order to return one action at a time for execution in the -# environment. It works by managing the actions in a queue and only calling `select_actions` when the -# queue is empty. -# """ -# self.eval() -# batch = self._prepare_batch(batch) -# self._queues = populate_queues(self._queues, batch, exclude_keys=[ACTION]) - -# # Action queue logic for n_action_steps > 1. When the action_queue is depleted, populate it by -# # querying the policy. -# if len(self._queues[ACTION]) == 0: -# actions = self._get_action_chunk(batch, noise) - -# # `self.predict_action_chunk` returns a (batch_size, n_action_steps, action_dim) tensor, but the queue -# # effectively has shape (n_action_steps, batch_size, *), hence the transpose. -# self._queues[ACTION].extend(actions.transpose(0, 1)[: self.config.n_action_steps]) - -# return self._queues[ACTION].popleft() - -# def forward(self, batch: dict[str, Tensor], noise=None, time=None) -> dict[str, Tensor]: -# """Do a full training forward pass to compute the loss""" -# if self.config.adapt_to_pi_aloha: -# batch[OBS_STATE] = self._pi_aloha_decode_state(batch[OBS_STATE]) -# batch[ACTION] = self._pi_aloha_encode_actions_inv(batch[ACTION]) -# batch = self.normalize_inputs(batch) -# batch = self.normalize_targets(batch) -# images, img_masks = self.prepare_images(batch) -# state = self.prepare_state(batch) -# lang_tokens, lang_masks = self.prepare_language(batch) -# actions = self.prepare_action(batch) -# actions_is_pad = batch.get("actions_id_pad") -# loss_dict = {} -# losses = self.model.forward(images, img_masks, lang_tokens, lang_masks, state, actions, noise, time) -# loss_dict["losses_after_forward"] = losses.clone() - -# if actions_is_pad is not None: -# in_episode_bound = ~actions_is_pad -# losses = losses * in_episode_bound.unsqueeze(-1) -# loss_dict["losses_after_in_ep_bound"] = losses.clone() - -# # Remove padding -# losses = losses[:, :, : self.config.max_action_dim] -# loss_dict["losses_after_rm_padding"] = losses.clone() - -# # For backward pass -# loss = losses.mean() -# # For backward pass -# loss_dict["loss"] = loss.item() -# return loss, loss_dict - -# def prepare_images(self, batch): -# """Apply SmolVLA preprocessing to the images, like resizing to 224x224 and padding to keep aspect ratio, and -# convert pixel range from [0.0, 1.0] to [-1.0, 1.0] as requested by SigLIP. -# """ -# images = [] -# img_masks = [] -# present_img_keys = [key for key in self.config.image_features if key in batch] -# missing_img_keys = [key for key in self.config.image_features if key not in batch] - -# if len(present_img_keys) == 0: -# raise ValueError( -# f"All image features are missing from the batch. At least one expected. (batch: {batch.keys()}) (image_features:{self.config.image_features})" -# ) -# # Preprocess image features present in the batch -# for key in present_img_keys: -# img = batch[key][:, -1, :, :, :] if batch[key].ndim == 5 else batch[key] -# if self.config.resize_imgs_with_padding is not None: -# img = resize_with_pad(img, *self.config.resize_imgs_with_padding, pad_value=0) - -# # Normalize from range [0,1] to [-1,1] as expacted by siglip -# img = img * 2.0 - 1.0 - -# bsize = img.shape[0] -# device = img.device -# if f"{key}_padding_mask" in batch: -# mask = batch[f"{key}_padding_mask"].bool() -# else: -# mask = torch.ones(bsize, dtype=torch.bool, device=device) -# images.append(img) -# img_masks.append(mask) - -# # Create image features not present in the batch -# # as fully 0 padded images. -# for num_empty_cameras in range(len(missing_img_keys)): -# if num_empty_cameras >= self.config.empty_cameras: -# break -# img = torch.ones_like(img) * -1 -# mask = torch.zeros_like(mask) -# images.append(img) -# img_masks.append(mask) -# return images, img_masks - -# def prepare_language(self, batch) -> tuple[Tensor, Tensor]: -# """Tokenize the text input""" -# device = batch[OBS_STATE].device -# tasks = batch["task"] -# if isinstance(tasks, str): -# tasks = [tasks] - -# if len(tasks) == 1: -# tasks = [tasks[0] for _ in range(batch[OBS_STATE].shape[0])] - -# tasks = [task if task.endswith("\n") else f"{task}\n" for task in tasks] - -# tokenized_prompt = self.language_tokenizer.__call__( -# tasks, -# padding=self.config.pad_language_to, -# padding_side="right", -# max_length=self.config.tokenizer_max_length, -# return_tensors="pt", -# ) -# lang_tokens = tokenized_prompt["input_ids"].to(device=device) -# lang_masks = tokenized_prompt["attention_mask"].to(device=device, dtype=torch.bool) - -# return lang_tokens, lang_masks - -# def _pi_aloha_decode_state(self, state): -# # Flip the joints. -# for motor_idx in [1, 2, 8, 9]: -# state[:, motor_idx] *= -1 -# # Reverse the gripper transformation that is being applied by the Aloha runtime. -# for motor_idx in [6, 13]: -# state[:, motor_idx] = aloha_gripper_to_angular(state[:, motor_idx]) -# return state - -# def _pi_aloha_encode_actions(self, actions): -# # Flip the joints. -# for motor_idx in [1, 2, 8, 9]: -# actions[:, :, motor_idx] *= -1 -# # Reverse the gripper transformation that is being applied by the Aloha runtime. -# for motor_idx in [6, 13]: -# actions[:, :, motor_idx] = aloha_gripper_from_angular(actions[:, :, motor_idx]) -# return actions - -# def _pi_aloha_encode_actions_inv(self, actions): -# # Flip the joints again. -# for motor_idx in [1, 2, 8, 9]: -# actions[:, :, motor_idx] *= -1 -# # Reverse the gripper transformation that is being applied by the Aloha runtime. -# for motor_idx in [6, 13]: -# actions[:, :, motor_idx] = aloha_gripper_from_angular_inv(actions[:, :, motor_idx]) -# return actions - -# def prepare_state(self, batch): -# """Pad state""" -# state = batch[OBS_STATE][:, -1, :] if batch[OBS_STATE].ndim > 2 else batch[OBS_STATE] -# state = pad_vector(state, self.config.max_state_dim) -# return state - -# def prepare_action(self, batch): -# """Pad action""" -# actions = pad_vector(batch[ACTION], self.config.max_action_dim) -# return actions - - -# def pad_tensor(tensor, max_len, pad_value=0): -# """ -# Efficiently pads a tensor along sequence dimension to match max_len. - -# Args: -# tensor (torch.Tensor): Shape (B, L, ...) or (B, L). -# max_len (int): Fixed sequence length. -# pad_value (int/float): Value for padding. - -# Returns: -# torch.Tensor: Shape (B, max_len, ...) or (B, max_len). -# """ -# b, d = tensor.shape[:2] - -# # Create a padded tensor of max_len and copy the existing values -# padded_tensor = torch.full( -# (b, max_len, *tensor.shape[2:]), pad_value, dtype=tensor.dtype, device=tensor.device -# ) -# padded_tensor[:, :d] = tensor # Efficient in-place copy - -# return padded_tensor - - -# class VLAFlowMatching(nn.Module): -# """ -# SmolVLA - -# [Paper]() - -# Designed by Hugging Face. -# β”Œβ”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β” -# β”‚ actions β”‚ -# β”‚ β–² β”‚ -# β”‚ β”Œβ”€β”€β”€β”€β”€β”€β”€β”€β”€β” β”Œβ”€|────┐ β”‚ -# β”‚ | │────► β”‚ β”‚ β”‚ -# β”‚ | β”‚ kv β”‚ β”‚ β”‚ -# β”‚ | │────► β”‚Actionβ”‚ β”‚ -# β”‚ | VLM β”‚cache β”‚Expertβ”‚ | -# β”‚ β”‚ │────► | β”‚ β”‚ -# β”‚ β”‚ β”‚ β”‚ β”‚ β”‚ -# β”‚ β””β–²β”€β”€β–²β”€β”€β”€β–²β”€β”˜ β””β”€β”€β”€β–²β”€β”€β”˜ | -# β”‚ β”‚ | | β”‚ | -# β”‚ | | | noise β”‚ -# β”‚ β”‚ β”‚ state β”‚ -# β”‚ β”‚ language tokens β”‚ -# β”‚ image(s) β”‚ -# β””β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”˜ -# """ - -# def __init__(self, config: SmolVLAConfig): -# super().__init__() -# self.config = config - -# self.vlm_with_expert = SmolVLMWithExpertModel( -# model_id=self.config.vlm_model_name, -# freeze_vision_encoder=self.config.freeze_vision_encoder, -# train_expert_only=self.config.train_expert_only, -# load_vlm_weights=self.config.load_vlm_weights, -# attention_mode=self.config.attention_mode, -# num_expert_layers=self.config.num_expert_layers, -# num_vlm_layers=self.config.num_vlm_layers, -# self_attn_every_n_layers=self.config.self_attn_every_n_layers, -# expert_width_multiplier=self.config.expert_width_multiplier, -# ) -# self.state_proj = nn.Linear( -# self.config.max_state_dim, self.vlm_with_expert.config.text_config.hidden_size -# ) -# self.action_in_proj = nn.Linear(self.config.max_action_dim, self.vlm_with_expert.expert_hidden_size) -# self.action_out_proj = nn.Linear(self.vlm_with_expert.expert_hidden_size, self.config.max_action_dim) - -# self.action_time_mlp_in = nn.Linear( -# self.vlm_with_expert.expert_hidden_size * 2, self.vlm_with_expert.expert_hidden_size -# ) -# self.action_time_mlp_out = nn.Linear( -# self.vlm_with_expert.expert_hidden_size, self.vlm_with_expert.expert_hidden_size -# ) - -# self.set_requires_grad() -# self.fake_image_token = self.vlm_with_expert.processor.tokenizer.fake_image_token_id -# self.global_image_token = self.vlm_with_expert.processor.tokenizer.global_image_token_id -# self.global_image_start_token = torch.tensor( -# [self.fake_image_token, self.global_image_token], dtype=torch.long -# ) - -# self.add_image_special_tokens = self.config.add_image_special_tokens -# self.image_end_token = torch.tensor([self.fake_image_token], dtype=torch.long) -# self.prefix_length = self.config.prefix_length - -# def set_requires_grad(self): -# for params in self.state_proj.parameters(): -# params.requires_grad = self.config.train_state_proj - -# def sample_noise(self, shape, device): -# noise = torch.normal( -# mean=0.0, -# std=1.0, -# size=shape, -# dtype=torch.float32, -# device=device, -# ) -# return noise - -# def sample_time(self, bsize, device): -# beta_dist = torch.distributions.Beta(concentration1=1.5, concentration0=1.0) -# time_beta = beta_dist.sample((bsize,)).to(device=device, dtype=torch.float32) -# time = time_beta * 0.999 + 0.001 -# return time - -# def embed_prefix( -# self, images, img_masks, lang_tokens, lang_masks, state: torch.Tensor = None -# ) -> tuple[torch.Tensor, torch.Tensor, torch.Tensor]: -# """Embed images with SigLIP and language tokens with embedding layer to prepare -# for SmolVLM transformer processing. -# """ -# embs = [] -# pad_masks = [] -# att_masks = [] -# for _img_idx, ( -# img, -# img_mask, -# ) in enumerate(zip(images, img_masks, strict=False)): -# if self.add_image_special_tokens: -# image_start_token = ( -# self.vlm_with_expert.embed_language_tokens( -# self.global_image_start_token.to(device=self.vlm_with_expert.vlm.device) -# ) -# .unsqueeze(0) -# .expand(img.shape[0], -1, -1) -# ) -# image_start_mask = torch.ones_like( -# image_start_token[:, :, 0], dtype=torch.bool, device=image_start_token.device -# ) -# att_masks += [0] * (image_start_mask.shape[-1]) -# embs.append(image_start_token) -# pad_masks.append(image_start_mask) - -# img_emb = self.vlm_with_expert.embed_image(img) -# img_emb = img_emb - -# # Normalize image embeddings -# img_emb_dim = img_emb.shape[-1] -# img_emb = img_emb * torch.tensor(img_emb_dim**0.5, dtype=img_emb.dtype, device=img_emb.device) - -# bsize, num_img_embs = img_emb.shape[:2] -# img_mask = img_mask[:, None].expand(bsize, num_img_embs) - -# embs.append(img_emb) -# pad_masks.append(img_mask) - -# att_masks += [0] * (num_img_embs) -# if self.add_image_special_tokens: -# image_end_token = ( -# self.vlm_with_expert.embed_language_tokens( -# self.image_end_token.to(device=self.vlm_with_expert.vlm.device) -# ) -# .unsqueeze(0) -# .expand(img.shape[0], -1, -1) -# ) -# image_end_mask = torch.ones_like( -# image_end_token[:, :, 0], dtype=torch.bool, device=image_end_token.device -# ) -# embs.append(image_end_token) -# pad_masks.append(image_end_mask) -# att_masks += [0] * (image_end_mask.shape[1]) -# lang_emb = self.vlm_with_expert.embed_language_tokens(lang_tokens) -# # Normalize language embeddings -# lang_emb_dim = lang_emb.shape[-1] -# lang_emb = lang_emb * math.sqrt(lang_emb_dim) - -# embs.append(lang_emb) -# pad_masks.append(lang_masks) - -# num_lang_embs = lang_emb.shape[1] -# att_masks += [0] * num_lang_embs - -# state_emb = self.state_proj(state) -# state_emb = state_emb[:, None, :] if state_emb.ndim == 2 else state_emb -# embs.append(state_emb) -# bsize = state_emb.shape[0] -# device = state_emb.device - -# states_seq_len = state_emb.shape[1] -# state_mask = torch.ones(bsize, states_seq_len, dtype=torch.bool, device=device) -# pad_masks.append(state_mask) - -# # Set attention masks so that image and language inputs do not attend to state or actions -# att_masks += [1] * (states_seq_len) -# embs = torch.cat(embs, dim=1) -# pad_masks = torch.cat(pad_masks, dim=1) -# att_masks = torch.tensor(att_masks, dtype=torch.bool, device=pad_masks.device) -# att_masks = att_masks[None, :] - -# seq_len = pad_masks.shape[1] -# if seq_len < self.prefix_length: -# embs = pad_tensor(embs, self.prefix_length, pad_value=0) -# pad_masks = pad_tensor(pad_masks, self.prefix_length, pad_value=0) -# att_masks = pad_tensor(att_masks, self.prefix_length, pad_value=0) - -# att_masks = att_masks.expand(bsize, -1) - -# return embs, pad_masks, att_masks - -# def embed_suffix(self, noisy_actions, timestep): -# """Embed state, noisy_actions, timestep to prepare for Expert Gemma processing.""" -# embs = [] -# pad_masks = [] -# att_masks = [] - -# # Fuse timestep + action information using an MLP -# action_emb = self.action_in_proj(noisy_actions) -# device = action_emb.device -# bsize = action_emb.shape[0] -# dtype = action_emb.dtype -# # Embed timestep using sine-cosine positional encoding with sensitivity in the range [0, 1] -# time_emb = create_sinusoidal_pos_embedding( -# timestep, -# self.vlm_with_expert.expert_hidden_size, -# self.config.min_period, -# self.config.max_period, -# device=device, -# ) -# time_emb = time_emb.type(dtype=dtype) - -# time_emb = time_emb[:, None, :].expand_as(action_emb) -# action_time_emb = torch.cat([action_emb, time_emb], dim=2) - -# action_time_emb = self.action_time_mlp_in(action_time_emb) -# action_time_emb = F.silu(action_time_emb) # swish == silu -# action_time_emb = self.action_time_mlp_out(action_time_emb) - -# # Add to input tokens -# embs.append(action_time_emb) - -# bsize, action_time_dim = action_time_emb.shape[:2] -# action_time_mask = torch.ones(bsize, action_time_dim, dtype=torch.bool, device=device) -# pad_masks.append(action_time_mask) - -# # Set attention masks so that image, language and state inputs do not attend to action tokens -# att_masks += [1] * self.config.chunk_size -# embs = torch.cat(embs, dim=1) -# pad_masks = torch.cat(pad_masks, dim=1) -# att_masks = torch.tensor(att_masks, dtype=embs.dtype, device=embs.device) -# att_masks = att_masks[None, :].expand(bsize, len(att_masks)) -# # added by jade -# seq_len = pad_masks.shape[1] -# if seq_len < self.config.chunk_size: -# embs = pad_tensor(embs, self.config.chunk_size, pad_value=0) -# pad_masks = pad_tensor(pad_masks, self.config.chunk_size, pad_value=0) -# att_masks = pad_tensor(att_masks, self.config.chunk_size, pad_value=0) -# return embs, pad_masks, att_masks - -# def forward( -# self, images, img_masks, lang_tokens, lang_masks, state, actions, noise=None, time=None -# ) -> Tensor: -# """Do a full training forward pass and compute the loss (batch_size x num_steps x num_motors)""" -# #added by jade -# if actions.ndim == 2: -# actions = actions[:, None, :].expand(-1, self.config.chunk_size, -1) -# if noise is None: -# noise = self.sample_noise(actions.shape, actions.device) - -# if time is None: -# time = self.sample_time(actions.shape[0], actions.device) - -# time_expanded = time[:, None, None] -# x_t = time_expanded * noise + (1 - time_expanded) * actions -# u_t = noise - actions -# prefix_embs, prefix_pad_masks, prefix_att_masks = self.embed_prefix( -# images, img_masks, lang_tokens, lang_masks, state=state -# ) -# suffix_embs, suffix_pad_masks, suffix_att_masks = self.embed_suffix(x_t, time) - -# pad_masks = torch.cat([prefix_pad_masks, suffix_pad_masks], dim=1) -# att_masks = torch.cat([prefix_att_masks, suffix_att_masks], dim=1) - -# att_2d_masks = make_att_2d_masks(pad_masks, att_masks) -# position_ids = torch.cumsum(pad_masks, dim=1) - 1 -# (_, suffix_out), _ = self.vlm_with_expert.forward( -# attention_mask=att_2d_masks, -# position_ids=position_ids, -# past_key_values=None, -# inputs_embeds=[prefix_embs, suffix_embs], -# use_cache=False, -# fill_kv_cache=False, -# ) -# # suffix_out = suffix_out[:, -self.config.chunk_size :] -# suffix_out = suffix_out[:, -self.config.chunk_size:, :] -# # Original openpi code, upcast attention output -# suffix_out = suffix_out.to(dtype=torch.float32) -# v_t = self.action_out_proj(suffix_out) -# losses = F.mse_loss(u_t, v_t, reduction="none") -# return losses - -# def sample_actions(self, images, img_masks, lang_tokens, lang_masks, state, noise=None) -> Tensor: -# """Do a full inference forward and compute the action (batch_size x num_steps x num_motors)""" -# bsize = state.shape[0] -# device = state.device - -# if noise is None: -# actions_shape = (bsize, self.config.chunk_size, self.config.max_action_dim) -# noise = self.sample_noise(actions_shape, device) - -# prefix_embs, prefix_pad_masks, prefix_att_masks = self.embed_prefix( -# images, img_masks, lang_tokens, lang_masks, state=state -# ) -# prefix_att_2d_masks = make_att_2d_masks(prefix_pad_masks, prefix_att_masks) -# prefix_position_ids = torch.cumsum(prefix_pad_masks, dim=1) - 1 -# # Compute image and language key value cache -# _, past_key_values = self.vlm_with_expert.forward( -# attention_mask=prefix_att_2d_masks, -# position_ids=prefix_position_ids, -# past_key_values=None, -# inputs_embeds=[prefix_embs, None], -# use_cache=self.config.use_cache, -# fill_kv_cache=True, -# ) -# dt = -1.0 / self.config.num_steps -# dt = torch.tensor(dt, dtype=torch.float32, device=device) - -# x_t = noise -# time = torch.tensor(1.0, dtype=torch.float32, device=device) -# while time >= -dt / 2: -# expanded_time = time.expand(bsize) -# v_t = self.denoise_step( -# prefix_pad_masks, -# past_key_values, -# x_t, -# expanded_time, -# ) -# # Euler step -# x_t += dt * v_t -# time += dt -# return x_t - -# def denoise_step( -# self, -# prefix_pad_masks, -# past_key_values, -# x_t, -# timestep, -# ): -# """Apply one denoising step of the noise `x_t` at a given timestep.""" -# suffix_embs, suffix_pad_masks, suffix_att_masks = self.embed_suffix(x_t, timestep) - -# suffix_len = suffix_pad_masks.shape[1] -# batch_size = prefix_pad_masks.shape[0] -# prefix_len = prefix_pad_masks.shape[1] -# prefix_pad_2d_masks = prefix_pad_masks[:, None, :].expand(batch_size, suffix_len, prefix_len) - -# suffix_att_2d_masks = make_att_2d_masks(suffix_pad_masks, suffix_att_masks) - -# full_att_2d_masks = torch.cat([prefix_pad_2d_masks, suffix_att_2d_masks], dim=2) -# prefix_offsets = torch.sum(prefix_pad_masks, dim=-1)[:, None] -# position_ids = prefix_offsets + torch.cumsum(suffix_pad_masks, dim=1) - 1 - -# outputs_embeds, _ = self.vlm_with_expert.forward( -# attention_mask=full_att_2d_masks, -# position_ids=position_ids, -# past_key_values=past_key_values, -# inputs_embeds=[None, suffix_embs], -# use_cache=self.config.use_cache, -# fill_kv_cache=False, -# ) -# suffix_out = outputs_embeds[1] -# suffix_out = suffix_out[:, -self.config.chunk_size :] -# suffix_out = suffix_out.to(dtype=torch.float32) -# v_t = self.action_out_proj(suffix_out) -# return v_t #!/usr/bin/env python # Copyright 2025 HuggingFace Inc. team. All rights reserved. @@ -1028,7 +76,6 @@ from lerobot.policies.utils import ( ) from lerobot.utils.utils import get_safe_dtype -# OBS_STATE = 'state' # Matches ".soNNN", optionally followed by "-something", up to the "_buffer_" marker _VARIANT_RE = re.compile(r"\.so\d+(?:-[\w]+)?_buffer_") @@ -1348,7 +395,6 @@ class SmolVLAPolicy(PreTrainedPolicy): # Unpad actions original_action_dim = self.config.action_feature.shape[0] - original_action_dim = 7 actions = actions[:, :, :original_action_dim] actions = self.unnormalize_outputs({ACTION: actions})[ACTION] @@ -1892,4 +938,4 @@ class VLAFlowMatching(nn.Module): suffix_out = suffix_out[:, -self.config.chunk_size :] suffix_out = suffix_out.to(dtype=torch.float32) v_t = self.action_out_proj(suffix_out) - return v_t + return v_t \ No newline at end of file diff --git a/src/lerobot/policies/smolvla/modeling_smolvla_v2.py b/src/lerobot/policies/smolvla/modeling_smolvla_v2.py deleted file mode 100644 index e69de29bb..000000000 diff --git a/src/lerobot/policies/smolvla/saver.txt b/src/lerobot/policies/smolvla/saver.txt deleted file mode 100644 index f2ad6c76f..000000000 --- a/src/lerobot/policies/smolvla/saver.txt +++ /dev/null @@ -1 +0,0 @@ -c diff --git a/src/lerobot/scripts/eval.py b/src/lerobot/scripts/eval.py index b96fbb8a3..bd131f708 100644 --- a/src/lerobot/scripts/eval.py +++ b/src/lerobot/scripts/eval.py @@ -132,10 +132,6 @@ def rollout( # Reset the policy and environments. policy.reset() - # added by jade - # for k in list(policy.config.input_features.keys()): - # if k.startswith("observation.image"): - # policy.config.input_features["observation.images." + k.split("observation.", 1)[1]] = policy.config.input_features.pop(k) observation, info = env.reset(seed=seeds) if render_callback is not None: render_callback(env) @@ -171,26 +167,6 @@ def rollout( # Infer "task" from attributes of environments. # TODO: works with SyncVectorEnv but not AsyncVectorEnv observation = add_envs_task(env, observation) - # breakpoint() - # observation = { - # k.replace("observation.images.", "observation.") if k.startswith("observation.images.") else k: v - # for k, v in observation.items() - # # } - # if "observation.image" in observation: - # observation["image"] = observation.pop("observation.image").to( - # device, non_blocking=device.type == "cuda" - # ) - - # if "observation.image2" in observation: - # observation["wrist_image"] = observation.pop("observation.image2").to( - # device, non_blocking=device.type == "cuda" - # ) - - # if "observation.state" in observation: - # observation["state"] = observation.pop("observation.state").to( - # device, non_blocking=device.type == "cuda" - # ) - with torch.inference_mode(): action = policy.select_action(observation) # Convert to CPU / numpy. @@ -550,15 +526,12 @@ def eval_main(cfg: EvalPipelineConfig): logging.info("Making environment.") envs = make_env(cfg.env, n_envs=cfg.eval.batch_size, use_async_envs=cfg.eval.use_async_envs) - breakpoint() + logging.info("Making policy.") policy = make_policy( cfg=cfg.policy, env_cfg=cfg.env, ) - breakpoint() - # policy, _ = load_smolvla(cfg.policy, "physical-intelligence/libero", policy) - # rename "image" -> "observation.image" policy.eval() with torch.no_grad(), torch.autocast(device_type=device.type) if cfg.policy.use_amp else nullcontext(): diff --git a/src/lerobot/scripts/train_2.py b/src/lerobot/scripts/train_2.py deleted file mode 100644 index 5b82ef044..000000000 --- a/src/lerobot/scripts/train_2.py +++ /dev/null @@ -1,345 +0,0 @@ -#!/usr/bin/env python - -# Copyright 2024 The HuggingFace Inc. team. All rights reserved. -# -# Licensed under the Apache License, Version 2.0 (the "License"); -# you may not use this file except in compliance with the License. -# You may obtain a copy of the License at -# -# http://www.apache.org/licenses/LICENSE-2.0 -# -# Unless required by applicable law or agreed to in writing, software -# distributed under the License is distributed on an "AS IS" BASIS, -# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. -# See the License for the specific language governing permissions and -# limitations under the License. -import logging -import time -from contextlib import nullcontext -from pprint import pformat -from typing import Any - -import torch -from termcolor import colored -from torch.amp import GradScaler -from torch.optim import Optimizer - -from lerobot.configs import parser -from lerobot.configs.train import TrainPipelineConfig -from lerobot.datasets.factory import make_dataset -from lerobot.datasets.sampler import EpisodeAwareSampler -from lerobot.datasets.utils import cycle -from lerobot.envs.factory import make_env -from lerobot.optim.factory import make_optimizer_and_scheduler -from lerobot.policies.factory import make_policy -from lerobot.policies.pretrained import PreTrainedPolicy -from lerobot.policies.utils import get_device_from_parameters -from lerobot.scripts.eval import eval_policy, eval_policy_multitask -from lerobot.utils.logging_utils import AverageMeter, MetricsTracker -from lerobot.utils.random_utils import set_seed -from lerobot.utils.train_utils import ( - get_step_checkpoint_dir, - get_step_identifier, - load_training_state, - save_checkpoint, - update_last_checkpoint, -) -from lerobot.utils.utils import ( - format_big_number, - get_safe_torch_device, - has_method, - init_logging, -) -from lerobot.utils.wandb_utils import WandBLogger - - -def update_policy( - train_metrics: MetricsTracker, - policy: PreTrainedPolicy, - batch: Any, - optimizer: Optimizer, - grad_clip_norm: float, - grad_scaler: GradScaler, - lr_scheduler=None, - use_amp: bool = False, - lock=None, -) -> tuple[MetricsTracker, dict]: - start_time = time.perf_counter() - device = get_device_from_parameters(policy) - policy.train() - with torch.autocast(device_type=device.type) if use_amp else nullcontext(): - loss, output_dict = policy.forward(batch) - # TODO(rcadene): policy.unnormalize_outputs(out_dict) - grad_scaler.scale(loss).backward() - - # Unscale the gradient of the optimizer's assigned params in-place **prior to gradient clipping**. - grad_scaler.unscale_(optimizer) - - grad_norm = torch.nn.utils.clip_grad_norm_( - policy.parameters(), - grad_clip_norm, - error_if_nonfinite=False, - ) - - # Optimizer's gradients are already unscaled, so scaler.step does not unscale them, - # although it still skips optimizer.step() if the gradients contain infs or NaNs. - with lock if lock is not None else nullcontext(): - grad_scaler.step(optimizer) - # Updates the scale for next iteration. - grad_scaler.update() - - optimizer.zero_grad() - - # Step through pytorch scheduler at every batch instead of epoch - if lr_scheduler is not None: - lr_scheduler.step() - - if has_method(policy, "update"): - # To possibly update an internal buffer (for instance an Exponential Moving Average like in TDMPC). - policy.update() - - train_metrics.loss = loss.item() - train_metrics.grad_norm = grad_norm.item() - train_metrics.lr = optimizer.param_groups[0]["lr"] - train_metrics.update_s = time.perf_counter() - start_time - return train_metrics, output_dict - - -def _inject_normalization_stats(policy: SmolVLAPolicy, dataset_meta: LeRobotDatasetMetadata): - """Recreate normalization layers with dataset stats if missing (Adil's workaround).""" - from lerobot.policies.normalize import Normalize, Unnormalize - - if not hasattr(dataset_meta, "stats") or not dataset_meta.stats: - print("⚠️ Dataset has no stats, skipping normalization injection.") - return - - stats = {} - for key, stat_dict in dataset_meta.stats.items(): - stats[key] = { - stat_type: torch.as_tensor(stat_array) if isinstance(stat_array, np.ndarray) else stat_array - for stat_type, stat_array in stat_dict.items() - } - - normalize_inputs = Normalize(policy.config.input_features, policy.config.normalization_mapping, stats) - normalize_targets = Normalize(policy.config.output_features, policy.config.normalization_mapping, stats) - unnormalize_outputs = Unnormalize( - policy.config.output_features, policy.config.normalization_mapping, stats - ) - - policy.normalize_inputs = normalize_inputs - policy.normalize_targets = normalize_targets - policy.unnormalize_outputs = unnormalize_outputs - - print("βœ… Normalization layers injected with dataset stats.") - - -@parser.wrap() -def train(cfg: TrainPipelineConfig): - cfg.validate() - logging.info(pformat(cfg.to_dict())) - - if cfg.wandb.enable and cfg.wandb.project: - wandb_logger = WandBLogger(cfg) - else: - wandb_logger = None - logging.info(colored("Logs will be saved locally.", "yellow", attrs=["bold"])) - - if cfg.seed is not None: - set_seed(cfg.seed) - - # Check device is available - device = get_safe_torch_device(cfg.policy.device, log=True) - torch.backends.cudnn.benchmark = True - torch.backends.cuda.matmul.allow_tf32 = True - - logging.info("Creating dataset") - dataset = make_dataset(cfg) - - # Create environment used for evaluating checkpoints during training on simulation data. - # On real-world data, no need to create an environment as evaluations are done outside train.py, - # using the eval.py instead, with gym_dora environment and dora-rs. - eval_env = None - if cfg.eval_freq > 0 and cfg.env is not None: - logging.info("Creating env") - eval_env = make_env(cfg.env, n_envs=cfg.eval.batch_size, use_async_envs=cfg.eval.use_async_envs) - - logging.info("Creating policy") - policy = make_policy( - cfg=cfg.policy, - ds_meta=dataset.meta, - ) - logging.info("Creating optimizer and scheduler") - optimizer, lr_scheduler = make_optimizer_and_scheduler(cfg, policy) - grad_scaler = GradScaler(device.type, enabled=cfg.policy.use_amp) - - step = 0 # number of policy updates (forward + backward + optim) - - if cfg.resume: - step, optimizer, lr_scheduler = load_training_state(cfg.checkpoint_path, optimizer, lr_scheduler) - - num_learnable_params = sum(p.numel() for p in policy.parameters() if p.requires_grad) - num_total_params = sum(p.numel() for p in policy.parameters()) - - logging.info(colored("Output dir:", "yellow", attrs=["bold"]) + f" {cfg.output_dir}") - if cfg.env is not None: - logging.info(f"{cfg.env.task=}") - logging.info(f"{cfg.steps=} ({format_big_number(cfg.steps)})") - logging.info(f"{dataset.num_frames=} ({format_big_number(dataset.num_frames)})") - logging.info(f"{dataset.num_episodes=}") - logging.info(f"{num_learnable_params=} ({format_big_number(num_learnable_params)})") - logging.info(f"{num_total_params=} ({format_big_number(num_total_params)})") - - # create dataloader for offline training - if hasattr(cfg.policy, "drop_n_last_frames"): - shuffle = False - sampler = EpisodeAwareSampler( - dataset.episode_data_index, - drop_n_last_frames=cfg.policy.drop_n_last_frames, - shuffle=True, - ) - else: - shuffle = True - sampler = None - - dataloader = torch.utils.data.DataLoader( - dataset, - num_workers=cfg.num_workers, - batch_size=cfg.batch_size, - shuffle=shuffle, - sampler=sampler, - pin_memory=device.type == "cuda", - drop_last=False, - ) - dl_iter = cycle(dataloader) - - policy.train() - train_metrics = { - "loss": AverageMeter("loss", ":.3f"), - "grad_norm": AverageMeter("grdn", ":.3f"), - "lr": AverageMeter("lr", ":0.1e"), - "update_s": AverageMeter("updt_s", ":.3f"), - "dataloading_s": AverageMeter("data_s", ":.3f"), - } - - train_tracker = MetricsTracker( - cfg.batch_size, dataset.num_frames, dataset.num_episodes, train_metrics, initial_step=step - ) - - logging.info("Start offline training on a fixed dataset") - for _ in range(step, cfg.steps): - start_time = time.perf_counter() - batch = next(dl_iter) - train_tracker.dataloading_s = time.perf_counter() - start_time - - for key in batch: - if isinstance(batch[key], torch.Tensor): - batch[key] = batch[key].to(device, non_blocking=device.type == "cuda") - - train_tracker, output_dict = update_policy( - train_tracker, - policy, - batch, - optimizer, - cfg.optimizer.grad_clip_norm, - grad_scaler=grad_scaler, - lr_scheduler=lr_scheduler, - use_amp=cfg.policy.use_amp, - ) - - # Note: eval and checkpoint happens *after* the `step`th training update has completed, so we - # increment `step` here. - step += 1 - train_tracker.step() - is_log_step = cfg.log_freq > 0 and step % cfg.log_freq == 0 - is_saving_step = step % cfg.save_freq == 0 or step == cfg.steps - is_eval_step = cfg.eval_freq > 0 and step % cfg.eval_freq == 0 - - if is_log_step: - logging.info(train_tracker) - if wandb_logger: - wandb_log_dict = train_tracker.to_dict() - if output_dict: - wandb_log_dict.update(output_dict) - wandb_logger.log_dict(wandb_log_dict, step) - train_tracker.reset_averages() - - if cfg.save_checkpoint and is_saving_step: - logging.info(f"Checkpoint policy after step {step}") - checkpoint_dir = get_step_checkpoint_dir(cfg.output_dir, cfg.steps, step) - save_checkpoint(checkpoint_dir, step, cfg, policy, optimizer, lr_scheduler) - update_last_checkpoint(checkpoint_dir) - if wandb_logger: - wandb_logger.log_policy(checkpoint_dir) - - if cfg.env and is_eval_step: - step_id = get_step_identifier(step, cfg.steps) - logging.info(f"Eval policy at step {step}") - with ( - torch.no_grad(), - torch.autocast(device_type=device.type) if cfg.policy.use_amp else nullcontext(), - ): - if cfg.env.multitask_eval: - eval_info = eval_policy_multitask( - eval_env, - policy, - cfg.eval.n_episodes, - videos_dir=cfg.output_dir / "eval" / f"videos_step_{step_id}", - max_episodes_rendered=4, - start_seed=cfg.seed, - max_parallel_tasks=cfg.env.max_parallel_tasks, - ) - aggregated = eval_info["overall"]["aggregated"] - # Print per-suite stats, log? - for task_group, task_group_info in eval_info.items(): - if task_group == "overall": - continue # Skip the overall stats since we already printed it - print(f"\nAggregated Metrics for {task_group}:") - print(task_group_info["aggregated"]) - else: - eval_info = eval_policy( - eval_env, - policy, - cfg.eval.n_episodes, - videos_dir=cfg.output_dir / "eval" / f"videos_step_{step_id}", - max_episodes_rendered=4, - start_seed=cfg.seed, - ) - aggregated = eval_info["aggregated"] - - eval_metrics = { - "avg_sum_reward": AverageMeter("βˆ‘rwrd", ":.3f"), - "pc_success": AverageMeter("success", ":.1f"), - "eval_s": AverageMeter("eval_s", ":.3f"), - } - eval_tracker = MetricsTracker( - cfg.batch_size, dataset.num_frames, dataset.num_episodes, eval_metrics, initial_step=step - ) - eval_tracker.eval_s = aggregated.pop("eval_s") - eval_tracker.avg_sum_reward = aggregated.pop("avg_sum_reward") - eval_tracker.pc_success = aggregated.pop("pc_success") - logging.info(eval_tracker) - if wandb_logger: - wandb_log_dict = {**eval_tracker.to_dict(), **eval_info} - wandb_logger.log_dict(wandb_log_dict, step, mode="eval") - wandb_logger.log_video(eval_info["video_paths"][0], step, mode="eval") - - if eval_env: - if cfg.env.multitask_eval: - for _task_group, envs_dict in eval_env.items(): - for _idx, env in envs_dict.items(): - env.close() - else: - eval_env.close() - logging.info("End of training") - - if cfg.policy.push_to_hub: - policy.push_model_to_hub(cfg) - - -def main(): - init_logging() - train() - - -if __name__ == "__main__": - main() diff --git a/src/lerobot/scripts/train_accelerate.py b/src/lerobot/scripts/train_accelerate.py deleted file mode 100644 index 1e8a59a64..000000000 --- a/src/lerobot/scripts/train_accelerate.py +++ /dev/null @@ -1,366 +0,0 @@ -#!/usr/bin/env python - -# Copyright 2024 The HuggingFace Inc. team. All rights reserved. -# -# Licensed under the Apache License, Version 2.0 (the "License"); -# you may not use this file except in compliance with the License. -# You may obtain a copy of the License at -# -# http://www.apache.org/licenses/LICENSE-2.0 -# -# Unless required by applicable law or agreed to in writing, software -# distributed under the License is distributed on an "AS IS" BASIS, -# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. -# See the License for the specific language governing permissions and -# limitations under the License. -import logging -import time -from pprint import pformat -from typing import Any - -import torch -from accelerate import Accelerator -from accelerate.utils import set_seed as accelerate_set_seed -from termcolor import colored -from torch.optim import Optimizer - -from lerobot.configs import parser -from lerobot.configs.train import TrainPipelineConfig -from lerobot.datasets.factory import make_dataset -from lerobot.datasets.sampler import EpisodeAwareSampler -from lerobot.envs.factory import make_env -from lerobot.optim.factory import make_optimizer_and_scheduler -from lerobot.policies.factory import make_policy -from lerobot.policies.pretrained import PreTrainedPolicy -from lerobot.scripts.eval import eval_policy -from lerobot.utils.logging_utils import AverageMeter, MetricsTracker -from lerobot.utils.train_utils import ( - get_step_checkpoint_dir, - get_step_identifier, - load_training_state, - save_checkpoint, - update_last_checkpoint, -) -from lerobot.utils.utils import ( - format_big_number, - has_method, - init_logging, -) - - -def update_policy( - train_metrics: MetricsTracker, - policy: PreTrainedPolicy, - batch: Any, - optimizer: Optimizer, - grad_clip_norm: float, - accelerator: Accelerator, - lr_scheduler=None, -) -> tuple[MetricsTracker, dict]: - start_time = time.perf_counter() - policy.train() - - # Use accelerator's autocast context if mixed precision is enabled - with accelerator.autocast(): - loss, output_dict = policy.forward(batch) - # TODO(rcadene): policy.unnormalize_outputs(out_dict) - - # Use accelerator for backward pass - accelerator.backward(loss) - - # Gradient clipping - accelerator handles unscaling automatically - if accelerator.sync_gradients and grad_clip_norm > 0: - grad_norm = accelerator.clip_grad_norm_(policy.parameters(), grad_clip_norm) - else: - grad_norm = torch.tensor(0.0) - - optimizer.step() - lr_scheduler.step() if lr_scheduler is not None else None - optimizer.zero_grad() - - # Update policy-specific buffers if needed - if has_method(policy, "update"): - policy.update() - - # Gather metrics across all processes - loss_value = accelerator.gather(loss.detach()).mean().item() - grad_norm_value = accelerator.gather(grad_norm).mean().item() - - train_metrics.loss = loss_value - train_metrics.grad_norm = grad_norm_value - train_metrics.lr = optimizer.param_groups[0]["lr"] - train_metrics.update_s = time.perf_counter() - start_time - return train_metrics, output_dict - - -@parser.wrap() -def train(cfg: TrainPipelineConfig): - cfg.validate() - logging.info(pformat(cfg.to_dict())) - - # Initialize accelerator - from accelerate.utils import DistributedDataParallelKwargs - - # added by jade 2 lines - ddp_kwargs = DistributedDataParallelKwargs(find_unused_parameters=False) - accelerator = Accelerator(..., kwargs_handlers=[ddp_kwargs]) - - from lerobot.utils.wandb_utils import cfg_to_group, get_wandb_run_id_from_filesystem - - ddp_kwargs = DistributedDataParallelKwargs(find_unused_parameters=True) - accelerator = Accelerator( - mixed_precision="bf16" if cfg.policy.use_amp else "no", - gradient_accumulation_steps=cfg.policy.gradient_accumulation_steps, - log_with="wandb" if cfg.wandb.enable else None, - kwargs_handlers=[ddp_kwargs], - project_dir=cfg.output_dir, - ) - - accelerator.init_trackers( - project_name=cfg.wandb.project, - init_kwargs={ - "wandb": { - "entity": cfg.wandb.entity, - "name": cfg.job_name, - "notes": cfg.wandb.notes, - "tags": cfg_to_group(cfg, return_list=True), - "dir": cfg.output_dir, - "config": cfg.to_dict(), - "save_code": False, - "job_type": "train_eval", - "mode": cfg.wandb.mode if cfg.wandb.mode in ["online", "offline", "disabled"] else "online", - "resume": "must" if cfg.resume else None, - "id": cfg.wandb.run_id - if cfg.wandb.run_id - else (get_wandb_run_id_from_filesystem(cfg.output_dir) if cfg.resume else None), - } - }, - ) - - # Set seed for reproducibility - if cfg.seed is not None: - accelerate_set_seed(cfg.seed) - - # Setup device - accelerator handles device placement - torch.backends.cudnn.benchmark = True - torch.backends.cuda.matmul.allow_tf32 = True - - # Create dataset - if accelerator.is_main_process: - logging.info("Creating dataset") - dataset = make_dataset(cfg) - print("c") - # Create evaluation environment (only on main process) - eval_env = None - if cfg.eval_freq > 0 and cfg.env is not None and accelerator.is_main_process: - logging.info("Creating env") - eval_env = make_env(cfg.env, n_envs=cfg.eval.batch_size, use_async_envs=cfg.eval.use_async_envs) - - # Create policy - if accelerator.is_main_process: - logging.info("Creating policy") - - # Use accelerator's device instead of cfg.policy.device - with accelerator.main_process_first(): - policy = make_policy( - cfg=cfg.policy, - ds_meta=dataset.meta, - ) - - # Create optimizer and scheduler - if accelerator.is_main_process: - logging.info("Creating optimizer and scheduler") - optimizer, lr_scheduler = make_optimizer_and_scheduler(cfg, policy) - - step = 0 # number of policy updates - - if cfg.resume: - step, optimizer, lr_scheduler = load_training_state(cfg.checkpoint_path, optimizer, lr_scheduler) - - # Prepare dataloader - if hasattr(cfg.policy, "drop_n_last_frames"): - shuffle = False - sampler = EpisodeAwareSampler( - dataset.episode_data_index, - drop_n_last_frames=cfg.policy.drop_n_last_frames, - shuffle=True, - ) - else: - shuffle = True - sampler = None - - dataloader = torch.utils.data.DataLoader( - dataset, - num_workers=cfg.num_workers, - batch_size=cfg.batch_size, - shuffle=shuffle, - sampler=sampler, - pin_memory=True, - drop_last=True, # Important for distributed training - ) - - # Prepare for distributed training - policy, optimizer, dataloader, lr_scheduler = accelerator.prepare( - policy, optimizer, dataloader, lr_scheduler - ) - - # Log training info (only on main process) - if accelerator.is_main_process: - num_learnable_params = sum(p.numel() for p in policy.parameters() if p.requires_grad) - num_total_params = sum(p.numel() for p in policy.parameters()) - - logging.info(colored("Output dir:", "yellow", attrs=["bold"]) + f" {cfg.output_dir}") - if cfg.env is not None: - logging.info(f"{cfg.env.task=}") - logging.info(f"{cfg.steps=} ({format_big_number(cfg.steps)})") - logging.info(f"{dataset.num_frames=} ({format_big_number(dataset.num_frames)})") - logging.info(f"{dataset.num_episodes=}") - logging.info(f"{num_learnable_params=} ({format_big_number(num_learnable_params)})") - logging.info(f"{num_total_params=} ({format_big_number(num_total_params)})") - logging.info(f"Number of processes: {accelerator.num_processes}") - logging.info(f"Device: {accelerator.device}") - logging.info(f"Mixed precision: {accelerator.mixed_precision}") - - # Create metrics trackers - train_metrics = { - "loss": AverageMeter("loss", ":.3f"), - "grad_norm": AverageMeter("grdn", ":.3f"), - "lr": AverageMeter("lr", ":0.1e"), - "update_s": AverageMeter("updt_s", ":.3f"), - "dataloading_s": AverageMeter("data_s", ":.3f"), - } - - train_tracker = MetricsTracker( - cfg.batch_size * accelerator.num_processes, # Account for all processes - dataset.num_frames, - dataset.num_episodes, - train_metrics, - initial_step=step, - ) - - # Training loop - policy.train() - if accelerator.is_main_process: - logging.info("Start offline training on a fixed dataset") - - # Create iterator from dataloader - dl_iter = iter(dataloader) - - for current_step in range(step, cfg.steps): - start_time = time.perf_counter() - # Get next batch, cycling through dataloader if needed - try: - batch = next(dl_iter) - print("data laoder batch keys: ", batch.keys()) - breakpoint() - except StopIteration: - dl_iter = iter(dataloader) - batch = next(dl_iter) - train_tracker.dataloading_s = time.perf_counter() - start_time - # Update policy - train_tracker, output_dict = update_policy( - train_tracker, - policy, - batch, - optimizer, - cfg.optimizer.grad_clip_norm, - accelerator, - lr_scheduler=lr_scheduler, - ) - - # Increment step counter - step += 1 - train_tracker.step() - - # Determine if we should log, save, or evaluate - is_log_step = cfg.log_freq > 0 and step % cfg.log_freq == 0 - is_saving_step = step % cfg.save_freq == 0 or step == cfg.steps - is_eval_step = cfg.eval_freq > 0 and step % cfg.eval_freq == 0 - - # Logging (only on main process) - if is_log_step and accelerator.is_main_process: - logging.info(train_tracker) - wandb_log_dict = train_tracker.to_dict() - if output_dict: - wandb_log_dict.update(output_dict) - for k, v in wandb_log_dict.items(): - accelerator.log({f"{'train'}/{k}": v}, step=step) - train_tracker.reset_averages() - - # Checkpointing (only on main process) - if cfg.save_checkpoint and is_saving_step: - # βœ… all processes wait here - accelerator.wait_for_everyone() - - if accelerator.is_main_process: - logging.info(f"Checkpoint policy after step {step}") - checkpoint_dir = get_step_checkpoint_dir(cfg.output_dir, cfg.steps, step) - - unwrapped_policy = accelerator.unwrap_model(policy) - save_checkpoint(checkpoint_dir, step, cfg, unwrapped_policy, optimizer, lr_scheduler) - update_last_checkpoint(checkpoint_dir) - - # βœ… all processes sync again after saving - accelerator.wait_for_everyone() - - # if wandb_logger: - # wandb_logger.log_policy(checkpoint_dir) - - # Evaluation (only on main process) - if cfg.env and is_eval_step and accelerator.is_main_process: - step_id = get_step_identifier(step, cfg.steps) - logging.info(f"Eval policy at step {step}") - - # Unwrap model for evaluation - unwrapped_policy = accelerator.unwrap_model(policy) - unwrapped_policy.eval() - - with torch.no_grad(): - eval_info = eval_policy( - eval_env, - unwrapped_policy, - cfg.eval.n_episodes, - videos_dir=cfg.output_dir / "eval" / f"videos_step_{step_id}", - max_episodes_rendered=4, - start_seed=cfg.seed, - ) - - eval_metrics = { - "avg_sum_reward": AverageMeter("βˆ‘rwrd", ":.3f"), - "pc_success": AverageMeter("success", ":.1f"), - "eval_s": AverageMeter("eval_s", ":.3f"), - } - eval_tracker = MetricsTracker( - cfg.batch_size * accelerator.num_processes, - dataset.num_frames, - dataset.num_episodes, - eval_metrics, - initial_step=step, - ) - eval_tracker.eval_s = eval_info["aggregated"].pop("eval_s") - eval_tracker.avg_sum_reward = eval_info["aggregated"].pop("avg_sum_reward") - eval_tracker.pc_success = eval_info["aggregated"].pop("pc_success") - logging.info(eval_tracker) - - wandb_log_dict = {**eval_tracker.to_dict(), **eval_info} - for k, v in wandb_log_dict.items(): - accelerator.log({f"{'eval'}/{k}": v}, step=step) - - # Set back to training mode - policy.train() - - # Wait for all processes to finish - accelerator.wait_for_everyone() - - # Cleanup - if eval_env and accelerator.is_main_process: - eval_env.close() - - if accelerator.is_main_process: - logging.info("End of training") - accelerator.end_training() # added by jade - - -if __name__ == "__main__": - init_logging() - train() From 978412346315244e56e2ca40097ff3dcd8c43fbe Mon Sep 17 00:00:00 2001 From: "pre-commit-ci[bot]" <66853113+pre-commit-ci[bot]@users.noreply.github.com> Date: Thu, 11 Sep 2025 12:18:35 +0000 Subject: [PATCH 08/12] [pre-commit.ci] auto fixes from pre-commit.com hooks for more information, see https://pre-commit.ci --- src/lerobot/envs/libero.py | 5 ++--- src/lerobot/policies/smolvla/modeling_smolvla.py | 2 +- src/lerobot/scripts/eval.py | 5 ++--- 3 files changed, 5 insertions(+), 7 deletions(-) diff --git a/src/lerobot/envs/libero.py b/src/lerobot/envs/libero.py index 0672f583f..5d0d37c83 100644 --- a/src/lerobot/envs/libero.py +++ b/src/lerobot/envs/libero.py @@ -4,10 +4,9 @@ import logging import math import os from collections import defaultdict -from collections.abc import Callable -from itertools import chain -from typing import Any, Dict, List from collections.abc import Callable, Iterable, Mapping, Sequence +from itertools import chain +from typing import Any import gymnasium as gym import numpy as np diff --git a/src/lerobot/policies/smolvla/modeling_smolvla.py b/src/lerobot/policies/smolvla/modeling_smolvla.py index 6bf956aa3..18f2fc58a 100644 --- a/src/lerobot/policies/smolvla/modeling_smolvla.py +++ b/src/lerobot/policies/smolvla/modeling_smolvla.py @@ -938,4 +938,4 @@ class VLAFlowMatching(nn.Module): suffix_out = suffix_out[:, -self.config.chunk_size :] suffix_out = suffix_out.to(dtype=torch.float32) v_t = self.action_out_proj(suffix_out) - return v_t \ No newline at end of file + return v_t diff --git a/src/lerobot/scripts/eval.py b/src/lerobot/scripts/eval.py index bd131f708..4777f57e3 100644 --- a/src/lerobot/scripts/eval.py +++ b/src/lerobot/scripts/eval.py @@ -52,14 +52,13 @@ import logging import threading import time from collections import defaultdict -from collections.abc import Callable +from collections.abc import Callable, Iterator from contextlib import nullcontext from copy import deepcopy from dataclasses import asdict from pathlib import Path from pprint import pformat -from typing import Dict, List, Tuple, TypedDict -from collections.abc import Iterator +from typing import TypedDict import einops import gymnasium as gym From 8567ab60d840e39fcec8c713a7bf4f3ad5388a4c Mon Sep 17 00:00:00 2001 From: Jade Choghari Date: Thu, 11 Sep 2025 14:24:06 +0200 Subject: [PATCH 09/12] remove unces --- src/lerobot/datasets/lerobot_dataset.py | 3 +- src/lerobot/envs/utils.py | 53 ------------------------- src/lerobot/scripts/eval.py | 1 - 3 files changed, 2 insertions(+), 55 deletions(-) diff --git a/src/lerobot/datasets/lerobot_dataset.py b/src/lerobot/datasets/lerobot_dataset.py index 875608c59..10b4e731e 100644 --- a/src/lerobot/datasets/lerobot_dataset.py +++ b/src/lerobot/datasets/lerobot_dataset.py @@ -471,7 +471,8 @@ class LeRobotDataset(torch.utils.data.Dataset): if self.episodes is not None and self.meta._version >= packaging.version.parse("v2.1"): episodes_stats = [self.meta.episodes_stats[ep_idx] for ep_idx in self.episodes] self.stats = aggregate_stats(episodes_stats) - + + # Load actual data try: if force_cache_sync: raise FileNotFoundError diff --git a/src/lerobot/envs/utils.py b/src/lerobot/envs/utils.py index d65f7f29f..94109bf8b 100644 --- a/src/lerobot/envs/utils.py +++ b/src/lerobot/envs/utils.py @@ -80,59 +80,6 @@ def preprocess_observation(observations: dict[str, np.ndarray]) -> dict[str, Ten return return_observations - -def preprocess_observation1( - observations: dict[str, np.ndarray], cfg: dict[str, Any] = None -) -> dict[str, Tensor]: - # TODO(aliberts, rcadene): refactor this to use features from the environment (no hardcoding) - """Convert environment observation to LeRobot format observation. - Args: - observation: Dictionary of observation batches from a Gym vector environment. - Returns: - Dictionary of observation batches with keys renamed to LeRobot format and values as tensors. - """ - # map to expected inputs for the policy - return_observations = {} - image_key = list(cfg.image_features.keys())[0] if cfg else "observation.image" - state_key = cfg.robot_state_feature_key if cfg else "observation.state" - if "pixels" in observations: - if isinstance(observations["pixels"], dict): - # imgs = {f"{image_key}.{key}": img for key, img in observations["pixels"].items()} - imgs = observations["pixels"] # keys should be OBS_IMAGE, OBS_IMAGE_2, OBS_IMAGE_3 - else: - imgs = {f"{image_key}": observations["pixels"]} - - for imgkey, img in imgs.items(): - # TODO(aliberts, rcadene): use transforms.ToTensor()? - img = torch.from_numpy(img) - - # sanity check that images are channel last - _, h, w, c = img.shape - assert c < h and c < w, f"expect channel last images, but instead got {img.shape=}" - - # sanity check that images are uint8 - assert img.dtype == torch.uint8, f"expect torch.uint8, but instead {img.dtype=}" - - # convert to channel first of type float32 in range [0,1] - img = einops.rearrange(img, "b h w c -> b c h w").contiguous() - img = img.type(torch.float32) - img /= 255 - - return_observations[imgkey] = img - - if "environment_state" in observations: - return_observations["observation.environment_state"] = torch.from_numpy( - observations["environment_state"] - ).float() - - # TODO(rcadene): enable pixels only baseline with `obs_type="pixels"` in environment by removing - # requirement for "agent_pos" - return_observations[state_key] = torch.from_numpy(observations["agent_pos"]).float() - if "task" in observations: - return_observations["task"] = observations["task"] - return return_observations - - def env_to_policy_features(env_cfg: EnvConfig) -> dict[str, PolicyFeature]: # TODO(aliberts, rcadene): remove this hardcoding of keys and just use the nested keys as is # (need to also refactor preprocess_observation and externalize normalization from policies) diff --git a/src/lerobot/scripts/eval.py b/src/lerobot/scripts/eval.py index 4777f57e3..2bc928117 100644 --- a/src/lerobot/scripts/eval.py +++ b/src/lerobot/scripts/eval.py @@ -155,7 +155,6 @@ def rollout( while not np.all(done) and step < max_steps: # Numpy array to tensor and changing dictionary keys to LeRobot policy format. observation = preprocess_observation(observation) - # observation = preprocess_observation1(observation) if return_observations: all_observations.append(deepcopy(observation)) From f2530570e0352a1b57cea42a685daec04733bb98 Mon Sep 17 00:00:00 2001 From: "pre-commit-ci[bot]" <66853113+pre-commit-ci[bot]@users.noreply.github.com> Date: Thu, 11 Sep 2025 12:25:13 +0000 Subject: [PATCH 10/12] [pre-commit.ci] auto fixes from pre-commit.com hooks for more information, see https://pre-commit.ci --- src/lerobot/datasets/lerobot_dataset.py | 2 +- src/lerobot/envs/utils.py | 1 + 2 files changed, 2 insertions(+), 1 deletion(-) diff --git a/src/lerobot/datasets/lerobot_dataset.py b/src/lerobot/datasets/lerobot_dataset.py index 10b4e731e..a869cb920 100644 --- a/src/lerobot/datasets/lerobot_dataset.py +++ b/src/lerobot/datasets/lerobot_dataset.py @@ -471,7 +471,7 @@ class LeRobotDataset(torch.utils.data.Dataset): if self.episodes is not None and self.meta._version >= packaging.version.parse("v2.1"): episodes_stats = [self.meta.episodes_stats[ep_idx] for ep_idx in self.episodes] self.stats = aggregate_stats(episodes_stats) - + # Load actual data try: if force_cache_sync: diff --git a/src/lerobot/envs/utils.py b/src/lerobot/envs/utils.py index 94109bf8b..4aabe7932 100644 --- a/src/lerobot/envs/utils.py +++ b/src/lerobot/envs/utils.py @@ -80,6 +80,7 @@ def preprocess_observation(observations: dict[str, np.ndarray]) -> dict[str, Ten return return_observations + def env_to_policy_features(env_cfg: EnvConfig) -> dict[str, PolicyFeature]: # TODO(aliberts, rcadene): remove this hardcoding of keys and just use the nested keys as is # (need to also refactor preprocess_observation and externalize normalization from policies) From d09b2a28af6879bd118325f300a31685f2482198 Mon Sep 17 00:00:00 2001 From: Jade Choghari Date: Thu, 11 Sep 2025 14:28:46 +0200 Subject: [PATCH 11/12] remove --- .../policies/smolvla/smolvlm_with_expert.py | 1 - src/lerobot/scripts/eval.py | 42 ------------------- src/lerobot/scripts/train.py | 29 ------------- 3 files changed, 72 deletions(-) diff --git a/src/lerobot/policies/smolvla/smolvlm_with_expert.py b/src/lerobot/policies/smolvla/smolvlm_with_expert.py index 3b78c99e6..f3d1a693a 100644 --- a/src/lerobot/policies/smolvla/smolvlm_with_expert.py +++ b/src/lerobot/policies/smolvla/smolvlm_with_expert.py @@ -78,7 +78,6 @@ class SmolVLMWithExpertModel(nn.Module): model_id, device_map="auto", torch_dtype="bfloat16", - # torch_dtype=torch.float16, low_cpu_mem_usage=True, ) config = self.vlm.config diff --git a/src/lerobot/scripts/eval.py b/src/lerobot/scripts/eval.py index 2bc928117..5e8d63f09 100644 --- a/src/lerobot/scripts/eval.py +++ b/src/lerobot/scripts/eval.py @@ -465,48 +465,6 @@ def _compile_episode_data( return data_dict - -from lerobot.datasets.lerobot_dataset import LeRobotDatasetMetadata -from lerobot.policies.smolvla.modeling_smolvla import SmolVLAPolicy - - -def _inject_normalization_stats(policy: SmolVLAPolicy, dataset_meta: LeRobotDatasetMetadata): - """Recreate normalization layers with proper stats from the dataset.""" - from lerobot.policies.normalize import Normalize, Unnormalize - - # Convert numpy stats to the format expected by normalization layers - stats = {} - for key, stat_dict in dataset_meta.stats.items(): - stats[key] = { - stat_type: torch.from_numpy(stat_array) if isinstance(stat_array, np.ndarray) else stat_array - for stat_type, stat_array in stat_dict.items() - } - - # Recreate normalization layers with proper stats - normalize_inputs = Normalize(policy.config.input_features, policy.config.normalization_mapping, stats) - - normalize_targets = Normalize(policy.config.output_features, policy.config.normalization_mapping, stats) - - unnormalize_outputs = Unnormalize( - policy.config.output_features, policy.config.normalization_mapping, stats - ) - - # Replace the normalization layers on the policy - policy.normalize_inputs = normalize_inputs - policy.normalize_targets = normalize_targets - policy.unnormalize_outputs = unnormalize_outputs - - print("Normalization layers recreated with dataset stats.") - - -def load_smolvla(cfg, dataset_repo: str, policy): - from lerobot.datasets.lerobot_dataset import LeRobotDataset - - dataset = LeRobotDataset(dataset_repo, root="/raid/jade/.cache/huggingface/datasets/") - _inject_normalization_stats(policy=policy, dataset_meta=dataset.meta) # only needed if stats are missing - return policy.to("cuda"), dataset - - @parser.wrap() def eval_main(cfg: EvalPipelineConfig): logging.info(pformat(asdict(cfg))) diff --git a/src/lerobot/scripts/train.py b/src/lerobot/scripts/train.py index fc8be4ebc..3aea697d0 100644 --- a/src/lerobot/scripts/train.py +++ b/src/lerobot/scripts/train.py @@ -105,35 +105,6 @@ def update_policy( train_metrics.update_s = time.perf_counter() - start_time return train_metrics, output_dict - -# def _inject_normalization_stats(policy: SmolVLAPolicy, dataset_meta: LeRobotDatasetMetadata): -# """Recreate normalization layers with dataset stats if missing (Adil's workaround).""" -# from lerobot.policies.normalize import Normalize, Unnormalize - -# if not hasattr(dataset_meta, "stats") or not dataset_meta.stats: -# print("⚠️ Dataset has no stats, skipping normalization injection.") -# return - -# stats = {} -# for key, stat_dict in dataset_meta.stats.items(): -# stats[key] = { -# stat_type: torch.as_tensor(stat_array) -# if isinstance(stat_array, np.ndarray) -# else stat_array -# for stat_type, stat_array in stat_dict.items() -# } - -# normalize_inputs = Normalize(policy.config.input_features, policy.config.normalization_mapping, stats) -# normalize_targets = Normalize(policy.config.output_features, policy.config.normalization_mapping, stats) -# unnormalize_outputs = Unnormalize(policy.config.output_features, policy.config.normalization_mapping, stats) - -# policy.normalize_inputs = normalize_inputs -# policy.normalize_targets = normalize_targets -# policy.unnormalize_outputs = unnormalize_outputs - -# print("βœ… Normalization layers injected with dataset stats.") - - @parser.wrap() def train(cfg: TrainPipelineConfig): cfg.validate() From 8fe977118b1ee4d661bb8762b0f5c1e5137fa36e Mon Sep 17 00:00:00 2001 From: "pre-commit-ci[bot]" <66853113+pre-commit-ci[bot]@users.noreply.github.com> Date: Thu, 11 Sep 2025 12:30:08 +0000 Subject: [PATCH 12/12] [pre-commit.ci] auto fixes from pre-commit.com hooks for more information, see https://pre-commit.ci --- src/lerobot/scripts/eval.py | 1 + src/lerobot/scripts/train.py | 1 + 2 files changed, 2 insertions(+) diff --git a/src/lerobot/scripts/eval.py b/src/lerobot/scripts/eval.py index 5e8d63f09..38a825777 100644 --- a/src/lerobot/scripts/eval.py +++ b/src/lerobot/scripts/eval.py @@ -465,6 +465,7 @@ def _compile_episode_data( return data_dict + @parser.wrap() def eval_main(cfg: EvalPipelineConfig): logging.info(pformat(asdict(cfg))) diff --git a/src/lerobot/scripts/train.py b/src/lerobot/scripts/train.py index 3aea697d0..2656ee6a2 100644 --- a/src/lerobot/scripts/train.py +++ b/src/lerobot/scripts/train.py @@ -105,6 +105,7 @@ def update_policy( train_metrics.update_s = time.perf_counter() - start_time return train_metrics, output_dict + @parser.wrap() def train(cfg: TrainPipelineConfig): cfg.validate()