From 61e55830dadc2bbaa56feed5306c5d0d1282bad0 Mon Sep 17 00:00:00 2001
From: Jade Choghari <chogharijade@gmail.com>
Date: Thu, 4 Sep 2025 12:12:10 +0200
Subject: [PATCH 01/12] add train

---
 examples/7_train_libero_smolvla.sh            | 90 +++++++++++++++++++
 .../policies/smolvla/modeling_smolvla.py      | 17 +++-
 .../policies/smolvla/smolvlm_with_expert.py   |  3 +-
 3 files changed, 106 insertions(+), 4 deletions(-)
 create mode 100644 examples/7_train_libero_smolvla.sh

diff --git a/examples/7_train_libero_smolvla.sh b/examples/7_train_libero_smolvla.sh
new file mode 100644
index 000000000..3943e3c96
--- /dev/null
+++ b/examples/7_train_libero_smolvla.sh
@@ -0,0 +1,90 @@
+#!/bin/bash
+# smolvla training
+
+set -euo pipefail
+
+# repo/env
+cd ~/lerobot || exit 1
+# conda activate lerobot
+export LC_ALL=C
+
+
+rm -f core-*
+
+# storage / caches (use RAID to avoid filling $HOME)
+RAID=/raid/jade
+export TRANSFORMERS_CACHE=$RAID/.cache/huggingface/transformers
+export HF_HOME=$RAID/.cache/huggingface
+export HF_DATASETS_CACHE=$RAID/.cache/huggingface/datasets
+export HF_LEROBOT_HOME=$RAID/.cache/huggingface/lerobot
+export WANDB_CACHE_DIR=$RAID/.cache/wandb
+export TMPDIR=$RAID/.cache/tmp
+mkdir -p $TMPDIR
+export WANDB_MODE=offline
+export HF_DATASETS_OFFLINE=1
+export HF_HUB_OFFLINE=1
+export TOKENIZERS_PARALLELISM=false
+export MUJOCO_GL=egl
+
+# will only use if accelerate is used
+PORT=29522
+
+# =================== CONFIG ===================
+ENV=libero
+TASK=libero_spatial
+REPO_ID=physical-intelligence/libero
+
+POLICY=smolvla
+VLM=HuggingFaceTB/SmolVLM2-2.2B-Instruct
+
+# Optim / scheduling
+LR=1e-4
+DECAY_LR=2.5e-6
+DECAY_STEPS=30000
+USE_AMP=false
+TRAIN_EXPERT_ONLY=true
+N_ACTION_STEPS=1
+SEED=1000
+
+# Training loop
+OFFLINE_STEPS=100000
+BATCH_SIZE=32
+EVAL_FREQ=0
+SAVE_FREQ=300000
+EVAL_BATCH_SIZE=1
+NUM_EPISODES=1
+
+# GPU selection 0, 1, 2, 3
+export CUDA_VISIBLE_DEVICES=1
+
+# naming/output dir
+TRAIN_DIR=$RAID/logs/lerobot/lerobot_${REPO_ID//\//_}_${POLICY}_lr${LR}bs${BATCH_SIZE}steps${OFFLINE_STEPS}
+echo "Training dir: $TRAIN_DIR"
+
+# train
+rm -rf "$TRAIN_DIR"
+
+python src/lerobot/scripts/train.py \
+  --policy.type=$POLICY \
+  --policy.vlm_model_name=$VLM \
+  --dataset.repo_id=$REPO_ID \
+  --dataset.root=$HF_DATASETS_CACHE \
+  --env.type=$ENV \
+  --env.task=$TASK \
+  --output_dir=$TRAIN_DIR \
+  --batch_size=$BATCH_SIZE \
+  --steps=$OFFLINE_STEPS \
+  --eval_freq=$EVAL_FREQ \
+  --save_freq=$SAVE_FREQ \
+  --eval.batch_size=$EVAL_BATCH_SIZE \
+  --eval.n_episodes=$NUM_EPISODES \
+  --policy.use_amp=$USE_AMP \
+  --policy.optimizer_lr=$LR \
+  --policy.repo_id=None \
+  --policy.scheduler_decay_lr=$DECAY_LR \
+  --policy.scheduler_decay_steps=$DECAY_STEPS \
+  --policy.n_action_steps=$N_ACTION_STEPS \
+  --policy.train_expert_only=$TRAIN_EXPERT_ONLY \
+  --policy.vlm_model_name=/raid/jade/.cache/huggingface/models/SmolVLM2-2.2B-Instruct \
+  --seed=$SEED \
+  --wandb.enable=false
diff --git a/src/lerobot/policies/smolvla/modeling_smolvla.py b/src/lerobot/policies/smolvla/modeling_smolvla.py
index 18f2fc58a..95ed993d2 100644
--- a/src/lerobot/policies/smolvla/modeling_smolvla.py
+++ b/src/lerobot/policies/smolvla/modeling_smolvla.py
@@ -63,7 +63,7 @@ import torch.nn.functional as F  # noqa: N812
 from torch import Tensor, nn
 from transformers import AutoProcessor
 
-from lerobot.constants import ACTION, OBS_STATE
+from lerobot.constants import ACTION
 from lerobot.policies.normalize import (
     Normalize,
     Unnormalize,
@@ -75,7 +75,8 @@ from lerobot.policies.utils import (
     populate_queues,
 )
 from lerobot.utils.utils import get_safe_dtype
-
+OBS_STATE = 'state'
+ACTION = 'actions'
 # Matches ".soNNN", optionally followed by "-something", up to the "_buffer_" marker
 _VARIANT_RE = re.compile(r"\.so\d+(?:-[\w]+)?_buffer_")
 
@@ -824,12 +825,21 @@ class VLAFlowMatching(nn.Module):
         pad_masks = torch.cat(pad_masks, dim=1)
         att_masks = torch.tensor(att_masks, dtype=embs.dtype, device=embs.device)
         att_masks = att_masks[None, :].expand(bsize, len(att_masks))
+        # added by jade
+        seq_len = pad_masks.shape[1]
+        if seq_len < self.config.chunk_size:
+            embs = pad_tensor(embs, self.config.chunk_size, pad_value=0)
+            pad_masks = pad_tensor(pad_masks, self.config.chunk_size, pad_value=0)
+            att_masks = pad_tensor(att_masks, self.config.chunk_size, pad_value=0)
         return embs, pad_masks, att_masks
 
     def forward(
         self, images, img_masks, lang_tokens, lang_masks, state, actions, noise=None, time=None
     ) -> Tensor:
         """Do a full training forward pass and compute the loss (batch_size x num_steps x num_motors)"""
+        #added by jade
+        if actions.ndim == 2:
+            actions = actions[:, None, :].expand(-1, self.config.chunk_size, -1)
         if noise is None:
             noise = self.sample_noise(actions.shape, actions.device)
 
@@ -857,7 +867,8 @@ class VLAFlowMatching(nn.Module):
             use_cache=False,
             fill_kv_cache=False,
         )
-        suffix_out = suffix_out[:, -self.config.chunk_size :]
+        # suffix_out = suffix_out[:, -self.config.chunk_size :]
+        suffix_out = suffix_out[:, -self.config.chunk_size:, :]
         # Original openpi code, upcast attention output
         suffix_out = suffix_out.to(dtype=torch.float32)
         v_t = self.action_out_proj(suffix_out)
diff --git a/src/lerobot/policies/smolvla/smolvlm_with_expert.py b/src/lerobot/policies/smolvla/smolvlm_with_expert.py
index f3d1a693a..f6a49dccf 100644
--- a/src/lerobot/policies/smolvla/smolvlm_with_expert.py
+++ b/src/lerobot/policies/smolvla/smolvlm_with_expert.py
@@ -77,7 +77,8 @@ class SmolVLMWithExpertModel(nn.Module):
             self.vlm = AutoModelForImageTextToText.from_pretrained(
                 model_id,
                 device_map="auto",
-                torch_dtype="bfloat16",
+                # torch_dtype="bfloat16",
+                torch_dtype=torch.float16,
                 low_cpu_mem_usage=True,
             )
             config = self.vlm.config

From 5c628f17004ac80c7f09892e908819f7eadf612c Mon Sep 17 00:00:00 2001
From: Jade Choghari <chogharijade@gmail.com>
Date: Wed, 10 Sep 2025 11:32:54 +0200
Subject: [PATCH 02/12] new things

---
 examples/6_evaluate_libero.sh                 |   49 +-
 examples/6_evaluate_libero_2.sh               |   76 +
 examples/7_train_acc.sh                       |   93 +
 examples/7_train_libero_smolvla.sh            |   17 +-
 examples/8_train_smolvla_must.sh              |  141 ++
 examples/checker.py                           |   27 +
 examples/checker2.py                          |   35 +
 examples/convert_data.py                      |  253 +++
 examples/convert_libero.py                    |  126 ++
 examples/evaluate_libero.py                   |  255 +++
 examples/requirements.in                      |    8 +
 examples/script2.py                           |   70 +
 examples/script3.py                           |   64 +
 examples/script4.py                           |    3 +
 log_text.txt                                  | 1765 +++++++++++++++
 src/lerobot/configs/policies.py               |    1 +
 src/lerobot/datasets/lerobot_dataset.py       |    2 +-
 src/lerobot/datasets/utils.py                 |    3 +-
 src/lerobot/policies/factory.py               |    8 +-
 src/lerobot/policies/normalize.py             |  161 ++
 .../policies/smolpi0/configuration_smolpi0.py |  210 ++
 .../policies/smolpi0/flex_attention.py        |  145 ++
 .../policies/smolpi0/modeling_smolpi0.py      | 1021 +++++++++
 .../policies/smolpi0/smolvlm_with_expert.py   |  824 +++++++
 .../policies/smolvla/modeling_smolvla.py      |  971 +++++++-
 .../policies/smolvla/modeling_smolvla_v2.py   |    0
 src/lerobot/policies/smolvla/saver.txt        |    1 +
 .../policies/smolvla/smolvlm_with_expert.py   |    6 +-
 src/lerobot/scripts/eval.py                   |   44 +-
 src/lerobot/scripts/train.py                  |   29 +-
 src/lerobot/scripts/train_2.py                |  343 +++
 src/lerobot/scripts/train_accelerate.py       |  365 +++
 tmux_log.txt                                  | 2008 +++++++++++++++++
 33 files changed, 9085 insertions(+), 39 deletions(-)
 create mode 100644 examples/6_evaluate_libero_2.sh
 create mode 100644 examples/7_train_acc.sh
 create mode 100644 examples/8_train_smolvla_must.sh
 create mode 100644 examples/checker.py
 create mode 100644 examples/checker2.py
 create mode 100644 examples/convert_data.py
 create mode 100644 examples/convert_libero.py
 create mode 100644 examples/evaluate_libero.py
 create mode 100644 examples/requirements.in
 create mode 100644 examples/script2.py
 create mode 100644 examples/script3.py
 create mode 100644 examples/script4.py
 create mode 100644 log_text.txt
 create mode 100644 src/lerobot/policies/smolpi0/configuration_smolpi0.py
 create mode 100644 src/lerobot/policies/smolpi0/flex_attention.py
 create mode 100644 src/lerobot/policies/smolpi0/modeling_smolpi0.py
 create mode 100644 src/lerobot/policies/smolpi0/smolvlm_with_expert.py
 create mode 100644 src/lerobot/policies/smolvla/modeling_smolvla_v2.py
 create mode 100644 src/lerobot/policies/smolvla/saver.txt
 create mode 100644 src/lerobot/scripts/train_2.py
 create mode 100644 src/lerobot/scripts/train_accelerate.py
 create mode 100644 tmux_log.txt

diff --git a/examples/6_evaluate_libero.sh b/examples/6_evaluate_libero.sh
index 36f8c6473..ad6ca0f13 100644
--- a/examples/6_evaluate_libero.sh
+++ b/examples/6_evaluate_libero.sh
@@ -1,14 +1,45 @@
 #!/bin/bash
 
-unset LEROBOT_HOME
-unset HF_LEROBOT_HOME
+# storage / caches
+RAID=/raid/jade
+export TRANSFORMERS_CACHE=$RAID/.cache/huggingface/transformers
+export HF_HOME=$RAID/.cache/huggingface
+export HF_DATASETS_CACHE=$RAID/.cache/huggingface/datasets
+export HF_LEROBOT_HOME=$RAID/.cache/huggingface/lerobot
+export WANDB_CACHE_DIR=$RAID/.cache/wandb
+export TMPDIR=$RAID/.cache/tmp
+mkdir -p $TMPDIR
+export WANDB_MODE=offline
+export HF_DATASETS_OFFLINE=1
+export HF_HUB_OFFLINE=1
+export TOKENIZERS_PARALLELISM=false
+export MUJOCO_GL=egl
+export CUDA_VISIBLE_DEVICES=3
+
 # CONFIGURATION
-POLICY_PATH="bicmol/smolvla-libero"
+POLICY_PATH="/raid/jade/logs/lerobot/lerobot_2_HuggingFaceVLA_libero_smolvla_lr1e-4bs32steps100000/checkpoints/100000/pretrained_model"
+POLICY_PATH="/raid/jade/models/smolvlamust"
 TASK=libero_spatial
 ENV_TYPE="libero"
-BATCH_SIZE=1
-N_EPISODES=1
+BATCH_SIZE=10
+N_EPISODES=10
+# storage / caches
+RAID=/raid/jade
+N_ACTION_STEPS=1
+export TRANSFORMERS_CACHE=$RAID/.cache/huggingface/transformers
+export HF_HOME=$RAID/.cache/huggingface
+export HF_DATASETS_CACHE=$RAID/.cache/huggingface/datasets
+export HF_LEROBOT_HOME=$RAID/.cache/huggingface/lerobot
+export WANDB_CACHE_DIR=$RAID/.cache/wandb
+export TMPDIR=$RAID/.cache/tmp
+mkdir -p $TMPDIR
+export WANDB_MODE=offline
+# export HF_DATASETS_OFFLINE=1
+# export HF_HUB_OFFLINE=1
+export TOKENIZERS_PARALLELISM=false
 export MUJOCO_GL=egl
+export MUJOCO_GL=egl
+unset HF_HUB_OFFLINE
 # RUN EVALUATION
 python src/lerobot/scripts/eval.py \
     --policy.path="$POLICY_PATH" \
@@ -17,3 +48,11 @@ python src/lerobot/scripts/eval.py \
     --eval.n_episodes="$N_EPISODES" \
     --env.multitask_eval=False \
     --env.task=$TASK \
+# python examples/evaluate_libero.py \
+#     --policy_path "$POLICY_PATH" \
+#     --task_suite_name "$TASK" \
+#     --num_steps_wait 10 \
+#     --num_trials_per_task 10 \
+#     --video_out_path "data/libero/videos" \
+#     --device "cuda" \
+#     --seed 7
\ No newline at end of file
diff --git a/examples/6_evaluate_libero_2.sh b/examples/6_evaluate_libero_2.sh
new file mode 100644
index 000000000..9d05c0330
--- /dev/null
+++ b/examples/6_evaluate_libero_2.sh
@@ -0,0 +1,76 @@
+#!/bin/bash
+
+# storage / caches
+RAID=/raid/jade
+export TRANSFORMERS_CACHE=$RAID/.cache/huggingface/transformers
+export HF_HOME=$RAID/.cache/huggingface
+export HF_DATASETS_CACHE=$RAID/.cache/huggingface/datasets
+export HF_LEROBOT_HOME=$RAID/.cache/huggingface/lerobot
+export WANDB_CACHE_DIR=$RAID/.cache/wandb
+export TMPDIR=$RAID/.cache/tmp
+mkdir -p $TMPDIR
+export WANDB_MODE=offline
+export HF_DATASETS_OFFLINE=1
+export HF_HUB_OFFLINE=1
+export TOKENIZERS_PARALLELISM=false
+export MUJOCO_GL=egl
+export CUDA_VISIBLE_DEVICES=3
+
+# CONFIGURATION
+POLICY_PATH="/raid/jade/logs/lerobot/lerobot_2_HuggingFaceVLA_libero_smolvla_lr1e-4bs32steps100000/checkpoints/100000/pretrained_model"
+POLICY_PATH="AustineJohnBreaker/smolvla_stratch_libero_spatial"
+TASK=libero_spatial
+ENV_TYPE="libero"
+BATCH_SIZE=10
+N_EPISODES=10
+USE_AMP=false
+N_ACTION_STEPS=1
+SELF_ATTN_EVERY_N_LAYERS=2
+VLM_NAME=HuggingFaceTB/SmolVLM-500M-Instruct
+PAD_LANG_TO=longest
+LOAD_VLM_WEIGHTS=true
+NUM_VLM_LAYERS=16
+CHUNK_SIZE=50
+N_OBS_STEPS=1
+NUM_EXPERT_LAYERS=0
+EXPERT_WIDTH_MULTIPLIER=0.5
+
+
+# storage / caches
+RAID=/raid/jade
+export TRANSFORMERS_CACHE=$RAID/.cache/huggingface/transformers
+export HF_HOME=$RAID/.cache/huggingface
+export HF_DATASETS_CACHE=$RAID/.cache/huggingface/datasets
+export HF_LEROBOT_HOME=$RAID/.cache/huggingface/lerobot
+export WANDB_CACHE_DIR=$RAID/.cache/wandb
+export TMPDIR=$RAID/.cache/tmp
+mkdir -p $TMPDIR
+export WANDB_MODE=offline
+# export HF_DATASETS_OFFLINE=1
+# export HF_HUB_OFFLINE=1
+export TOKENIZERS_PARALLELISM=false
+export MUJOCO_GL=egl
+export MUJOCO_GL=egl
+ADD_IMAGE_TOKENS=true
+unset HF_HUB_OFFLINE
+# RUN EVALUATION
+python src/lerobot/scripts/eval.py \
+    --policy.path="$POLICY_PATH" \
+    --env.type="$ENV_TYPE" \
+    --eval.batch_size="$BATCH_SIZE" \
+    --eval.n_episodes="$N_EPISODES" \
+    --env.multitask_eval=False \
+    --env.task=$TASK \
+    --policy.use_amp=$USE_AMP \
+    --policy.n_action_steps=$N_ACTION_STEPS \
+    # --policy.add_image_special_tokens=$ADD_IMAGE_TOKENS \
+    --policy.attention_mode=$ATTN_MODE \
+    --policy.self_attn_every_n_layers=$SELF_ATTN_EVERY_N_LAYERS \
+    --policy.vlm_model_name=$VLM_NAME \
+    --policy.pad_language_to=$PAD_LANG_TO \
+    --policy.load_vlm_weights=$LOAD_VLM_WEIGHTS \
+    --policy.num_vlm_layers=$NUM_VLM_LAYERS \
+    --policy.chunk_size=$CHUNK_SIZE \
+    --policy.n_obs_steps=$N_OBS_STEPS \
+    --policy.num_expert_layers=$NUM_EXPERT_LAYERS \
+    --policy.expert_width_multiplier=$EXPERT_WIDTH_MULTIPLIER \
diff --git a/examples/7_train_acc.sh b/examples/7_train_acc.sh
new file mode 100644
index 000000000..27f445143
--- /dev/null
+++ b/examples/7_train_acc.sh
@@ -0,0 +1,93 @@
+#!/bin/bash
+# smolvla training with accelerate
+
+set -euo pipefail
+
+# repo/env
+cd ~/lerobot || exit 1
+# conda activate lerobot
+export LC_ALL=C
+
+rm -f core-*
+
+# storage / caches
+RAID=/raid/jade
+export TRANSFORMERS_CACHE=$RAID/.cache/huggingface/transformers
+export HF_HOME=$RAID/.cache/huggingface
+export HF_DATASETS_CACHE=$RAID/.cache/huggingface/datasets
+export HF_LEROBOT_HOME=$RAID/.cache/huggingface/lerobot
+export WANDB_CACHE_DIR=$RAID/.cache/wandb
+export TMPDIR=$RAID/.cache/tmp
+mkdir -p $TMPDIR
+export WANDB_MODE=offline
+export HF_DATASETS_OFFLINE=1
+export HF_HUB_OFFLINE=1
+export TOKENIZERS_PARALLELISM=false
+export MUJOCO_GL=egl
+
+# CONFIG
+ENV=libero
+TASK=libero_spatial
+REPO_ID=physical-intelligence/libero
+
+POLICY=smolvla
+VLM=HuggingFaceTB/SmolVLM2-500M-Instruct
+
+# Optim / scheduling
+LR=1e-4
+DECAY_LR=2.5e-6
+DECAY_STEPS=30000
+USE_AMP=true   # set to true for mixed precision
+TRAIN_EXPERT_ONLY=true
+N_ACTION_STEPS=1
+SEED=1000
+
+# Training loop
+OFFLINE_STEPS=100000
+BATCH_SIZE=32
+EVAL_FREQ=0
+SAVE_FREQ=20000
+EVAL_BATCH_SIZE=1
+NUM_EPISODES=1
+
+# number of gpus to use
+NUM_PROCESSES=2
+export CUDA_VISIBLE_DEVICES=1,3
+PORT=29522
+
+# naming/output dir
+TRAIN_DIR=$RAID/logs/lerobot/lerobot_2_${REPO_ID//\//_}_${POLICY}_lr${LR}bs${BATCH_SIZE}steps${OFFLINE_STEPS}
+echo "Training dir: $TRAIN_DIR"
+
+rm -rf "$TRAIN_DIR"
+
+# RUN
+python -m accelerate.commands.launch \
+  --num_processes $NUM_PROCESSES \
+  --num_machines 1 \
+  --main_process_port $PORT \
+  --mixed_precision=$( [ "$USE_AMP" = true ] && echo "bf16" || echo "no" ) \
+  src/lerobot/scripts/train_accelerate.py \
+    --policy.type=$POLICY \
+    --policy.use_amp=True \
+    --policy.vlm_model_name=$VLM \
+    --dataset.repo_id=$REPO_ID \
+    --dataset.root=$HF_DATASETS_CACHE \
+    --env.type=$ENV \
+    --env.task=$TASK \
+    --output_dir=$TRAIN_DIR \
+    --batch_size=$BATCH_SIZE \
+    --steps=$OFFLINE_STEPS \
+    --eval_freq=$EVAL_FREQ \
+    --save_freq=$SAVE_FREQ \
+    --eval.batch_size=$EVAL_BATCH_SIZE \
+    --eval.n_episodes=$NUM_EPISODES \
+    --policy.optimizer_lr=$LR \
+    --policy.repo_id=None \
+    --policy.scheduler_decay_lr=$DECAY_LR \
+    --policy.scheduler_decay_steps=$DECAY_STEPS \
+    --policy.n_action_steps=$N_ACTION_STEPS \
+    --policy.train_expert_only=$TRAIN_EXPERT_ONLY \
+    --policy.vlm_model_name=$VLM \
+    --seed=$SEED \
+    --wandb.enable=false
diff --git a/examples/7_train_libero_smolvla.sh b/examples/7_train_libero_smolvla.sh
index 3943e3c96..f0b9de4e5 100644
--- a/examples/7_train_libero_smolvla.sh
+++ b/examples/7_train_libero_smolvla.sh
@@ -21,8 +21,8 @@ export WANDB_CACHE_DIR=$RAID/.cache/wandb
 export TMPDIR=$RAID/.cache/tmp
 mkdir -p $TMPDIR
 export WANDB_MODE=offline
-export HF_DATASETS_OFFLINE=1
-export HF_HUB_OFFLINE=1
+# export HF_DATASETS_OFFLINE=1
+# export HF_HUB_OFFLINE=1
 export TOKENIZERS_PARALLELISM=false
 export MUJOCO_GL=egl
 
@@ -31,11 +31,11 @@ PORT=29522
 
 # =================== CONFIG ===================
 ENV=libero
-TASK=libero_spatial
+TASK=libero_object
 REPO_ID=physical-intelligence/libero
-
+ROOT=$RAID
 POLICY=smolvla
-VLM=HuggingFaceTB/SmolVLM2-2.2B-Instruct
+VLM=HuggingFaceTB/SmolVLM2-500M-Instruct
 
 # Optim / scheduling
 LR=1e-4
@@ -55,10 +55,10 @@ EVAL_BATCH_SIZE=1
 NUM_EPISODES=1
 
 # GPU selection 0, 1, 2, 3
-export CUDA_VISIBLE_DEVICES=1
+export CUDA_VISIBLE_DEVICES=0
 
 # naming/output dir
-TRAIN_DIR=$RAID/logs/lerobot/lerobot_${REPO_ID//\//_}_${POLICY}_lr${LR}bs${BATCH_SIZE}steps${OFFLINE_STEPS}
+TRAIN_DIR=$RAID/logs/lerobot/lerobot_solo_${REPO_ID//\//_}_${POLICY}_lr${LR}bs${BATCH_SIZE}steps${OFFLINE_STEPS}
 echo "Training dir: $TRAIN_DIR"
 
 # train
@@ -68,7 +68,6 @@ python src/lerobot/scripts/train.py \
   --policy.type=$POLICY \
   --policy.vlm_model_name=$VLM \
   --dataset.repo_id=$REPO_ID \
-  --dataset.root=$HF_DATASETS_CACHE \
   --env.type=$ENV \
   --env.task=$TASK \
   --output_dir=$TRAIN_DIR \
@@ -85,6 +84,6 @@ python src/lerobot/scripts/train.py \
   --policy.scheduler_decay_steps=$DECAY_STEPS \
   --policy.n_action_steps=$N_ACTION_STEPS \
   --policy.train_expert_only=$TRAIN_EXPERT_ONLY \
-  --policy.vlm_model_name=/raid/jade/.cache/huggingface/models/SmolVLM2-2.2B-Instruct \
+  --policy.vlm_model_name=$VLM \
   --seed=$SEED \
   --wandb.enable=false
diff --git a/examples/8_train_smolvla_must.sh b/examples/8_train_smolvla_must.sh
new file mode 100644
index 000000000..98029f3ad
--- /dev/null
+++ b/examples/8_train_smolvla_must.sh
@@ -0,0 +1,141 @@
+#!/bin/bash
+# smolvla training with accelerate
+
+set -euo pipefail
+
+# repo/env
+cd ~/lerobot || exit 1
+# conda activate lerobot
+export LC_ALL=C
+
+rm -f core-*
+
+# storage / caches
+RAID=/raid/jade
+export TRANSFORMERS_CACHE=$RAID/.cache/huggingface/transformers
+export HF_HOME=$RAID/.cache/huggingface
+export HF_DATASETS_CACHE=$RAID/.cache/huggingface/datasets
+export HF_LEROBOT_HOME=$RAID/.cache/huggingface/lerobot
+export WANDB_CACHE_DIR=$RAID/.cache/wandb
+export TMPDIR=$RAID/.cache/tmp
+mkdir -p $TMPDIR
+export WANDB_MODE=offline
+# export HF_DATASETS_OFFLINE=1
+# export HF_HUB_OFFLINE=1
+export TOKENIZERS_PARALLELISM=false
+export MUJOCO_GL=egl
+
+# CONFIG
+ENV=libero
+TASK=libero_spatial
+REPO_ID=HuggingfaceVLA/libero
+
+POLICY=smolvla
+VLM=HuggingFaceTB/SmolVLM2-500M-Instruct
+
+# Optim / scheduling
+LR=1e-4
+DECAY_LR=2.5e-6
+DECAY_STEPS=30000
+USE_AMP=true   # set to true for mixed precision
+TRAIN_EXPERT_ONLY=true
+N_ACTION_STEPS=1
+SEED=1000
+LOAD_VLM_WEIGHTS=true
+# Training loop
+OFFLINE_STEPS=100000
+BATCH_SIZE=32
+EVAL_FREQ=0
+SAVE_FREQ=20000
+EVAL_BATCH_SIZE=1
+NUM_EPISODES=1
+ADD_IMAGE_TOKENS=tru
+N_OBS_STEPS=1
+ATTN_MODE=cross_attn
+EXPERT_WIDTH_MULTIPLIER=0.5
+# number of gpus to use
+NUM_PROCESSES=2
+NUM_VLM_LAYERS=0
+SELF_ATTN_EVERY_N_LAYERS=2
+CHUNK_SIZE=50
+export CUDA_VISIBLE_DEVICES=0
+PORT=29522
+PREFIX_LENGTH=0
+LOAD_VLM_WEIGHTS=true
+# naming/output dir
+TRAIN_DIR=$RAID/logs/lerobot/lerobot_new_${REPO_ID//\//_}_${POLICY}_lr${LR}bs${BATCH_SIZE}steps${OFFLINE_STEPS}
+echo "Training dir: $TRAIN_DIR"
+
+rm -rf "$TRAIN_DIR"
+
+# RUN
+# python -m accelerate.commands.launch \
+#   --num_processes $NUM_PROCESSES \
+#   --num_machines 1 \
+#   --main_process_port $PORT \
+#   --mixed_precision=$( [ "$USE_AMP" = true ] && echo "bf16" || echo "no" ) \
+#   src/lerobot/scripts/train_accelerate.py \
+#     --policy.type=$POLICY \
+#     --policy.use_amp=True \
+#     --policy.vlm_model_name=$VLM \
+#     --dataset.repo_id=$REPO_ID \
+#     --dataset.root=$HF_DATASETS_CACHE \
+#     --env.type=$ENV \
+#     --env.task=$TASK \
+#     --output_dir=$TRAIN_DIR \
+#     --batch_size=$BATCH_SIZE \
+#     --steps=$OFFLINE_STEPS \
+#     --eval_freq=$EVAL_FREQ \
+#     --save_freq=$SAVE_FREQ \
+#     --eval.batch_size=$EVAL_BATCH_SIZE \
+#     --eval.n_episodes=$NUM_EPISODES \
+#     --policy.optimizer_lr=$LR \
+#     --policy.repo_id=None \
+#     --policy.scheduler_decay_lr=$DECAY_LR \
+#     --policy.scheduler_decay_steps=$DECAY_STEPS \
+#     --policy.n_action_steps=$N_ACTION_STEPS \
+#     --policy.train_expert_only=$TRAIN_EXPERT_ONLY \
+#     --policy.vlm_model_name=$VLM \
+#     --policy.n_obs_steps=$N_OBS_STEPS \
+#     --policy.attention_mode=$ATTN_MODE \
+#     --policy.prefix_length=$PREFIX_LENGTH \
+#     --policy.num_vlm_layers=$NUM_VLM_LAYERS \
+#     --policy.chunk_size=$CHUNK_SIZE \
+#     --policy.expert_width_multiplier=$EXPERT_WIDTH_MULTIPLIER \
+#     --policy.self_attn_every_n_layers=$SELF_ATTN_EVERY_N_LAYERS \
+#     --seed=$SEED \
+#     --wandb.enable=false
+
+
+python src/lerobot/scripts/train.py \
+    --policy.type=$POLICY \
+    --policy.use_amp=False \
+    --policy.vlm_model_name=$VLM \
+    --dataset.repo_id=$REPO_ID \
+    --dataset.root='/raid/jade/.cache/huggingface/lerobot/HuggingFaceVLA/libero/data' \
+    --env.type=$ENV \
+    --env.task=$TASK \
+    --output_dir=$TRAIN_DIR \
+    --batch_size=$BATCH_SIZE \
+    --steps=$OFFLINE_STEPS \
+    --eval_freq=$EVAL_FREQ \
+    --save_freq=$SAVE_FREQ \
+    --eval.batch_size=$EVAL_BATCH_SIZE \
+    --eval.n_episodes=$NUM_EPISODES \
+    --policy.optimizer_lr=$LR \
+    --policy.repo_id=None \
+    --policy.scheduler_decay_lr=$DECAY_LR \
+    --policy.scheduler_decay_steps=$DECAY_STEPS \
+    --policy.n_action_steps=$N_ACTION_STEPS \
+    --policy.train_expert_only=$TRAIN_EXPERT_ONLY \
+    --policy.vlm_model_name=$VLM \
+    --policy.n_obs_steps=$N_OBS_STEPS \
+    --policy.attention_mode=$ATTN_MODE \
+    --policy.prefix_length=$PREFIX_LENGTH \
+    --policy.num_vlm_layers=$NUM_VLM_LAYERS \
+    --policy.chunk_size=$CHUNK_SIZE \
+    --policy.load_vlm_weights=$LOAD_VLM_WEIGHTS \
+    --policy.expert_width_multiplier=$EXPERT_WIDTH_MULTIPLIER \
+    --policy.self_attn_every_n_layers=$SELF_ATTN_EVERY_N_LAYERS \
+    --seed=$SEED \
+    --wandb.enable=false
diff --git a/examples/checker.py b/examples/checker.py
new file mode 100644
index 000000000..12377e9b9
--- /dev/null
+++ b/examples/checker.py
@@ -0,0 +1,27 @@
+from huggingface_hub import HfApi
+api = HfApi()
+# api.upload_large_folder(
+#     repo_id="HuggingFaceVLA/libero",
+#     repo_type="dataset",
+#     folder_path="/raid/jade/.cache/huggingface/lerobot/HuggingFaceVLA/libero",
+# )
+api.upload_large_folder(
+    repo_id="HuggingFaceVLA/metaworld_mt50",
+    repo_type="dataset",
+    folder_path="/raid/jade/.cache/huggingface/lerobot/metaworld_mt50",
+)
+# repo_id="HuggingFaceVLA/libero"
+# # Upload extra files
+# api.upload_file(
+#     repo_id=repo_id,
+#     repo_type="dataset",
+#     path_or_fileobj="/raid/jade/libero_converted/README.md",
+#     path_in_repo="README.md"
+# )
+
+# api.upload_folder(
+#     repo_id=repo_id,
+#     repo_type="dataset",
+#     folder_path="/raid/jade/libero_converted/meta",
+#     path_in_repo="meta"
+# )
diff --git a/examples/checker2.py b/examples/checker2.py
new file mode 100644
index 000000000..a5825d87f
--- /dev/null
+++ b/examples/checker2.py
@@ -0,0 +1,35 @@
+import pyarrow.parquet as pq
+
+# # First parquet (cached HF version)
+meta1 = pq.read_metadata("/raid/jade/.cache/huggingface/datasets/data/chunk-000/episode_000000.parquet")
+meta1 = pq.read_metadata("//raid/jade/.cache/huggingface/lerobot/HuggingFaceVLA/libero/data/chunk-000/episode_000019.parquet")
+print("First parquet key_value_metadata:")
+print(meta1.metadata)  # low-level file metadata
+# print()
+print("Second")
+# Second parquet (your converted version)
+meta2 = pq.read_metadata("//raid/jade/.cache/huggingface/lerobot/HuggingFaceVLA/libero/data/chunk-000/episode_000019.parquet")
+print("\nSecond parquet key_value_metadata:")
+# print(meta2.metadata)
+
+# from datasets import load_dataset
+# root_dir = "/raid/jade/libero_converted"
+
+# # Load all parquet files under the root_dir recursively
+# ds = load_dataset("parquet", data_files=f"{root_dir}/**/*.parquet")
+
+# print(ds)                 # prints split info
+# print(ds["train"].features)  # check schema/features
+
+# # Peek at one row
+# example = ds["train"][0]
+# print(example.keys())
+# print(type(example["observation.images.image"]))
+# print(type(example["observation.images.image2"]))
+
+import pyarrow.parquet as pq
+
+for ep in ["episode_000019.parquet", "episode_000021.parquet", "episode_000026.parquet"]:
+    path = f"/raid/jade/.cache/huggingface/lerobot/HuggingFaceVLA/libero/data/chunk-000/{ep}"
+    schema = pq.read_schema(path)
+    print(ep, schema.names)
diff --git a/examples/convert_data.py b/examples/convert_data.py
new file mode 100644
index 000000000..96ce58cb1
--- /dev/null
+++ b/examples/convert_data.py
@@ -0,0 +1,253 @@
+#!/usr/bin/env python3
+"""
+Convert local LeRobot datasets from v2.0 to v2.1 format.
+This script adapts the official converter to work with local datasets.
+"""
+
+import sys
+import argparse
+import logging
+from pathlib import Path
+
+# Add lerobot to path
+sys.path.insert(0, '/home/jade_choghari/lerobot/src')
+
+from lerobot.datasets.lerobot_dataset import CODEBASE_VERSION, LeRobotDataset
+from lerobot.datasets.utils import EPISODES_STATS_PATH, STATS_PATH, load_stats, write_info
+from lerobot.datasets.v21.convert_stats import check_aggregate_stats, convert_stats
+
+logging.basicConfig(level=logging.INFO)
+logger = logging.getLogger(__name__)
+
+def convert_local_dataset(
+    dataset_path: str,
+    num_workers: int = 4,
+    skip_if_converted: bool = True
+):
+    """
+    Convert a local dataset from v2.0 to v2.1 format.
+    
+    Args:
+        dataset_path: Path to the local dataset directory
+        num_workers: Number of workers for parallel processing
+        skip_if_converted: Skip if already has episodes_stats.jsonl
+    """
+    dataset_path = Path(dataset_path)
+    
+    print(f"🔄 Converting local dataset: {dataset_path}")
+    
+    # Check if already converted
+    episodes_stats_path = dataset_path / "meta" / "episodes_stats.jsonl"
+    if episodes_stats_path.exists() and skip_if_converted:
+        # Check if file is empty
+        file_size = episodes_stats_path.stat().st_size
+        if file_size == 0:
+            print(f"  ⚠️  episodes_stats.jsonl is empty, will regenerate")
+        else:
+            # Check if file has content
+            with open(episodes_stats_path, 'r') as f:
+                content = f.read().strip()
+                if not content:
+                    print(f"  ⚠️  episodes_stats.jsonl has no content, will regenerate")
+                else:
+                    print(f"  ⏭️  Already has episodes_stats.jsonl, skipping")
+                    return True
+    
+    try:
+        # Check if this is a v2.0 dataset that needs conversion
+        episodes_stats_path = dataset_path / "meta" / "episodes_stats.jsonl"
+        stats_path = dataset_path / "meta" / "stats.json"
+        
+        if not episodes_stats_path.exists() and stats_path.exists():
+            print(f"  🔄 Detected v2.0 dataset, creating temporary episodes_stats.jsonl...")
+            # Create empty episodes_stats.jsonl to allow loading
+            episodes_stats_path.touch()
+            created_temp_file = True
+        else:
+            created_temp_file = False
+        
+        # Load dataset from local path with pyav video backend
+        print(f"  📂 Loading dataset from local path...")
+        # Use a dummy repo_id since we're loading locally
+        dummy_repo_id = f"{dataset_path.parent.name}/{dataset_path.name}"
+        dataset = LeRobotDataset(
+            dummy_repo_id, 
+            root=str(dataset_path), 
+            # video_backend="pyav",
+            # local_files_only=True
+        )
+        
+        # Remove temporary file if we created it
+        if created_temp_file and episodes_stats_path.exists() and episodes_stats_path.stat().st_size == 0:
+            episodes_stats_path.unlink()
+            print(f"  🗑️  Removed temporary episodes_stats.jsonl")
+        
+        # Remove existing episodes_stats if present (ensure clean conversion)
+        episodes_stats_path = dataset_path / "meta" / "episodes_stats.jsonl"
+        if episodes_stats_path.exists():
+            episodes_stats_path.unlink()
+            print(f"  🗑️  Removed existing episodes_stats.jsonl")
+        
+        # Check if video directory exists before conversion
+        videos_dir = dataset_path / "videos"
+        if not videos_dir.exists():
+            print(f"  ⚠️  No videos directory found - will skip video statistics")
+        
+        # Convert stats
+        print(f"  📊 Computing episode statistics...")
+        convert_stats(dataset, num_workers=num_workers)
+        
+        # Load reference stats for validation if they exist
+        stats_path = dataset.root / STATS_PATH
+        if stats_path.exists():
+            print(f"  ✅ Validating against reference stats...")
+            try:
+                ref_stats = load_stats(dataset.root)
+                check_aggregate_stats(dataset, ref_stats)
+                print(f"  ✅ Stats validation passed!")
+            except AssertionError as e:
+                print(f"  ⚠️  Stats validation failed with minor differences: {e}")
+                print(f"  ⚠️  This is likely due to floating-point precision, continuing anyway...")
+                # Check if the error is just a small numerical difference
+                if "Max absolute difference:" in str(e) and "Max relative difference:" in str(e):
+                    print(f"  ✅ Treating as acceptable numerical precision difference")
+                else:
+                    raise e
+            
+            # Remove old stats.json file
+            print(f"  🗑️  Removing old stats.json")
+            stats_path.unlink()
+        else:
+            print(f"  ⚠️  No reference stats found, skipping validation")
+        
+        # Update codebase version
+        dataset.meta.info["codebase_version"] = CODEBASE_VERSION
+        write_info(dataset.meta.info, dataset.root)
+        
+        print(f"  ✅ Successfully converted to v2.1")
+        return True
+        
+    except Exception as e:
+        print(f"  ❌ Failed to convert: {e}")
+        logger.exception("Conversion failed")
+        return False
+
+def convert_multiple_datasets(
+    base_dirs: list[str],
+    max_datasets: int = None,
+    num_workers: int = 4
+):
+    """Convert multiple datasets from base directories."""
+    
+    datasets_to_convert = []
+    
+    # Scan for datasets needing conversion
+    for base_dir in base_dirs:
+        base_path = Path(base_dir)
+        if not base_path.exists():
+            print(f"⚠️  Directory not found: {base_dir}")
+            continue
+        
+        print(f"🔍 Scanning: {base_dir}")
+        
+        # Walk through author/dataset structure
+        for author_dir in sorted(base_path.iterdir()):
+            if not author_dir.is_dir():
+                continue
+            
+            for dataset_dir in sorted(author_dir.iterdir()):
+                if not dataset_dir.is_dir():
+                    continue
+                
+                # Check if needs conversion
+                episodes_stats_path = dataset_dir / "meta" / "episodes_stats.jsonl" 
+                info_path = dataset_dir / "meta" / "info.json"
+                
+                needs_conversion = False
+                if info_path.exists():
+                    if not episodes_stats_path.exists():
+                        needs_conversion = True
+                        print(f"  📝 Found (missing): {author_dir.name}/{dataset_dir.name}")
+                    else:
+                        # Check if episodes_stats file is empty
+                        try:
+                            file_size = episodes_stats_path.stat().st_size
+                            if file_size == 0:
+                                needs_conversion = True
+                                print(f"  📝 Found (empty): {author_dir.name}/{dataset_dir.name}")
+                            else:
+                                # Check if file has content
+                                with open(episodes_stats_path, 'r') as f:
+                                    content = f.read().strip()
+                                    if not content:
+                                        needs_conversion = True
+                                        print(f"  📝 Found (no content): {author_dir.name}/{dataset_dir.name}")
+                        except Exception as e:
+                            # If we can't read the file, consider it needs conversion
+                            needs_conversion = True
+                            print(f"  📝 Found (read error): {author_dir.name}/{dataset_dir.name}")
+                
+                if needs_conversion:
+                    datasets_to_convert.append(dataset_dir)
+    
+    if not datasets_to_convert:
+        print("🎉 No datasets need conversion!")
+        return
+    
+    if max_datasets:
+        datasets_to_convert = datasets_to_convert[:max_datasets]
+    
+    print(f"\n🚀 Converting {len(datasets_to_convert)} datasets...")
+    
+    successful = 0
+    failed = 0
+    
+    for i, dataset_path in enumerate(datasets_to_convert, 1):
+        print(f"\n[{i}/{len(datasets_to_convert)}] {dataset_path.parent.name}/{dataset_path.name}")
+        
+        success = convert_local_dataset(dataset_path, num_workers=num_workers)
+        if success:
+            successful += 1
+        else:
+            failed += 1
+    
+    print(f"\n📊 Conversion Summary:")
+    print(f"   ✅ Successful: {successful}")
+    print(f"   ❌ Failed: {failed}")
+    print(f"   📈 Success rate: {successful}/{len(datasets_to_convert)} ({100*successful/len(datasets_to_convert):.1f}%)")
+
+
+def main():
+    parser = argparse.ArgumentParser(description="Convert local LeRobot datasets to v2.1 format")
+    parser.add_argument("--dataset", type=str, help="Single dataset path to convert")
+    parser.add_argument("--base-dirs", nargs="+", 
+                       default=["/fsx/dana_aubakirova/vla/community_dataset_v1"],
+                       help="Base directories to scan for datasets")
+    parser.add_argument("--max-datasets", type=int, help="Maximum number of datasets to convert")
+    parser.add_argument("--num-workers", type=int, default=4, help="Number of workers for stats computation")
+    parser.add_argument("--all", action="store_true", help="Convert all datasets in base directories")
+    
+    args = parser.parse_args()
+    
+    if args.dataset:
+        # Convert single dataset
+        success = convert_local_dataset(args.dataset, num_workers=args.num_workers)
+        if success:
+            print(f"\n🎉 Successfully converted: {args.dataset}")
+        else:
+            print(f"\n💥 Failed to convert: {args.dataset}")
+            sys.exit(1)
+    
+    elif args.all:
+        # Convert all datasets
+        convert_multiple_datasets(
+            args.base_dirs, 
+            max_datasets=args.max_datasets,
+            num_workers=args.num_workers
+        )
+    
+    else:
+        parser.print_help()
+
+if __name__ == "__main__":
+    main()
\ No newline at end of file
diff --git a/examples/convert_libero.py b/examples/convert_libero.py
new file mode 100644
index 000000000..7bfc50eae
--- /dev/null
+++ b/examples/convert_libero.py
@@ -0,0 +1,126 @@
+import os
+import pyarrow.parquet as pq
+import tempfile
+import shutil
+
+# Root directory of converted data
+root_dir = "/raid/jade/libero_converted"
+
+# No renaming
+rename_map = {
+    
+}
+
+# Hugging Face features metadata (constant across all files)
+HF_METADATA = {
+    b"huggingface": b'{"info": {"features": {"observation.images.image": {"_type": "Image"}, "observation.images.image2": {"_type": "Image"}, "state": {"feature": {"dtype": "float32", "_type": "Value"}, "length": 8, "_type": "Sequence"}, "actions": {"feature": {"dtype": "float32", "_type": "Value"}, "length": 7, "_type": "Sequence"}, "timestamp": {"dtype": "float32", "_type": "Value"}, "frame_index": {"dtype": "int64", "_type": "Value"}, "episode_index": {"dtype": "int64", "_type": "Value"}, "index": {"dtype": "int64", "_type": "Value"}, "task_index": {"dtype": "int64", "_type": "Value"}}}}'
+}
+
+def patch_parquet(parquet_path, hf_metadata):
+    try:
+        table = pq.read_table(parquet_path)
+
+        # Merge metadata
+        new_meta = dict(table.schema.metadata or {})
+        new_meta.update(hf_metadata)
+
+        # Apply metadata to table
+        table = table.replace_schema_metadata(new_meta)
+
+        # Write safely via temp file
+        tmp_fd, tmp_path = tempfile.mkstemp(suffix=".parquet")
+        os.close(tmp_fd)
+        pq.write_table(table, tmp_path)
+        shutil.move(tmp_path, parquet_path)
+
+        print(f"✅ Patched: {parquet_path}")
+        return True
+    except Exception as e:
+        print(f"❌ Failed on {parquet_path}: {e}")
+        return False
+
+# Walk through all chunk dirs and patch parquet files
+for dirpath, _, filenames in os.walk(root_dir):
+    for fname in filenames:
+        if fname.endswith(".parquet"):
+            fpath = os.path.join(dirpath, fname)
+            patch_parquet(fpath, HF_METADATA)#!/usr/bin/env python3
+
+#!/usr/bin/env python3
+import os
+import pyarrow.parquet as pq
+import tempfile
+import shutil
+
+# Explicit list of files to patch
+FILES_TO_PATCH = [
+    "/raid/jade/.cache/huggingface/lerobot/HuggingFaceVLA/libero/data/chunk-000/episode_000021.parquet",
+    "/raid/jade/.cache/huggingface/lerobot/HuggingFaceVLA/libero/data/chunk-000/episode_000022.parquet",
+    "/raid/jade/.cache/huggingface/lerobot/HuggingFaceVLA/libero/data/chunk-000/episode_000023.parquet",
+    "/raid/jade/.cache/huggingface/lerobot/HuggingFaceVLA/libero/data/chunk-000/episode_000024.parquet",
+    "/raid/jade/.cache/huggingface/lerobot/HuggingFaceVLA/libero/data/chunk-000/episode_000025.parquet",
+]
+
+# Optional renaming map (fill in as needed)
+rename_map = {
+    # "old_column_name": "new_column_name",
+    "image": "observation.images.image",
+    "image2": "observation.images.image2",
+    "actions": "action",
+}
+
+# Hugging Face features metadata (constant across all files)
+HF_METADATA = {
+    b"huggingface": b'{"info": {"features": {'
+    b'"observation.images.image": {"_type": "Image"}, '
+    b'"observation.images.image2": {"_type": "Image"}, '
+    b'"state": {"feature": {"dtype": "float32", "_type": "Value"}, "length": 8, "_type": "Sequence"}, '
+    b'"actions": {"feature": {"dtype": "float32", "_type": "Value"}, "length": 7, "_type": "Sequence"}, '
+    b'"timestamp": {"dtype": "float32", "_type": "Value"}, '
+    b'"frame_index": {"dtype": "int64", "_type": "Value"}, '
+    b'"episode_index": {"dtype": "int64", "_type": "Value"}, '
+    b'"index": {"dtype": "int64", "_type": "Value"}, '
+    b'"task_index": {"dtype": "int64", "_type": "Value"}}}}'
+}
+
+def patch_parquet(parquet_path, hf_metadata, rename_map):
+    try:
+        # Load parquet table
+        table = pq.read_table(parquet_path)
+
+        # If renaming is needed
+        if rename_map:
+            schema = table.schema
+            new_names = [
+                rename_map.get(name, name) for name in schema.names
+            ]
+            table = table.rename_columns(new_names)
+
+        # Merge schema metadata
+        new_meta = dict(table.schema.metadata or {})
+        new_meta.update(hf_metadata)
+
+        # Replace metadata in table
+        table = table.replace_schema_metadata(new_meta)
+
+        # Write safely via temp file
+        tmp_fd, tmp_path = tempfile.mkstemp(suffix=".parquet")
+        os.close(tmp_fd)
+        pq.write_table(table, tmp_path)
+
+        # Replace original file
+        shutil.move(tmp_path, parquet_path)
+
+        print(f"✅ Patched: {parquet_path}")
+        return True
+    except Exception as e:
+        print(f"❌ Failed on {parquet_path}: {e}")
+        return False
+
+
+if __name__ == "__main__":
+    for fpath in FILES_TO_PATCH:
+        if os.path.exists(fpath):
+            patch_parquet(fpath, HF_METADATA, rename_map)
+        else:
+            print(f"⚠️ File not found: {fpath}")
diff --git a/examples/evaluate_libero.py b/examples/evaluate_libero.py
new file mode 100644
index 000000000..bf99994b6
--- /dev/null
+++ b/examples/evaluate_libero.py
@@ -0,0 +1,255 @@
+"""
+This script demonstrates how to evaluate a pretrained smolVLA policy on the LIBERO benchmark.
+"""
+
+import collections
+import dataclasses
+import logging
+import math
+import pathlib
+
+import cv2
+import draccus
+import imageio
+import numpy as np
+import torch
+from libero.libero import benchmark, get_libero_path
+from libero.libero.envs import OffScreenRenderEnv
+from tqdm import tqdm
+
+from lerobot.policies.smolvla.modeling_smolvla import SmolVLAPolicy
+from lerobot.policies.pi0.modeling_pi0 import PI0Policy
+
+LIBERO_DUMMY_ACTION = [0.0] * 6 + [-1.0]
+LIBERO_ENV_RESOLUTION = 256  # resolution used to render training data
+
+@dataclasses.dataclass
+class Args:
+    """
+    Evaluation arguments for smolVLA on LIBERO.
+    """
+
+    # --- Hugging Face arguments ---
+    policy_path: str = "lerobot/smolvla_base"
+    """Path to the pretrained policy on the Hugging Face Hub or local directory."""
+
+    # --- LIBERO environment-specific parameters ---
+    task_suite_name: str = "libero_spatial"
+    """Task suite. Options: libero_spatial, libero_object, libero_goal, libero_10, libero_90"""
+    num_steps_wait: int = 10
+    """Number of steps to wait for objects to stabilize in sim."""
+    num_trials_per_task: int = 50
+    """Number of rollouts per task."""
+
+    # --- Evaluation arguments ---
+    video_out_path: str = "data/libero/videos"
+    """Path to save videos."""
+    device: str = "cuda"
+    """Device to use for evaluation."""
+
+    seed: int = 7
+    """Random Seed (for reproducibility)"""
+
+
+@draccus.wrap()
+def eval_libero(args: Args) -> None:
+    # Set random seed
+    torch.manual_seed(args.seed)
+    np.random.seed(args.seed)
+
+    # --- Load Policy ---
+    policy = SmolVLAPolicy.from_pretrained(args.policy_path)
+    policy.to(args.device)
+    policy.eval()
+
+    # --- Initialize LIBERO task suite ---
+    benchmark_dict = benchmark.get_benchmark_dict()
+    try:
+        task_suite = benchmark_dict[args.task_suite_name]()
+    except KeyError:
+        raise ValueError(
+            f"Unknown task suite: {args.task_suite_name}. "
+            f"Available options are: {list(benchmark_dict.keys())}"
+        )
+    num_tasks_in_suite = task_suite.n_tasks
+    logging.info(f"Task suite: {args.task_suite_name}")
+
+    pathlib.Path(args.video_out_path).mkdir(parents=True, exist_ok=True)
+
+    if args.task_suite_name == "libero_spatial":
+        max_steps = 220  # longest training demo has 193 steps
+    elif args.task_suite_name == "libero_object":
+        max_steps = 280  # longest training demo has 254 steps
+    elif args.task_suite_name == "libero_goal":
+        max_steps = 300  # longest training demo has 270 steps
+    elif args.task_suite_name == "libero_10":
+        max_steps = 520  # longest training demo has 505 steps
+    elif args.task_suite_name == "libero_90":
+        max_steps = 400  # longest training demo has 373 steps
+    else:
+        # Fallback for custom task suites
+        max_steps = 520
+
+    # --- Evaluation Loop ---
+    total_episodes, total_successes = 0, 0
+    for task_id in tqdm(range(num_tasks_in_suite), desc="Tasks"):
+        # Get task
+        task = task_suite.get_task(task_id)
+
+        # Get default LIBERO initial states
+        initial_states = task_suite.get_task_init_states(task_id)
+        
+        # Initialize LIBERO environment and task description
+        env, task_description = _get_libero_env(task, LIBERO_ENV_RESOLUTION, args.seed)
+
+        # Start episodes
+        task_episodes, task_successes = 0, 0
+        for episode_idx in tqdm(
+            range(min(args.num_trials_per_task, len(initial_states))),
+            desc=f"Task {task_id}: {task.language}",
+            leave=False,
+        ):
+            logging.info(f"\nTask: {task_description}")
+
+            # Reset environment and policy
+            env.reset()
+            policy.reset()
+
+            # Set initial states
+            obs = env.set_init_state(initial_states[episode_idx])
+
+            # IMPORTANT: Do nothing for the first few timesteps because the simulator drops objects
+            # and we need to wait for them to fall
+            for _ in range(args.num_steps_wait):
+                obs, _, _, _ = env.step(LIBERO_DUMMY_ACTION)
+
+            # Setup
+            t = 0
+            frames = []
+            done = False
+
+            # Add initial frame
+            agentview_image = np.ascontiguousarray(obs["agentview_image"][::-1, ::-1])
+            # frames.append(agentview_image)
+            # import ipdb; ipdb.set_trace()
+            logging.info(f"Starting episode {task_episodes+1}...")
+            while t < max_steps:
+                try:
+                    # Get preprocessed image
+                    # IMPORTANT: rotate 180 degrees to match train preprocessing
+                    wrist_img = np.ascontiguousarray(obs["robot0_eye_in_hand_image"][::-1, ::-1])
+                    agentview_image = np.ascontiguousarray(obs["agentview_image"][::-1, ::-1])
+                    frames.append(agentview_image)
+
+                    # Prepare observations dict
+                    state = np.concatenate(
+                        (
+                            obs["robot0_eef_pos"],
+                            _quat2axisangle(obs["robot0_eef_quat"]),
+                            obs["robot0_gripper_qpos"],
+                        )
+                    )
+                    observation = {
+                        "observation.images.image": torch.from_numpy(agentview_image / 255.0)
+                        .permute(2, 0, 1)
+                        .to(torch.float32)
+                        .to(args.device).unsqueeze(0),
+                        "observation.images.image2": torch.from_numpy(wrist_img / 255.0)
+                        .permute(2, 0, 1)
+                        .to(torch.float32)
+                        .to(args.device).unsqueeze(0),
+                        "observation.state": torch.from_numpy(state).to(torch.float32).to(args.device).unsqueeze(0),
+                        "task": task_description,
+                    }
+
+                    # Query model to get action
+                    with torch.inference_mode():
+                        action_tensor = policy.select_action(observation)
+                    action = action_tensor.cpu().numpy()[0]
+                    action[-1] = 1 -  action[-1]
+
+                    # Execute action in environment
+                    obs, _, done, _ = env.step(action)
+                    if done:
+                        task_successes += 1
+                        total_successes += 1
+                        break
+                    t += 1
+
+                except Exception as e:
+                    logging.error(f"Caught exception: {e}")
+                    break
+
+            task_episodes += 1
+            total_episodes += 1
+
+            # Save a replay video of the episode
+            suffix = "success" if done else "failure"
+            task_segment = task_description.replace(" ", "_").replace("/", "_")
+            video_path = (
+                pathlib.Path(args.video_out_path) / f"rollout_task_{task_id}_episode_{episode_idx}_{task_segment}_{suffix}.mp4"
+            )
+            fps = 30
+            writer = imageio.get_writer(video_path, fps=fps)
+
+            for image in frames:
+                writer.append_data(image)
+            writer.close()
+            logging.info(f"Saved video to {video_path}")
+
+            # Log current results
+            logging.info(f"Success: {done}")
+            if total_episodes > 0:
+                logging.info(f"# episodes completed so far: {total_episodes}")
+                logging.info(f"# successes: {total_successes} ({total_successes / total_episodes * 100:.1f}%)")
+
+        # Log final results for the task
+        if task_episodes > 0:
+            logging.info(f"Task {task_id} success rate: {float(task_successes) / float(task_episodes):.2f}")
+        if total_episodes > 0:
+            logging.info(f"Cumulative success rate: {float(total_successes) / float(total_episodes):.2f}")
+
+    logging.info("--- Evaluation finished ---")
+    if total_episodes > 0:
+        logging.info(f"Total success rate: {float(total_successes) / float(total_episodes):.2f}")
+    logging.info(f"Total episodes: {total_episodes}")
+    logging.info(f"Total successes: {total_successes}")
+    cv2.destroyAllWindows()
+
+
+def _get_libero_env(task, resolution, seed):
+    """Initializes and returns the LIBERO environment, along with the task description."""
+    task_description = task.language
+    task_bddl_file = pathlib.Path(get_libero_path("bddl_files")) / task.problem_folder / task.bddl_file
+    env_args = {
+        "bddl_file_name": str(task_bddl_file),
+        "camera_heights": resolution,
+        "camera_widths": resolution,
+    }
+    env = OffScreenRenderEnv(**env_args)
+    env.seed(seed)  # IMPORTANT: seed seems to affect object positions even when using fixed initial state
+    return env, task_description
+
+
+def _quat2axisangle(quat):
+    """
+    Copied from robosuite:
+    https://github.com/ARISE-Initiative/robosuite/blob/eafb81f54ffc104f905ee48a16bb15f059176ad3/robosuite/utils/transform_utils.py#L490C1-L512C55
+    """
+    # clip quaternion
+    if quat[3] > 1.0:
+        quat[3] = 1.0
+    elif quat[3] < -1.0:
+        quat[3] = -1.0
+
+    den = np.sqrt(1.0 - quat[3] * quat[3])
+    if math.isclose(den, 0.0):
+        # This is (close to) a zero degree rotation, immediately return
+        return np.zeros(3)
+
+    return (quat[:3] * 2.0 * math.acos(quat[3])) / den
+
+
+if __name__ == "__main__":
+    logging.basicConfig(level=logging.INFO)
+    eval_libero()
\ No newline at end of file
diff --git a/examples/requirements.in b/examples/requirements.in
new file mode 100644
index 000000000..25664608a
--- /dev/null
+++ b/examples/requirements.in
@@ -0,0 +1,8 @@
+imageio[ffmpeg]
+numpy==1.22.4
+tqdm
+tyro
+PyYaml
+opencv-python==4.6.0.66
+robosuite==1.4.1
+matplotlib==3.5.3
\ No newline at end of file
diff --git a/examples/script2.py b/examples/script2.py
new file mode 100644
index 000000000..cbd4da913
--- /dev/null
+++ b/examples/script2.py
@@ -0,0 +1,70 @@
+#!/usr/bin/env python3
+import os
+import pyarrow.parquet as pq
+import tempfile
+import shutil
+
+FILES_TO_PATCH = [
+    "/raid/jade/.cache/huggingface/lerobot/HuggingFaceVLA/libero/data/chunk-000/episode_000021.parquet",
+    "/raid/jade/.cache/huggingface/lerobot/HuggingFaceVLA/libero/data/chunk-000/episode_000022.parquet",
+    "/raid/jade/.cache/huggingface/lerobot/HuggingFaceVLA/libero/data/chunk-000/episode_000023.parquet",
+    "/raid/jade/.cache/huggingface/lerobot/HuggingFaceVLA/libero/data/chunk-000/episode_000024.parquet",
+    "/raid/jade/.cache/huggingface/lerobot/HuggingFaceVLA/libero/data/chunk-000/episode_000025.parquet",
+]
+
+# Column renaming map
+rename_map = {
+    "wrist_image": "observation.images.image2",
+    "actions": "action",
+}
+
+# Hugging Face metadata
+HF_METADATA = {
+    b"huggingface": b'{"info": {"features": {'
+    b'"observation.images.image": {"_type": "Image"}, '
+    b'"observation.images.image2": {"_type": "Image"}, '
+    b'"state": {"feature": {"dtype": "float32", "_type": "Value"}, "length": 8, "_type": "Sequence"}, '
+    b'"action": {"feature": {"dtype": "float32", "_type": "Value"}, "length": 7, "_type": "Sequence"}, '
+    b'"timestamp": {"dtype": "float32", "_type": "Value"}, '
+    b'"frame_index": {"dtype": "int64", "_type": "Value"}, '
+    b'"episode_index": {"dtype": "int64", "_type": "Value"}, '
+    b'"index": {"dtype": "int64", "_type": "Value"}, '
+    b'"task_index": {"dtype": "int64", "_type": "Value"}}}}'
+}
+
+def patch_parquet(parquet_path, hf_metadata, rename_map):
+    try:
+        table = pq.read_table(parquet_path)
+
+        # Apply column renames if needed
+        if rename_map:
+            schema = table.schema
+            new_names = [rename_map.get(name, name) for name in schema.names]
+            table = table.rename_columns(new_names)
+
+        # Merge schema metadata
+        new_meta = dict(table.schema.metadata or {})
+        new_meta.update(hf_metadata)
+
+        # Replace metadata
+        table = table.replace_schema_metadata(new_meta)
+
+        # Write via temp file
+        tmp_fd, tmp_path = tempfile.mkstemp(suffix=".parquet")
+        os.close(tmp_fd)
+        pq.write_table(table, tmp_path)
+
+        shutil.move(tmp_path, parquet_path)
+        print(f"✅ Patched: {parquet_path}")
+        return True
+    except Exception as e:
+        print(f"❌ Failed on {parquet_path}: {e}")
+        return False
+
+
+if __name__ == "__main__":
+    for fpath in FILES_TO_PATCH:
+        if os.path.exists(fpath):
+            patch_parquet(fpath, HF_METADATA, rename_map)
+        else:
+            print(f"⚠️ File not found: {fpath}")
diff --git a/examples/script3.py b/examples/script3.py
new file mode 100644
index 000000000..7b4d7957a
--- /dev/null
+++ b/examples/script3.py
@@ -0,0 +1,64 @@
+#!/usr/bin/env python3
+import os
+import pyarrow.parquet as pq
+import tempfile
+import shutil
+
+# Root directory containing all parquet files
+ROOT_DIR = "/raid/jade/.cache/huggingface/lerobot/HuggingFaceVLA/libero/data"
+
+# Column renaming map (normalize schema to what training expects)
+rename_map = {
+    "state": "observation.state",
+}
+
+# Hugging Face metadata (aligned with expected feature names)
+HF_METADATA = {
+    b"huggingface": b'{"info": {"features": {'
+    b'"observation.images.image": {"_type": "Image"}, '
+    b'"observation.images.image2": {"_type": "Image"}, '
+    b'"observation.state": {"feature": {"dtype": "float32", "_type": "Value"}, "length": 8, "_type": "Sequence"}, '
+    b'"action": {"feature": {"dtype": "float32", "_type": "Value"}, "length": 7, "_type": "Sequence"}, '
+    b'"timestamp": {"dtype": "float32", "_type": "Value"}, '
+    b'"frame_index": {"dtype": "int64", "_type": "Value"}, '
+    b'"episode_index": {"dtype": "int64", "_type": "Value"}, '
+    b'"index": {"dtype": "int64", "_type": "Value"}, '
+    b'"task_index": {"dtype": "int64", "_type": "Value"}}}}'
+}
+
+def patch_parquet(parquet_path, hf_metadata, rename_map):
+    try:
+        # Read the parquet table
+        table = pq.read_table(parquet_path)
+
+        # Apply renames if necessary
+        if rename_map:
+            new_names = [rename_map.get(name, name) for name in table.schema.names]
+            if new_names != table.schema.names:
+                table = table.rename_columns(new_names)
+
+        # Update metadata
+        new_meta = dict(table.schema.metadata or {})
+        new_meta.update(hf_metadata)
+        table = table.replace_schema_metadata(new_meta)
+
+        # Write to temp file then atomically move back
+        tmp_fd, tmp_path = tempfile.mkstemp(suffix=".parquet")
+        os.close(tmp_fd)
+        pq.write_table(table, tmp_path)
+        shutil.move(tmp_path, parquet_path)
+
+        # Debug print
+        print(f"✅ Patched: {parquet_path}")
+        print("   Columns:", table.schema.names)
+        return True
+    except Exception as e:
+        print(f"❌ Failed on {parquet_path}: {e}")
+        return False
+
+if __name__ == "__main__":
+    for dirpath, _, filenames in os.walk(ROOT_DIR):
+        for fname in filenames:
+            if fname.endswith(".parquet"):
+                fpath = os.path.join(dirpath, fname)
+                patch_parquet(fpath, HF_METADATA, rename_map)
diff --git a/examples/script4.py b/examples/script4.py
new file mode 100644
index 000000000..2eed60a94
--- /dev/null
+++ b/examples/script4.py
@@ -0,0 +1,3 @@
+from huggingface_hub import HfApi
+hub_api = HfApi()
+hub_api.create_tag("HuggingFaceVLA/libero", tag="v2.1", repo_type="dataset")
diff --git a/log_text.txt b/log_text.txt
new file mode 100644
index 000000000..6676df0eb
--- /dev/null
+++ b/log_text.txt
@@ -0,0 +1,1765 @@
+    self.vlm_with_expert = SmolVLMWithExpertModel(
+  File "/home/jade_choghari/lerobot/src/lerobot/policies/smolvla/smolvlm_with_expert.py", line 88, in __init__
+    self.processor = AutoProcessor.from_pretrained(model_id)
+  File "/home/jade_choghari/miniconda3/envs/lerobot/lib/python3.10/site-packages/transformers/models/auto/processing
+_auto.py", line 300, in from_pretrained
+    config_dict, _ = ProcessorMixin.get_processor_dict(pretrained_model_name_or_path, **kwargs)
+  File "/home/jade_choghari/miniconda3/envs/lerobot/lib/python3.10/site-packages/transformers/processing_utils.py",
+line 944, in get_processor_dict
+    resolved_raw_chat_template_file = cached_file(
+  File "/home/jade_choghari/miniconda3/envs/lerobot/lib/python3.10/site-packages/transformers/utils/hub.py", line 32
+1, in cached_file
+    file = cached_files(path_or_repo_id=path_or_repo_id, filenames=[filename], **kwargs)
+  File "/home/jade_choghari/miniconda3/envs/lerobot/lib/python3.10/site-packages/transformers/utils/hub.py", line 47
+8, in cached_files
+    hf_hub_download(
+  File "/home/jade_choghari/miniconda3/envs/lerobot/lib/python3.10/site-packages/huggingface_hub/utils/_validators.p
+y", line 114, in _inner_fn
+    return fn(*args, **kwargs)
+  File "/home/jade_choghari/miniconda3/envs/lerobot/lib/python3.10/site-packages/huggingface_hub/file_download.py",
+line 1010, in hf_hub_download
+    return _hf_hub_download_to_cache_dir(
+  File "/home/jade_choghari/miniconda3/envs/lerobot/lib/python3.10/site-packages/huggingface_hub/file_download.py",
+line 1073, in _hf_hub_download_to_cache_dir
+    (url_to_download, etag, commit_hash, expected_size, xet_file_data, head_call_error) = _get_metadata_or_catch_err
+or(
+  File "/home/jade_choghari/miniconda3/envs/lerobot/lib/python3.10/site-packages/huggingface_hub/file_download.py",
+line 1546, in _get_metadata_or_catch_error
+    metadata = get_hf_file_metadata(
+  File "/home/jade_choghari/miniconda3/envs/lerobot/lib/python3.10/site-packages/huggingface_hub/utils/_validators.p
+y", line 114, in _inner_fn
+    return fn(*args, **kwargs)
+  File "/home/jade_choghari/miniconda3/envs/lerobot/lib/python3.10/site-packages/huggingface_hub/file_download.py",
+line 1463, in get_hf_file_metadata
+    r = _request_wrapper(
+  File "/home/jade_choghari/miniconda3/envs/lerobot/lib/python3.10/site-packages/huggingface_hub/file_download.py",
+line 286, in _request_wrapper
+    response = _request_wrapper(
+  File "/home/jade_choghari/miniconda3/envs/lerobot/lib/python3.10/site-packages/huggingface_hub/file_download.py",
+line 309, in _request_wrapper
+    response = http_backoff(method=method, url=url, **params, retry_on_exceptions=(), retry_on_status_codes=(429,))
+  File "/home/jade_choghari/miniconda3/envs/lerobot/lib/python3.10/site-packages/huggingface_hub/utils/_http.py", li
+ne 310, in http_backoff
+    response = session.request(method=method, url=url, **kwargs)
+  File "/home/jade_choghari/miniconda3/envs/lerobot/lib/python3.10/site-packages/requests/sessions.py", line 589, in
+ request
+    resp = self.send(prep, **send_kwargs)
+  File "/home/jade_choghari/miniconda3/envs/lerobot/lib/python3.10/site-packages/requests/sessions.py", line 703, in
+ send
+    r = adapter.send(request, **kwargs)
+  File "/home/jade_choghari/miniconda3/envs/lerobot/lib/python3.10/site-packages/huggingface_hub/utils/_http.py", li
+ne 96, in send
+    return super().send(request, *args, **kwargs)
+  File "/home/jade_choghari/miniconda3/envs/lerobot/lib/python3.10/site-packages/requests/adapters.py", line 644, in
+ send
+    resp = conn.urlopen(
+  File "/home/jade_choghari/miniconda3/envs/lerobot/lib/python3.10/site-packages/urllib3/connectionpool.py", line 78
+7, in urlopen
+    response = self._make_request(
+  File "/home/jade_choghari/miniconda3/envs/lerobot/lib/python3.10/site-packages/urllib3/connectionpool.py", line 53
+4, in _make_request
+    response = conn.getresponse()
+  File "/home/jade_choghari/miniconda3/envs/lerobot/lib/python3.10/site-packages/urllib3/connection.py", line 565, i
+n getresponse
+    httplib_response = super().getresponse()
+  File "/home/jade_choghari/miniconda3/envs/lerobot/lib/python3.10/http/client.py", line 1375, in getresponse
+    response.begin()
+  File "/home/jade_choghari/miniconda3/envs/lerobot/lib/python3.10/http/client.py", line 318, in begin
+    version, status, reason = self._read_status()
+  File "/home/jade_choghari/miniconda3/envs/lerobot/lib/python3.10/http/client.py", line 279, in _read_status
+    line = str(self.fp.readline(_MAXLINE + 1), "iso-8859-1")
+  File "/home/jade_choghari/miniconda3/envs/lerobot/lib/python3.10/socket.py", line 717, in readinto
+    return self._sock.recv_into(b)
+  File "/home/jade_choghari/miniconda3/envs/lerobot/lib/python3.10/ssl.py", line 1307, in recv_into
+    return self.read(nbytes, buffer)
+  File "/home/jade_choghari/miniconda3/envs/lerobot/lib/python3.10/ssl.py", line 1163, in read
+    return self._sslobj.read(len, buffer)
+KeyboardInterrupt
+clea
+(lerobot) jade_choghari@hf-dgx-01:~/lerobot$ clear
+(lerobot) jade_choghari@hf-dgx-01:~/lerobot$ bash examples/8_train_smolvla_must.sh
+Training dir: /raid/jade/logs/lerobot/lerobot_2_physical-intelligence_libero_smolvla_lr1e-4bs32steps100000
+/home/jade_choghari/miniconda3/envs/lerobot/lib/python3.10/site-packages/transformers/utils/hub.py:111: FutureWarnin
+g: Using `TRANSFORMERS_CACHE` is deprecated and will be removed in v5 of Transformers. Use `HF_HOME` instead.
+  warnings.warn(
+INFO 2025-09-09 15:50:52 ils/utils.py:48 Cuda backend detected, using cuda.
+WARNING 2025-09-09 15:50:52 /policies.py:81 Device 'None' is not available. Switching to 'cuda'.
+INFO 2025-09-09 15:50:52 ts/train.py:137 {'batch_size': 32,
+ 'dataset': {'episodes': None,
+             'image_transforms': {'enable': False,
+                                  'max_num_transforms': 3,
+                                  'random_order': False,
+                                  'tfs': {'brightness': {'kwargs': {'brightness': [0.8,
+                                                                                   1.2]},
+                                                         'type': 'ColorJitter',
+                                                         'weight': 1.0},
+                                          'contrast': {'kwargs': {'contrast': [0.8,
+                                                                               1.2]},
+                                                       'type': 'ColorJitter',
+                                                       'weight': 1.0},
+                                          'hue': {'kwargs': {'hue': [-0.05,
+                                                                     0.05]},
+                                                  'type': 'ColorJitter',
+                                                  'weight': 1.0},
+                                          'saturation': {'kwargs': {'saturation': [0.5,
+                                                                                   1.5]},
+                                                         'type': 'ColorJitter',
+                                                         'weight': 1.0},
+                                          'sharpness': {'kwargs': {'sharpness': [0.5,
+                                                                                 1.5]},
+                                                        'type': 'SharpnessJitter',
+                                                        'weight': 1.0}}},
+             'repo_id': 'physical-intelligence/libero',
+             'revision': None,
+             'root': '/raid/jade/.cache/huggingface/datasets',
+             'use_imagenet_stats': True,
+             'video_backend': 'torchcodec'},
+ 'env': {'camera_name': 'agentview_image,robot0_eye_in_hand_image',
+         'episode_length': 520,
+         'features': {'action': {'shape': [7],
+                                 'type': <FeatureType.ACTION: 'ACTION'>},
+                      'agent_pos': {'shape': [8],
+                                    'type': <FeatureType.STATE: 'STATE'>},
+                      'pixels/agentview_image': {'shape': [360, 360, 3],
+                                                 'type': <FeatureType.VISUAL: 'VISUAL'>},
+                      'pixels/robot0_eye_in_hand_image': {'shape': [360,
+                                                                    360,
+                                                                    3],
+                                                          'type': <FeatureType.VISUAL: 'VISUAL'>}},
+         'features_map': {'action': 'action',
+                          'agent_pos': 'observation.state',
+                          'pixels/agentview_image': 'observation.images.image',
+                          'pixels/robot0_eye_in_hand_image': 'observation.images.image2'},
+         'fps': 30,
+         'init_states': True,
+         'max_parallel_tasks': 5,
+         'multitask_eval': True,
+         'obs_type': 'pixels_agent_pos',
+         'render_mode': 'rgb_array',
+         'task': 'libero_spatial',
+         'type': 'libero'},
+ 'eval': {'batch_size': 1, 'n_episodes': 1, 'use_async_envs': False},
+ 'eval_freq': 0,
+ 'job_name': 'libero_smolvla',
+ 'log_freq': 200,
+ 'num_workers': 4,
+ 'optimizer': {'betas': [0.9, 0.95],
+               'eps': 1e-08,
+               'grad_clip_norm': 10,
+               'lr': 0.0001,
+               'type': 'adamw',
+               'weight_decay': 1e-10},
+ 'output_dir': '/raid/jade/logs/lerobot/lerobot_2_physical-intelligence_libero_smolvla_lr1e-4bs32steps100000',
+ 'policy': {'adapt_to_pi_aloha': False,
+            'add_image_special_tokens': False,
+            'attention_mode': 'cross_attn',
+            'chunk_size': 50,
+            'device': 'cuda',
+            'empty_cameras': 0,
+            'expert_width_multiplier': 0.5,
+            'freeze_vision_encoder': True,
+            'gradient_accumulation_steps': 1,
+            'input_features': {},
+            'license': None,
+            'load_vlm_weights': False,
+            'max_action_dim': 32,
+            'max_period': 4.0,
+            'max_state_dim': 32,
+            'min_period': 0.004,
+            'n_action_steps': 1,
+            'n_obs_steps': 1,
+            'normalization_mapping': {'ACTION': <NormalizationMode.MEAN_STD: 'MEAN_STD'>,
+                                      'STATE': <NormalizationMode.MEAN_STD: 'MEAN_STD'>,
+                                      'VISUAL': <NormalizationMode.IDENTITY: 'IDENTITY'>},
+            'num_expert_layers': -1,
+            'num_steps': 10,
+            'num_vlm_layers': 16,
+            'optimizer_betas': [0.9, 0.95],
+            'optimizer_eps': 1e-08,
+            'optimizer_grad_clip_norm': 10,
+            'optimizer_lr': 0.0001,
+            'optimizer_weight_decay': 1e-10,
+            'output_features': {},
+            'pad_language_to': 'longest',
+            'prefix_length': 0,
+            'private': None,
+            'push_to_hub': True,
+            'repo_id': 'None',
+            'resize_imgs_with_padding': [512, 512],
+            'scheduler_decay_lr': 2.5e-06,
+            'scheduler_decay_steps': 30000,
+            'scheduler_warmup_steps': 1000,
+            'self_attn_every_n_layers': 2,
+            'tags': None,
+            'tokenizer_max_length': 48,
+            'train_expert_only': True,
+            'train_state_proj': True,
+            'type': 'smolvla',
+            'use_amp': True,
+            'use_cache': True,
+            'use_delta_joint_actions_aloha': False,
+            'vlm_model_name': 'HuggingFaceTB/SmolVLM2-500M-Instruct'},
+ 'resume': False,
+ 'save_checkpoint': True,
+ 'save_freq': 20000,
+ 'scheduler': {'decay_lr': 2.5e-06,
+               'num_decay_steps': 30000,
+               'num_warmup_steps': 1000,
+               'peak_lr': 0.0001,
+               'type': 'cosine_decay_with_warmup'},
+ 'seed': 1000,
+ 'steps': 100000,
+ 'use_policy_training_preset': True,
+ 'wandb': {'disable_artifact': False,
+           'enable': False,
+           'entity': None,
+           'mode': None,
+           'notes': None,
+           'project': 'lerobot',
+           'run_id': None}}
+INFO 2025-09-09 15:50:52 ts/train.py:143 Logs will be saved locally.
+INFO 2025-09-09 15:50:52 ts/train.py:153 Creating dataset
+WARNING 2025-09-09 15:50:52 ts/utils.py:302
+The dataset you requested (physical-intelligence/libero) is in 2.0 format.
+While current version of LeRobot is backward-compatible with it, the version of your dataset still uses global
+stats instead of per-episode stats. Update your dataset stats to the new format using this command:
+```
+python -m lerobot.datasets.v21.convert_dataset_v20_to_v21 --repo-id=physical-intelligence/libero
+```
+
+If you encounter a problem, contact LeRobot maintainers on [Discord](https://discord.com/invite/s3KuuzsPFb)
+or open an [issue on GitHub](https://github.com/huggingface/lerobot/issues/new/choose).
+
+WARNING 2025-09-09 15:50:52 ts/utils.py:302
+The dataset you requested (physical-intelligence/libero) is in 2.0 format.
+While current version of LeRobot is backward-compatible with it, the version of your dataset still uses global
+stats instead of per-episode stats. Update your dataset stats to the new format using this command:
+```
+python -m lerobot.datasets.v21.convert_dataset_v20_to_v21 --repo-id=physical-intelligence/libero
+```
+
+If you encounter a problem, contact LeRobot maintainers on [Discord](https://discord.com/invite/s3KuuzsPFb)
+or open an [issue on GitHub](https://github.com/huggingface/lerobot/issues/new/choose).
+
+Resolving data files: 100%|████████████████████████████████████████████████████████| 1693/1693 [00:00<00:00, 67057.8
+5it/s]
+Loading dataset shards: 100%|███████████████████████████████████████████████████████████| 70/70 [00:00<00:00, 5343.9
+4it/s]
+INFO 2025-09-09 15:50:57 ts/train.py:163 Creating policy
+Fetching 2 files: 100%|██████████████████████████████████████████████████████████████████| 2/2 [00:00<00:00, 47393.2
+7it/s]
+Fetching 2 files: 100%|███████████████████████████████████████████████████████████████████| 2/2 [00:00<00:00, 3797.4
+7it/s]
+Fetching 2 files: 100%|██████████████████████████████████████████████████████████████████| 2/2 [00:00<00:00, 44384.1
+7it/s]
+Fetching 2 files: 100%|███████████████████████████████████████████████████████████████████| 2/2 [00:00<00:00, 6533.1
+8it/s]
+Reducing the number of VLM layers to 16 ...
+INFO 2025-09-09 15:51:30 ts/train.py:168 Creating optimizer and scheduler
+INFO 2025-09-09 15:51:30 ts/train.py:180 Output dir: /raid/jade/logs/lerobot/lerobot_2_physical-intelligence_libero_
+smolvla_lr1e-4bs32steps100000
+INFO 2025-09-09 15:51:30 ts/train.py:182 cfg.env.task='libero_spatial'
+INFO 2025-09-09 15:51:30 ts/train.py:183 cfg.steps=100000 (100K)
+INFO 2025-09-09 15:51:30 ts/train.py:184 dataset.num_frames=273465 (273K)
+INFO 2025-09-09 15:51:30 ts/train.py:185 dataset.num_episodes=1693
+INFO 2025-09-09 15:51:30 ts/train.py:186 num_learnable_params=49103712 (49M)
+INFO 2025-09-09 15:51:30 ts/train.py:187 num_total_params=399268924 (399M)
+INFO 2025-09-09 15:51:30 ts/train.py:225 Start offline training on a fixed dataset
+> /home/jade_choghari/lerobot/src/lerobot/scripts/train.py(230)train()
+-> train_tracker.dataloading_s = time.perf_counter() - start_time
+(Pdb) batch.keys()
+dict_keys(['image', 'wrist_image', 'state', 'actions', 'timestamp', 'frame_index', 'episode_index', 'index', 'task_i
+ndex', 'task'])
+(Pdb) policy.config.input_features
+{'image': PolicyFeature(type=<FeatureType.VISUAL: 'VISUAL'>, shape=(3, 256, 256)), 'wrist_image': PolicyFeature(type
+=<FeatureType.VISUAL: 'VISUAL'>, shape=(3, 256, 256))}
+(Pdb) quit()
+Traceback (most recent call last):
+  File "/home/jade_choghari/lerobot/src/lerobot/scripts/train.py", line 343, in <module>
+    main()
+  File "/home/jade_choghari/lerobot/src/lerobot/scripts/train.py", line 339, in main
+    train()
+  File "/home/jade_choghari/lerobot/src/lerobot/configs/parser.py", line 225, in wrapper_inner
+    response = fn(cfg, *args, **kwargs)
+  File "/home/jade_choghari/lerobot/src/lerobot/scripts/train.py", line 230, in train
+    train_tracker.dataloading_s = time.perf_counter() - start_time
+  File "/home/jade_choghari/lerobot/src/lerobot/scripts/train.py", line 230, in train
+    train_tracker.dataloading_s = time.perf_counter() - start_time
+  File "/home/jade_choghari/miniconda3/envs/lerobot/lib/python3.10/bdb.py", line 90, in trace_dispatch
+    return self.dispatch_line(frame)
+  File "/home/jade_choghari/miniconda3/envs/lerobot/lib/python3.10/bdb.py", line 115, in dispatch_line
+    if self.quitting: raise BdbQuit
+bdb.BdbQuit
+clear
+^[[A(lerobot) jade_choghari@hf-dgx-01:~/lerobot$ clear
+(lerobot) jade_choghari@hf-dgx-01:~/lerobot$ bash examples/8_train_smolvla_must.sh
+Training dir: /raid/jade/logs/lerobot/lerobot_2_physical-intelligence_libero_smolvla_lr1e-4bs32steps100000
+/home/jade_choghari/miniconda3/envs/lerobot/lib/python3.10/site-packages/transformers/utils/hub.py:111: FutureWarnin
+g: Using `TRANSFORMERS_CACHE` is deprecated and will be removed in v5 of Transformers. Use `HF_HOME` instead.
+  warnings.warn(
+INFO 2025-09-09 15:53:49 ils/utils.py:48 Cuda backend detected, using cuda.
+WARNING 2025-09-09 15:53:49 /policies.py:81 Device 'None' is not available. Switching to 'cuda'.
+INFO 2025-09-09 15:53:49 ts/train.py:137 {'batch_size': 32,
+ 'dataset': {'episodes': None,
+             'image_transforms': {'enable': False,
+                                  'max_num_transforms': 3,
+                                  'random_order': False,
+                                  'tfs': {'brightness': {'kwargs': {'brightness': [0.8,
+                                                                                   1.2]},
+                                                         'type': 'ColorJitter',
+                                                         'weight': 1.0},
+                                          'contrast': {'kwargs': {'contrast': [0.8,
+                                                                               1.2]},
+                                                       'type': 'ColorJitter',
+                                                       'weight': 1.0},
+                                          'hue': {'kwargs': {'hue': [-0.05,
+                                                                     0.05]},
+                                                  'type': 'ColorJitter',
+                                                  'weight': 1.0},
+                                          'saturation': {'kwargs': {'saturation': [0.5,
+                                                                                   1.5]},
+                                                         'type': 'ColorJitter',
+                                                         'weight': 1.0},
+                                          'sharpness': {'kwargs': {'sharpness': [0.5,
+                                                                                 1.5]},
+                                                        'type': 'SharpnessJitter',
+                                                        'weight': 1.0}}},
+             'repo_id': 'physical-intelligence/libero',
+             'revision': None,
+             'root': '/raid/jade/.cache/huggingface/datasets',
+             'use_imagenet_stats': True,
+             'video_backend': 'torchcodec'},
+ 'env': {'camera_name': 'agentview_image,robot0_eye_in_hand_image',
+         'episode_length': 520,
+         'features': {'action': {'shape': [7],
+                                 'type': <FeatureType.ACTION: 'ACTION'>},
+                      'agent_pos': {'shape': [8],
+                                    'type': <FeatureType.STATE: 'STATE'>},
+                      'pixels/agentview_image': {'shape': [360, 360, 3],
+                                                 'type': <FeatureType.VISUAL: 'VISUAL'>},
+                      'pixels/robot0_eye_in_hand_image': {'shape': [360,
+                                                                    360,
+                                                                    3],
+                                                          'type': <FeatureType.VISUAL: 'VISUAL'>}},
+         'features_map': {'action': 'action',
+                          'agent_pos': 'observation.state',
+                          'pixels/agentview_image': 'observation.images.image',
+                          'pixels/robot0_eye_in_hand_image': 'observation.images.image2'},
+         'fps': 30,
+         'init_states': True,
+         'max_parallel_tasks': 5,
+         'multitask_eval': True,
+         'obs_type': 'pixels_agent_pos',
+         'render_mode': 'rgb_array',
+         'task': 'libero_spatial',
+         'type': 'libero'},
+ 'eval': {'batch_size': 1, 'n_episodes': 1, 'use_async_envs': False},
+ 'eval_freq': 0,
+ 'job_name': 'libero_smolvla',
+ 'log_freq': 200,
+ 'num_workers': 4,
+ 'optimizer': {'betas': [0.9, 0.95],
+               'eps': 1e-08,
+               'grad_clip_norm': 10,
+               'lr': 0.0001,
+               'type': 'adamw',
+               'weight_decay': 1e-10},
+ 'output_dir': '/raid/jade/logs/lerobot/lerobot_2_physical-intelligence_libero_smolvla_lr1e-4bs32steps100000',
+ 'policy': {'adapt_to_pi_aloha': False,
+            'add_image_special_tokens': False,
+            'attention_mode': 'cross_attn',
+            'chunk_size': 50,
+            'device': 'cuda',
+            'empty_cameras': 0,
+            'expert_width_multiplier': 0.5,
+            'freeze_vision_encoder': True,
+            'gradient_accumulation_steps': 1,
+            'input_features': {},
+            'license': None,
+            'load_vlm_weights': False,
+            'max_action_dim': 32,
+            'max_period': 4.0,
+            'max_state_dim': 32,
+            'min_period': 0.004,
+            'n_action_steps': 1,
+            'n_obs_steps': 1,
+            'normalization_mapping': {'ACTION': <NormalizationMode.MEAN_STD: 'MEAN_STD'>,
+                                      'STATE': <NormalizationMode.MEAN_STD: 'MEAN_STD'>,
+                                      'VISUAL': <NormalizationMode.IDENTITY: 'IDENTITY'>},
+            'num_expert_layers': -1,
+            'num_steps': 10,
+            'num_vlm_layers': 16,
+            'optimizer_betas': [0.9, 0.95],
+            'optimizer_eps': 1e-08,
+            'optimizer_grad_clip_norm': 10,
+            'optimizer_lr': 0.0001,
+            'optimizer_weight_decay': 1e-10,
+            'output_features': {},
+            'pad_language_to': 'longest',
+            'prefix_length': 0,
+            'private': None,
+            'push_to_hub': True,
+            'repo_id': 'None',
+            'resize_imgs_with_padding': [512, 512],
+            'scheduler_decay_lr': 2.5e-06,
+            'scheduler_decay_steps': 30000,
+            'scheduler_warmup_steps': 1000,
+            'self_attn_every_n_layers': 2,
+            'tags': None,
+            'tokenizer_max_length': 48,
+            'train_expert_only': True,
+            'train_state_proj': True,
+            'type': 'smolvla',
+            'use_amp': True,
+            'use_cache': True,
+            'use_delta_joint_actions_aloha': False,
+            'vlm_model_name': 'HuggingFaceTB/SmolVLM2-500M-Instruct'},
+ 'resume': False,
+ 'save_checkpoint': True,
+ 'save_freq': 20000,
+ 'scheduler': {'decay_lr': 2.5e-06,
+               'num_decay_steps': 30000,
+               'num_warmup_steps': 1000,
+               'peak_lr': 0.0001,
+               'type': 'cosine_decay_with_warmup'},
+ 'seed': 1000,
+ 'steps': 100000,
+ 'use_policy_training_preset': True,
+ 'wandb': {'disable_artifact': False,
+           'enable': False,
+           'entity': None,
+           'mode': None,
+           'notes': None,
+           'project': 'lerobot',
+           'run_id': None}}
+INFO 2025-09-09 15:53:49 ts/train.py:143 Logs will be saved locally.
+INFO 2025-09-09 15:53:49 ts/train.py:153 Creating dataset
+WARNING 2025-09-09 15:53:49 ts/utils.py:302
+The dataset you requested (physical-intelligence/libero) is in 2.0 format.
+While current version of LeRobot is backward-compatible with it, the version of your dataset still uses global
+stats instead of per-episode stats. Update your dataset stats to the new format using this command:
+```
+python -m lerobot.datasets.v21.convert_dataset_v20_to_v21 --repo-id=physical-intelligence/libero
+```
+
+If you encounter a problem, contact LeRobot maintainers on [Discord](https://discord.com/invite/s3KuuzsPFb)
+or open an [issue on GitHub](https://github.com/huggingface/lerobot/issues/new/choose).
+
+WARNING 2025-09-09 15:53:49 ts/utils.py:302
+The dataset you requested (physical-intelligence/libero) is in 2.0 format.
+While current version of LeRobot is backward-compatible with it, the version of your dataset still uses global
+stats instead of per-episode stats. Update your dataset stats to the new format using this command:
+```
+python -m lerobot.datasets.v21.convert_dataset_v20_to_v21 --repo-id=physical-intelligence/libero
+```
+
+If you encounter a problem, contact LeRobot maintainers on [Discord](https://discord.com/invite/s3KuuzsPFb)
+or open an [issue on GitHub](https://github.com/huggingface/lerobot/issues/new/choose).
+
+Resolving data files: 100%|████████████████████████████████████████████████████████| 1693/1693 [00:00<00:00, 34701.4
+4it/s]
+Loading dataset shards: 100%|███████████████████████████████████████████████████████████| 70/70 [00:00<00:00, 5495.3
+7it/s]
+INFO 2025-09-09 15:53:55 ts/train.py:163 Creating policy
+Fetching 2 files: 100%|██████████████████████████████████████████████████████████████████| 2/2 [00:00<00:00, 41943.0
+4it/s]
+Fetching 2 files: 100%|███████████████████████████████████████████████████████████████████| 2/2 [00:00<00:00, 5500.7
+3it/s]
+Fetching 2 files: 100%|███████████████████████████████████████████████████████████████████| 2/2 [00:00<00:00, 2361.6
+6it/s]
+Fetching 2 files: 100%|███████████████████████████████████████████████████████████████████| 2/2 [00:00<00:00, 5041.2
+3it/s]
+Reducing the number of VLM layers to 16 ...
+> /home/jade_choghari/lerobot/src/lerobot/policies/factory.py(173)make_policy()
+-> assert isinstance(policy, nn.Module)
+(Pdb) features
+{'image': PolicyFeature(type=<FeatureType.VISUAL: 'VISUAL'>, shape=(3, 256, 256)), 'wrist_image': PolicyFeature(type
+=<FeatureType.VISUAL: 'VISUAL'>, shape=(3, 256, 256)), 'actions': PolicyFeature(type=<FeatureType.ACTION: 'ACTION'>,
+ shape=(7,))}
+(Pdb) ds_meta.features
+{'image': {'dtype': 'image', 'shape': (256, 256, 3), 'names': ['height', 'width', 'channel']}, 'wrist_image': {'dtyp
+e': 'image', 'shape': (256, 256, 3), 'names': ['height', 'width', 'channel']}, 'state': {'dtype': 'float32', 'shape'
+: (8,), 'names': ['state']}, 'actions': {'dtype': 'float32', 'shape': (7,), 'names': ['actions']}, 'timestamp': {'dt
+ype': 'float32', 'shape': (1,), 'names': None}, 'frame_index': {'dtype': 'int64', 'shape': (1,), 'names': None}, 'ep
+isode_index': {'dtype': 'int64', 'shape': (1,), 'names': None}, 'index': {'dtype': 'int64', 'shape': (1,), 'names':
+None}, 'task_index': {'dtype': 'int64', 'shape': (1,), 'names': None}}
+(Pdb) quit()
+
+Traceback (most recent call last):
+  File "/home/jade_choghari/lerobot/src/lerobot/scripts/train.py", line 343, in <module>
+    main()
+  File "/home/jade_choghari/lerobot/src/lerobot/scripts/train.py", line 339, in main
+    train()
+  File "/home/jade_choghari/lerobot/src/lerobot/configs/parser.py", line 225, in wrapper_inner
+    response = fn(cfg, *args, **kwargs)
+  File "/home/jade_choghari/lerobot/src/lerobot/scripts/train.py", line 164, in train
+    policy = make_policy(
+  File "/home/jade_choghari/lerobot/src/lerobot/policies/factory.py", line 173, in make_policy
+    assert isinstance(policy, nn.Module)
+  File "/home/jade_choghari/lerobot/src/lerobot/policies/factory.py", line 173, in make_policy
+    assert isinstance(policy, nn.Module)
+  File "/home/jade_choghari/miniconda3/envs/lerobot/lib/python3.10/bdb.py", line 90, in trace_dispatch
+    return self.dispatch_line(frame)
+  File "/home/jade_choghari/miniconda3/envs/lerobot/lib/python3.10/bdb.py", line 115, in dispatch_line
+    if self.quitting: raise BdbQuit
+bdb.BdbQuit
+clear
+(lerobot) jade_choghari@hf-dgx-01:~/lerobot$ clear
+(lerobot) jade_choghari@hf-dgx-01:~/lerobot$ bash examples/8_train_smolvla_must.sh
+Training dir: /raid/jade/logs/lerobot/lerobot_2_physical-intelligence_libero_smolvla_lr1e-4bs32steps100000
+/home/jade_choghari/miniconda3/envs/lerobot/lib/python3.10/site-packages/transformers/utils/hub.py:111: FutureWarnin
+g: Using `TRANSFORMERS_CACHE` is deprecated and will be removed in v5 of Transformers. Use `HF_HOME` instead.
+  warnings.warn(
+INFO 2025-09-09 15:56:35 ils/utils.py:48 Cuda backend detected, using cuda.
+WARNING 2025-09-09 15:56:35 /policies.py:81 Device 'None' is not available. Switching to 'cuda'.
+INFO 2025-09-09 15:56:35 ts/train.py:137 {'batch_size': 32,
+ 'dataset': {'episodes': None,
+             'image_transforms': {'enable': False,
+                                  'max_num_transforms': 3,
+                                  'random_order': False,
+                                  'tfs': {'brightness': {'kwargs': {'brightness': [0.8,
+                                                                                   1.2]},
+                                                         'type': 'ColorJitter',
+                                                         'weight': 1.0},
+                                          'contrast': {'kwargs': {'contrast': [0.8,
+                                                                               1.2]},
+                                                       'type': 'ColorJitter',
+                                                       'weight': 1.0},
+                                          'hue': {'kwargs': {'hue': [-0.05,
+                                                                     0.05]},
+                                                  'type': 'ColorJitter',
+                                                  'weight': 1.0},
+                                          'saturation': {'kwargs': {'saturation': [0.5,
+                                                                                   1.5]},
+                                                         'type': 'ColorJitter',
+                                                         'weight': 1.0},
+                                          'sharpness': {'kwargs': {'sharpness': [0.5,
+                                                                                 1.5]},
+                                                        'type': 'SharpnessJitter',
+                                                        'weight': 1.0}}},
+             'repo_id': 'physical-intelligence/libero',
+             'revision': None,
+             'root': '/raid/jade/.cache/huggingface/datasets',
+             'use_imagenet_stats': True,
+             'video_backend': 'torchcodec'},
+ 'env': {'camera_name': 'agentview_image,robot0_eye_in_hand_image',
+         'episode_length': 520,
+         'features': {'action': {'shape': [7],
+                                 'type': <FeatureType.ACTION: 'ACTION'>},
+                      'agent_pos': {'shape': [8],
+                                    'type': <FeatureType.STATE: 'STATE'>},
+                      'pixels/agentview_image': {'shape': [360, 360, 3],
+                                                 'type': <FeatureType.VISUAL: 'VISUAL'>},
+                      'pixels/robot0_eye_in_hand_image': {'shape': [360,
+                                                                    360,
+                                                                    3],
+                                                          'type': <FeatureType.VISUAL: 'VISUAL'>}},
+         'features_map': {'action': 'action',
+                          'agent_pos': 'observation.state',
+                          'pixels/agentview_image': 'observation.images.image',
+                          'pixels/robot0_eye_in_hand_image': 'observation.images.image2'},
+         'fps': 30,
+         'init_states': True,
+         'max_parallel_tasks': 5,
+         'multitask_eval': True,
+         'obs_type': 'pixels_agent_pos',
+         'render_mode': 'rgb_array',
+         'task': 'libero_spatial',
+         'type': 'libero'},
+ 'eval': {'batch_size': 1, 'n_episodes': 1, 'use_async_envs': False},
+ 'eval_freq': 0,
+ 'job_name': 'libero_smolvla',
+ 'log_freq': 200,
+ 'num_workers': 4,
+ 'optimizer': {'betas': [0.9, 0.95],
+               'eps': 1e-08,
+               'grad_clip_norm': 10,
+               'lr': 0.0001,
+               'type': 'adamw',
+               'weight_decay': 1e-10},
+ 'output_dir': '/raid/jade/logs/lerobot/lerobot_2_physical-intelligence_libero_smolvla_lr1e-4bs32steps100000',
+ 'policy': {'adapt_to_pi_aloha': False,
+            'add_image_special_tokens': False,
+            'attention_mode': 'cross_attn',
+            'chunk_size': 50,
+            'device': 'cuda',
+            'empty_cameras': 0,
+            'expert_width_multiplier': 0.5,
+            'freeze_vision_encoder': True,
+            'gradient_accumulation_steps': 1,
+            'input_features': {},
+            'license': None,
+            'load_vlm_weights': False,
+            'max_action_dim': 32,
+            'max_period': 4.0,
+            'max_state_dim': 32,
+            'min_period': 0.004,
+            'n_action_steps': 1,
+            'n_obs_steps': 1,
+            'normalization_mapping': {'ACTION': <NormalizationMode.MEAN_STD: 'MEAN_STD'>,
+                                      'STATE': <NormalizationMode.MEAN_STD: 'MEAN_STD'>,
+                                      'VISUAL': <NormalizationMode.IDENTITY: 'IDENTITY'>},
+            'num_expert_layers': -1,
+            'num_steps': 10,
+            'num_vlm_layers': 16,
+            'optimizer_betas': [0.9, 0.95],
+            'optimizer_eps': 1e-08,
+            'optimizer_grad_clip_norm': 10,
+            'optimizer_lr': 0.0001,
+            'optimizer_weight_decay': 1e-10,
+            'output_features': {},
+            'pad_language_to': 'longest',
+            'prefix_length': 0,
+            'private': None,
+            'push_to_hub': True,
+            'repo_id': 'None',
+            'resize_imgs_with_padding': [512, 512],
+            'scheduler_decay_lr': 2.5e-06,
+            'scheduler_decay_steps': 30000,
+            'scheduler_warmup_steps': 1000,
+            'self_attn_every_n_layers': 2,
+            'tags': None,
+            'tokenizer_max_length': 48,
+            'train_expert_only': True,
+            'train_state_proj': True,
+            'type': 'smolvla',
+            'use_amp': True,
+            'use_cache': True,
+            'use_delta_joint_actions_aloha': False,
+            'vlm_model_name': 'HuggingFaceTB/SmolVLM2-500M-Instruct'},
+ 'resume': False,
+ 'save_checkpoint': True,
+ 'save_freq': 20000,
+ 'scheduler': {'decay_lr': 2.5e-06,
+               'num_decay_steps': 30000,
+               'num_warmup_steps': 1000,
+               'peak_lr': 0.0001,
+               'type': 'cosine_decay_with_warmup'},
+ 'seed': 1000,
+ 'steps': 100000,
+ 'use_policy_training_preset': True,
+ 'wandb': {'disable_artifact': False,
+           'enable': False,
+           'entity': None,
+           'mode': None,
+           'notes': None,
+           'project': 'lerobot',
+           'run_id': None}}
+INFO 2025-09-09 15:56:35 ts/train.py:143 Logs will be saved locally.
+INFO 2025-09-09 15:56:35 ts/train.py:153 Creating dataset
+WARNING 2025-09-09 15:56:35 ts/utils.py:302
+The dataset you requested (physical-intelligence/libero) is in 2.0 format.
+While current version of LeRobot is backward-compatible with it, the version of your dataset still uses global
+stats instead of per-episode stats. Update your dataset stats to the new format using this command:
+```
+python -m lerobot.datasets.v21.convert_dataset_v20_to_v21 --repo-id=physical-intelligence/libero
+```
+
+If you encounter a problem, contact LeRobot maintainers on [Discord](https://discord.com/invite/s3KuuzsPFb)
+or open an [issue on GitHub](https://github.com/huggingface/lerobot/issues/new/choose).
+
+WARNING 2025-09-09 15:56:35 ts/utils.py:302
+The dataset you requested (physical-intelligence/libero) is in 2.0 format.
+While current version of LeRobot is backward-compatible with it, the version of your dataset still uses global
+stats instead of per-episode stats. Update your dataset stats to the new format using this command:
+```
+python -m lerobot.datasets.v21.convert_dataset_v20_to_v21 --repo-id=physical-intelligence/libero
+```
+
+If you encounter a problem, contact LeRobot maintainers on [Discord](https://discord.com/invite/s3KuuzsPFb)
+or open an [issue on GitHub](https://github.com/huggingface/lerobot/issues/new/choose).
+
+Resolving data files: 100%|████████████████████████████████████████████████████████| 1693/1693 [00:00<00:00, 78132.9
+5it/s]
+Loading dataset shards: 100%|███████████████████████████████████████████████████████████| 70/70 [00:00<00:00, 4716.0
+3it/s]
+INFO 2025-09-09 15:56:40 ts/train.py:163 Creating policy
+Fetching 2 files: 100%|███████████████████████████████████████████████████████████████████| 2/2 [00:00<00:00, 5259.3
+2it/s]
+Fetching 2 files: 100%|███████████████████████████████████████████████████████████████████| 2/2 [00:00<00:00, 3477.8
+6it/s]
+Fetching 2 files: 100%|██████████████████████████████████████████████████████████████████| 2/2 [00:00<00:00, 45343.8
+3it/s]
+Fetching 2 files: 100%|███████████████████████████████████████████████████████████████████| 2/2 [00:00<00:00, 5551.6
+9it/s]
+Reducing the number of VLM layers to 16 ...
+> /home/jade_choghari/lerobot/src/lerobot/policies/factory.py(173)make_policy()
+-> assert isinstance(policy, nn.Module)
+(Pdb) features
+{'image': PolicyFeature(type=<FeatureType.VISUAL: 'VISUAL'>, shape=(3, 256, 256)), 'wrist_image': PolicyFeature(type
+=<FeatureType.VISUAL: 'VISUAL'>, shape=(3, 256, 256)), 'state': PolicyFeature(type=<FeatureType.STATE: 'STATE'>, sha
+pe=(8,)), 'actions': PolicyFeature(type=<FeatureType.ACTION: 'ACTION'>, shape=(7,))}
+(Pdb) quit()
+Traceback (most recent call last):
+  File "/home/jade_choghari/lerobot/src/lerobot/scripts/train.py", line 343, in <module>
+  File "/home/jade_choghari/lerobot/src/lerobot/scripts/train.py", line 339, in main
+
+  File "/home/jade_choghari/lerobot/src/lerobot/configs/parser.py", line 225, in wrapper_inner
+    response = fn(cfg, *args, **kwargs)
+  File "/home/jade_choghari/lerobot/src/lerobot/scripts/train.py", line 164, in train
+    policy = make_policy(
+  File "/home/jade_choghari/lerobot/src/lerobot/policies/factory.py", line 173, in make_policy
+    # policy = torch.compile(policy, mode="reduce-overhead")
+  File "/home/jade_choghari/lerobot/src/lerobot/policies/factory.py", line 173, in make_policy
+    # policy = torch.compile(policy, mode="reduce-overhead")
+  File "/home/jade_choghari/miniconda3/envs/lerobot/lib/python3.10/bdb.py", line 90, in trace_dispatch
+    return self.dispatch_line(frame)
+  File "/home/jade_choghari/miniconda3/envs/lerobot/lib/python3.10/bdb.py", line 115, in dispatch_line
+    if self.quitting: raise BdbQuit
+bdb.BdbQuit
+clear
+(lerobot) jade_choghari@hf-dgx-01:~/lerobot$ clear
+(lerobot) jade_choghari@hf-dgx-01:~/lerobot$ bash examples/8_train_smolvla_must.sh
+Training dir: /raid/jade/logs/lerobot/lerobot_2_physical-intelligence_libero_smolvla_lr1e-4bs32steps100000
+/home/jade_choghari/miniconda3/envs/lerobot/lib/python3.10/site-packages/transformers/utils/hub.py:111: FutureWarnin
+g: Using `TRANSFORMERS_CACHE` is deprecated and will be removed in v5 of Transformers. Use `HF_HOME` instead.
+  warnings.warn(
+INFO 2025-09-09 15:58:35 ils/utils.py:48 Cuda backend detected, using cuda.
+WARNING 2025-09-09 15:58:35 /policies.py:81 Device 'None' is not available. Switching to 'cuda'.
+INFO 2025-09-09 15:58:35 ts/train.py:137 {'batch_size': 32,
+ 'dataset': {'episodes': None,
+             'image_transforms': {'enable': False,
+                                  'max_num_transforms': 3,
+                                  'random_order': False,
+                                  'tfs': {'brightness': {'kwargs': {'brightness': [0.8,
+                                                                                   1.2]},
+                                                         'type': 'ColorJitter',
+                                                         'weight': 1.0},
+                                          'contrast': {'kwargs': {'contrast': [0.8,
+                                                                               1.2]},
+                                                       'type': 'ColorJitter',
+                                                       'weight': 1.0},
+                                          'hue': {'kwargs': {'hue': [-0.05,
+                                                                     0.05]},
+                                                  'type': 'ColorJitter',
+                                                  'weight': 1.0},
+                                          'saturation': {'kwargs': {'saturation': [0.5,
+                                                                                   1.5]},
+                                                         'type': 'ColorJitter',
+                                                         'weight': 1.0},
+                                          'sharpness': {'kwargs': {'sharpness': [0.5,
+                                                                                 1.5]},
+                                                        'type': 'SharpnessJitter',
+                                                        'weight': 1.0}}},
+             'repo_id': 'physical-intelligence/libero',
+             'revision': None,
+             'root': '/raid/jade/.cache/huggingface/datasets',
+             'use_imagenet_stats': True,
+             'video_backend': 'torchcodec'},
+ 'env': {'camera_name': 'agentview_image,robot0_eye_in_hand_image',
+         'episode_length': 520,
+         'features': {'action': {'shape': [7],
+                                 'type': <FeatureType.ACTION: 'ACTION'>},
+                      'agent_pos': {'shape': [8],
+                                    'type': <FeatureType.STATE: 'STATE'>},
+                      'pixels/agentview_image': {'shape': [360, 360, 3],
+                                                 'type': <FeatureType.VISUAL: 'VISUAL'>},
+                      'pixels/robot0_eye_in_hand_image': {'shape': [360,
+                                                                    360,
+                                                                    3],
+                                                          'type': <FeatureType.VISUAL: 'VISUAL'>}},
+         'features_map': {'action': 'action',
+                          'agent_pos': 'observation.state',
+                          'pixels/agentview_image': 'observation.images.image',
+                          'pixels/robot0_eye_in_hand_image': 'observation.images.image2'},
+         'fps': 30,
+         'init_states': True,
+         'max_parallel_tasks': 5,
+         'multitask_eval': True,
+         'obs_type': 'pixels_agent_pos',
+         'render_mode': 'rgb_array',
+         'task': 'libero_spatial',
+         'type': 'libero'},
+ 'eval': {'batch_size': 1, 'n_episodes': 1, 'use_async_envs': False},
+ 'eval_freq': 0,
+ 'job_name': 'libero_smolvla',
+ 'log_freq': 200,
+ 'num_workers': 4,
+ 'optimizer': {'betas': [0.9, 0.95],
+               'eps': 1e-08,
+               'grad_clip_norm': 10,
+               'lr': 0.0001,
+               'type': 'adamw',
+               'weight_decay': 1e-10},
+ 'output_dir': '/raid/jade/logs/lerobot/lerobot_2_physical-intelligence_libero_smolvla_lr1e-4bs32steps100000',
+ 'policy': {'adapt_to_pi_aloha': False,
+            'add_image_special_tokens': False,
+            'attention_mode': 'cross_attn',
+            'chunk_size': 50,
+            'device': 'cuda',
+            'empty_cameras': 0,
+            'expert_width_multiplier': 0.5,
+            'freeze_vision_encoder': True,
+            'gradient_accumulation_steps': 1,
+            'input_features': {},
+            'license': None,
+            'load_vlm_weights': False,
+            'max_action_dim': 32,
+            'max_period': 4.0,
+            'max_state_dim': 32,
+            'min_period': 0.004,
+            'n_action_steps': 1,
+            'n_obs_steps': 1,
+            'normalization_mapping': {'ACTION': <NormalizationMode.MEAN_STD: 'MEAN_STD'>,
+                                      'STATE': <NormalizationMode.MEAN_STD: 'MEAN_STD'>,
+                                      'VISUAL': <NormalizationMode.IDENTITY: 'IDENTITY'>},
+            'num_expert_layers': -1,
+            'num_steps': 10,
+            'num_vlm_layers': 16,
+            'optimizer_betas': [0.9, 0.95],
+            'optimizer_eps': 1e-08,
+            'optimizer_grad_clip_norm': 10,
+            'optimizer_lr': 0.0001,
+            'optimizer_weight_decay': 1e-10,
+            'output_features': {},
+            'pad_language_to': 'longest',
+            'prefix_length': 0,
+            'private': None,
+            'push_to_hub': True,
+            'repo_id': 'None',
+            'resize_imgs_with_padding': [512, 512],
+            'scheduler_decay_lr': 2.5e-06,
+            'scheduler_decay_steps': 30000,
+            'scheduler_warmup_steps': 1000,
+            'self_attn_every_n_layers': 2,
+            'tags': None,
+            'tokenizer_max_length': 48,
+            'train_expert_only': True,
+            'train_state_proj': True,
+            'type': 'smolvla',
+            'use_amp': True,
+            'use_cache': True,
+            'use_delta_joint_actions_aloha': False,
+            'vlm_model_name': 'HuggingFaceTB/SmolVLM2-500M-Instruct'},
+ 'resume': False,
+ 'save_checkpoint': True,
+ 'save_freq': 20000,
+ 'scheduler': {'decay_lr': 2.5e-06,
+               'num_decay_steps': 30000,
+               'num_warmup_steps': 1000,
+               'peak_lr': 0.0001,
+               'type': 'cosine_decay_with_warmup'},
+ 'seed': 1000,
+ 'steps': 100000,
+ 'use_policy_training_preset': True,
+ 'wandb': {'disable_artifact': False,
+           'enable': False,
+           'entity': None,
+           'mode': None,
+           'notes': None,
+           'project': 'lerobot',
+           'run_id': None}}
+INFO 2025-09-09 15:58:35 ts/train.py:143 Logs will be saved locally.
+INFO 2025-09-09 15:58:35 ts/train.py:153 Creating dataset
+WARNING 2025-09-09 15:58:35 ts/utils.py:302
+The dataset you requested (physical-intelligence/libero) is in 2.0 format.
+While current version of LeRobot is backward-compatible with it, the version of your dataset still uses global
+stats instead of per-episode stats. Update your dataset stats to the new format using this command:
+```
+python -m lerobot.datasets.v21.convert_dataset_v20_to_v21 --repo-id=physical-intelligence/libero
+```
+
+If you encounter a problem, contact LeRobot maintainers on [Discord](https://discord.com/invite/s3KuuzsPFb)
+or open an [issue on GitHub](https://github.com/huggingface/lerobot/issues/new/choose).
+
+WARNING 2025-09-09 15:58:35 ts/utils.py:302
+The dataset you requested (physical-intelligence/libero) is in 2.0 format.
+While current version of LeRobot is backward-compatible with it, the version of your dataset still uses global
+stats instead of per-episode stats. Update your dataset stats to the new format using this command:
+```
+python -m lerobot.datasets.v21.convert_dataset_v20_to_v21 --repo-id=physical-intelligence/libero
+```
+
+If you encounter a problem, contact LeRobot maintainers on [Discord](https://discord.com/invite/s3KuuzsPFb)
+or open an [issue on GitHub](https://github.com/huggingface/lerobot/issues/new/choose).
+
+Resolving data files: 100%|████████████████████████████████████████████████████████| 1693/1693 [00:00<00:00, 27666.4
+6it/s]
+Loading dataset shards: 100%|███████████████████████████████████████████████████████████| 70/70 [00:00<00:00, 5305.7
+0it/s]
+INFO 2025-09-09 15:58:41 ts/train.py:163 Creating policy
+Fetching 2 files: 100%|██████████████████████████████████████████████████████████████████| 2/2 [00:00<00:00, 44384.1
+7it/s]
+Fetching 2 files: 100%|███████████████████████████████████████████████████████████████████| 2/2 [00:00<00:00, 3192.0
+1it/s]
+Fetching 2 files: 100%|██████████████████████████████████████████████████████████████████| 2/2 [00:00<00:00, 44620.2
+6it/s]
+Fetching 2 files: 100%|██████████████████████████████████████████████████████████████████| 2/2 [00:00<00:00, 42799.0
+2it/s]
+Reducing the number of VLM layers to 16 ...
+INFO 2025-09-09 15:59:13 ts/train.py:168 Creating optimizer and scheduler
+INFO 2025-09-09 15:59:13 ts/train.py:180 Output dir: /raid/jade/logs/lerobot/lerobot_2_physical-intelligence_libero_
+smolvla_lr1e-4bs32steps100000
+INFO 2025-09-09 15:59:13 ts/train.py:182 cfg.env.task='libero_spatial'
+INFO 2025-09-09 15:59:13 ts/train.py:183 cfg.steps=100000 (100K)
+INFO 2025-09-09 15:59:13 ts/train.py:184 dataset.num_frames=273465 (273K)
+INFO 2025-09-09 15:59:13 ts/train.py:185 dataset.num_episodes=1693
+INFO 2025-09-09 15:59:13 ts/train.py:186 num_learnable_params=49103712 (49M)
+INFO 2025-09-09 15:59:13 ts/train.py:187 num_total_params=399268940 (399M)
+INFO 2025-09-09 15:59:13 ts/train.py:225 Start offline training on a fixed dataset
+Traceback (most recent call last):
+  File "/home/jade_choghari/lerobot/src/lerobot/scripts/train.py", line 342, in <module>
+    main()
+  File "/home/jade_choghari/lerobot/src/lerobot/scripts/train.py", line 338, in main
+    train()
+  File "/home/jade_choghari/lerobot/src/lerobot/configs/parser.py", line 225, in wrapper_inner
+    response = fn(cfg, *args, **kwargs)
+  File "/home/jade_choghari/lerobot/src/lerobot/scripts/train.py", line 235, in train
+    train_tracker, output_dict = update_policy(
+  File "/home/jade_choghari/lerobot/src/lerobot/scripts/train.py", line 71, in update_policy
+    loss, output_dict = policy.forward(batch)
+  File "/home/jade_choghari/lerobot/src/lerobot/policies/smolvla/modeling_smolvla.py", line 458, in forward
+    actions = self.prepare_action(batch)
+  File "/home/jade_choghari/lerobot/src/lerobot/policies/smolvla/modeling_smolvla.py", line 580, in prepare_action
+    actions = pad_vector(batch[ACTION], self.config.max_action_dim)
+KeyError: 'action'
+Exception in thread Thread-3 (_pin_memory_loop):
+Traceback (most recent call last):
+  File "/home/jade_choghari/miniconda3/envs/lerobot/lib/python3.10/threading.py", line 1016, in _bootstrap_inner
+    self.run()
+  File "/home/jade_choghari/miniconda3/envs/lerobot/lib/python3.10/threading.py", line 953, in run
+    self._target(*self._args, **self._kwargs)
+  File "/home/jade_choghari/miniconda3/envs/lerobot/lib/python3.10/site-packages/torch/utils/data/_utils/pin_memory.
+py", line 61, in _pin_memory_loop
+    do_one_step()
+  File "/home/jade_choghari/miniconda3/envs/lerobot/lib/python3.10/site-packages/torch/utils/data/_utils/pin_memory.
+py", line 37, in do_one_step
+    r = in_queue.get(timeout=MP_STATUS_CHECK_INTERVAL)
+  File "/home/jade_choghari/miniconda3/envs/lerobot/lib/python3.10/multiprocessing/queues.py", line 122, in get
+    return _ForkingPickler.loads(res)
+  File "/home/jade_choghari/miniconda3/envs/lerobot/lib/python3.10/site-packages/torch/multiprocessing/reductions.py
+", line 541, in rebuild_storage_fd
+    fd = df.detach()
+  File "/home/jade_choghari/miniconda3/envs/lerobot/lib/python3.10/multiprocessing/resource_sharer.py", line 57, in
+detach
+    with _resource_sharer.get_connection(self._id) as conn:
+  File "/home/jade_choghari/miniconda3/envs/lerobot/lib/python3.10/multiprocessing/resource_sharer.py", line 86, in
+get_connection
+    c = Client(address, authkey=process.current_process().authkey)
+  File "/home/jade_choghari/miniconda3/envs/lerobot/lib/python3.10/multiprocessing/connection.py", line 508, in Clie
+nt
+    answer_challenge(c, authkey)
+  File "/home/jade_choghari/miniconda3/envs/lerobot/lib/python3.10/multiprocessing/connection.py", line 752, in answ
+er_challenge
+    message = connection.recv_bytes(256)         # reject large message
+  File "/home/jade_choghari/miniconda3/envs/lerobot/lib/python3.10/multiprocessing/connection.py", line 216, in recv
+_bytes
+    buf = self._recv_bytes(maxlength)
+  File "/home/jade_choghari/miniconda3/envs/lerobot/lib/python3.10/multiprocessing/connection.py", line 414, in _rec
+v_bytes
+    buf = self._recv(4)
+  File "/home/jade_choghari/miniconda3/envs/lerobot/lib/python3.10/multiprocessing/connection.py", line 379, in _rec
+v
+    chunk = read(handle, remaining)
+ConnectionResetError: [Errno 104] Connection reset by peer
+(lerobot) jade_choghari@hf-dgx-01:~/lerobot$ clear
+(lerobot) jade_choghari@hf-dgx-01:~/lerobot$ bash examples/8_train_smolvla_must.sh
+Training dir: /raid/jade/logs/lerobot/lerobot_2_physical-intelligence_libero_smolvla_lr1e-4bs32steps100000
+/home/jade_choghari/miniconda3/envs/lerobot/lib/python3.10/site-packages/transformers/utils/hub.py:111: FutureWarnin
+g: Using `TRANSFORMERS_CACHE` is deprecated and will be removed in v5 of Transformers. Use `HF_HOME` instead.
+  warnings.warn(
+INFO 2025-09-09 15:59:53 ils/utils.py:48 Cuda backend detected, using cuda.
+WARNING 2025-09-09 15:59:53 /policies.py:81 Device 'None' is not available. Switching to 'cuda'.
+INFO 2025-09-09 15:59:53 ts/train.py:137 {'batch_size': 32,
+ 'dataset': {'episodes': None,
+             'image_transforms': {'enable': False,
+                                  'max_num_transforms': 3,
+                                  'random_order': False,
+                                  'tfs': {'brightness': {'kwargs': {'brightness': [0.8,
+                                                                                   1.2]},
+                                                         'type': 'ColorJitter',
+                                                         'weight': 1.0},
+                                          'contrast': {'kwargs': {'contrast': [0.8,
+                                                                               1.2]},
+                                                       'type': 'ColorJitter',
+                                                       'weight': 1.0},
+                                          'hue': {'kwargs': {'hue': [-0.05,
+                                                                     0.05]},
+                                                  'type': 'ColorJitter',
+                                                  'weight': 1.0},
+                                          'saturation': {'kwargs': {'saturation': [0.5,
+                                                                                   1.5]},
+                                                         'type': 'ColorJitter',
+                                                         'weight': 1.0},
+                                          'sharpness': {'kwargs': {'sharpness': [0.5,
+                                                                                 1.5]},
+                                                        'type': 'SharpnessJitter',
+                                                        'weight': 1.0}}},
+             'repo_id': 'physical-intelligence/libero',
+             'revision': None,
+             'root': '/raid/jade/.cache/huggingface/datasets',
+             'use_imagenet_stats': True,
+             'video_backend': 'torchcodec'},
+ 'env': {'camera_name': 'agentview_image,robot0_eye_in_hand_image',
+         'episode_length': 520,
+         'features': {'action': {'shape': [7],
+                                 'type': <FeatureType.ACTION: 'ACTION'>},
+                      'agent_pos': {'shape': [8],
+                                    'type': <FeatureType.STATE: 'STATE'>},
+                      'pixels/agentview_image': {'shape': [360, 360, 3],
+                                                 'type': <FeatureType.VISUAL: 'VISUAL'>},
+                      'pixels/robot0_eye_in_hand_image': {'shape': [360,
+                                                                    360,
+                                                                    3],
+                                                          'type': <FeatureType.VISUAL: 'VISUAL'>}},
+         'features_map': {'action': 'action',
+                          'agent_pos': 'observation.state',
+                          'pixels/agentview_image': 'observation.images.image',
+                          'pixels/robot0_eye_in_hand_image': 'observation.images.image2'},
+         'fps': 30,
+         'init_states': True,
+         'max_parallel_tasks': 5,
+         'multitask_eval': True,
+         'obs_type': 'pixels_agent_pos',
+         'render_mode': 'rgb_array',
+         'task': 'libero_spatial',
+         'type': 'libero'},
+ 'eval': {'batch_size': 1, 'n_episodes': 1, 'use_async_envs': False},
+ 'eval_freq': 0,
+ 'job_name': 'libero_smolvla',
+ 'log_freq': 200,
+ 'num_workers': 4,
+ 'optimizer': {'betas': [0.9, 0.95],
+               'eps': 1e-08,
+               'grad_clip_norm': 10,
+               'lr': 0.0001,
+               'type': 'adamw',
+               'weight_decay': 1e-10},
+ 'output_dir': '/raid/jade/logs/lerobot/lerobot_2_physical-intelligence_libero_smolvla_lr1e-4bs32steps100000',
+ 'policy': {'adapt_to_pi_aloha': False,
+            'add_image_special_tokens': False,
+            'attention_mode': 'cross_attn',
+            'chunk_size': 50,
+            'device': 'cuda',
+            'empty_cameras': 0,
+            'expert_width_multiplier': 0.5,
+            'freeze_vision_encoder': True,
+            'gradient_accumulation_steps': 1,
+            'input_features': {},
+            'license': None,
+            'load_vlm_weights': False,
+            'max_action_dim': 32,
+            'max_period': 4.0,
+            'max_state_dim': 32,
+            'min_period': 0.004,
+            'n_action_steps': 1,
+            'n_obs_steps': 1,
+            'normalization_mapping': {'ACTION': <NormalizationMode.MEAN_STD: 'MEAN_STD'>,
+                                      'STATE': <NormalizationMode.MEAN_STD: 'MEAN_STD'>,
+                                      'VISUAL': <NormalizationMode.IDENTITY: 'IDENTITY'>},
+            'num_expert_layers': -1,
+            'num_steps': 10,
+            'num_vlm_layers': 16,
+            'optimizer_betas': [0.9, 0.95],
+            'optimizer_eps': 1e-08,
+            'optimizer_grad_clip_norm': 10,
+            'optimizer_lr': 0.0001,
+            'optimizer_weight_decay': 1e-10,
+            'output_features': {},
+            'pad_language_to': 'longest',
+            'prefix_length': 0,
+            'private': None,
+            'push_to_hub': True,
+            'repo_id': 'None',
+            'resize_imgs_with_padding': [512, 512],
+            'scheduler_decay_lr': 2.5e-06,
+            'scheduler_decay_steps': 30000,
+            'scheduler_warmup_steps': 1000,
+            'self_attn_every_n_layers': 2,
+            'tags': None,
+            'tokenizer_max_length': 48,
+            'train_expert_only': True,
+            'train_state_proj': True,
+            'type': 'smolvla',
+            'use_amp': True,
+            'use_cache': True,
+            'use_delta_joint_actions_aloha': False,
+            'vlm_model_name': 'HuggingFaceTB/SmolVLM2-500M-Instruct'},
+ 'resume': False,
+ 'save_checkpoint': True,
+ 'save_freq': 20000,
+ 'scheduler': {'decay_lr': 2.5e-06,
+               'num_decay_steps': 30000,
+               'num_warmup_steps': 1000,
+               'peak_lr': 0.0001,
+               'type': 'cosine_decay_with_warmup'},
+ 'seed': 1000,
+ 'steps': 100000,
+ 'use_policy_training_preset': True,
+ 'wandb': {'disable_artifact': False,
+           'enable': False,
+           'entity': None,
+           'mode': None,
+           'notes': None,
+           'project': 'lerobot',
+           'run_id': None}}
+INFO 2025-09-09 15:59:53 ts/train.py:143 Logs will be saved locally.
+INFO 2025-09-09 15:59:53 ts/train.py:153 Creating dataset
+WARNING 2025-09-09 15:59:53 ts/utils.py:302
+The dataset you requested (physical-intelligence/libero) is in 2.0 format.
+While current version of LeRobot is backward-compatible with it, the version of your dataset still uses global
+stats instead of per-episode stats. Update your dataset stats to the new format using this command:
+```
+python -m lerobot.datasets.v21.convert_dataset_v20_to_v21 --repo-id=physical-intelligence/libero
+```
+
+If you encounter a problem, contact LeRobot maintainers on [Discord](https://discord.com/invite/s3KuuzsPFb)
+or open an [issue on GitHub](https://github.com/huggingface/lerobot/issues/new/choose).
+
+WARNING 2025-09-09 15:59:53 ts/utils.py:302
+The dataset you requested (physical-intelligence/libero) is in 2.0 format.
+While current version of LeRobot is backward-compatible with it, the version of your dataset still uses global
+stats instead of per-episode stats. Update your dataset stats to the new format using this command:
+```
+python -m lerobot.datasets.v21.convert_dataset_v20_to_v21 --repo-id=physical-intelligence/libero
+```
+
+If you encounter a problem, contact LeRobot maintainers on [Discord](https://discord.com/invite/s3KuuzsPFb)
+or open an [issue on GitHub](https://github.com/huggingface/lerobot/issues/new/choose).
+
+Resolving data files: 100%|████████████████████████████████████████████████████████| 1693/1693 [00:00<00:00, 72147.3
+3it/s]
+Loading dataset shards: 100%|███████████████████████████████████████████████████████████| 70/70 [00:00<00:00, 5076.7
+1it/s]
+INFO 2025-09-09 15:59:58 ts/train.py:163 Creating policy
+Fetching 2 files: 100%|███████████████████████████████████████████████████████████████████| 2/2 [00:00<00:00, 6096.3
+7it/s]
+Fetching 2 files: 100%|███████████████████████████████████████████████████████████████████| 2/2 [00:00<00:00, 4348.6
+8it/s]
+Fetching 2 files: 100%|██████████████████████████████████████████████████████████████████| 2/2 [00:00<00:00, 46091.2
+5it/s]
+Fetching 2 files: 100%|███████████████████████████████████████████████████████████████████| 2/2 [00:00<00:00, 3225.1
+5it/s]
+Reducing the number of VLM layers to 16 ...
+INFO 2025-09-09 16:00:31 ts/train.py:168 Creating optimizer and scheduler
+INFO 2025-09-09 16:00:31 ts/train.py:180 Output dir: /raid/jade/logs/lerobot/lerobot_2_physical-intelligence_libero_
+smolvla_lr1e-4bs32steps100000
+INFO 2025-09-09 16:00:31 ts/train.py:182 cfg.env.task='libero_spatial'
+INFO 2025-09-09 16:00:31 ts/train.py:183 cfg.steps=100000 (100K)
+INFO 2025-09-09 16:00:31 ts/train.py:184 dataset.num_frames=273465 (273K)
+INFO 2025-09-09 16:00:31 ts/train.py:185 dataset.num_episodes=1693
+INFO 2025-09-09 16:00:31 ts/train.py:186 num_learnable_params=49103712 (49M)
+INFO 2025-09-09 16:00:31 ts/train.py:187 num_total_params=399268940 (399M)
+INFO 2025-09-09 16:00:31 ts/train.py:225 Start offline training on a fixed dataset
+Traceback (most recent call last):
+  File "/home/jade_choghari/lerobot/src/lerobot/scripts/train.py", line 342, in <module>
+    main()
+  File "/home/jade_choghari/lerobot/src/lerobot/scripts/train.py", line 338, in main
+    train()
+  File "/home/jade_choghari/lerobot/src/lerobot/configs/parser.py", line 225, in wrapper_inner
+    response = fn(cfg, *args, **kwargs)
+  File "/home/jade_choghari/lerobot/src/lerobot/scripts/train.py", line 235, in train
+    train_tracker, output_dict = update_policy(
+  File "/home/jade_choghari/lerobot/src/lerobot/scripts/train.py", line 71, in update_policy
+    loss, output_dict = policy.forward(batch)
+  File "/home/jade_choghari/lerobot/src/lerobot/policies/smolvla/modeling_smolvla.py", line 461, in forward
+    losses = self.model.forward(images, img_masks, lang_tokens, lang_masks, state, actions, noise, time)
+  File "/home/jade_choghari/lerobot/src/lerobot/policies/smolvla/modeling_smolvla.py", line 850, in forward
+    att_2d_masks = make_att_2d_masks(pad_masks, att_masks)
+  File "/home/jade_choghari/lerobot/src/lerobot/policies/smolvla/modeling_smolvla.py", line 226, in make_att_2d_mask
+s
+    att_2d_masks = att_2d_masks & pad_2d_masks
+RuntimeError: The size of tensor a (199) must match the size of tensor b (181) at non-singleton dimension 2
+(lerobot) jade_choghari@hf-dgx-01:~/lerobot$ clear
+(lerobot) jade_choghari@hf-dgx-01:~/lerobot$ bash examples/8_train_smolvla_must.sh
+Training dir: /raid/jade/logs/lerobot/lerobot_2_physical-intelligence_libero_smolvla_lr1e-4bs32steps100000
+/home/jade_choghari/miniconda3/envs/lerobot/lib/python3.10/site-packages/transformers/utils/hub.py:111: FutureWarnin
+g: Using `TRANSFORMERS_CACHE` is deprecated and will be removed in v5 of Transformers. Use `HF_HOME` instead.
+  warnings.warn(
+INFO 2025-09-09 16:10:03 ils/utils.py:48 Cuda backend detected, using cuda.
+WARNING 2025-09-09 16:10:03 /policies.py:81 Device 'None' is not available. Switching to 'cuda'.
+INFO 2025-09-09 16:10:03 ts/train.py:137 {'batch_size': 32,
+ 'dataset': {'episodes': None,
+             'image_transforms': {'enable': False,
+                                  'max_num_transforms': 3,
+                                  'random_order': False,
+                                  'tfs': {'brightness': {'kwargs': {'brightness': [0.8,
+                                                                                   1.2]},
+                                                         'type': 'ColorJitter',
+                                                         'weight': 1.0},
+                                          'contrast': {'kwargs': {'contrast': [0.8,
+                                                                               1.2]},
+                                                       'type': 'ColorJitter',
+                                                       'weight': 1.0},
+                                          'hue': {'kwargs': {'hue': [-0.05,
+                                                                     0.05]},
+                                                  'type': 'ColorJitter',
+                                                  'weight': 1.0},
+                                          'saturation': {'kwargs': {'saturation': [0.5,
+                                                                                   1.5]},
+                                                         'type': 'ColorJitter',
+                                                         'weight': 1.0},
+                                          'sharpness': {'kwargs': {'sharpness': [0.5,
+                                                                                 1.5]},
+                                                        'type': 'SharpnessJitter',
+                                                        'weight': 1.0}}},
+             'repo_id': 'physical-intelligence/libero',
+             'revision': None,
+             'root': '/raid/jade/.cache/huggingface/datasets',
+             'use_imagenet_stats': True,
+             'video_backend': 'torchcodec'},
+ 'env': {'camera_name': 'agentview_image,robot0_eye_in_hand_image',
+         'episode_length': 520,
+         'features': {'action': {'shape': [7],
+                                 'type': <FeatureType.ACTION: 'ACTION'>},
+                      'agent_pos': {'shape': [8],
+                                    'type': <FeatureType.STATE: 'STATE'>},
+                      'pixels/agentview_image': {'shape': [360, 360, 3],
+                                                 'type': <FeatureType.VISUAL: 'VISUAL'>},
+                      'pixels/robot0_eye_in_hand_image': {'shape': [360,
+                                                                    360,
+                                                                    3],
+                                                          'type': <FeatureType.VISUAL: 'VISUAL'>}},
+         'features_map': {'action': 'action',
+                          'agent_pos': 'observation.state',
+                          'pixels/agentview_image': 'observation.images.image',
+                          'pixels/robot0_eye_in_hand_image': 'observation.images.image2'},
+         'fps': 30,
+         'init_states': True,
+         'max_parallel_tasks': 5,
+         'multitask_eval': True,
+         'obs_type': 'pixels_agent_pos',
+         'render_mode': 'rgb_array',
+         'task': 'libero_spatial',
+         'type': 'libero'},
+ 'eval': {'batch_size': 1, 'n_episodes': 1, 'use_async_envs': False},
+ 'eval_freq': 0,
+ 'job_name': 'libero_smolvla',
+ 'log_freq': 200,
+ 'num_workers': 4,
+ 'optimizer': {'betas': [0.9, 0.95],
+               'eps': 1e-08,
+               'grad_clip_norm': 10,
+               'lr': 0.0001,
+               'type': 'adamw',
+               'weight_decay': 1e-10},
+ 'output_dir': '/raid/jade/logs/lerobot/lerobot_2_physical-intelligence_libero_smolvla_lr1e-4bs32steps100000',
+ 'policy': {'adapt_to_pi_aloha': False,
+            'add_image_special_tokens': False,
+            'attention_mode': 'cross_attn',
+            'chunk_size': 50,
+            'device': 'cuda',
+            'empty_cameras': 0,
+            'expert_width_multiplier': 0.5,
+            'freeze_vision_encoder': True,
+            'gradient_accumulation_steps': 1,
+            'input_features': {},
+            'license': None,
+            'load_vlm_weights': False,
+            'max_action_dim': 32,
+            'max_period': 4.0,
+            'max_state_dim': 32,
+            'min_period': 0.004,
+            'n_action_steps': 1,
+            'n_obs_steps': 1,
+            'normalization_mapping': {'ACTION': <NormalizationMode.MEAN_STD: 'MEAN_STD'>,
+                                      'STATE': <NormalizationMode.MEAN_STD: 'MEAN_STD'>,
+                                      'VISUAL': <NormalizationMode.IDENTITY: 'IDENTITY'>},
+            'num_expert_layers': -1,
+            'num_steps': 10,
+            'num_vlm_layers': 16,
+            'optimizer_betas': [0.9, 0.95],
+            'optimizer_eps': 1e-08,
+            'optimizer_grad_clip_norm': 10,
+            'optimizer_lr': 0.0001,
+            'optimizer_weight_decay': 1e-10,
+            'output_features': {},
+            'pad_language_to': 'longest',
+            'prefix_length': 0,
+            'private': None,
+            'push_to_hub': True,
+            'repo_id': 'None',
+            'resize_imgs_with_padding': [512, 512],
+            'scheduler_decay_lr': 2.5e-06,
+            'scheduler_decay_steps': 30000,
+            'scheduler_warmup_steps': 1000,
+            'self_attn_every_n_layers': 2,
+            'tags': None,
+            'tokenizer_max_length': 48,
+            'train_expert_only': True,
+            'train_state_proj': True,
+            'type': 'smolvla',
+            'use_amp': True,
+            'use_cache': True,
+            'use_delta_joint_actions_aloha': False,
+            'vlm_model_name': 'HuggingFaceTB/SmolVLM2-500M-Instruct'},
+ 'resume': False,
+ 'save_checkpoint': True,
+ 'save_freq': 20000,
+ 'scheduler': {'decay_lr': 2.5e-06,
+               'num_decay_steps': 30000,
+               'num_warmup_steps': 1000,
+               'peak_lr': 0.0001,
+               'type': 'cosine_decay_with_warmup'},
+ 'seed': 1000,
+ 'steps': 100000,
+ 'use_policy_training_preset': True,
+ 'wandb': {'disable_artifact': False,
+           'enable': False,
+           'entity': None,
+           'mode': None,
+           'notes': None,
+           'project': 'lerobot',
+           'run_id': None}}
+INFO 2025-09-09 16:10:03 ts/train.py:143 Logs will be saved locally.
+INFO 2025-09-09 16:10:03 ts/train.py:153 Creating dataset
+WARNING 2025-09-09 16:10:03 ts/utils.py:302
+The dataset you requested (physical-intelligence/libero) is in 2.0 format.
+While current version of LeRobot is backward-compatible with it, the version of your dataset still uses global
+stats instead of per-episode stats. Update your dataset stats to the new format using this command:
+```
+python -m lerobot.datasets.v21.convert_dataset_v20_to_v21 --repo-id=physical-intelligence/libero
+```
+
+If you encounter a problem, contact LeRobot maintainers on [Discord](https://discord.com/invite/s3KuuzsPFb)
+or open an [issue on GitHub](https://github.com/huggingface/lerobot/issues/new/choose).
+
+WARNING 2025-09-09 16:10:03 ts/utils.py:302
+The dataset you requested (physical-intelligence/libero) is in 2.0 format.
+While current version of LeRobot is backward-compatible with it, the version of your dataset still uses global
+stats instead of per-episode stats. Update your dataset stats to the new format using this command:
+```
+python -m lerobot.datasets.v21.convert_dataset_v20_to_v21 --repo-id=physical-intelligence/libero
+```
+
+If you encounter a problem, contact LeRobot maintainers on [Discord](https://discord.com/invite/s3KuuzsPFb)
+or open an [issue on GitHub](https://github.com/huggingface/lerobot/issues/new/choose).
+
+Resolving data files: 100%|█████████████████████████████████| 1693/1693 [00:00<00:00, 54574.89it/s]
+Loading dataset shards: 100%|████████████████████████████████████| 70/70 [00:00<00:00, 7567.63it/s]
+INFO 2025-09-09 16:10:09 ts/train.py:163 Creating policy
+Fetching 2 files: 100%|███████████████████████████████████████████| 2/2 [00:00<00:00, 40721.40it/s]
+Fetching 2 files: 100%|████████████████████████████████████████████| 2/2 [00:00<00:00, 7516.67it/s]
+Fetching 2 files: 100%|████████████████████████████████████████████| 2/2 [00:00<00:00, 3158.36it/s]
+Fetching 2 files: 100%|████████████████████████████████████████████| 2/2 [00:00<00:00, 6775.94it/s]
+Reducing the number of VLM layers to 16 ...
+INFO 2025-09-09 16:10:41 ts/train.py:168 Creating optimizer and scheduler
+INFO 2025-09-09 16:10:41 ts/train.py:180 Output dir: /raid/jade/logs/lerobot/lerobot_2_physical-intelligence_libero_
+smolvla_lr1e-4bs32steps100000
+INFO 2025-09-09 16:10:41 ts/train.py:182 cfg.env.task='libero_spatial'
+INFO 2025-09-09 16:10:41 ts/train.py:183 cfg.steps=100000 (100K)
+INFO 2025-09-09 16:10:41 ts/train.py:184 dataset.num_frames=273465 (273K)
+INFO 2025-09-09 16:10:41 ts/train.py:185 dataset.num_episodes=1693
+INFO 2025-09-09 16:10:41 ts/train.py:186 num_learnable_params=49103712 (49M)
+INFO 2025-09-09 16:10:41 ts/train.py:187 num_total_params=399268940 (399M)
+INFO 2025-09-09 16:10:41 ts/train.py:225 Start offline training on a fixed dataset
+Traceback (most recent call last):
+  File "/home/jade_choghari/lerobot/src/lerobot/scripts/train.py", line 342, in <module>
+    main()
+  File "/home/jade_choghari/lerobot/src/lerobot/scripts/train.py", line 338, in main
+    train()
+  File "/home/jade_choghari/lerobot/src/lerobot/configs/parser.py", line 225, in wrapper_inner
+    response = fn(cfg, *args, **kwargs)
+  File "/home/jade_choghari/lerobot/src/lerobot/scripts/train.py", line 235, in train
+    train_tracker, output_dict = update_policy(
+  File "/home/jade_choghari/lerobot/src/lerobot/scripts/train.py", line 76, in update_policy
+    grad_scaler.unscale_(optimizer)
+  File "/home/jade_choghari/miniconda3/envs/lerobot/lib/python3.10/site-packages/torch/amp/grad_scaler.py", line 342
+, in unscale_
+    optimizer_state["found_inf_per_device"] = self._unscale_grads_(
+  File "/home/jade_choghari/miniconda3/envs/lerobot/lib/python3.10/site-packages/torch/amp/grad_scaler.py", line 283
+, in _unscale_grads_
+    torch._amp_foreach_non_finite_check_and_unscale_(
+RuntimeError: "_amp_foreach_non_finite_check_and_unscale_cuda" not implemented for 'BFloat16'
+(lerobot) jade_choghari@hf-dgx-01:~/lerobot$ clear
+(lerobot) jade_choghari@hf-dgx-01:~/lerobot$ bash examples/8_train_smolvla_must.sh
+Training dir: /raid/jade/logs/lerobot/lerobot_2_physical-intelligence_libero_smolvla_lr1e-4bs32steps100000
+/home/jade_choghari/miniconda3/envs/lerobot/lib/python3.10/site-packages/transformers/utils/hub.py:111: FutureWarnin
+g: Using `TRANSFORMERS_CACHE` is deprecated and will be removed in v5 of Transformers. Use `HF_HOME` instead.
+  warnings.warn(
+INFO 2025-09-09 16:12:28 ils/utils.py:48 Cuda backend detected, using cuda.
+WARNING 2025-09-09 16:12:28 /policies.py:81 Device 'None' is not available. Switching to 'cuda'.
+INFO 2025-09-09 16:12:28 ts/train.py:137 {'batch_size': 32,
+ 'dataset': {'episodes': None,
+             'image_transforms': {'enable': False,
+                                  'max_num_transforms': 3,
+                                  'random_order': False,
+                                  'tfs': {'brightness': {'kwargs': {'brightness': [0.8,
+                                                                                   1.2]},
+                                                         'type': 'ColorJitter',
+                                                         'weight': 1.0},
+                                          'contrast': {'kwargs': {'contrast': [0.8,
+                                                                               1.2]},
+                                                       'type': 'ColorJitter',
+                                                       'weight': 1.0},
+                                          'hue': {'kwargs': {'hue': [-0.05,
+                                                                     0.05]},
+                                                  'type': 'ColorJitter',
+                                                  'weight': 1.0},
+                                          'saturation': {'kwargs': {'saturation': [0.5,
+                                                                                   1.5]},
+                                                         'type': 'ColorJitter',
+                                                         'weight': 1.0},
+                                          'sharpness': {'kwargs': {'sharpness': [0.5,
+                                                                                 1.5]},
+                                                        'type': 'SharpnessJitter',
+                                                        'weight': 1.0}}},
+             'repo_id': 'physical-intelligence/libero',
+             'revision': None,
+             'root': '/raid/jade/.cache/huggingface/datasets',
+             'use_imagenet_stats': True,
+             'video_backend': 'torchcodec'},
+ 'env': {'camera_name': 'agentview_image,robot0_eye_in_hand_image',
+         'episode_length': 520,
+         'features': {'action': {'shape': [7],
+                                 'type': <FeatureType.ACTION: 'ACTION'>},
+                      'agent_pos': {'shape': [8],
+                                    'type': <FeatureType.STATE: 'STATE'>},
+                      'pixels/agentview_image': {'shape': [360, 360, 3],
+                                                 'type': <FeatureType.VISUAL: 'VISUAL'>},
+                      'pixels/robot0_eye_in_hand_image': {'shape': [360,
+                                                                    360,
+                                                                    3],
+                                                          'type': <FeatureType.VISUAL: 'VISUAL'>}},
+         'features_map': {'action': 'action',
+                          'agent_pos': 'observation.state',
+                          'pixels/agentview_image': 'observation.images.image',
+                          'pixels/robot0_eye_in_hand_image': 'observation.images.image2'},
+         'fps': 30,
+         'init_states': True,
+         'max_parallel_tasks': 5,
+         'multitask_eval': True,
+         'obs_type': 'pixels_agent_pos',
+         'render_mode': 'rgb_array',
+         'task': 'libero_spatial',
+         'type': 'libero'},
+ 'eval': {'batch_size': 1, 'n_episodes': 1, 'use_async_envs': False},
+ 'eval_freq': 0,
+ 'job_name': 'libero_smolvla',
+ 'log_freq': 200,
+ 'num_workers': 4,
+ 'optimizer': {'betas': [0.9, 0.95],
+               'eps': 1e-08,
+               'grad_clip_norm': 10,
+               'lr': 0.0001,
+               'type': 'adamw',
+               'weight_decay': 1e-10},
+ 'output_dir': '/raid/jade/logs/lerobot/lerobot_2_physical-intelligence_libero_smolvla_lr1e-4bs32steps100000',
+ 'policy': {'adapt_to_pi_aloha': False,
+            'add_image_special_tokens': False,
+            'attention_mode': 'cross_attn',
+            'chunk_size': 50,
+            'device': 'cuda',
+            'empty_cameras': 0,
+            'expert_width_multiplier': 0.5,
+            'freeze_vision_encoder': True,
+            'gradient_accumulation_steps': 1,
+            'input_features': {},
+            'license': None,
+            'load_vlm_weights': False,
+            'max_action_dim': 32,
+            'max_period': 4.0,
+            'max_state_dim': 32,
+            'min_period': 0.004,
+            'n_action_steps': 1,
+            'n_obs_steps': 1,
+            'normalization_mapping': {'ACTION': <NormalizationMode.MEAN_STD: 'MEAN_STD'>,
+                                      'STATE': <NormalizationMode.MEAN_STD: 'MEAN_STD'>,
+                                      'VISUAL': <NormalizationMode.IDENTITY: 'IDENTITY'>},
+            'num_expert_layers': -1,
+            'num_steps': 10,
+            'num_vlm_layers': 16,
+            'optimizer_betas': [0.9, 0.95],
+            'optimizer_eps': 1e-08,
+            'optimizer_grad_clip_norm': 10,
+            'optimizer_lr': 0.0001,
+            'optimizer_weight_decay': 1e-10,
+            'output_features': {},
+            'pad_language_to': 'longest',
+            'prefix_length': 0,
+            'private': None,
+            'push_to_hub': True,
+            'repo_id': 'None',
+            'resize_imgs_with_padding': [512, 512],
+            'scheduler_decay_lr': 2.5e-06,
+            'scheduler_decay_steps': 30000,
+            'scheduler_warmup_steps': 1000,
+            'self_attn_every_n_layers': 2,
+            'tags': None,
+            'tokenizer_max_length': 48,
+            'train_expert_only': True,
+            'train_state_proj': True,
+            'type': 'smolvla',
+            'use_amp': True,
+            'use_cache': True,
+            'use_delta_joint_actions_aloha': False,
+            'vlm_model_name': 'HuggingFaceTB/SmolVLM2-500M-Instruct'},
+ 'resume': False,
+ 'save_checkpoint': True,
+ 'save_freq': 20000,
+ 'scheduler': {'decay_lr': 2.5e-06,
+               'num_decay_steps': 30000,
+               'num_warmup_steps': 1000,
+               'peak_lr': 0.0001,
+               'type': 'cosine_decay_with_warmup'},
+ 'seed': 1000,
+ 'steps': 100000,
+ 'use_policy_training_preset': True,
+ 'wandb': {'disable_artifact': False,
+           'enable': False,
+           'entity': None,
+           'mode': None,
+           'notes': None,
+           'project': 'lerobot',
+           'run_id': None}}
+INFO 2025-09-09 16:12:28 ts/train.py:143 Logs will be saved locally.
+INFO 2025-09-09 16:12:28 ts/train.py:153 Creating dataset
+WARNING 2025-09-09 16:12:28 ts/utils.py:302
+The dataset you requested (physical-intelligence/libero) is in 2.0 format.
+While current version of LeRobot is backward-compatible with it, the version of your dataset still uses global
+stats instead of per-episode stats. Update your dataset stats to the new format using this command:
+```
+python -m lerobot.datasets.v21.convert_dataset_v20_to_v21 --repo-id=physical-intelligence/libero
+```
+
+If you encounter a problem, contact LeRobot maintainers on [Discord](https://discord.com/invite/s3KuuzsPFb)
+or open an [issue on GitHub](https://github.com/huggingface/lerobot/issues/new/choose).
+
+WARNING 2025-09-09 16:12:28 ts/utils.py:302
+The dataset you requested (physical-intelligence/libero) is in 2.0 format.
+While current version of LeRobot is backward-compatible with it, the version of your dataset still uses global
+stats instead of per-episode stats. Update your dataset stats to the new format using this command:
+```
+python -m lerobot.datasets.v21.convert_dataset_v20_to_v21 --repo-id=physical-intelligence/libero
+```
+
+If you encounter a problem, contact LeRobot maintainers on [Discord](https://discord.com/invite/s3KuuzsPFb)
+or open an [issue on GitHub](https://github.com/huggingface/lerobot/issues/new/choose).
+
+Resolving data files: 100%|█████████████████████████████████| 1693/1693 [00:00<00:00, 87666.13it/s]
+Loading dataset shards: 100%|████████████████████████████████████| 70/70 [00:00<00:00, 4223.20it/s]
+INFO 2025-09-09 16:12:34 ts/train.py:163 Creating policy
+Fetching 2 files: 100%|███████████████████████████████████████████| 2/2 [00:00<00:00, 43690.67it/s]
+Fetching 2 files: 100%|████████████████████████████████████████████| 2/2 [00:00<00:00, 4871.43it/s]
+Fetching 2 files: 100%|████████████████████████████████████████████| 2/2 [00:00<00:00, 6512.89it/s]
+Fetching 2 files: 100%|███████████████████████████████████████████| 2/2 [00:00<00:00, 43018.50it/s]
+Reducing the number of VLM layers to 16 ...
+INFO 2025-09-09 16:13:06 ts/train.py:168 Creating optimizer and scheduler
+INFO 2025-09-09 16:13:06 ts/train.py:180 Output dir: /raid/jade/logs/lerobot/lerobot_2_physical-intelligence_libero_
+smolvla_lr1e-4bs32steps100000
+INFO 2025-09-09 16:13:06 ts/train.py:182 cfg.env.task='libero_spatial'
+INFO 2025-09-09 16:13:06 ts/train.py:183 cfg.steps=100000 (100K)
+INFO 2025-09-09 16:13:06 ts/train.py:184 dataset.num_frames=273465 (273K)
+INFO 2025-09-09 16:13:06 ts/train.py:185 dataset.num_episodes=1693
+INFO 2025-09-09 16:13:06 ts/train.py:186 num_learnable_params=49103712 (49M)
+INFO 2025-09-09 16:13:06 ts/train.py:187 num_total_params=399268940 (399M)
+INFO 2025-09-09 16:13:06 ts/train.py:225 Start offline training on a fixed dataset
+Traceback (most recent call last):
+  File "/home/jade_choghari/lerobot/src/lerobot/scripts/train.py", line 342, in <module>
+    main()
+  File "/home/jade_choghari/lerobot/src/lerobot/scripts/train.py", line 338, in main
+    train()
+  File "/home/jade_choghari/lerobot/src/lerobot/configs/parser.py", line 225, in wrapper_inner
+    response = fn(cfg, *args, **kwargs)
+  File "/home/jade_choghari/lerobot/src/lerobot/scripts/train.py", line 235, in train
+    train_tracker, output_dict = update_policy(
+  File "/home/jade_choghari/lerobot/src/lerobot/scripts/train.py", line 76, in update_policy
+    grad_scaler.unscale_(optimizer)
+  File "/home/jade_choghari/miniconda3/envs/lerobot/lib/python3.10/site-packages/torch/amp/grad_scaler.py", line 342
+, in unscale_
+    optimizer_state["found_inf_per_device"] = self._unscale_grads_(
+  File "/home/jade_choghari/miniconda3/envs/lerobot/lib/python3.10/site-packages/torch/amp/grad_scaler.py", line 283
+, in _unscale_grads_
+    torch._amp_foreach_non_finite_check_and_unscale_(
+RuntimeError: "_amp_foreach_non_finite_check_and_unscale_cuda" not implemented for 'BFloat16'
+(lerobot) jade_choghari@hf-dgx-01:~/lerobot$ bash examples/8_train_smolvla_must.sh
+Training dir: /raid/jade/logs/lerobot/lerobot_2_physical-intelligence_libero_smolvla_lr1e-4bs32steps100000
+/home/jade_choghari/miniconda3/envs/lerobot/lib/python3.10/site-packages/transformers/utils/hub.py:111: FutureWarnin
+g: Using `TRANSFORMERS_CACHE` is deprecated and will be removed in v5 of Transformers. Use `HF_HOME` instead.
+  warnings.warn(
+INFO 2025-09-09 16:13:51 ils/utils.py:48 Cuda backend detected, using cuda.
+WARNING 2025-09-09 16:13:51 /policies.py:81 Device 'None' is not available. Switching to 'cuda'.
+INFO 2025-09-09 16:13:51 ts/train.py:137 {'batch_size': 32,
+ 'dataset': {'episodes': None,
+             'image_transforms': {'enable': False,
+                                  'max_num_transforms': 3,
+                                  'random_order': False,
+                                  'tfs': {'brightness': {'kwargs': {'brightness': [0.8,
+                                                                                   1.2]},
+                                                         'type': 'ColorJitter',
+                                                         'weight': 1.0},
+                                          'contrast': {'kwargs': {'contrast': [0.8,
+                                                                               1.2]},
+                                                       'type': 'ColorJitter',
+                                                       'weight': 1.0},
+                                          'hue': {'kwargs': {'hue': [-0.05,
+                                                                     0.05]},
+                                                  'type': 'ColorJitter',
+                                                  'weight': 1.0},
+                                          'saturation': {'kwargs': {'saturation': [0.5,
+                                                                                   1.5]},
+                                                         'type': 'ColorJitter',
+                                                         'weight': 1.0},
+                                          'sharpness': {'kwargs': {'sharpness': [0.5,
+                                                                                 1.5]},
+                                                        'type': 'SharpnessJitter',
+                                                        'weight': 1.0}}},
+             'repo_id': 'physical-intelligence/libero',
+             'revision': None,
+             'root': '/raid/jade/.cache/huggingface/datasets',
+             'use_imagenet_stats': True,
+             'video_backend': 'torchcodec'},
+ 'env': {'camera_name': 'agentview_image,robot0_eye_in_hand_image',
+         'episode_length': 520,
+         'features': {'action': {'shape': [7],
+                                 'type': <FeatureType.ACTION: 'ACTION'>},
+                      'agent_pos': {'shape': [8],
+                                    'type': <FeatureType.STATE: 'STATE'>},
+                      'pixels/agentview_image': {'shape': [360, 360, 3],
+                                                 'type': <FeatureType.VISUAL: 'VISUAL'>},
+                      'pixels/robot0_eye_in_hand_image': {'shape': [360,
+                                                                    360,
+                                                                    3],
+                                                          'type': <FeatureType.VISUAL: 'VISUAL'>}},
+         'features_map': {'action': 'action',
+                          'agent_pos': 'observation.state',
+                          'pixels/agentview_image': 'observation.images.image',
+                          'pixels/robot0_eye_in_hand_image': 'observation.images.image2'},
+         'fps': 30,
+         'init_states': True,
+         'max_parallel_tasks': 5,
+         'multitask_eval': True,
+         'obs_type': 'pixels_agent_pos',
+         'render_mode': 'rgb_array',
+         'task': 'libero_spatial',
+         'type': 'libero'},
+ 'eval': {'batch_size': 1, 'n_episodes': 1, 'use_async_envs': False},
+ 'eval_freq': 0,
+ 'job_name': 'libero_smolvla',
+ 'log_freq': 200,
+ 'num_workers': 4,
+ 'optimizer': {'betas': [0.9, 0.95],
+               'eps': 1e-08,
+               'grad_clip_norm': 10,
+               'lr': 0.0001,
+               'type': 'adamw',
+               'weight_decay': 1e-10},
+ 'output_dir': '/raid/jade/logs/lerobot/lerobot_2_physical-intelligence_libero_smolvla_lr1e-4bs32steps100000',
+ 'policy': {'adapt_to_pi_aloha': False,
+            'add_image_special_tokens': False,
+            'attention_mode': 'cross_attn',
+            'chunk_size': 50,
+            'device': 'cuda',
+            'empty_cameras': 0,
+            'expert_width_multiplier': 0.5,
+            'freeze_vision_encoder': True,
+            'gradient_accumulation_steps': 1,
+            'input_features': {},
+            'license': None,
+            'load_vlm_weights': False,
+            'max_action_dim': 32,
+            'max_period': 4.0,
+            'max_state_dim': 32,
+            'min_period': 0.004,
+            'n_action_steps': 1,
+            'n_obs_steps': 1,
+            'normalization_mapping': {'ACTION': <NormalizationMode.MEAN_STD: 'MEAN_STD'>,
+                                      'STATE': <NormalizationMode.MEAN_STD: 'MEAN_STD'>,
+                                      'VISUAL': <NormalizationMode.IDENTITY: 'IDENTITY'>},
+            'num_expert_layers': -1,
+            'num_steps': 10,
+            'num_vlm_layers': 16,
+            'optimizer_betas': [0.9, 0.95],
+            'optimizer_eps': 1e-08,
+            'optimizer_grad_clip_norm': 10,
+            'optimizer_lr': 0.0001,
+            'optimizer_weight_decay': 1e-10,
+            'output_features': {},
+            'pad_language_to': 'longest',
+            'prefix_length': 0,
+            'private': None,
+            'push_to_hub': True,
+            'repo_id': 'None',
+            'resize_imgs_with_padding': [512, 512],
+            'scheduler_decay_lr': 2.5e-06,
+            'scheduler_decay_steps': 30000,
+            'scheduler_warmup_steps': 1000,
+            'self_attn_every_n_layers': 2,
+            'tags': None,
+            'tokenizer_max_length': 48,
+            'train_expert_only': True,
+            'train_state_proj': True,
+            'type': 'smolvla',
+            'use_amp': False,
+            'use_cache': True,
+            'use_delta_joint_actions_aloha': False,
+            'vlm_model_name': 'HuggingFaceTB/SmolVLM2-500M-Instruct'},
+ 'resume': False,
+ 'save_checkpoint': True,
+ 'save_freq': 20000,
+ 'scheduler': {'decay_lr': 2.5e-06,
+               'num_decay_steps': 30000,
+               'num_warmup_steps': 1000,
+               'peak_lr': 0.0001,
+               'type': 'cosine_decay_with_warmup'},
+ 'seed': 1000,
+ 'steps': 100000,
+ 'use_policy_training_preset': True,
+ 'wandb': {'disable_artifact': False,
+           'enable': False,
+           'entity': None,
+           'mode': None,
+           'notes': None,
+           'project': 'lerobot',
+           'run_id': None}}
+INFO 2025-09-09 16:13:51 ts/train.py:143 Logs will be saved locally.
+INFO 2025-09-09 16:13:51 ts/train.py:153 Creating dataset
+WARNING 2025-09-09 16:13:51 ts/utils.py:302
+The dataset you requested (physical-intelligence/libero) is in 2.0 format.
+While current version of LeRobot is backward-compatible with it, the version of your dataset still uses global
+stats instead of per-episode stats. Update your dataset stats to the new format using this command:
+```
+python -m lerobot.datasets.v21.convert_dataset_v20_to_v21 --repo-id=physical-intelligence/libero
+```
+
+If you encounter a problem, contact LeRobot maintainers on [Discord](https://discord.com/invite/s3KuuzsPFb)
+or open an [issue on GitHub](https://github.com/huggingface/lerobot/issues/new/choose).
+
+WARNING 2025-09-09 16:13:51 ts/utils.py:302
+The dataset you requested (physical-intelligence/libero) is in 2.0 format.
+While current version of LeRobot is backward-compatible with it, the version of your dataset still uses global
+stats instead of per-episode stats. Update your dataset stats to the new format using this command:
+```
+python -m lerobot.datasets.v21.convert_dataset_v20_to_v21 --repo-id=physical-intelligence/libero
+```
+
+If you encounter a problem, contact LeRobot maintainers on [Discord](https://discord.com/invite/s3KuuzsPFb)
+or open an [issue on GitHub](https://github.com/huggingface/lerobot/issues/new/choose).
+
+Resolving data files: 100%|█████████████████████████████████| 1693/1693 [00:00<00:00, 82981.28it/s]
+Loading dataset shards: 100%|████████████████████████████████████| 70/70 [00:00<00:00, 4687.94it/s]
+INFO 2025-09-09 16:13:57 ts/train.py:163 Creating policy
+Fetching 2 files: 100%|███████████████████████████████████████████| 2/2 [00:00<00:00, 21345.06it/s]
+Fetching 2 files: 100%|████████████████████████████████████████████| 2/2 [00:00<00:00, 4226.00it/s]
+Fetching 2 files: 100%|████████████████████████████████████████████| 2/2 [00:00<00:00, 2966.27it/s]
+Fetching 2 files: 100%|████████████████████████████████████████████| 2/2 [00:00<00:00, 6497.76it/s]
+Reducing the number of VLM layers to 16 ...
+INFO 2025-09-09 16:14:30 ts/train.py:168 Creating optimizer and scheduler
+INFO 2025-09-09 16:14:30 ts/train.py:180 Output dir: /raid/jade/logs/lerobot/lerobot_2_physical-intelligence_libero_
+smolvla_lr1e-4bs32steps100000
+INFO 2025-09-09 16:14:30 ts/train.py:182 cfg.env.task='libero_spatial'
+INFO 2025-09-09 16:14:30 ts/train.py:183 cfg.steps=100000 (100K)
+INFO 2025-09-09 16:14:30 ts/train.py:184 dataset.num_frames=273465 (273K)
+INFO 2025-09-09 16:14:30 ts/train.py:185 dataset.num_episodes=1693
+INFO 2025-09-09 16:14:30 ts/train.py:186 num_learnable_params=49103712 (49M)
+INFO 2025-09-09 16:14:30 ts/train.py:187 num_total_params=399268940 (399M)
+INFO 2025-09-09 16:14:30 ts/train.py:225 Start offline training on a fixed dataset
+INFO 2025-09-09 16:16:20 ts/train.py:255 step:200 smpl:6K ep:40 epch:0.02 loss:1.244 grdn:2.492 lr:1.0e-05 updt_s:0.
+536 data_s:0.007
+INFO 2025-09-09 16:17:56 ts/train.py:255 step:400 smpl:13K ep:79 epch:0.05 loss:0.685 grdn:4.262 lr:3.0e-05 updt_s:0
+.481 data_s:0.000
+INFO 2025-09-09 16:19:33 ts/train.py:255 step:600 smpl:19K ep:119 epch:0.07 loss:0.364 grdn:4.849 lr:5.0e-05 updt_s:
+0.482 data_s:0.000
+INFO 2025-09-09 16:21:10 ts/train.py:255 step:800 smpl:26K ep:158 epch:0.09 loss:0.239 grdn:4.024 lr:7.0e-05 updt_s:
+0.481 data_s:0.000
+INFO 2025-09-09 16:22:46 ts/train.py:255 step:1K smpl:32K ep:198 epch:0.12 loss:0.197 grdn:3.267 lr:9.0e-05 updt_s:0
+.478 data_s:0.000
+INFO 2025-09-09 16:24:22 ts/train.py:255 step:1K smpl:38K ep:238 epch:0.14 loss:0.173 grdn:2.319 lr:1.0e-04 updt_s:0
+.481 data_s:0.000
+INFO 2025-09-09 16:25:59 ts/train.py:255 step:1K smpl:45K ep:277 epch:0.16 loss:0.153 grdn:1.741 lr:1.0e-04 updt_s:0
+.483 data_s:0.000
+INFO 2025-09-09 16:27:36 ts/train.py:255 step:2K smpl:51K ep:317 epch:0.19 loss:0.135 grdn:1.354 lr:9.9e-05 updt_s:0
+.483 data_s:0.000
+INFO 2025-09-09 16:29:14 ts/train.py:255 step:2K smpl:58K ep:357 epch:0.21 loss:0.126 grdn:1.177 lr:9.9e-05 updt_s:0
+.484 data_s:0.000
+
diff --git a/src/lerobot/configs/policies.py b/src/lerobot/configs/policies.py
index f5fa727cf..75863d3fc 100644
--- a/src/lerobot/configs/policies.py
+++ b/src/lerobot/configs/policies.py
@@ -62,6 +62,7 @@ class PreTrainedConfig(draccus.ChoiceRegistry, HubMixin, abc.ABC):
     # `use_amp` determines whether to use Automatic Mixed Precision (AMP) for training and evaluation. With AMP,
     # automatic gradient scaling is used.
     use_amp: bool = False
+    gradient_accumulation_steps: int = 1
 
     push_to_hub: bool = True
     repo_id: str | None = None
diff --git a/src/lerobot/datasets/lerobot_dataset.py b/src/lerobot/datasets/lerobot_dataset.py
index a869cb920..6509993bb 100644
--- a/src/lerobot/datasets/lerobot_dataset.py
+++ b/src/lerobot/datasets/lerobot_dataset.py
@@ -472,7 +472,6 @@ class LeRobotDataset(torch.utils.data.Dataset):
             episodes_stats = [self.meta.episodes_stats[ep_idx] for ep_idx in self.episodes]
             self.stats = aggregate_stats(episodes_stats)
 
-        # Load actual data
         try:
             if force_cache_sync:
                 raise FileNotFoundError
@@ -598,6 +597,7 @@ class LeRobotDataset(torch.utils.data.Dataset):
         """hf_dataset contains all the observations, states, actions, rewards, etc."""
         if self.episodes is None:
             path = str(self.root / "data")
+            # added by jade
             hf_dataset = load_dataset("parquet", data_dir=path, split="train")
         else:
             files = [str(self.root / self.meta.get_data_file_path(ep_idx)) for ep_idx in self.episodes]
diff --git a/src/lerobot/datasets/utils.py b/src/lerobot/datasets/utils.py
index 078c5351d..daa1de163 100644
--- a/src/lerobot/datasets/utils.py
+++ b/src/lerobot/datasets/utils.py
@@ -455,7 +455,8 @@ def dataset_to_policy_features(features: dict[str, dict]) -> dict[str, PolicyFea
                 shape = (shape[2], shape[0], shape[1])
         elif key == "observation.environment_state":
             type = FeatureType.ENV
-        elif key.startswith("observation"):
+        # changed by jade
+        elif key.startswith("observation") or key.startswith("state"):
             type = FeatureType.STATE
         elif key.startswith("action"):
             type = FeatureType.ACTION
diff --git a/src/lerobot/policies/factory.py b/src/lerobot/policies/factory.py
index 4b8eeffd1..79461d3a9 100644
--- a/src/lerobot/policies/factory.py
+++ b/src/lerobot/policies/factory.py
@@ -34,6 +34,7 @@ from lerobot.policies.sac.reward_model.configuration_classifier import RewardCla
 from lerobot.policies.smolvla.configuration_smolvla import SmolVLAConfig
 from lerobot.policies.tdmpc.configuration_tdmpc import TDMPCConfig
 from lerobot.policies.vqbet.configuration_vqbet import VQBeTConfig
+from lerobot.policies.smolpi0.configuration_smolpi0 import SMOLPI0Config
 
 
 def get_policy_class(name: str) -> PreTrainedPolicy:
@@ -74,6 +75,10 @@ def get_policy_class(name: str) -> PreTrainedPolicy:
         from lerobot.policies.smolvla.modeling_smolvla import SmolVLAPolicy
 
         return SmolVLAPolicy
+    elif name == "smolpi0":
+        from lerobot.policies.smolpi0.modeling_smolpi0 import SMOLPI0Policy
+
+        return SMOLPI0Policy
     else:
         raise NotImplementedError(f"Policy with name {name} is not implemented.")
 
@@ -97,6 +102,8 @@ def make_policy_config(policy_type: str, **kwargs) -> PreTrainedConfig:
         return SmolVLAConfig(**kwargs)
     elif policy_type == "reward_classifier":
         return RewardClassifierConfig(**kwargs)
+    elif policy_type == "smolpi0":
+        return SMOLPI0Config(**kwargs)
     else:
         raise ValueError(f"Policy type '{policy_type}' is not available.")
 
@@ -170,7 +177,6 @@ def make_policy(
         policy = policy_cls(**kwargs)
     policy.to(cfg.device)
     assert isinstance(policy, nn.Module)
-
     # policy = torch.compile(policy, mode="reduce-overhead")
 
     return policy
diff --git a/src/lerobot/policies/normalize.py b/src/lerobot/policies/normalize.py
index 119055873..043265b1b 100644
--- a/src/lerobot/policies/normalize.py
+++ b/src/lerobot/policies/normalize.py
@@ -255,6 +255,83 @@ class Unnormalize(nn.Module):
         return batch
 
 
+class NormalizePerRobotType(nn.Module):
+    """Normalizes data (e.g. "observation.image") for more stable and faster convergence during training."""
+
+    def __init__(
+        self,
+        features: dict[str, PolicyFeature],
+        norm_map: dict[str, NormalizationMode],
+        stats: dict[str, dict[str, Tensor]] | None = None,
+    ):
+        """
+        Args:
+            shapes (dict): A dictionary where keys are input modalities (e.g. "observation.image") and values
+            are their shapes (e.g. `[3,96,96]`]). These shapes are used to create the tensor buffer containing
+            mean, std, min, max statistics. If the provided `shapes` contain keys related to images, the shape
+            is adjusted to be invariant to height and width, assuming a channel-first (c, h, w) format.
+            modes (dict): A dictionary where keys are output modalities (e.g. "observation.image") and values
+                are their normalization modes among:
+                    - "mean_std": subtract the mean and divide by standard deviation.
+                    - "min_max": map to [-1, 1] range.
+            stats (dict, optional): A dictionary where keys are output modalities (e.g. "observation.image")
+                and values are dictionaries of statistic types and their values (e.g.
+                `{"mean": torch.randn(3,1,1)}, "std": torch.randn(3,1,1)}`). If provided, as expected for
+                training the model for the first time, these statistics will overwrite the default buffers. If
+                not provided, as expected for finetuning or evaluation, the default buffers should to be
+                overwritten by a call to `policy.load_state_dict(state_dict)`. That way, initializing the
+                dataset is not needed to get the stats, since they are already in the policy state_dict.
+        """
+        super().__init__()
+        self.features = features
+        self.norm_map = norm_map
+        for robot_type in stats.keys():
+            stats_buffers = create_stats_buffers(features, norm_map, stats[robot_type])
+            for key, buffer in stats_buffers.items():
+                setattr(self, f"{robot_type}_buffer_" + key.replace(".", "_"), buffer)
+
+    # TODO(rcadene): should we remove torch.no_grad?
+    @torch.no_grad
+    def forward(self, batch: dict[str, Tensor]) -> dict[str, Tensor]:
+        batch = dict(batch)  # shallow copy avoids mutating the input batch
+        assert "robot_type" in batch, "robot_type is not in the batch"
+        robot_types = batch["robot_type"]
+
+        for key, ft in self.features.items():
+            if key not in batch:
+                continue
+
+            norm_mode = self.norm_map.get(ft.type, NormalizationMode.IDENTITY)
+            if norm_mode is NormalizationMode.IDENTITY:
+                continue
+            # FIXME(mshukor): make it more efficient
+            buffers = [
+                getattr(self, f"{robot_type}_buffer_" + key.replace(".", "_")) for robot_type in robot_types
+            ]
+            if norm_mode is NormalizationMode.MEAN_STD:
+                mean = torch.stack([buffers[i]["mean"] for i in range(len(robot_types))],dim=0)
+                std = torch.stack([buffers[i]["std"] for i in range(len(robot_types))],dim=0)
+                if batch[key].ndim == 3:
+                    mean = mean.unsqueeze(1)
+                    std = std.unsqueeze(1)
+                assert not torch.isinf(mean).any(), _no_stats_error_str("mean")
+                assert not torch.isinf(std).any(), _no_stats_error_str("std")
+                batch[key] = (batch[key] - mean) / (std + 1e-8)
+            elif norm_mode is NormalizationMode.MIN_MAX:
+                min = torch.stack([buffers[i]["min"] for i in range(len(robot_types))], dim=0)
+                max = torch.stack([buffers[i]["max"] for i in range(len(robot_types))], dim=0)
+                assert not torch.isinf(min).any(), _no_stats_error_str("min")
+                assert not torch.isinf(max).any(), _no_stats_error_str("max")
+                if batch[key].ndim == 3:
+                    min = min.unsqueeze(1)
+                    max = max.unsqueeze(1)
+                # normalize to [0,1]
+                batch[key] = (batch[key] - min) / (max - min + 1e-8)
+                # normalize to [-1, 1]
+                batch[key] = batch[key] * 2 - 1
+            else:
+                raise ValueError(norm_mode)
+        return batch
 # TODO (azouitine): We should replace all normalization on the policies with register_buffer normalization
 #       and remove the `Normalize` and `Unnormalize` classes.
 def _initialize_stats_buffers(
@@ -418,3 +495,87 @@ class UnnormalizeBuffer(nn.Module):
             raise ValueError(norm_mode)
 
         return batch
+
+
+class UnnormalizePerRobotType(nn.Module):
+    """
+    Similar to `Normalize` but unnormalizes output data (e.g. `{"action": torch.randn(b,c)}`) in their
+    original range used by the environment.
+    """
+
+    def __init__(
+        self,
+        features: dict[str, PolicyFeature],
+        norm_map: dict[str, NormalizationMode],
+        stats: dict[str, dict[str, Tensor]] | None = None,
+    ):
+        """
+        Args:
+            shapes (dict): A dictionary where keys are input modalities (e.g. "observation.image") and values
+            are their shapes (e.g. `[3,96,96]`]). These shapes are used to create the tensor buffer containing
+            mean, std, min, max statistics. If the provided `shapes` contain keys related to images, the shape
+            is adjusted to be invariant to height and width, assuming a channel-first (c, h, w) format.
+            modes (dict): A dictionary where keys are output modalities (e.g. "observation.image") and values
+                are their normalization modes among:
+                    - "mean_std": subtract the mean and divide by standard deviation.
+                    - "min_max": map to [-1, 1] range.
+            stats (dict, optional): A dictionary where keys are output modalities (e.g. "observation.image")
+                and values are dictionaries of statistic types and their values (e.g.
+                `{"mean": torch.randn(3,1,1)}, "std": torch.randn(3,1,1)}`). If provided, as expected for
+                training the model for the first time, these statistics will overwrite the default buffers. If
+                not provided, as expected for finetuning or evaluation, the default buffers should to be
+                overwritten by a call to `policy.load_state_dict(state_dict)`. That way, initializing the
+                dataset is not needed to get the stats, since they are already in the policy state_dict.
+        """
+        super().__init__()
+        self.features = features
+        self.norm_map = norm_map
+        self.stats = stats
+        # `self.buffer_observation_state["mean"]` contains `torch.tensor(state_dim)`
+        for robot_type in stats.keys():
+            stats_buffers = create_stats_buffers(features, norm_map, stats[robot_type])
+            for key, buffer in stats_buffers.items():
+                setattr(self, f"{robot_type}_buffer_" + key.replace(".", "_"), buffer)
+
+    # TODO(rcadene): should we remove torch.no_grad?
+    @torch.no_grad
+    def forward(self, batch: dict[str, Tensor]) -> dict[str, Tensor]:
+        batch = dict(batch)  # shallow copy avoids mutating the input batch
+        assert "robot_type" in batch, "robot_type is not in the batch"
+        robot_types = batch["robot_type"]
+
+        for key, ft in self.features.items():
+            if key not in batch:
+                continue
+
+            norm_mode = self.norm_map.get(ft.type, NormalizationMode.IDENTITY)
+            if norm_mode is NormalizationMode.IDENTITY:
+                continue
+
+            # buffer = getattr(self, "buffer_" + key.replace(".", "_"))
+            buffers = [
+                getattr(self, f"{robot_type}_buffer_" + key.replace(".", "_")) for robot_type in robot_types
+            ]
+
+            if norm_mode is NormalizationMode.MEAN_STD:
+                mean = torch.stack([buffers[i]["mean"] for i in range(len(robot_types))], dim=0)
+                std = torch.stack([buffers[i]["std"] for i in range(len(robot_types))], dim=0)
+                assert not torch.isinf(mean).any(), _no_stats_error_str("mean")
+                assert not torch.isinf(std).any(), _no_stats_error_str("std")
+                if batch[key].ndim == 3:
+                    mean = mean.unsqueeze(1)
+                    std = std.unsqueeze(1)
+                batch[key] = batch[key] * std + mean
+            elif norm_mode is NormalizationMode.MIN_MAX:
+                min = torch.stack([buffers[i]["min"] for i in range(len(robot_types))], dim=0)
+                max = torch.stack([buffers[i]["max"] for i in range(len(robot_types))], dim=0)
+                assert not torch.isinf(min).any(), _no_stats_error_str("min")
+                assert not torch.isinf(max).any(), _no_stats_error_str("max")
+                if batch[key].ndim == 3:
+                    min = min.unsqueeze(1)
+                    max = max.unsqueeze(1)
+                batch[key] = (batch[key] + 1) / 2
+                batch[key] = batch[key] * (max - min) + min
+            else:
+                raise ValueError(norm_mode)
+        return batch
diff --git a/src/lerobot/policies/smolpi0/configuration_smolpi0.py b/src/lerobot/policies/smolpi0/configuration_smolpi0.py
new file mode 100644
index 000000000..c3605cd82
--- /dev/null
+++ b/src/lerobot/policies/smolpi0/configuration_smolpi0.py
@@ -0,0 +1,210 @@
+# Copyright 2024 The HuggingFace Inc. team. All rights reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+from dataclasses import dataclass, field
+
+from lerobot.optim.optimizers import AdamWConfig
+from lerobot.optim.schedulers import (
+    CosineDecayWithWarmupSchedulerConfig,
+)
+from lerobot.configs.policies import PreTrainedConfig
+from lerobot.configs.types import FeatureType, NormalizationMode, PolicyFeature
+
+
+@dataclass
+class PEFTConfig:
+    r: int = 4
+    lora_alpha: int = 16
+    lora_dropout: float = 0.1
+    target_modules: str = "q_proj,v_proj"
+
+
+@PreTrainedConfig.register_subclass("smolpi0")
+@dataclass
+class SMOLPI0Config(PreTrainedConfig):
+    # Input / output structure.
+    n_obs_steps: int = 1
+    chunk_size: int = 50
+    n_action_steps: int = 50
+    n_obs_gap: int = 1
+
+    normalization_mapping: dict[str, NormalizationMode] = field(
+        default_factory=lambda: {
+            "VISUAL": NormalizationMode.IDENTITY,
+            "STATE": NormalizationMode.MEAN_STD,
+            "ACTION": NormalizationMode.MEAN_STD,
+        }
+    )
+
+    # Shorter state and action vectors will be padded
+    max_state_dim: int = 32
+    max_action_dim: int = 32
+
+    # Image preprocessing
+    resize_imgs_with_padding: tuple[int, int] = (512, 512) #(224, 224)
+
+    # Add empty images. Used by pi0_aloha_sim which adds the empty
+    # left and right wrist cameras in addition to the top camera.
+    empty_cameras: int = 0
+
+    # Converts the joint and gripper values from the standard Aloha space to
+    # the space used by the pi internal runtime which was used to train the base model.
+    adapt_to_pi_aloha: bool = False
+
+    # Converts joint dimensions to deltas with respect to the current state before passing to the model.
+    # Gripper dimensions will remain in absolute values.
+    use_delta_joint_actions_aloha: bool = False
+
+    # Tokenizer
+    tokenizer_max_length: int = 48
+
+    # Projector
+    proj_width: int = 480
+
+    # Decoding
+    num_steps: int = 10
+
+    # Attention utils
+    use_cache: bool = True
+    attention_implementation: str = "eager"  # or fa2, flex
+
+    # Finetuning settings
+    freeze_vision_encoder: bool = True
+    train_expert_only: bool = False
+    train_state_proj: bool = True
+
+    # Training presets
+    optimizer_lr: float = 2.5e-5
+    optimizer_betas: tuple[float, float] = (0.9, 0.95)
+    optimizer_eps: float = 1e-8
+    optimizer_weight_decay: float = 1e-10
+    optimizer_grad_clip_norm: float = 10
+    optimizer_lr_vlm: float = 0
+
+    scheduler_warmup_steps: int = 1_000
+    scheduler_decay_steps: int = 30_000
+    scheduler_decay_lr: float = 2.5e-6
+
+    # TODO: Add EMA
+    vlm_model_name: str = "HuggingFaceTB/SmolVLM2-500M-Video-Instruct"
+    checkpoint_path: str = None
+    load_vlm_weights: bool = False
+
+    peft_method: str = ""
+    peft_config: PEFTConfig = PEFTConfig()
+    peft_target_model: str = ""
+
+    add_image_special_tokens: bool = False
+    add_prompt_template: bool = False
+    prefix_prompt_template: str = f"<|im_start|>User: What action should the robot take to"
+    suffix_prompt_template: str = f"?\nAssistant:"
+
+    attention_mode: str = "self_attn"
+
+    prefix_length: int = -1 # n_obs_steps * num_cameras * num_image_token_per_image + tokenizer_max_length
+
+    past_obs_keys: str = f"image"
+
+    add_local_special_image_tokens: bool = False
+
+    reverse_images_order: bool = False
+
+    state_to_prefix: bool = False
+
+    pad_language_to: str = "longest" # "max_length"
+
+    num_expert_layers: int = -1
+    num_vlm_layers: int = -1
+
+    causal_action_attention_mask: bool = False
+
+    self_attn_every_n_layers: int = -1
+
+    expert_width_multiplier: float = 0.5
+
+    robot_type: str = ""
+
+    self_attn_only_actions: bool = False
+
+    causal_attention_on_history: bool = False
+
+    predict_relative_actions: bool = False
+    relative_actions_mode: str = "first"
+
+    shuffle_camera_positions: bool = False
+    vlm_img_size: int = -1
+    
+    regression_loss: bool = False
+    
+    def __post_init__(self):
+        super().__post_init__()
+        if self.vlm_img_size > 0:
+            self.resize_imgs_with_padding = (self.vlm_img_size, self.vlm_img_size)
+        """Input validation (not exhaustive)."""
+        if self.n_action_steps > self.chunk_size:
+            raise ValueError(
+                f"The chunk size is the upper bound for the number of action steps per model invocation. Got "
+                f"{self.n_action_steps} for `n_action_steps` and {self.chunk_size} for `chunk_size`."
+            )
+        # if self.n_obs_steps != 1:
+        #     raise ValueError(
+        #         f"Multiple observation steps not handled yet. Got `nobs_steps={self.n_obs_steps}`"
+        #     )
+
+        if self.use_delta_joint_actions_aloha:
+            raise NotImplementedError(
+                "`use_delta_joint_actions_aloha` is used by pi0 for aloha real models. It is not ported yet in LeRobot."
+            )
+
+    def validate_features(self) -> None:
+        # TODO: implement value error
+        # if not self.image_features and not self.env_state_feature:
+        #     raise ValueError("You must provide at least one image or the environment state among the inputs.")
+
+        for i in range(self.empty_cameras):
+            key = f"observation.images.empty_camera_{i}"
+            empty_camera = PolicyFeature(
+                type=FeatureType.VISUAL,
+                shape=(3, 480, 640),
+            )
+            self.input_features[key] = empty_camera
+
+    def get_optimizer_preset(self) -> AdamWConfig:
+        return AdamWConfig(
+            lr=self.optimizer_lr,
+            betas=self.optimizer_betas,
+            eps=self.optimizer_eps,
+            weight_decay=self.optimizer_weight_decay,
+            grad_clip_norm=self.optimizer_grad_clip_norm,
+        )
+
+    def get_scheduler_preset(self):
+        return CosineDecayWithWarmupSchedulerConfig(
+            peak_lr=self.optimizer_lr,
+            decay_lr=self.scheduler_decay_lr,
+            num_warmup_steps=self.scheduler_warmup_steps,
+            num_decay_steps=self.scheduler_decay_steps,
+        )
+
+    @property
+    def observation_delta_indices(self) -> list: # FIXME(mshukor): support spacing between observations
+        return [-k for k in range(0, self.n_obs_steps * self.n_obs_gap, self.n_obs_gap)][::-1]
+
+    @property
+    def action_delta_indices(self) -> list:
+        return list(range(self.chunk_size))
+
+    @property
+    def reward_delta_indices(self) -> None:
+        return None
diff --git a/src/lerobot/policies/smolpi0/flex_attention.py b/src/lerobot/policies/smolpi0/flex_attention.py
new file mode 100644
index 000000000..13950f743
--- /dev/null
+++ b/src/lerobot/policies/smolpi0/flex_attention.py
@@ -0,0 +1,145 @@
+# Copyright 2024 The HuggingFace Inc. team. All rights reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+import torch
+import torch.nn.functional as F  # noqa: N812
+from packaging.version import Version
+
+if Version(torch.__version__) > Version("2.5.0"):
+    # Ffex attention is only available from torch 2.5 onwards
+    from torch.nn.attention.flex_attention import (
+        _mask_mod_signature,
+        _round_up_to_multiple,
+        create_block_mask,
+        create_mask,
+        flex_attention,
+    )
+
+
+@torch.compile(dynamic=False)
+def flex_attention_forward(
+    attention_mask: torch.Tensor,
+    batch_size: int,
+    head_dim: int,
+    query_states: torch.Tensor,
+    key_states: torch.Tensor,
+    value_states: torch.Tensor,
+    scaling=None,
+    num_att_heads: int = 8,
+    num_key_value_heads: int = 1,
+):
+    """
+    This is defined out of classes to make compile happy.
+    """
+
+    original_dtype = query_states.dtype
+    num_key_value_groups = num_att_heads // num_key_value_heads
+    key_states = key_states[:, :, :, None, :]
+    key_states = key_states.expand(
+        batch_size, key_states.shape[1], num_key_value_heads, num_key_value_groups, head_dim
+    )
+    key_states = key_states.reshape(
+        batch_size, key_states.shape[1], num_key_value_heads * num_key_value_groups, head_dim
+    )
+
+    value_states = value_states[:, :, :, None, :]
+    value_states = value_states.expand(
+        batch_size, value_states.shape[1], num_key_value_heads, num_key_value_groups, head_dim
+    )
+    value_states = value_states.reshape(
+        batch_size, value_states.shape[1], num_key_value_heads * num_key_value_groups, head_dim
+    )
+
+    query_states = query_states.transpose(1, 2)
+    key_states = key_states.transpose(1, 2)
+    value_states = value_states.transpose(1, 2)
+
+    # query_states = query_states.to(torch.float32)
+    # key_states = key_states.to(torch.float32)
+    # value_states = value_states.to(torch.float32)
+
+    causal_mask = attention_mask
+    if causal_mask is not None:
+        causal_mask = causal_mask[:, None, :, : key_states.shape[2]]
+
+        if causal_mask.shape[1] == 1 and query_states.shape[1] > 1:
+            causal_mask = causal_mask.expand(-1, query_states.shape[1], -1, -1)
+
+    def precomputed_mask_factory(precomputed_mask: torch.Tensor) -> _mask_mod_signature:
+        def mask_mod(b, h, q_idx, kv_idx):
+            # Danger zone: if b,h,q_idx,kv_idx exceed the shape, device-side assert occurs.
+            return precomputed_mask[b][h][q_idx][kv_idx]
+
+        return mask_mod
+
+    b_mask, h_mask, q_len, kv_len = causal_mask.shape  # The shape of your mask
+
+    block_size = 128 # limitation of flex attention
+    q_len_rounded = _round_up_to_multiple(q_len, block_size)
+    kv_len_rounded = _round_up_to_multiple(kv_len, block_size)
+
+    # *CRITICAL* we do need to expand here, else we get a CUDA index error
+
+    pad_q = q_len_rounded - q_len
+    pad_k = kv_len_rounded - kv_len
+    if pad_q > 0 or pad_k > 0:
+        padded_causal_mask = F.pad(causal_mask, (0, pad_k, 0, pad_q), value=0.0)
+    else: 
+        padded_causal_mask = causal_mask
+    mask_mod_fn_orig = precomputed_mask_factory(padded_causal_mask)
+
+    mask_4d = create_mask(
+        mod_fn=mask_mod_fn_orig,
+        B=b_mask,
+        H=h_mask,
+        Q_LEN=q_len_rounded,
+        KV_LEN=kv_len_rounded,
+        device=causal_mask.device,
+    )
+    
+    mask_mod_fn_padded = precomputed_mask_factory(mask_4d)
+    # FIXME(mshukor): compile mask torch.compile(create_block_mask)
+    create_block_mask_compiled = torch.compile(create_block_mask)
+    block_mask = create_block_mask_compiled(
+        mask_mod=mask_mod_fn_padded,
+        B=b_mask,
+        H=None, # 
+        Q_LEN=q_len_rounded,
+        KV_LEN=kv_len_rounded,
+        BLOCK_SIZE=block_size,
+        device=causal_mask.device,
+        _compile=False,
+    )
+    padded_query_states = F.pad(query_states, (0, 0, 0, pad_q), value=0.0) if pad_q > 0  else query_states
+    padded_key_states = F.pad(key_states, (0, 0, 0, pad_k), value=0.0) if pad_k > 0 else key_states
+    padded_value_states = F.pad(value_states, (0, 0, 0, pad_k), value=0.0) if pad_k > 0 else value_states
+    #  mask is applied inside the kernel, ideally more efficiently than score_mod.
+    attn_output, attention_weights = flex_attention(
+        padded_query_states,
+        padded_key_states,
+        padded_value_states,
+        block_mask=block_mask,
+        enable_gqa=True,  # because we shaped query/key states for GQA
+        scale=head_dim**-0.5 if scaling is None else scaling,
+        return_lse=True,
+    )
+
+    attn_output = attn_output.to(dtype=original_dtype)
+    attn_output = attn_output.transpose(1, 2).contiguous()  # [B, Q_LEN, H, head_dim]
+    attn_output = attn_output.reshape(
+        batch_size,
+        -1,
+        attn_output.shape[2] * attn_output.shape[3],  # merges [H, head_dim]
+    )
+    return attn_output[:, :-pad_k, :] if pad_k > 0 else attn_output
diff --git a/src/lerobot/policies/smolpi0/modeling_smolpi0.py b/src/lerobot/policies/smolpi0/modeling_smolpi0.py
new file mode 100644
index 000000000..fa2d3d5a7
--- /dev/null
+++ b/src/lerobot/policies/smolpi0/modeling_smolpi0.py
@@ -0,0 +1,1021 @@
+#!/usr/bin/env python
+
+# Copyright 2025 Physical Intelligence and The HuggingFace Inc. team. All rights reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+"""
+π0: A Vision-Language-Action Flow Model for General Robot Control
+
+[Paper](https://www.physicalintelligence.company/download/pi0.pdf)
+[Jax code](https://github.com/Physical-Intelligence/openpi)
+
+Designed by Physical Intelligence. Ported from Jax by Hugging Face.
+
+Install pi0 extra dependencies:
+```bash
+pip install -e ".[pi0]"
+```
+
+Example of finetuning the pi0 pretrained model (`pi0_base` in `openpi`):
+```bash
+python lerobot/scripts/train.py \
+--policy.path=lerobot/pi0 \
+--dataset.repo_id=danaaubakirova/koch_test
+```
+
+Example of finetuning the pi0 neural network with PaliGemma and expert Gemma
+pretrained with VLM default parameters before pi0 finetuning:
+```bash
+python lerobot/scripts/train.py \
+--policy.type=pi0 \
+--dataset.repo_id=danaaubakirova/koch_test
+```
+
+Example of using the pi0 pretrained model outside LeRobot training framework:
+```python
+policy = Pi0Policy.from_pretrained("lerobot/pi0")
+```
+
+"""
+
+import math
+from collections import deque
+
+import torch
+import torch.nn.functional as F  # noqa: N812
+from torch import Tensor, nn
+from transformers import AutoProcessor
+
+from lerobot.constants import ACTION, OBS_STATE
+from lerobot.policies.normalize import (
+    Normalize,
+    NormalizePerRobotType,
+    Unnormalize,
+    UnnormalizePerRobotType,
+)
+from lerobot.policies.smolpi0.configuration_smolpi0 import SMOLPI0Config
+from lerobot.policies.smolpi0.smolvlm_with_expert import (
+    SmolVLMWithExpertModel
+)
+from lerobot.policies.pretrained import PreTrainedPolicy
+from lerobot.utils.utils import get_safe_dtype
+OBS_IMAGE = "observation.image"
+OBS_IMAGES = "observation.images"
+ACTION = "action"
+OBS_IMAGE_2 = "observation.image2"
+OBS_IMAGE_3 = "observation.image3"
+OBS_IMAGE_4 = "observation.image4"
+TASK = "task"
+ROBOT = "robot_type"
+IMAGES_ORDER = {
+    OBS_IMAGE: 0,
+    OBS_IMAGE_2: 1,
+    OBS_IMAGE_3: 2,
+    OBS_IMAGE_4: 3,
+}
+from lerobot.policies.utils import (
+    populate_queues,
+)
+import random
+def create_sinusoidal_pos_embedding(
+    time: torch.tensor, dimension: int, min_period: float, max_period: float, device="cpu"
+) -> Tensor:
+    """Computes sine-cosine positional embedding vectors for scalar positions."""
+    if dimension % 2 != 0:
+        raise ValueError(f"dimension ({dimension}) must be divisible by 2")
+
+    if time.ndim != 1:
+        raise ValueError("The time tensor is expected to be of shape `(batch_size, )`.")
+
+    dtype = get_safe_dtype(torch.float64, device.type)
+    fraction = torch.linspace(0.0, 1.0, dimension // 2, dtype=dtype, device=device)
+    period = min_period * (max_period / min_period) ** fraction
+
+    # Compute the outer product
+    scaling_factor = 1.0 / period * 2 * math.pi
+    sin_input = scaling_factor[None, :] * time[:, None]
+    pos_emb = torch.cat([torch.sin(sin_input), torch.cos(sin_input)], dim=1)
+    return pos_emb
+
+
+def sample_beta(alpha, beta, bsize, device):
+    gamma1 = torch.empty((bsize,), device=device).uniform_(0, 1).pow(1 / alpha)
+    gamma2 = torch.empty((bsize,), device=device).uniform_(0, 1).pow(1 / beta)
+    return gamma1 / (gamma1 + gamma2)
+
+
+def make_att_2d_masks(pad_masks, att_masks):
+    """Copied from big_vision.
+
+    Tokens can attend to valid inputs tokens which have a cumulative mask_ar
+    smaller or equal to theirs. This way `mask_ar` int[B, N] can be used to
+    setup several types of attention, for example:
+
+      [[1 1 1 1 1 1]]: pure causal attention.
+
+      [[0 0 0 1 1 1]]: prefix-lm attention. The first 3 tokens can attend between
+          themselves and the last 3 tokens have a causal attention. The first
+          entry could also be a 1 without changing behaviour.
+
+      [[1 0 1 0 1 0 0 1 0 0]]: causal attention between 4 blocks. Tokens of a
+          block can attend all previous blocks and all tokens on the same block.
+
+    Args:
+      input_mask: bool[B, N] true if its part of the input, false if padding.
+      mask_ar: int32[B, N] mask that's 1 where previous tokens cannot depend on
+        it and 0 where it shares the same attention mask as the previous token.
+    """
+    if att_masks.ndim != 2:
+        raise ValueError(att_masks.ndim)
+    if pad_masks.ndim != 2:
+        raise ValueError(pad_masks.ndim)
+
+    cumsum = torch.cumsum(att_masks, dim=1)
+    att_2d_masks = cumsum[:, None, :] <= cumsum[:, :, None]
+    pad_2d_masks = pad_masks[:, None, :] * pad_masks[:, :, None]
+    att_2d_masks = att_2d_masks & pad_2d_masks
+    return att_2d_masks
+
+
+def resize_with_pad(img, width, height, pad_value=-1):
+    # assume no-op when width height fits already
+    if img.ndim != 4:
+        raise ValueError(f"(b,c,h,w) expected, but {img.shape}")
+
+    cur_height, cur_width = img.shape[2:]
+
+    ratio = max(cur_width / width, cur_height / height)
+    resized_height = int(cur_height / ratio)
+    resized_width = int(cur_width / ratio)
+    resized_img = F.interpolate(
+        img, size=(resized_height, resized_width), mode="bilinear", align_corners=False
+    )
+
+    pad_height = max(0, int(height - resized_height))
+    pad_width = max(0, int(width - resized_width))
+
+    # pad on left and top of image
+    padded_img = F.pad(resized_img, (pad_width, 0, pad_height, 0), value=pad_value)
+    return padded_img
+
+
+def pad_vector(vector, new_dim):
+    """Can be (batch_size x sequence_length x features_dimension)
+    or (batch_size x features_dimension)
+    """
+    if vector.shape[-1] == new_dim:
+        return vector
+    shape = list(vector.shape)
+    current_dim = shape[-1]
+    shape[-1] = new_dim
+    new_vector = torch.zeros(*shape, dtype=vector.dtype, device=vector.device)
+    new_vector[..., :current_dim] = vector
+    return new_vector
+
+
+def normalize(x, min_val, max_val):
+    return (x - min_val) / (max_val - min_val)
+
+
+def unnormalize(x, min_val, max_val):
+    return x * (max_val - min_val) + min_val
+
+
+def safe_arcsin(value):
+    # This ensures that the input stays within
+    # [−1,1] to avoid invalid values for arcsin
+    return torch.arcsin(torch.clamp(value, -1.0, 1.0))
+
+
+def aloha_gripper_to_angular(value):
+    # Aloha transforms the gripper positions into a linear space. The following code
+    # reverses this transformation to be consistent with pi0 which is pretrained in
+    # angular space.
+    #
+    # These values are coming from the Aloha code:
+    # PUPPET_GRIPPER_POSITION_OPEN, PUPPET_GRIPPER_POSITION_CLOSED
+    value = unnormalize(value, min_val=0.01844, max_val=0.05800)
+
+    # This is the inverse of the angular to linear transformation inside the Interbotix code.
+    def linear_to_radian(linear_position, arm_length, horn_radius):
+        value = (horn_radius**2 + linear_position**2 - arm_length**2) / (2 * horn_radius * linear_position)
+        return safe_arcsin(value)
+
+    # The constants are taken from the Interbotix code.
+    value = linear_to_radian(value, arm_length=0.036, horn_radius=0.022)
+
+    # Normalize to [0, 1].
+    # The values 0.4 and 1.5 were measured on an actual Trossen robot.
+    return normalize(value, min_val=0.4, max_val=1.5)
+
+
+def aloha_gripper_from_angular(value):
+    # Convert from the gripper position used by pi0 to the gripper position that is used by Aloha.
+    # Note that the units are still angular but the range is different.
+
+    # The values 0.4 and 1.5 were measured on an actual Trossen robot.
+    value = unnormalize(value, min_val=0.4, max_val=1.5)
+
+    # These values are coming from the Aloha code:
+    # PUPPET_GRIPPER_JOINT_OPEN, PUPPET_GRIPPER_JOINT_CLOSE
+    return normalize(value, min_val=-0.6213, max_val=1.4910)
+
+
+def aloha_gripper_from_angular_inv(value):
+    # Directly inverts the gripper_from_angular function.
+    value = unnormalize(value, min_val=-0.6213, max_val=1.4910)
+    return normalize(value, min_val=0.4, max_val=1.5)
+
+class SMOLPI0Policy(PreTrainedPolicy):
+    """Wrapper class around VLAFlowMatching model to train and run inference within LeRobot."""
+
+    config_class = SMOLPI0Config
+    name = "smolpi0"
+
+    def __init__(
+        self,
+        config: SMOLPI0Config,
+        dataset_stats: dict[str, dict[str, Tensor]] | None = None,
+    ):
+        """
+        Args:
+            config: Policy configuration class instance or None, in which case the default instantiation of
+                    the configuration class is used.
+            dataset_stats: Dataset statistics to be used for normalization. If not passed here, it is expected
+                that they will be passed with a call to `load_state_dict` before the policy is used.
+        """
+
+        super().__init__(config)
+        config.validate_features()
+        self.config = config
+        self.normalize_per_robot_type = getattr(
+            config, "normalize_per_robot_type", False
+        )  # FIXME(mshukor): assert in case of single dataset
+        if self.normalize_per_robot_type:
+            if not dataset_stats:
+                dataset_stats[config.robot_type] = {}
+            self.normalize_inputs = NormalizePerRobotType(
+                config.input_features, config.normalization_mapping, dataset_stats
+            )
+            self.normalize_targets = NormalizePerRobotType(
+                config.output_features, config.normalization_mapping, dataset_stats
+            )
+            self.unnormalize_outputs = UnnormalizePerRobotType(
+                config.output_features, config.normalization_mapping, dataset_stats
+            )
+        else:
+            self.normalize_inputs = Normalize(
+                config.input_features, config.normalization_mapping, dataset_stats
+            )
+            self.normalize_targets = Normalize(
+                config.output_features, config.normalization_mapping, dataset_stats
+            )
+            self.unnormalize_outputs = Unnormalize(
+                config.output_features, config.normalization_mapping, dataset_stats
+            )
+
+        self.language_tokenizer = AutoProcessor.from_pretrained(self.config.vlm_model_name).tokenizer
+        self.model = VLAFlowMatching(config)
+        self.include_past_states = config.n_obs_steps > 1 and OBS_STATE in self.config.past_obs_keys.split(",")
+        self.include_past_images = config.n_obs_steps > 1 and "image" in self.config.past_obs_keys.split(",")
+        self.num_past_images = self.config.n_obs_steps if self.include_past_images else 1
+        self.reset()
+
+    def reset(self):
+        """This should be called whenever the environment is reset."""
+        # self._action_queue = deque([], maxlen=self.config.n_action_steps)
+        self._queues = {
+            ACTION: deque(maxlen=self.config.n_action_steps),
+        }
+        if self.config.n_obs_steps > 1:
+            for k in self.config.input_features:
+                if any([past_obs_key in k for past_obs_key in self.config.past_obs_keys.split(",")]):
+                    self._queues[k] = deque(maxlen=self.config.n_obs_steps)
+    
+    def get_optim_params(self) -> dict:
+        if self.config.optimizer_lr_vlm > 0 and self.config.optimizer_lr_vlm != self.config.optimizer_lr:
+            params = [
+                {
+                    "params": [
+                        p
+                        for n, p in self.named_parameters()
+                        if not ".vlm." in n and p.requires_grad
+                    ]
+                },
+                {
+                    "params": [
+                        p
+                        for n, p in self.named_parameters()
+                        if ".vlm." in n and p.requires_grad
+                    ],
+                    "lr": self.config.optimizer_lr_vlm,
+                },
+            ]
+            return params
+            
+        else:
+            return self.parameters()
+            
+
+    def merge_peft_model_weights(self) -> None:
+        if "lora" in self.config.peft_method:
+            self.model.vlm_with_expert.merge_lora_weights()
+
+    @torch.no_grad
+    def select_action_chunk(self, batch: dict[str, Tensor], noise: Tensor | None = None) -> Tensor:
+        """Select a single action given environment observations.
+
+        This method wraps `select_actions` in order to return one action at a time for execution in the
+        environment. It works by managing the actions in a queue and only calling `select_actions` when the
+        queue is empty.
+        """
+        self.eval()
+
+        if self.config.adapt_to_pi_aloha:
+            batch[OBS_STATE] = self._pi_aloha_decode_state(batch[OBS_STATE])
+
+        batch = self.normalize_inputs(batch)
+
+        images, img_masks = self.prepare_images(batch)
+        state = self.prepare_state(batch)
+        lang_tokens, lang_masks = self.prepare_language(batch)
+
+        actions = self.model.sample_actions(
+            images, img_masks, lang_tokens, lang_masks, state, noise=noise
+        )
+        # Unpad actions
+        original_action_dim = self.config.action_feature.shape[0]
+        actions = actions[:, :, :original_action_dim]
+
+        actions = self.unnormalize_outputs({"action": actions, "robot_type": batch["robot_type"]})["action"]
+
+        if self.config.adapt_to_pi_aloha:
+            actions = self._pi_aloha_encode_actions(actions)
+
+        return actions
+    
+    @torch.no_grad
+    def select_action(self, batch: dict[str, Tensor], noise: Tensor | None = None) -> Tensor:
+        """Select a single action given environment observations.
+
+        This method wraps `select_actions` in order to return one action at a time for execution in the
+        environment. It works by managing the actions in a queue and only calling `select_actions` when the
+        queue is empty.
+        """
+        self.eval()
+
+        if self.config.adapt_to_pi_aloha:
+            batch[OBS_STATE] = self._pi_aloha_decode_state(batch[OBS_STATE])
+
+        batch = self.normalize_inputs(batch)
+
+        self._queues = populate_queues(self._queues, batch, exclude_keys=[ACTION])
+        # Action queue logic for n_action_steps > 1. When the action_queue is depleted, populate it by
+        # querying the policy.
+        if len(self._queues[ACTION]) == 0:
+            for k in batch:
+                if k in self._queues:
+                    batch[k] = torch.stack(list(self._queues[k]), dim=1)
+            images, img_masks = self.prepare_images(batch)
+            state = self.prepare_state(batch)
+            lang_tokens, lang_masks = self.prepare_language(batch)
+            actions = self.model.sample_actions(
+                images, img_masks, lang_tokens, lang_masks, state, noise=noise
+            )
+            if self.config.predict_relative_actions and actions.ndim == 3:
+                # If the model predicts relative actions, we need to unpad the actions
+                # and then convert them to absolute actions.
+                if self.config.relative_actions_mode == "first":
+                    actions = torch.cat((actions[:, :1], actions[:, 1:] + actions[:, :1]), dim=1)
+                elif self.config.relative_actions_mode == "state":
+                    actions = actions + state.unsqueeze(1)
+                else:
+                    actions = torch.cat((actions[:, :1], actions[:, 1:] + actions[:, :-1]), dim=1)
+            # Unpad actions
+            original_action_dim = self.config.action_feature.shape[0]
+            actions = actions[:, :, :original_action_dim]
+
+            actions = self.unnormalize_outputs({"action": actions})["action"]
+
+            if self.config.adapt_to_pi_aloha:
+                actions = self._pi_aloha_encode_actions(actions)
+
+            # `self.model.forward` returns a (batch_size, n_action_steps, action_dim) tensor, but the queue
+            # effectively has shape (n_action_steps, batch_size, *), hence the transpose.
+            self._queues[ACTION].extend(actions.transpose(0, 1)[: self.config.n_action_steps])
+        return self._queues[ACTION].popleft()
+
+    def forward(self, batch: dict[str, Tensor], noise=None, time=None) -> dict[str, Tensor]:
+        """Do a full training forward pass to compute the loss"""
+        if self.config.adapt_to_pi_aloha:
+            batch[OBS_STATE] = self._pi_aloha_decode_state(batch[OBS_STATE])
+            batch[ACTION] = self._pi_aloha_encode_actions_inv(batch[ACTION])
+        batch = self.normalize_inputs(batch)
+        batch = self.normalize_targets(batch)
+        images, img_masks = self.prepare_images(
+            batch
+        )  # FIXME(mshukor): adapte it to take into account already padded images in the batch
+        state = self.prepare_state(batch)
+        lang_tokens, lang_masks = self.prepare_language(batch)
+        actions = self.prepare_action(batch, state=state)
+        actions_is_pad = batch.get("actions_id_pad")
+        loss_dict = {}
+        losses = self.model.forward(images, img_masks, lang_tokens, lang_masks, state, actions, noise, time)
+        loss_dict["losses_after_forward"] = losses.mean().clone()
+
+        if actions_is_pad is not None:
+            in_episode_bound = ~actions_is_pad
+            losses = losses * in_episode_bound.unsqueeze(-1)
+            loss_dict["losses_after_in_ep_bound"] = losses.mean().clone()
+
+        # Remove padding
+        losses = losses[:, :, : self.config.max_action_dim]
+        loss_dict["losses_after_rm_padding"] = losses.mean().clone()
+
+        # For backward pass
+        loss = losses.mean()
+        # For backward pass
+        loss_dict["loss"] = loss
+        # # For logging
+        # loss_dict["l2_loss"] = loss.item() # remove for torch compile
+        return loss_dict
+
+    def prepare_images(self, batch):
+        """Apply Pi0 preprocessing to the images, like resizing to 224x224 and padding to keep aspect ratio, and
+        convert pixel range from [0.0, 1.0] to [-1.0, 1.0] as requested by SigLIP.
+        """
+        images = []
+        img_masks = []
+        present_img_keys = [key for key in self.config.image_features if key in batch]
+        missing_img_keys = [key for key in self.config.image_features if key not in batch]
+
+        present_img_keys = sorted(present_img_keys, key=lambda k: IMAGES_ORDER.get(k, float("inf")), reverse=self.config.reverse_images_order)
+        if self.config.shuffle_camera_positions and ACTION in batch: # only during training
+            present_img_keys = random.sample(present_img_keys, len(present_img_keys))
+        if len(present_img_keys) == 0:
+            raise ValueError(
+                f"All image features are missing from the batch. At least one expected. (batch: {batch.keys()}) (image_features:{self.config.image_features})"
+            )
+        for i in range(self.num_past_images):
+            # Preprocess image features present in the batch
+            for key in present_img_keys:
+                img = batch[key][:, i, :, :, :] if batch[key].ndim == 5 else batch[key]
+                if self.config.resize_imgs_with_padding is not None:
+                    img = resize_with_pad(img, *self.config.resize_imgs_with_padding, pad_value=0)
+
+                # Normalize from range [0,1] to [-1,1] as expacted by siglip
+                img = img * 2.0 - 1.0
+
+                bsize = img.shape[0]
+                device = img.device
+                if f"{key}_padding_mask" in batch:
+                    mask = batch[f"{key}_padding_mask"].bool()
+                else:
+                    mask = torch.ones(bsize, dtype=torch.bool, device=device)
+                images.append(img)
+                img_masks.append(mask)
+
+            # Create image features not present in the batch
+            # as fully 0 padded images.
+            for num_empty_cameras in range(len(missing_img_keys)):
+                if num_empty_cameras >= self.config.empty_cameras:
+                    break
+                img = torch.ones_like(img) * -1
+                mask = torch.zeros_like(mask)
+                images.append(img)
+                img_masks.append(mask)
+        return images, img_masks
+
+    def prepare_language(self, batch) -> tuple[Tensor, Tensor]:
+        """Tokenize the text input"""
+        device = batch[OBS_STATE].device
+        tasks = batch["task"]
+        if len(tasks) == 1:
+            tasks = [tasks[0] for _ in range(batch[OBS_STATE].shape[0])]
+
+        if self.config.add_prompt_template:
+            tasks = [f"{self.config.prefix_prompt_template}{task}{self.config.suffix_prompt_template}" for task in tasks]
+        else:
+            tasks = [task if task.endswith("\n") else f"{task}\n" for task in tasks]
+        tokenized_prompt = self.language_tokenizer.__call__(
+            tasks,
+            padding=self.config.pad_language_to,
+            padding_side="right",
+            max_length=self.config.tokenizer_max_length,
+            return_tensors="pt",
+            truncation=True, # FIXME(mshukor)
+        )
+
+        lang_tokens = tokenized_prompt["input_ids"].to(device=device)
+        lang_masks = tokenized_prompt["attention_mask"].to(device=device, dtype=torch.bool)
+
+        return lang_tokens, lang_masks
+
+    def _pi_aloha_decode_state(self, state):
+        # Flip the joints.
+        for motor_idx in [1, 2, 8, 9]:
+            state[:, motor_idx] *= -1
+        # Reverse the gripper transformation that is being applied by the Aloha runtime.
+        for motor_idx in [6, 13]:
+            state[:, motor_idx] = aloha_gripper_to_angular(state[:, motor_idx])
+        return state
+
+    def _pi_aloha_encode_actions(self, actions):
+        # Flip the joints.
+        for motor_idx in [1, 2, 8, 9]:
+            actions[:, :, motor_idx] *= -1
+        # Reverse the gripper transformation that is being applied by the Aloha runtime.
+        for motor_idx in [6, 13]:
+            actions[:, :, motor_idx] = aloha_gripper_from_angular(actions[:, :, motor_idx])
+        return actions
+
+    def _pi_aloha_encode_actions_inv(self, actions):
+        # Flip the joints again.
+        for motor_idx in [1, 2, 8, 9]:
+            actions[:, :, motor_idx] *= -1
+        # Reverse the gripper transformation that is being applied by the Aloha runtime.
+        for motor_idx in [6, 13]:
+            actions[:, :, motor_idx] = aloha_gripper_from_angular_inv(actions[:, :, motor_idx])
+        return actions
+
+    def prepare_state(self, batch):
+        """Pad state"""
+        state = batch[OBS_STATE][:, -1, :] if (batch[OBS_STATE].ndim > 2 and not self.include_past_states) else batch[OBS_STATE] # FIXME(mshukor): no state history for now
+        state = pad_vector(state, self.config.max_state_dim)
+        return state
+
+    def prepare_action(self, batch, state=None):
+        """Pad action"""
+        actions = pad_vector(batch[ACTION], self.config.max_action_dim)
+        if self.config.predict_relative_actions and actions.ndim == 3:
+            if self.config.relative_actions_mode == "first":
+                actions = torch.cat((actions[:, :1], actions[:, 1:] - actions[:, :1]), dim=1)
+            elif self.config.relative_actions_mode == "state":
+                assert batch[ACTION].shape[-1] == batch[OBS_STATE].shape[-1], "Relative action mode 'state' requires the action and state to have the same dimension."
+                if state.ndim == 2:
+                    state = state.unsqueeze(1)
+                actions = actions - state
+            else:
+                actions = torch.cat((actions[:, :1], actions[:, 1:] - actions[:, :-1]), dim=1)
+        return actions
+
+def pad_tensor(tensor, max_len, pad_value=0):
+    """
+    Efficiently pads a tensor along sequence dimension to match max_len.
+
+    Args:
+        tensor (torch.Tensor): Shape (B, L, ...) or (B, L).
+        max_len (int): Fixed sequence length.
+        pad_value (int/float): Value for padding.
+
+    Returns:
+        torch.Tensor: Shape (B, max_len, ...) or (B, max_len).
+    """
+    B, L = tensor.shape[:2]
+    
+    # Create a padded tensor of max_len and copy the existing values
+    padded_tensor = torch.full((B, max_len, *tensor.shape[2:]), pad_value, dtype=tensor.dtype, device=tensor.device)
+    padded_tensor[:, :L] = tensor  # Efficient in-place copy
+
+    return padded_tensor
+
+class VLAFlowMatching(nn.Module):
+    """
+    π0: A Vision-Language-Action Flow Model for General Robot Control
+
+    [Paper](https://www.physicalintelligence.company/download/pi0.pdf)
+    [Jax code](https://github.com/Physical-Intelligence/openpi)
+
+    Designed by Physical Intelligence. Ported from Jax by Hugging Face.
+    ┌──────────────────────────────┐
+    │               actions        │
+    │               ▲              │
+    │              ┌┴─────┐        │
+    │  kv cache    │Gemma │        │
+    │  ┌──────────►│Expert│        │
+    │  │           │      │        │
+    │ ┌┴────────┐  │x 10  │        │
+    │ │         │  └▲──▲──┘        │
+    │ │   VLM   │   │  │           │
+    │ │         │   │  robot state │
+    │ │         │   noise          │
+    │ └▲──▲─────┘                  │
+    │  │  │                        │
+    │  │  image(s)                 │
+    │  language tokens             │
+    └──────────────────────────────┘
+    """
+
+    def __init__(self, config):
+        super().__init__()
+        self.config = config
+
+        self.vlm_with_expert = SmolVLMWithExpertModel(model_id=self.config.vlm_model_name, 
+            freeze_vision_encoder=self.config.freeze_vision_encoder,
+            train_expert_only=self.config.train_expert_only,
+            attention_implementation=self.config.attention_implementation,
+            load_vlm_weights=self.config.load_vlm_weights,
+            attention_mode=self.config.attention_mode,
+            num_expert_layers=self.config.num_expert_layers,
+            num_vlm_layers=self.config.num_vlm_layers,
+            self_attn_every_n_layers=self.config.self_attn_every_n_layers,
+            expert_width_multiplier=self.config.expert_width_multiplier,
+            self_attn_only_actions=self.config.self_attn_only_actions,
+            )
+        # self.paligemma_with_expert = self.configure_peft(paligemma_with_expert)
+        self.vlm_with_expert.configure_peft(config=self.config)
+        # Projections are float32
+        self.state_to_prefix = self.config.state_to_prefix
+        if self.state_to_prefix:
+            self.state_proj = nn.Linear(self.config.max_state_dim, self.vlm_with_expert.config.text_config.hidden_size)
+        else:
+            self.state_proj = nn.Linear(self.config.max_state_dim, self.vlm_with_expert.expert_hidden_size)
+        self.action_in_proj = nn.Linear(self.config.max_action_dim, self.vlm_with_expert.expert_hidden_size)
+        self.action_out_proj = nn.Linear(self.vlm_with_expert.expert_hidden_size, self.config.max_action_dim)
+
+        self.action_time_mlp_in = nn.Linear(self.vlm_with_expert.expert_hidden_size * 2, self.vlm_with_expert.expert_hidden_size)
+        self.action_time_mlp_out = nn.Linear(self.vlm_with_expert.expert_hidden_size, self.vlm_with_expert.expert_hidden_size)
+
+        self.set_requires_grad()
+        # SmolVLM2 has: [fake_tok + crop_tok + crop + fake_tok + crop_tok ... + fake_tok + global_tok + global + fake_tok] + [second image] + ...
+        if  any([k in self.config.vlm_model_name for k in ["SmolVLM-", "SmolVLA-"]]):
+            if "SmolVLM-Instruct" in self.config.vlm_model_name:
+                self.fake_image_token =  49152
+                self.global_image_token =  [44, 13906, 29, 6266, 46]
+                self.global_image_start_token = torch.tensor([self.fake_image_token] + self.global_image_token, dtype=torch.long)
+            else:
+                self.fake_image_token =  49189
+                self.global_image_token =  49152
+                self.global_image_start_token = torch.tensor([self.fake_image_token, self.global_image_token], dtype=torch.long)
+        else:
+            self.fake_image_token = self.vlm_with_expert.processor.tokenizer.fake_image_token_id
+            self.global_image_token = self.vlm_with_expert.processor.tokenizer.global_image_token_id
+            self.global_image_start_token = torch.tensor([self.fake_image_token, self.global_image_token], dtype=torch.long)
+
+        self.add_image_special_tokens = self.config.add_image_special_tokens
+        self.add_local_special_image_tokens = self.config.add_local_special_image_tokens
+        self.local_image_tokens = [torch.tensor([self.fake_image_token, tok], dtype=torch.long) for tok in [49153, 49154, 49155, 49159, 49160, 49161, 49165, 49166, 49167]] # assume 3 x 3 grid
+        
+        self.local_image_start_token = self.global_image_start_token
+        self.image_end_token = torch.tensor([self.fake_image_token], dtype=torch.long)
+        self.prefix_length = self.config.prefix_length
+        self.include_past_images = self.config.n_obs_steps > 1 and "image" in self.config.past_obs_keys.split(",")
+        self.num_past_images = self.config.n_obs_steps if self.include_past_images else 1
+        self.causal_attention_on_history = self.config.causal_attention_on_history
+        
+        
+        
+
+    # def configure_peft(self, model):
+    #     # return model
+    #     self.peft_method = self.config.peft_method
+    #     if "lora" in self.peft_method:
+    #         peft_config = self.config.peft_config
+    #         target_modules = peft_config.target_modules
+    #         if not isinstance(target_modules, list):
+    #             target_modules = target_modules.split(",")
+    #         lora_config = LoraConfig(
+    #             task_type=TaskType.CAUSAL_LM,  # Based on the task type (e.g., language modeling, etc.)
+    #             r=peft_config.r,  # The rank of the low-rank adaptation
+    #             lora_alpha=peft_config.lora_alpha,  # Scaling factor
+    #             lora_dropout=peft_config.lora_dropout,  # Dropout applied to LoRA layers
+    #             target_modules=target_modules,  # The components where LoRA is applied
+    #             exclude_modules=["gemma_expert", "model.gemma_expert.model.layers"], # FIXME(mshukor): this does not work for now
+    #         )
+    #         # LoraConfig(task_type=TaskType.CAUSAL_LM, r=16, lora_alpha=1, lora_dropout=0, target_modules=["q_proj"], exclude_modules=["gemma_expert"])
+    #         self.lora_config = lora_config
+    #         # Apply LoRA and ensure only LoRA parameters are trainable
+
+    #         model = get_peft_model(model, lora_config)
+    #         assert self.config.train_expert_only, "Backbone should be frozen and only lora parameters are " # FIXME(mshukor): handle this here?
+    #         for name, param in model.named_parameters():
+    #             if (
+    #                 "lora" in name
+    #             ):  # lm_head is not a parameter in most LLMs becasue it's tied to the embedding layer
+    #                 param.requires_grad = True
+    #     return model
+
+    def set_requires_grad(self):
+        for params in self.state_proj.parameters():
+            params.requires_grad = self.config.train_state_proj
+
+    def sample_noise(self, shape, device):
+        noise = torch.normal(
+            mean=0.0,
+            std=1.0,
+            size=shape,
+            dtype=torch.float32,
+            device=device,
+        )
+        return noise
+
+    def sample_time(self, bsize, device):
+        time_beta = sample_beta(1.5, 1.0, bsize, device)
+        time = time_beta * 0.999 + 0.001
+        return time.to(dtype=torch.float32, device=device)
+
+    def embed_prefix(
+        self, images, img_masks, lang_tokens, lang_masks, state: torch.Tensor = None
+    ) -> tuple[torch.Tensor, torch.Tensor, torch.Tensor]:
+        """Embed images with SigLIP and language tokens with embedding layer to prepare
+        for SmolVLM transformer processing.
+        """
+        # TODO: avoid list in python and torch.cat ; prefer pre-allocation with torch.empty
+        embs = []
+        pad_masks = []
+        att_masks = []
+        num_images = len(images) // self.num_past_images
+        # TODO: remove for loop
+        for img_idx, (
+            img,
+            img_mask,
+        ) in enumerate(zip(images, img_masks, strict=False)):
+            # FIXME(mshukor):  add special tokens for the history each history_steps or not 
+            if self.add_image_special_tokens:
+                if self.add_local_special_image_tokens and img_idx % num_images != num_images - 1:
+                    local_token_idx = img_idx % num_images
+                    image_start_token = self.vlm_with_expert.embed_language_tokens(self.local_image_tokens[local_token_idx].to(device=self.vlm_with_expert.vlm.device)).unsqueeze(0).expand(img.shape[0], -1, -1)
+                else:
+                    image_start_token = self.vlm_with_expert.embed_language_tokens(self.global_image_start_token.to(device=self.vlm_with_expert.vlm.device)).unsqueeze(0).expand(img.shape[0], -1, -1)
+                image_start_mask = torch.ones_like(image_start_token[:, :, 0], dtype=torch.bool, device=image_start_token.device)
+                if self.causal_attention_on_history and img_idx % num_images == 0:
+                    att_masks += [1] + [0] * (image_start_mask.shape[-1] - 1)
+                else:
+                    att_masks += [0] * (image_start_mask.shape[-1])
+                embs.append(image_start_token)
+                pad_masks.append(image_start_mask)
+
+            img_emb = self.vlm_with_expert.embed_image(img)
+            img_emb = img_emb #.to(dtype=self.vlm_with_expert.type)
+
+            # Normalize image embeddings
+            img_emb_dim = img_emb.shape[-1]
+            img_emb = img_emb * torch.tensor(img_emb_dim**0.5, dtype=img_emb.dtype, device=img_emb.device)
+
+            bsize, num_img_embs = img_emb.shape[:2]
+            img_mask = img_mask[:, None].expand(bsize, num_img_embs)
+
+            # FIXME(mshukor): add special image tokens. Assume no tiling fake global images fake
+            # template <|im_start|>User: What actions? image tokens \nAssistant: or processor.apply_chat_template?
+            # processor.fake_image_token
+            # processor.global_image_token
+
+            embs.append(img_emb)
+            pad_masks.append(img_mask)
+
+            att_masks += [0] * (num_img_embs)
+            if self.add_image_special_tokens:
+                if not self.add_local_special_image_tokens or (self.add_local_special_image_tokens and img_idx % num_images == num_images - 1):
+                    image_end_token = self.vlm_with_expert.embed_language_tokens(self.image_end_token.to(device=self.vlm_with_expert.vlm.device)).unsqueeze(0).expand(img.shape[0], -1, -1)
+                    image_end_mask = torch.ones_like(image_end_token[:, :, 0], dtype=torch.bool, device=image_end_token.device)
+                    embs.append(image_end_token)
+                    pad_masks.append(image_end_mask)
+                    att_masks += [0] * (image_end_mask.shape[1])
+        lang_emb = self.vlm_with_expert.embed_language_tokens(lang_tokens)
+        # Normalize language embeddings
+        lang_emb_dim = lang_emb.shape[-1]
+        lang_emb = lang_emb * math.sqrt(lang_emb_dim) # FIXME(mshukor): is this needed for smolvlm?
+
+        embs.append(lang_emb)
+        pad_masks.append(lang_masks)
+
+        # full attention between image and language inputs
+        num_lang_embs = lang_emb.shape[1]
+        att_masks += [0] * num_lang_embs
+
+        if state is not None and self.state_to_prefix:
+            state_emb = self.state_proj(state)
+            state_emb = state_emb[:, None, :] if state_emb.ndim == 2 else state_emb #.to(dtype=self.vlm_with_expert.type)
+            embs.append(state_emb)
+            bsize = state_emb.shape[0]
+            dtype = state_emb.dtype
+            device = state_emb.device
+
+            states_seq_len = state_emb.shape[1]
+            state_mask = torch.ones(bsize, states_seq_len, dtype=torch.bool, device=device)
+            pad_masks.append(state_mask)
+
+            # Set attention masks so that image and language inputs do not attend to state or actions
+            # att_masks += [1] + [0]*(states_seq_len - 1)
+            att_masks += [1]*(states_seq_len)
+        embs = torch.cat(embs, dim=1)
+        pad_masks = torch.cat(pad_masks, dim=1)
+        att_masks = torch.tensor(att_masks, dtype=torch.bool, device=pad_masks.device)
+        att_masks = att_masks[None, :]
+
+        seq_len = pad_masks.shape[1]
+        if seq_len < self.prefix_length:
+            embs = pad_tensor(embs, self.prefix_length, pad_value=0)
+            pad_masks = pad_tensor(pad_masks, self.prefix_length, pad_value=0)
+            att_masks = pad_tensor(att_masks, self.prefix_length, pad_value=0)
+
+        att_masks = att_masks.expand(bsize, -1)
+
+        return embs, pad_masks, att_masks
+
+    def embed_suffix(self, state, noisy_actions, timestep):
+        """Embed state, noisy_actions, timestep to prepare for Expert Gemma processing."""
+        embs = []
+        pad_masks = []
+        att_masks = []
+
+        # Embed state
+        if not self.state_to_prefix:
+            state_emb = self.state_proj(state)
+            state_emb = state_emb[:, None, :] if state_emb.ndim == 2 else state_emb #.to(dtype=self.vlm_with_expert.type)
+            embs.append(state_emb)
+            bsize = state_emb.shape[0]
+            dtype = state_emb.dtype
+            device = state_emb.device
+
+            states_seq_len = state_emb.shape[1]
+            state_mask = torch.ones(bsize, states_seq_len, dtype=torch.bool, device=device)
+            pad_masks.append(state_mask)
+
+            # Set attention masks so that image and language inputs do not attend to state or actions
+            att_masks += [1] + [0]*(states_seq_len - 1)
+
+
+        # Fuse timestep + action information using an MLP
+        action_emb = self.action_in_proj(noisy_actions)
+        device = action_emb.device
+        bsize = action_emb.shape[0]
+        dtype = action_emb.dtype
+        # Embed timestep using sine-cosine positional encoding with sensitivity in the range [0, 1]
+        time_emb = create_sinusoidal_pos_embedding(
+            timestep, self.vlm_with_expert.expert_hidden_size, min_period=4e-3, max_period=4.0, device=device
+        )
+        time_emb = time_emb.type(dtype=dtype)
+
+        time_emb = time_emb[:, None, :].expand_as(action_emb)
+        action_time_emb = torch.cat([action_emb, time_emb], dim=2)
+
+        action_time_emb = self.action_time_mlp_in(action_time_emb)
+        action_time_emb = F.silu(action_time_emb)  # swish == silu
+        action_time_emb = self.action_time_mlp_out(action_time_emb)
+
+        # Add to input tokens
+        embs.append(action_time_emb)
+
+        bsize, action_time_dim = action_time_emb.shape[:2]
+        action_time_mask = torch.ones(bsize, action_time_dim, dtype=torch.bool, device=device)
+        pad_masks.append(action_time_mask)
+
+        # Set attention masks so that image, language and state inputs do not attend to action tokens
+        if self.config.causal_action_attention_mask:
+            att_masks += [1] * self.config.chunk_size
+        else:
+            att_masks += [1] + ([0] * (self.config.chunk_size - 1))
+        embs = torch.cat(embs, dim=1)
+        pad_masks = torch.cat(pad_masks, dim=1)
+        att_masks = torch.tensor(att_masks, dtype=embs.dtype, device=embs.device)
+        att_masks = att_masks[None, :].expand(bsize, len(att_masks))
+        return embs, pad_masks, att_masks
+
+    def forward(
+        self, images, img_masks, lang_tokens, lang_masks, state, actions, noise=None, time=None
+    ) -> Tensor:
+        """Do a full training forward pass and compute the loss (batch_size x num_steps x num_motors)"""
+        if noise is None:
+            noise = self.sample_noise(actions.shape, actions.device)
+
+        if time is None:
+            time = self.sample_time(actions.shape[0], actions.device)
+
+        time_expanded = time[:, None, None]
+        if self.config.regression_loss:
+            # Hack to compare regression to flow matching
+            time = torch.zeros_like(time, dtype=time.dtype, device=time.device)
+            x_t = torch.zeros_like(actions, dtype=actions.dtype, device=actions.device)
+            u_t = actions
+        else:
+            x_t = time_expanded * noise + (1 - time_expanded) * actions
+            u_t = noise - actions
+        prefix_embs, prefix_pad_masks, prefix_att_masks = self.embed_prefix(
+            images, img_masks, lang_tokens, lang_masks, state=state
+        )
+        suffix_embs, suffix_pad_masks, suffix_att_masks = self.embed_suffix(state, x_t, time)
+        
+        pad_masks = torch.cat([prefix_pad_masks, suffix_pad_masks], dim=1)
+        att_masks = torch.cat([prefix_att_masks, suffix_att_masks], dim=1)
+
+        att_2d_masks = make_att_2d_masks(pad_masks, att_masks)
+        position_ids = torch.cumsum(pad_masks, dim=1) - 1
+        (_, suffix_out), _ = self.vlm_with_expert.forward(
+            attention_mask=att_2d_masks,
+            position_ids=position_ids,
+            past_key_values=None,
+            inputs_embeds=[prefix_embs, suffix_embs],
+            use_cache=False,
+            fill_kv_cache=False,
+        )
+        suffix_out = suffix_out[:, -self.config.chunk_size :]
+        # Original openpi code, upcast attention output
+        suffix_out = suffix_out.to(dtype=torch.float32)
+        v_t = self.action_out_proj(suffix_out)
+        if self.config.regression_loss:
+            losses = F.l1_loss(u_t, v_t, reduction="none")
+        else:
+            losses = F.mse_loss(u_t, v_t, reduction="none")
+        return losses
+
+    def sample_actions(self, images, img_masks, lang_tokens, lang_masks, state, noise=None) -> Tensor:
+        """Do a full inference forward and compute the action (batch_size x num_steps x num_motors)"""
+        bsize = state.shape[0]
+        device = state.device
+
+        if noise is None:
+            actions_shape = (bsize, self.config.chunk_size, self.config.max_action_dim)
+            noise = self.sample_noise(actions_shape, device)
+
+        prefix_embs, prefix_pad_masks, prefix_att_masks = self.embed_prefix(
+            images, img_masks, lang_tokens, lang_masks, state=state
+        )
+        prefix_att_2d_masks = make_att_2d_masks(prefix_pad_masks, prefix_att_masks)
+        prefix_position_ids = torch.cumsum(prefix_pad_masks, dim=1) - 1
+        # Compute image and language key value cache
+        _, past_key_values = self.vlm_with_expert.forward(
+            attention_mask=prefix_att_2d_masks,
+            position_ids=prefix_position_ids,
+            past_key_values=None,
+            inputs_embeds=[prefix_embs, None],
+            use_cache=self.config.use_cache,
+            fill_kv_cache=True,
+        )
+        if self.config.regression_loss:
+            x_t = torch.zeros_like(noise, dtype=torch.float32, device=device)
+            expanded_time = torch.zeros(bsize, dtype=torch.float32, device=device)
+            x_t = self.denoise_step(
+                                state,
+                                prefix_pad_masks,
+                                past_key_values,
+                                x_t,
+                                expanded_time,
+                            )
+        else:
+            dt = -1.0 / self.config.num_steps
+            dt = torch.tensor(dt, dtype=torch.float32, device=device)
+
+            x_t = noise
+            time = torch.tensor(1.0, dtype=torch.float32, device=device)
+            while time >= -dt / 2:
+                expanded_time = time.expand(bsize)
+                v_t = self.denoise_step(
+                    state,
+                    prefix_pad_masks,
+                    past_key_values,
+                    x_t,
+                    expanded_time,
+                )
+
+                # Euler step
+                x_t += dt * v_t
+                time += dt
+        return x_t
+
+    def denoise_step(
+        self,
+        state,
+        prefix_pad_masks,
+        past_key_values,
+        x_t,
+        timestep,
+    ):
+        """Apply one denoising step of the noise `x_t` at a given timestep."""
+        suffix_embs, suffix_pad_masks, suffix_att_masks = self.embed_suffix(state, x_t, timestep)
+
+        suffix_len = suffix_pad_masks.shape[1]
+        batch_size = prefix_pad_masks.shape[0]
+        prefix_len = prefix_pad_masks.shape[1]
+        prefix_pad_2d_masks = prefix_pad_masks[:, None, :].expand(batch_size, suffix_len, prefix_len)
+
+        suffix_att_2d_masks = make_att_2d_masks(suffix_pad_masks, suffix_att_masks)
+
+        full_att_2d_masks = torch.cat([prefix_pad_2d_masks, suffix_att_2d_masks], dim=2)
+        prefix_offsets = torch.sum(prefix_pad_masks, dim=-1)[:, None]
+        position_ids = prefix_offsets + torch.cumsum(suffix_pad_masks, dim=1) - 1
+
+        outputs_embeds, _ = self.vlm_with_expert.forward(
+            attention_mask=full_att_2d_masks,
+            position_ids=position_ids,
+            past_key_values=past_key_values,
+            inputs_embeds=[None, suffix_embs],
+            use_cache=self.config.use_cache,
+            fill_kv_cache=False,
+        )
+        suffix_out = outputs_embeds[1]
+        suffix_out = suffix_out[:, -self.config.chunk_size :]
+        suffix_out = suffix_out.to(dtype=torch.float32)
+        v_t = self.action_out_proj(suffix_out)
+        return v_t
diff --git a/src/lerobot/policies/smolpi0/smolvlm_with_expert.py b/src/lerobot/policies/smolpi0/smolvlm_with_expert.py
new file mode 100644
index 000000000..b910679f1
--- /dev/null
+++ b/src/lerobot/policies/smolpi0/smolvlm_with_expert.py
@@ -0,0 +1,824 @@
+# Copyright 2024 The HuggingFace Inc. team. All rights reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+from typing import List, Optional, Union
+from functools import partial
+import copy
+
+import torch
+import torch.version
+import torch.nn.functional as F  # noqa: N812
+from peft import LoraConfig, TaskType, get_peft_model
+from pytest import Cache
+from torch import nn
+from transformers import (
+    AutoConfig,
+    GemmaForCausalLM,
+    AutoModelForImageTextToText,
+    AutoProcessor,
+    PretrainedConfig,
+    PreTrainedModel,
+    SmolVLMForConditionalGeneration,
+    AutoModel,
+    AutoModelForVision2Seq,
+)
+from transformers.models.auto import CONFIG_MAPPING
+from transformers import SmolVLMModel, SmolVLMConfig
+from lerobot.policies.smolpi0.flex_attention import flex_attention_forward
+
+def _round_up_to_multiple(x, multiple):
+    return (x + multiple - 1) // multiple * multiple
+
+
+def apply_rope(x, positions, max_wavelength=10_000):
+    """
+    Applies RoPE positions [B, L] to x [B, L, H, D].
+    """
+    d_half = x.shape[-1] // 2
+    device = x.device
+    dtype = x.dtype
+    x = x.to(torch.float32)
+
+    freq_exponents = (2.0 / x.shape[-1]) * torch.arange(d_half, dtype=torch.float32, device=device)
+    timescale = max_wavelength**freq_exponents
+    radians = positions[..., None].to(torch.float32) / timescale[None, None, :].to(torch.float32)
+
+    radians = radians[..., None, :]
+
+    sin = torch.sin(radians)  # .to(dtype=dtype)
+    cos = torch.cos(radians)  # .to(dtype=dtype)
+
+    x1, x2 = x.split(d_half, dim=-1)
+    res = torch.empty_like(x)
+    res[..., :d_half] = x1 * cos - x2 * sin
+    res[..., d_half:] = x2 * cos + x1 * sin
+
+    return res.to(dtype)
+
+
+# class SmolVLMWithExpertConfig(PretrainedConfig):
+#     model_type = "SmolVLMWithExpertModel"
+#     sub_configs = {"smolvlm_config": AutoConfig, "lm_expert_config": AutoConfig}
+
+#     def __init__(
+#         self,
+#         smolvlm_config: dict | None = None,
+#         lm_expert_config: dict | None = None,
+#         freeze_vision_encoder: bool = True,
+#         train_expert_only: bool = True,
+#         attention_implementation: str = "eager",
+#         load_vlm_weights: bool = False,
+#         **kwargs,
+#     ):
+#         self.load_vlm_weights = load_vlm_weights
+#         self.freeze_vision_encoder = freeze_vision_encoder
+#         self.train_expert_only = train_expert_only
+#         self.attention_implementation = attention_implementation
+
+#         if smolvlm_config is None:
+#             # Default config from Pi0
+#             self.smolvlm_config = CONFIG_MAPPING["smolvlm"](
+#                 transformers_version="4.48.1",
+#                 _vocab_size=257152,
+#                 bos_token_id=2,
+#                 eos_token_id=1,
+#                 hidden_size=2048,
+#                 image_token_index=257152,
+#                 model_type="smolvlm",
+#                 pad_token_id=0,
+#                 projection_dim=2048,
+#                 text_config={
+#                     "hidden_activation": "gelu_pytorch_tanh",
+#                     "hidden_size": 2048,
+#                     "intermediate_size": 16384,
+#                     "model_type": "gemma",
+#                     "num_attention_heads": 8,
+#                     "num_hidden_layers": 18,
+#                     "num_image_tokens": 256,
+#                     "num_key_value_heads": 1,
+#                     "torch_dtype": "float32",
+#                     "vocab_size": 257152,
+#                 },
+#                 vision_config={
+#                     "hidden_size": 1152,
+#                     "intermediate_size": 4304,
+#                     "model_type": "siglip_vision_model",
+#                     "num_attention_heads": 16,
+#                     "num_hidden_layers": 27,
+#                     "num_image_tokens": 256,
+#                     "patch_size": 14,
+#                     "projection_dim": 2048,
+#                     "projector_hidden_act": "gelu_fast",
+#                     "torch_dtype": "float32",
+#                     "vision_use_head": False,
+#                 },
+#             )
+#         elif isinstance(self.paligemma_config, dict):
+#             # Override Pi0 default config for PaliGemma
+#             if "model_type" not in gemma_expert_config:
+#                 paligemma_config["model_type"] = "paligemma"
+
+#             cfg_cls = CONFIG_MAPPING[paligemma_config["model_type"]]
+#             self.paligemma_config = cfg_cls(**paligemma_config)
+
+#         if gemma_expert_config is None:
+#             # Default config from Pi0
+#             self.gemma_expert_config = CONFIG_MAPPING["gemma"](
+#                 attention_bias=False,
+#                 attention_dropout=0.0,
+#                 bos_token_id=2,
+#                 eos_token_id=1,
+#                 head_dim=256,
+#                 hidden_act="gelu_pytorch_tanh",
+#                 hidden_activation="gelu_pytorch_tanh",
+#                 hidden_size=1024,
+#                 initializer_range=0.02,
+#                 intermediate_size=4096,
+#                 max_position_embeddings=8192,
+#                 model_type="gemma",
+#                 num_attention_heads=8,
+#                 num_hidden_layers=18,
+#                 num_key_value_heads=1,
+#                 pad_token_id=0,
+#                 rms_norm_eps=1e-06,
+#                 rope_theta=10000.0,
+#                 torch_dtype="float32",
+#                 transformers_version="4.48.1",
+#                 use_cache=True,
+#                 vocab_size=257152,
+#             )
+#         elif isinstance(self.gemma_expert_config, dict):
+#             # Override Pi0 default config for Gemma Expert
+#             if "model_type" not in gemma_expert_config:
+#                 gemma_expert_config["model_type"] = "gemma"
+
+#             cfg_cls = CONFIG_MAPPING[paligemma_config["model_type"]]
+#             self.gemma_expert_config = cfg_cls(**gemma_expert_config)
+
+#         super().__init__(**kwargs)
+
+#     def __post_init__(self):
+#         super().__post_init__()
+#         if self.train_expert_only and not self.freeze_vision_encoder:
+#             raise ValueError(
+#                 "You set `freeze_vision_encoder=False` and `train_expert_only=True` which are not compatible."
+#             )
+
+#         if self.attention_implementation not in ["eager", "fa2", "flex"]:
+#             raise ValueError(
+#                 f"Wrong value provided for `attention_implementation` ({self.attention_implementation}). Expected 'eager', 'fa2' or 'flex'."
+#             )
+
+def get_intermediate_size(hidden_dim, ffn_dim_multiplier=4, multiple_of=256):
+    hidden_dim = int(2 * hidden_dim / 3)
+    hidden_dim = int(ffn_dim_multiplier * hidden_dim)
+    hidden_dim = multiple_of * ((hidden_dim + multiple_of - 1) // multiple_of)
+    return hidden_dim
+    
+
+class SmolVLMWithExpertModel(nn.Module):
+    # config_class = PaliGemmaWithExpertConfig
+
+    def __init__(self, model_id: str = "HuggingFaceTB/SmolVLM2-500M-Video-Instruct", 
+    load_vlm_weights: bool = True, train_expert_only: bool = True, freeze_vision_encoder: bool = False, 
+    attention_implementation: str = "eager", attention_mode: str = "self_attn", num_expert_layers: int = -1, 
+    num_vlm_layers: int = -1, self_attn_every_n_layers: int = -1, expert_width_multiplier: float = 0.5, self_attn_only_actions: bool = False):
+        super().__init__()
+        if load_vlm_weights:
+            print(f"Loading  {model_id} weights ...")
+            if "SmolVLM-" in model_id:
+                self.vlm = AutoModelForVision2Seq.from_pretrained(
+                    model_id,
+                    device_map="cuda",
+                    torch_dtype="bfloat16",
+                    low_cpu_mem_usage=True,
+                )
+            else:
+                # model_id = "HuggingFaceTB/SmolVLM2-500M-Video-Instruct"
+                self.vlm = AutoModelForImageTextToText.from_pretrained(
+                    model_id,
+                    device_map="cuda",
+                    torch_dtype="bfloat16",
+                    low_cpu_mem_usage=True,
+                    # attn_implementation="eager",
+                    # attn_implementation="flash_attention_2"
+                )
+            config = self.vlm.config
+        else:
+            config = AutoConfig.from_pretrained(model_id)
+            self.vlm = SmolVLMForConditionalGeneration(config=config)
+        self.processor = AutoProcessor.from_pretrained(model_id)
+        if num_vlm_layers > 0:
+            print(f"Reducing the number of VLM layers to {num_vlm_layers} ...")
+            self.get_vlm_model().text_model.layers = self.get_vlm_model().text_model.layers[:num_vlm_layers]
+        self.num_vlm_layers = len(self.get_vlm_model().text_model.layers)
+        self.config = config
+        # Smaller lm expert
+        lm_expert_config = copy.deepcopy(config.text_config)
+        hidden_size = lm_expert_config.hidden_size
+        lm_expert_config.hidden_size = int(hidden_size*expert_width_multiplier) #hidden_size // 2
+        lm_expert_config.intermediate_size = get_intermediate_size(int(hidden_size*expert_width_multiplier))
+        lm_expert_config.num_hidden_layers = self.num_vlm_layers
+        if num_expert_layers > 0 :
+            assert len(self.get_vlm_model().text_model.layers) % num_expert_layers == 0, f"Number of layers in the VLM {len(self.get_vlm_model().text_model.layers)} are not multiple of num_expert_layers {num_expert_layers}"
+            lm_expert_config.num_hidden_layers = num_expert_layers
+        # lm_expert_config.head_dim = lm_expert_config.head_dim * 2
+        self.lm_expert = AutoModel.from_config(lm_expert_config)
+        
+        self.num_expert_layers = len(self.lm_expert.layers)
+        self.self_attn_every_n_layers = self_attn_every_n_layers
+        self.self_attn_only_actions = self_attn_only_actions
+        if "cross" in attention_mode:
+            # Reshape qkv projections to have the same input dimension as the vlm
+            for layer_idx in range(len(self.lm_expert.layers)):
+                if self.self_attn_every_n_layers > 0 and layer_idx % self.self_attn_every_n_layers == 0:
+                    continue
+                self.lm_expert.layers[layer_idx].self_attn.k_proj = nn.Linear(
+                    config.text_config.num_key_value_heads * config.text_config.head_dim, lm_expert_config.num_key_value_heads * lm_expert_config.head_dim, bias=lm_expert_config.attention_bias
+                )
+                self.lm_expert.layers[layer_idx].self_attn.v_proj = nn.Linear(
+                    config.text_config.num_key_value_heads * config.text_config.head_dim, lm_expert_config.num_key_value_heads * lm_expert_config.head_dim, bias=lm_expert_config.attention_bias
+                )
+        # Remove unused embed_tokens
+        self.lm_expert.embed_tokens = None
+
+        self.num_attention_heads = self.config.text_config.num_attention_heads
+        self.num_key_value_heads = self.config.text_config.num_key_value_heads
+
+        self.freeze_vision_encoder = freeze_vision_encoder
+        self.train_expert_only = train_expert_only
+        self.attention_implementation = attention_implementation
+        self.attention_mode = attention_mode
+        self.expert_hidden_size = lm_expert_config.hidden_size
+        # self.to_bfloat16_like_physical_intelligence()
+        self.set_requires_grad()
+
+    def configure_peft(self, config):
+        # return model
+        self.peft_method = config.peft_method
+        self.peft_target_model = config.peft_target_model
+        if "lora" in self.peft_method:
+            peft_config = config.peft_config
+            target_modules = peft_config.target_modules
+            if not isinstance(target_modules, list):
+                target_modules = target_modules.split(",")
+            lora_config = LoraConfig(
+                task_type=TaskType.CAUSAL_LM,  # Based on the task type (e.g., language modeling, etc.)
+                r=peft_config.r,  # The rank of the low-rank adaptation
+                lora_alpha=peft_config.lora_alpha,  # Scaling factor
+                lora_dropout=peft_config.lora_dropout,  # Dropout applied to LoRA layers
+                target_modules=target_modules,  # The components where LoRA is applied
+                exclude_modules=[
+                    "lm_expert",
+                    "model.lm_expert.model.layers",
+                ],  # FIXME(mshukor): this does not work for now
+            )
+            self.lora_config = lora_config
+            # Apply LoRA and ensure only LoRA parameters are trainable
+            if "text" in self.peft_target_model:
+                self.get_vlm_model().text_model = get_peft_model(self.get_vlm_model().text_model, lora_config)
+            else:
+                self.vlm = get_peft_model(self.vlm, lora_config)
+            # assert config.train_expert_only, "Backbone should be frozen and only lora parameters are " # FIXME(mshukor): handle this here?
+            for name, param in self.vlm.named_parameters():
+                if (
+                    "lora" in name and "text_model.model.layers.17" not in name
+                ):  # lm_head is not a parameter in most LLMs becasue it's tied to the embedding layer
+                    param.requires_grad = True
+                else:
+                    param.requires_grad = False
+
+    def merge_lora_weights(self):
+        """
+        Merge LoRA weights into the base model.
+        """
+        if "text" in self.peft_target_model:
+            self.get_vlm_model().text_model = self.get_vlm_model().text_model.merge_and_unload()
+        else:
+            self.vlm = self.vlm.merge_and_unload()
+
+    def get_vlm_model(self,):
+        if hasattr(self.vlm.model, "model"): # When using peft
+            return self.vlm.model.model
+        else:
+            return self.vlm.model
+
+    def set_requires_grad(self):
+        if self.freeze_vision_encoder:
+            self.get_vlm_model().vision_model.eval()
+            for params in self.get_vlm_model().vision_model.parameters():
+                params.requires_grad = False
+        if self.train_expert_only:
+            self.vlm.eval()
+            for params in self.vlm.parameters():
+                params.requires_grad = False
+        else:
+            # To avoid unused params issue with distributed training
+            last_layers = [self.num_vlm_layers - 1]
+            if self.num_vlm_layers != self.num_expert_layers and self.num_vlm_layers % self.num_expert_layers == 0:
+                last_layers.append(self.num_vlm_layers - 2)
+            frozen_layers = [
+                            "lm_head",
+                            "text_model.model.norm.weight",
+                        ]
+            for layer in last_layers:
+                frozen_layers.append(f"text_model.model.layers.{layer}.")
+
+            for name, params in self.vlm.named_parameters():
+                if any(
+                    [
+                        k in name
+                        for k in frozen_layers
+                    ]
+                ):
+                    params.requires_grad = False
+        # To avoid unused params issue with distributed training
+        for name, params in self.lm_expert.named_parameters():
+            if any(
+                [
+                    k in name
+                    for k in [
+                        "lm_head",
+                    ]
+                ]
+            ):
+                params.requires_grad = False
+
+    def train(self, mode: bool = True):
+        super().train(mode)
+
+        if self.freeze_vision_encoder:
+            self.get_vlm_model().vision_model.eval()
+
+        if self.train_expert_only:
+            self.vlm.eval()
+
+    # def to_bfloat16_like_physical_intelligence(self):
+    #     self.vlm = self.vlm.to(dtype=torch.bfloat16)
+
+    #     params_to_change_dtype = [
+    #         "language_model.model.layers",
+    #         "gemma_expert.model.layers",
+    #         "vision_tower",
+    #         "multi_modal",
+    #     ]
+    #     for name, param in self.named_parameters():
+    #         if any(selector in name for selector in params_to_change_dtype):
+    #             param.data = param.data.to(dtype=torch.bfloat16)
+
+    def embed_image(self, image: torch.Tensor):
+        patch_attention_mask = None
+        # # FIXME(mshukor): probably not needed as we don't have padded images here
+        # pixel_values = image.unsqueeze(1)
+        # batch_size, num_images, num_channels, height, width = pixel_values.shape
+        # pixel_values = pixel_values
+        # pixel_values = pixel_values.view(batch_size * num_images, *pixel_values.shape[2:])
+
+        # # Remove padding images - padding images are full 0.
+        # nb_values_per_image = pixel_values.shape[1:].numel()
+        # real_images_inds = (pixel_values == 0.0).sum(dim=(-1, -2, -3)) != nb_values_per_image
+
+        # if not any(real_images_inds):
+        #     # no images, leave one empty image.
+        #     real_images_inds[0] = True
+
+        # pixel_values = pixel_values[real_images_inds].contiguous()
+
+        # # Handle the vision attention mask
+
+        # pixel_attention_mask = torch.ones(
+        #     size=[pixel_values.shape[i] for i in (0, 2, 3)],
+        #     dtype=torch.bool,
+        #     device=pixel_values.device,
+        # )
+
+        # patch_size = self.vlm.config.vision_config.patch_size
+        # patches_subgrid = pixel_attention_mask.unfold(dimension=1, size=patch_size, step=patch_size)
+        # patches_subgrid = patches_subgrid.unfold(dimension=2, size=patch_size, step=patch_size)
+        # patch_attention_mask = (patches_subgrid.sum(dim=(-1, -2)) > 0).bool()
+
+        # FIXME(mshukor): add special image tokens specific to smolvlm
+        # Get sequence from the vision encoder
+        image_hidden_states = self.get_vlm_model().vision_model(
+            pixel_values=image.to(dtype=self.get_vlm_model().vision_model.dtype),
+            patch_attention_mask=patch_attention_mask,
+        ).last_hidden_state
+        # Modality projection & resampling
+        image_hidden_states = self.get_vlm_model().connector(image_hidden_states)
+        return image_hidden_states
+        
+    def embed_language_tokens(self, tokens: torch.Tensor):
+        return self.get_vlm_model().text_model.get_input_embeddings()(tokens)
+
+    def forward_attn_layer(self, model_layers, inputs_embeds, layer_idx, position_ids, attention_mask, batch_size, head_dim, use_cache: bool = True, fill_kv_cache: bool = True, past_key_values=None) -> list[torch.Tensor]:
+
+        query_states = []
+        key_states = []
+        value_states = []
+        for i, hidden_states in enumerate(inputs_embeds):
+            layer = model_layers[i][layer_idx]
+            if hidden_states is None or layer is None:
+                continue
+            
+            # normalizer = torch.tensor(models[i].config.hidden_size**0.5, dtype=hidden_states.dtype)
+            # hidden_states = hidden_states * normalizer
+            hidden_states = layer.input_layernorm(hidden_states)
+
+            input_shape = hidden_states.shape[:-1]
+            hidden_shape = (*input_shape, -1, layer.self_attn.head_dim)
+
+            hidden_states = hidden_states.to(dtype=layer.self_attn.q_proj.weight.dtype)
+            query_state = layer.self_attn.q_proj(hidden_states).view(hidden_shape)
+            key_state = layer.self_attn.k_proj(hidden_states).view(hidden_shape)
+            value_state = layer.self_attn.v_proj(hidden_states).view(hidden_shape)
+
+            query_states.append(query_state)
+            key_states.append(key_state)
+            value_states.append(value_state)
+
+        # FIXME(mshukor): self attention always when having only the prefix
+        # B,L,H,D with L sequence length, H number of heads, D head dim
+        # concatenate on the number of embeddings/tokens
+        query_states = torch.cat(query_states, dim=1)
+        key_states = torch.cat(key_states, dim=1)
+        value_states = torch.cat(value_states, dim=1)
+        # FIXME(mshukor): seq should be B, H, L, D ?
+        seq_len = query_states.shape[1]
+        if seq_len < position_ids.shape[1]:
+            _position_ids = position_ids[:, :seq_len]
+            _attention_mask = attention_mask[:, :seq_len, :seq_len]
+        else:
+            _position_ids = position_ids
+            _attention_mask = attention_mask
+
+        if self.self_attn_only_actions:
+            attention_mask_ = _attention_mask.clone()
+            position_ids_ = _position_ids.clone()
+            if inputs_embeds[1] is not None:
+                suffix_len = inputs_embeds[1].shape[1]
+                attention_mask_[:, -suffix_len:, :-suffix_len] = False
+                position_ids_[:, -suffix_len:] = _position_ids[:, -suffix_len:] - _position_ids[:, -suffix_len][:, None]
+        else:
+            attention_mask_ = _attention_mask
+            position_ids_ = _position_ids
+
+        query_states = apply_rope(query_states, position_ids_) # FIXME(mshukor): this assumes we have always the vlm features?
+        key_states = apply_rope(key_states, position_ids_)
+
+        if use_cache and past_key_values is None:
+            past_key_values = {}
+
+        if use_cache:
+            if fill_kv_cache:
+                past_key_values[layer_idx] = {
+                    "key_states": key_states,
+                    "value_states": value_states,
+                }
+            else:
+                # TODO here, some optimization can be done - similar to a `StaticCache` we can declare the `max_len` before.
+                # so we create an empty cache, with just one cuda malloc, and if (in autoregressive case) we reach
+                # the max len, then we (for instance) double the cache size. This implementation already exists
+                # in `transformers`. (molbap)
+                key_states = torch.cat([past_key_values[layer_idx]["key_states"], key_states], dim=1)
+                value_states = torch.cat(
+                    [past_key_values[layer_idx]["value_states"], value_states], dim=1
+                )
+
+        attention_interface = self.get_attention_interface()
+
+        att_output = attention_interface(
+            attention_mask_, batch_size, head_dim, query_states, key_states, value_states
+        )
+        # att_output = att_output.to(dtype=models[i].dtype)
+
+        return [att_output], past_key_values
+
+
+    def forward_cross_attn_layer(self, model_layers, inputs_embeds, layer_idx, position_ids, attention_mask, batch_size, head_dim, use_cache: bool = True, fill_kv_cache: bool = True, past_key_values = None) -> list[torch.Tensor]:
+
+        attention_interface = self.get_attention_interface()
+
+        att_outputs = []
+        assert len(inputs_embeds) == 2 or (use_cache and past_key_values is not None and not fill_kv_cache), f"Both len(inputs_embeds) == {len(inputs_embeds)} and past_key_values is {past_key_values}"
+
+        if len(inputs_embeds) == 2 and not past_key_values:
+            # Prefix attention
+            seq_len = inputs_embeds[0].shape[1]
+            position_id, expert_position_id = position_ids[:, :seq_len],  position_ids[:, seq_len:]
+            prefix_attention_mask = attention_mask[:, :seq_len, :seq_len]
+            
+            layer = model_layers[0][layer_idx]
+
+            hidden_states = layer.input_layernorm(inputs_embeds[0])
+
+            input_shape = hidden_states.shape[:-1]
+            hidden_shape = (*input_shape, -1, layer.self_attn.head_dim)
+
+            hidden_states = hidden_states.to(dtype=layer.self_attn.q_proj.weight.dtype)
+            query_state = layer.self_attn.q_proj(hidden_states).view(hidden_shape)
+            key_state = layer.self_attn.k_proj(hidden_states).view(hidden_shape)
+            value_states = layer.self_attn.v_proj(hidden_states).view(hidden_shape)
+
+            # B,L,H,D with L sequence length, H number of heads, D head dim
+            query_states = apply_rope(query_state, position_id)
+            key_states = apply_rope(key_state, position_id)
+
+            att_output = attention_interface(
+                prefix_attention_mask, batch_size, head_dim, query_states, key_states, value_states
+            )
+            att_outputs.append(att_output)
+        else:
+            expert_position_id = position_ids
+
+        if use_cache and past_key_values is None:
+            past_key_values = {}
+
+        if use_cache:
+            if fill_kv_cache:
+                past_key_values[layer_idx] = {
+                    "key_states": key_states,
+                    "value_states": value_states,
+                }
+            else:
+                # TODO here, some optimization can be done - similar to a `StaticCache` we can declare the `max_len` before.
+                # so we create an empty cache, with just one cuda malloc, and if (in autoregressive case) we reach
+                # the max len, then we (for instance) double the cache size. This implementation already exists
+                # in `transformers`. (molbap)
+                key_states = past_key_values[layer_idx]["key_states"]
+                value_states = past_key_values[layer_idx]["value_states"]
+
+
+        # Expert
+        expert_layer = model_layers[1][layer_idx]
+        if expert_layer is not None:
+            expert_hidden_states = expert_layer.input_layernorm(inputs_embeds[1])
+
+            expert_input_shape = expert_hidden_states.shape[:-1]
+            expert_hidden_shape = (*expert_input_shape, -1, expert_layer.self_attn.head_dim)
+
+            expert_hidden_states = expert_hidden_states.to(dtype=expert_layer.self_attn.q_proj.weight.dtype)
+            expert_query_state = expert_layer.self_attn.q_proj(expert_hidden_states).view(expert_hidden_shape)
+
+
+            _key_states = key_states.to(dtype=expert_layer.self_attn.k_proj.weight.dtype).view(*key_states.shape[:2], -1)
+            expert_key_states = expert_layer.self_attn.k_proj(_key_states).view(*_key_states.shape[:-1], -1, expert_layer.self_attn.head_dim) # k_proj should have same dim as kv
+
+            _value_states = value_states.to(dtype=expert_layer.self_attn.v_proj.weight.dtype).view(*value_states.shape[:2], -1)
+            expert_value_states = expert_layer.self_attn.v_proj(_value_states).view(*_value_states.shape[:-1], -1, expert_layer.self_attn.head_dim)
+
+            expert_position_id = expert_position_id - torch.min(expert_position_id, dim=1, keepdim=True).values # start from 0
+            expert_attention_mask = attention_mask[:, -inputs_embeds[1].shape[1]:, :expert_key_states.shape[1]:] # take into account kv
+
+            expert_query_states = apply_rope(expert_query_state, expert_position_id)
+            # expert_key_states = apply_rope(expert_key_state, expert_position_id)
+            
+            att_output = attention_interface(
+                expert_attention_mask, batch_size, head_dim, expert_query_states, expert_key_states, expert_value_states
+            )
+            att_outputs.append(att_output)
+        else:
+            att_outputs.append(None)
+
+        # att_output = att_output.to(dtype=models[i].dtype)
+        return att_outputs, past_key_values
+    
+    def get_model_layers(self, models: list) -> list: # FIXME(mshukor): is this efficient?
+        vlm_layers = []
+        expert_layers = []
+        multiple_of = self.num_vlm_layers // self.num_expert_layers
+        for i in range(self.num_vlm_layers):
+            if multiple_of > 0 and i > 0 and i % multiple_of != 0:
+                expert_layer = None
+            else:
+                expert_layer_index = i // multiple_of if multiple_of > 0 else i
+                expert_layer = models[1].layers[expert_layer_index]
+            vlm_layers.append(models[0].layers[i])
+            expert_layers.append(expert_layer)
+        return [vlm_layers, expert_layers]
+    # TODO: break down this huge forward into modules or functions
+    def forward(
+        self,
+        attention_mask: Optional[torch.Tensor] = None,
+        position_ids: Optional[torch.LongTensor] = None,
+        past_key_values: Optional[Union[List[torch.FloatTensor], Cache]] = None,
+        inputs_embeds: List[torch.FloatTensor] = None,
+        use_cache: Optional[bool] = None,
+        fill_kv_cache: Optional[bool] = None,
+    ):
+        models = [self.get_vlm_model().text_model, self.lm_expert]
+        model_layers = self.get_model_layers(models)
+        for hidden_states in inputs_embeds:
+            # TODO this is very inefficient
+            # dtype is always the same, batch size too (if > 1 len)
+            # device could be trickier in multi gpu edge cases but that's it
+            if hidden_states is None:
+                continue
+            batch_size = hidden_states.shape[0]
+
+        # # Pad prefix embds so that prefix_embs + prefix_embs len are multiple of 128, pad left or right depending on the gen or train 
+        if self.attention_implementation == "flex":
+            if inputs_embeds[0] is not None and inputs_embeds[1] is not None and attention_mask.shape[-1] == attention_mask.shape[-2] and past_key_values is None: # Now only during training
+                seq_len = inputs_embeds[0].shape[1] + inputs_embeds[1].shape[1]
+                padded_seq_len = _round_up_to_multiple(seq_len, 128) # FIXME(mshukor): more efficient to have a fixed seq len?
+                b_mask, q_len, kv_len = attention_mask.shape  # The shape of your mask
+                pad = padded_seq_len - q_len
+                attention_mask = F.pad(attention_mask, (0, pad, 0, pad), value=True)
+                inputs_embeds[0] = F.pad(inputs_embeds[0], (0, 0, 0, pad), value=0.0)
+                position_ids = F.pad(position_ids, (0, pad), value=0)
+
+
+        # RMSNorm
+        num_layers = self.num_vlm_layers
+        head_dim = self.vlm.config.text_config.head_dim
+        for layer_idx in range(num_layers):
+            if fill_kv_cache or "cross" not in self.attention_mode or (self.self_attn_every_n_layers > 0 and layer_idx % self.self_attn_every_n_layers == 0):
+                att_outputs, past_key_values = self.forward_attn_layer(model_layers, inputs_embeds, layer_idx, position_ids, attention_mask, batch_size, head_dim, use_cache=use_cache, fill_kv_cache=fill_kv_cache, past_key_values=past_key_values)
+            else:
+                att_outputs, past_key_values = self.forward_cross_attn_layer(model_layers, inputs_embeds, layer_idx, position_ids, attention_mask, batch_size, head_dim, use_cache=use_cache, fill_kv_cache=fill_kv_cache, past_key_values=past_key_values)
+            # query_states = []
+            # key_states = []
+            # value_states = []
+            # for i, hidden_states in enumerate(inputs_embeds):
+            #     if hidden_states is None:
+            #         continue
+            #     layer = models[i].layers[layer_idx]
+            #     # normalizer = torch.tensor(models[i].config.hidden_size**0.5, dtype=hidden_states.dtype)
+            #     # hidden_states = hidden_states * normalizer
+            #     hidden_states = layer.input_layernorm(hidden_states)
+
+            #     input_shape = hidden_states.shape[:-1]
+            #     hidden_shape = (*input_shape, -1, layer.self_attn.head_dim)
+
+            #     hidden_states = hidden_states.to(dtype=layer.self_attn.q_proj.weight.dtype)
+            #     query_state = layer.self_attn.q_proj(hidden_states).view(hidden_shape)
+            #     key_state = layer.self_attn.k_proj(hidden_states).view(hidden_shape)
+            #     value_state = layer.self_attn.v_proj(hidden_states).view(hidden_shape)
+
+            #     query_states.append(query_state)
+            #     key_states.append(key_state)
+            #     value_states.append(value_state)
+
+            # # FIXME(mshukor): self attention always when having only the prefix
+            # # B,L,H,D with L sequence length, H number of heads, D head dim
+            # # concatenate on the number of embeddings/tokens
+            # query_states = torch.cat(query_states, dim=1)
+            # key_states = torch.cat(key_states, dim=1)
+            # value_states = torch.cat(value_states, dim=1)
+            # # FIXME(mshukor): seq should be B, H, L, D ?
+            # query_states = apply_rope(query_states, position_ids)
+            # key_states = apply_rope(key_states, position_ids)
+
+            # if use_cache and past_key_values is None:
+            #     past_key_values = {}
+
+            # if use_cache:
+            #     if fill_kv_cache:
+            #         past_key_values[layer_idx] = {
+            #             "key_states": key_states,
+            #             "value_states": value_states,
+            #         }
+            #     else:
+            #         # TODO here, some optimization can be done - similar to a `StaticCache` we can declare the `max_len` before.
+            #         # so we create an empty cache, with just one cuda malloc, and if (in autoregressive case) we reach
+            #         # the max len, then we (for instance) double the cache size. This implementation already exists
+            #         # in `transformers`. (molbap)
+            #         key_states = torch.cat([past_key_values[layer_idx]["key_states"], key_states], dim=1)
+            #         value_states = torch.cat(
+            #             [past_key_values[layer_idx]["value_states"], value_states], dim=1
+            #         )
+
+            # attention_interface = self.get_attention_interface()
+            # att_output = attention_interface(
+            #     attention_mask, batch_size, head_dim, query_states, key_states, value_states
+            # )
+
+
+            # att_output = att_output.to(dtype=models[i].dtype)
+
+            # first part of att_output is prefix (up to sequence length, [:, 0:prefix_seq_len])
+            outputs_embeds = []
+            start = 0
+            for i, hidden_states in enumerate(inputs_embeds):
+                # layer = models[i].layers[layer_idx]
+                layer = model_layers[i][layer_idx]
+                att_output = att_outputs[i] if i < len(att_outputs) else att_outputs[0] # in case of self_attn
+                if hidden_states is not None:
+                    if layer is None:
+                        outputs_embeds.append(hidden_states)
+                        continue
+                    end = start + hidden_states.shape[1]
+
+                    if att_output.dtype != layer.self_attn.o_proj.weight.dtype:
+                        att_output = att_output.to(layer.self_attn.o_proj.weight.dtype)
+                    att_out = att_output[:, start:end]
+                    out_emb = layer.self_attn.o_proj(att_out)
+
+                    # TODO: first dropout (by default 0.0)
+                    # first residual
+                    out_emb += hidden_states
+                    after_first_residual = out_emb.clone()
+
+                    out_emb = layer.post_attention_layernorm(out_emb)
+                    out_emb = layer.mlp(out_emb)
+
+                    # TODO: second dropout (by default 0.0)
+
+                    # second residual
+                    out_emb += after_first_residual
+
+                    outputs_embeds.append(out_emb)
+
+                    start = end if len(att_outputs) == 1 else 0
+                else:
+                    outputs_embeds.append(None)
+
+            inputs_embeds = outputs_embeds
+
+        # final norm
+        outputs_embeds = []
+        for i, hidden_states in enumerate(inputs_embeds):
+            if hidden_states is not None:
+                out_emb = models[i].norm(hidden_states)
+                outputs_embeds.append(out_emb)
+            else:
+                outputs_embeds.append(None)
+        return outputs_embeds, past_key_values
+
+    def get_attention_interface(self):
+        if self.attention_implementation == "fa2":
+            attention_interface = self.flash_attention_forward
+        elif self.attention_implementation == "flex":
+            attention_interface = partial(flex_attention_forward, num_att_heads=self.num_attention_heads, num_key_value_heads=self.num_key_value_heads)
+        else:
+            attention_interface = self.eager_attention_forward
+        return attention_interface
+
+    def flash_attention_forward(
+        self, attention_mask, batch_size, head_dim, query_states, key_states, value_states
+    ):
+        raise NotImplementedError("FA2 is not implemented (yet)")
+
+    def eager_attention_forward(
+        self, attention_mask, batch_size, head_dim, query_states, key_states, value_states
+    ):
+        num_att_heads = self.num_attention_heads
+        num_key_value_heads = self.num_key_value_heads
+        num_key_value_groups = num_att_heads // num_key_value_heads
+
+        # query_states: batch_size, sequence_length, num_att_head, head_dim
+        # key_states: batch_size, sequence_length, num_key_value_head, head_dim
+        # value_states: batch_size, sequence_length, num_key_value_head, head_dim
+        sequence_length = key_states.shape[1]
+
+        key_states = key_states[:, :, :, None, :].expand(
+            batch_size, sequence_length, num_key_value_heads, num_key_value_groups, head_dim
+        )
+        key_states = key_states.reshape(
+            batch_size, sequence_length, num_key_value_heads * num_key_value_groups, head_dim
+        )
+
+        value_states = value_states[:, :, :, None, :].expand(
+            batch_size, sequence_length, num_key_value_heads, num_key_value_groups, head_dim
+        )
+        value_states = value_states.reshape(
+            batch_size, sequence_length, num_key_value_heads * num_key_value_groups, head_dim
+        )
+
+        # Attention here is upcasted to float32 to match the original eager implementation.
+
+        query_states = query_states.to(dtype=torch.float32)
+        key_states = key_states.to(dtype=torch.float32)
+
+        query_states = query_states.transpose(1, 2)
+        key_states = key_states.transpose(1, 2)
+
+        att_weights = torch.matmul(query_states, key_states.transpose(2, 3))
+        att_weights *= head_dim**-0.5
+
+        att_weights = att_weights.to(dtype=torch.float32)
+        big_neg = torch.finfo(att_weights.dtype).min #-2.3819763e38  # See gemma/modules.py
+        masked_att_weights = torch.where(attention_mask[:, None, :, :], att_weights, big_neg)
+        probs = nn.functional.softmax(masked_att_weights, dim=-1)
+        probs = probs.to(dtype=value_states.dtype)
+
+        # probs: batch_size, num_key_value_head, num_att_head, sequence_length, sequence_length
+        # value_states: batch_size, sequence_length, num_att_heads, head_dim
+
+        att_output = torch.matmul(probs, value_states.permute(0, 2, 1, 3))
+
+        att_output = att_output.permute(0, 2, 1, 3)
+        # we use -1 because sequence length can change
+        att_output = att_output.reshape(batch_size, -1, num_key_value_heads * num_key_value_groups * head_dim)
+
+        return att_output
diff --git a/src/lerobot/policies/smolvla/modeling_smolvla.py b/src/lerobot/policies/smolvla/modeling_smolvla.py
index 95ed993d2..9b7e3520a 100644
--- a/src/lerobot/policies/smolvla/modeling_smolvla.py
+++ b/src/lerobot/policies/smolvla/modeling_smolvla.py
@@ -1,3 +1,955 @@
+# #!/usr/bin/env python
+
+# # Copyright 2025 HuggingFace Inc. team. All rights reserved.
+# #
+# # Licensed under the Apache License, Version 2.0 (the "License");
+# # you may not use this file except in compliance with the License.
+# # You may obtain a copy of the License at
+# #
+# #     http://www.apache.org/licenses/LICENSE-2.0
+# #
+# # Unless required by applicable law or agreed to in writing, software
+# # distributed under the License is distributed on an "AS IS" BASIS,
+# # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# # See the License for the specific language governing permissions and
+# # limitations under the License.
+
+# """
+# SmolVLA:
+
+# [Paper](https://huggingface.co/papers/2506.01844)
+
+# Designed by Hugging Face.
+
+# Install smolvla extra dependencies:
+# ```bash
+# pip install -e ".[smolvla]"
+# ```
+
+# Example of finetuning the smolvla pretrained model (`smolvla_base`):
+# ```bash
+# lerobot-train \
+# --policy.path=lerobot/smolvla_base \
+# --dataset.repo_id=danaaubakirova/svla_so100_task1_v3 \
+# --batch_size=64 \
+# --steps=200000
+# ```
+
+# Example of finetuning a smolVLA. SmolVLA is composed of a pretrained VLM,
+# and an action expert.
+# ```bash
+# lerobot-train \
+# --policy.type=smolvla \
+# --dataset.repo_id=danaaubakirova/svla_so100_task1_v3 \
+# --batch_size=64 \
+# --steps=200000
+# ```
+
+# Example of using the smolvla pretrained model outside LeRobot training framework:
+# ```python
+# policy = SmolVLAPolicy.from_pretrained("lerobot/smolvla_base")
+# ```
+
+# """
+
+# import math
+# import os
+# import re
+# from collections import deque
+
+# import safetensors
+# import torch
+# import torch.nn.functional as F  # noqa: N812
+# from torch import Tensor, nn
+# from transformers import AutoProcessor
+
+# from lerobot.constants import ACTION
+# from lerobot.policies.normalize import (
+#     Normalize,
+#     Unnormalize,
+# )
+# from lerobot.policies.pretrained import PreTrainedPolicy
+# from lerobot.policies.smolvla.configuration_smolvla import SmolVLAConfig
+# from lerobot.policies.smolvla.smolvlm_with_expert import SmolVLMWithExpertModel
+# from lerobot.policies.utils import (
+#     populate_queues,
+# )
+# from lerobot.utils.utils import get_safe_dtype
+# OBS_STATE = 'state'
+# ACTION = 'actions'
+# # Matches ".soNNN", optionally followed by "-something", up to the "_buffer_" marker
+# _VARIANT_RE = re.compile(r"\.so\d+(?:-[\w]+)?_buffer_")
+
+
+# def canonicalise(k: str) -> str:
+#     """
+#     Remove dataset-variant markers like '.so100-blue_' or '.so100_' from a
+#     normalisation-buffer key.
+#     """
+#     return _VARIANT_RE.sub(".buffer_", k)
+
+
+# def standardise_state_dict(
+#     checkpoint: dict[str, torch.Tensor], ref_keys: set[str], *, verbose: bool = True
+# ) -> tuple[dict[str, torch.Tensor], list[str]]:
+#     """
+#     • Re-keys `checkpoint ` so that every entry matches the *reference* key set.
+#     • If several variant keys collapse to the same canonical name we keep the
+#       first one and log the collision.
+#     • Returns the new dict + a list of entries that could not be matched.
+#     """
+#     out, collisions, unmatched = {}, {}, []
+
+#     for k, v in checkpoint.items():
+#         canon = canonicalise(k)
+#         if canon in ref_keys:
+#             if canon in out:  # duplicate after collapsing
+#                 collisions.setdefault(canon, []).append(k)
+#             else:
+#                 out[canon] = v
+#         else:
+#             unmatched.append(k)
+
+#     if verbose:
+#         for canon, variants in collisions.items():
+#             print(f"[standardise_state_dict] '{canon}'  ←  {variants}")
+#         if unmatched:
+#             print(f"[standardise_state_dict] kept {len(unmatched)} unmatched keys")
+
+#     out.update({k: checkpoint[k] for k in unmatched})
+#     return out, unmatched
+
+
+# def rename_checkpoint_keys(checkpoint: dict, rename_str: str):
+#     """
+#     Renames keys in a checkpoint dictionary based on the given rename string.
+
+#     Args:
+#         checkpoint (dict): The checkpoint dictionary.
+#         rename_str (str): A string specifying key mappings in the format "old1//new1,old2//new2".
+
+#     Returns:
+#         dict: The modified checkpoint with renamed keys.
+#     """
+
+#     rename_dict = dict(pair.split("//") for pair in rename_str.split(","))
+
+#     new_checkpoint = {}
+#     for k, v in checkpoint.items():
+#         for old_key, new_key in rename_dict.items():
+#             if old_key in k:
+#                 k = k.replace(old_key, new_key)
+#         new_checkpoint[k] = v
+#     return new_checkpoint
+
+
+# def load_smolvla(
+#     model: torch.nn.Module,
+#     filename: str | os.PathLike,
+#     *,
+#     device: str = "cpu",
+#     checkpoint_keys_mapping: str = "",
+# ) -> torch.nn.Module:
+#     state_dict = safetensors.torch.load_file(filename, device=device)
+
+#     # Optional user-supplied renames (e.g. "model._orig_mod.//model.")
+#     if checkpoint_keys_mapping and "//" in checkpoint_keys_mapping:
+#         state_dict = rename_checkpoint_keys(state_dict, checkpoint_keys_mapping)
+
+#     state_dict, _ = standardise_state_dict(state_dict, set(model.state_dict().keys()))
+
+#     # HACK(aliberts): to not overwrite normalization parameters as they should come from the dataset
+#     norm_keys = ("normalize_inputs", "normalize_targets", "unnormalize_outputs")
+#     state_dict = {k: v for k, v in state_dict.items() if not k.startswith(norm_keys)}
+
+#     missing, unexpected = model.load_state_dict(state_dict, strict=False)
+
+#     if not all(key.startswith(norm_keys) for key in missing) or unexpected:
+#         raise RuntimeError(
+#             "SmolVLA %d missing / %d unexpected keys",
+#             len(missing),
+#             len(unexpected),
+#         )
+
+#     return model
+
+
+# def create_sinusoidal_pos_embedding(
+#     time: torch.tensor, dimension: int, min_period: float, max_period: float, device="cpu"
+# ) -> Tensor:
+#     """Computes sine-cosine positional embedding vectors for scalar positions."""
+#     if dimension % 2 != 0:
+#         raise ValueError(f"dimension ({dimension}) must be divisible by 2")
+
+#     if time.ndim != 1:
+#         raise ValueError("The time tensor is expected to be of shape `(batch_size, )`.")
+
+#     dtype = get_safe_dtype(torch.float64, device.type)
+#     fraction = torch.linspace(0.0, 1.0, dimension // 2, dtype=dtype, device=device)
+#     period = min_period * (max_period / min_period) ** fraction
+
+#     # Compute the outer product
+#     scaling_factor = 1.0 / period * 2 * math.pi
+#     sin_input = scaling_factor[None, :] * time[:, None]
+#     pos_emb = torch.cat([torch.sin(sin_input), torch.cos(sin_input)], dim=1)
+#     return pos_emb
+
+
+# def make_att_2d_masks(pad_masks, att_masks):
+#     """Copied from big_vision.
+
+#     Tokens can attend to valid inputs tokens which have a cumulative mask_ar
+#     smaller or equal to theirs. This way `mask_ar` int[B, N] can be used to
+#     setup several types of attention, for example:
+
+#       [[1 1 1 1 1 1]]: pure causal attention.
+
+#       [[0 0 0 1 1 1]]: prefix-lm attention. The first 3 tokens can attend between
+#           themselves and the last 3 tokens have a causal attention. The first
+#           entry could also be a 1 without changing behaviour.
+
+#       [[1 0 1 0 1 0 0 1 0 0]]: causal attention between 4 blocks. Tokens of a
+#           block can attend all previous blocks and all tokens on the same block.
+
+#     Args:
+#       input_mask: bool[B, N] true if its part of the input, false if padding.
+#       mask_ar: int32[B, N] mask that's 1 where previous tokens cannot depend on
+#         it and 0 where it shares the same attention mask as the previous token.
+#     """
+#     if att_masks.ndim != 2:
+#         raise ValueError(att_masks.ndim)
+#     if pad_masks.ndim != 2:
+#         raise ValueError(pad_masks.ndim)
+
+#     cumsum = torch.cumsum(att_masks, dim=1)
+#     att_2d_masks = cumsum[:, None, :] <= cumsum[:, :, None]
+#     pad_2d_masks = pad_masks[:, None, :] * pad_masks[:, :, None]
+#     att_2d_masks = att_2d_masks & pad_2d_masks
+#     return att_2d_masks
+
+
+# def resize_with_pad(img, width, height, pad_value=-1):
+#     # assume no-op when width height fits already
+#     if img.ndim != 4:
+#         raise ValueError(f"(b,c,h,w) expected, but {img.shape}")
+
+#     cur_height, cur_width = img.shape[2:]
+
+#     ratio = max(cur_width / width, cur_height / height)
+#     resized_height = int(cur_height / ratio)
+#     resized_width = int(cur_width / ratio)
+#     resized_img = F.interpolate(
+#         img, size=(resized_height, resized_width), mode="bilinear", align_corners=False
+#     )
+
+#     pad_height = max(0, int(height - resized_height))
+#     pad_width = max(0, int(width - resized_width))
+
+#     # pad on left and top of image
+#     padded_img = F.pad(resized_img, (pad_width, 0, pad_height, 0), value=pad_value)
+#     return padded_img
+
+
+# def pad_vector(vector, new_dim):
+#     """Can be (batch_size x sequence_length x features_dimension)
+#     or (batch_size x features_dimension)
+#     """
+#     if vector.shape[-1] == new_dim:
+#         return vector
+#     shape = list(vector.shape)
+#     current_dim = shape[-1]
+#     shape[-1] = new_dim
+#     new_vector = torch.zeros(*shape, dtype=vector.dtype, device=vector.device)
+#     new_vector[..., :current_dim] = vector
+#     return new_vector
+
+
+# def normalize(x, min_val, max_val):
+#     return (x - min_val) / (max_val - min_val)
+
+
+# def unnormalize(x, min_val, max_val):
+#     return x * (max_val - min_val) + min_val
+
+
+# def safe_arcsin(value):
+#     # This ensures that the input stays within
+#     # [−1,1] to avoid invalid values for arcsin
+#     return torch.arcsin(torch.clamp(value, -1.0, 1.0))
+
+
+# def aloha_gripper_to_angular(value):
+#     # Aloha transforms the gripper positions into a linear space. The following code
+#     # reverses this transformation to be consistent with smolvla which is pretrained in
+#     # angular space.
+#     #
+#     # These values are coming from the Aloha code:
+#     # PUPPET_GRIPPER_POSITION_OPEN, PUPPET_GRIPPER_POSITION_CLOSED
+#     value = unnormalize(value, min_val=0.01844, max_val=0.05800)
+
+#     # This is the inverse of the angular to linear transformation inside the Interbotix code.
+#     def linear_to_radian(linear_position, arm_length, horn_radius):
+#         value = (horn_radius**2 + linear_position**2 - arm_length**2) / (2 * horn_radius * linear_position)
+#         return safe_arcsin(value)
+
+#     # The constants are taken from the Interbotix code.
+#     value = linear_to_radian(value, arm_length=0.036, horn_radius=0.022)
+
+#     # Normalize to [0, 1].
+#     # The values 0.4 and 1.5 were measured on an actual Trossen robot.
+#     return normalize(value, min_val=0.4, max_val=1.5)
+
+
+# def aloha_gripper_from_angular(value):
+#     # Convert from the gripper position used by smolvla to the gripper position that is used by Aloha.
+#     # Note that the units are still angular but the range is different.
+
+#     # The values 0.4 and 1.5 were measured on an actual Trossen robot.
+#     value = unnormalize(value, min_val=0.4, max_val=1.5)
+
+#     # These values are coming from the Aloha code:
+#     # PUPPET_GRIPPER_JOINT_OPEN, PUPPET_GRIPPER_JOINT_CLOSE
+#     return normalize(value, min_val=-0.6213, max_val=1.4910)
+
+
+# def aloha_gripper_from_angular_inv(value):
+#     # Directly inverts the gripper_from_angular function.
+#     value = unnormalize(value, min_val=-0.6213, max_val=1.4910)
+#     return normalize(value, min_val=0.4, max_val=1.5)
+
+
+# class SmolVLAPolicy(PreTrainedPolicy):
+#     """Wrapper class around VLAFlowMatching model to train and run inference within LeRobot."""
+
+#     config_class = SmolVLAConfig
+#     name = "smolvla"
+
+#     def __init__(
+#         self,
+#         config: SmolVLAConfig,
+#         dataset_stats: dict[str, dict[str, Tensor]] | None = None,
+#     ):
+#         """
+#         Args:
+#             config: Policy configuration class instance or None, in which case the default instantiation of
+#                     the configuration class is used.
+#             dataset_stats: Dataset statistics to be used for normalization. If not passed here, it is expected
+#                 that they will be passed with a call to `load_state_dict` before the policy is used.
+#         """
+
+#         super().__init__(config)
+#         config.validate_features()
+#         self.config = config
+#         self.normalize_inputs = Normalize(config.input_features, config.normalization_mapping, dataset_stats)
+#         self.normalize_targets = Normalize(
+#             config.output_features, config.normalization_mapping, dataset_stats
+#         )
+#         self.unnormalize_outputs = Unnormalize(
+#             config.output_features, config.normalization_mapping, dataset_stats
+#         )
+
+#         self.language_tokenizer = AutoProcessor.from_pretrained(self.config.vlm_model_name).tokenizer
+#         self.model = VLAFlowMatching(config)
+#         self.reset()
+
+#     def reset(self):
+#         """This should be called whenever the environment is reset."""
+#         self._queues = {
+#             ACTION: deque(maxlen=self.config.n_action_steps),
+#         }
+
+#     # HACK(aliberts, danaaubakirova): we overwrite this classmethod here to fix smolVLA-specific issues
+#     @classmethod
+#     def _load_as_safetensor(
+#         cls,
+#         model: "SmolVLAPolicy",
+#         model_file: str,
+#         map_location: str,
+#         strict: bool,
+#     ):
+#         safetensors.torch.load_model(model, model_file, strict=strict, device=map_location)
+#         return load_smolvla(
+#             model,
+#             model_file,
+#             device=map_location,
+#             checkpoint_keys_mapping="model._orig_mod.//model.",
+#         )
+
+#     def get_optim_params(self) -> dict:
+#         return self.parameters()
+
+#     def _get_action_chunk(self, batch: dict[str, Tensor], noise: Tensor | None = None) -> Tensor:
+#         # TODO: Check if this for loop is needed.
+#         # Context: In fact, self.queues contains only ACTION field, and in inference, we don't have action in the batch
+#         # In the case of offline inference, we have the action in the batch
+#         # that why without the k != ACTION check, it will raise an error because we are trying to stack
+#         # on an empty container.
+#         for k in batch:
+#             if k in self._queues and k != ACTION:
+#                 batch[k] = torch.stack(list(self._queues[k]), dim=1)
+
+#         images, img_masks = self.prepare_images(batch)
+#         state = self.prepare_state(batch)
+#         lang_tokens, lang_masks = self.prepare_language(batch)
+
+#         actions = self.model.sample_actions(images, img_masks, lang_tokens, lang_masks, state, noise=noise)
+
+#         # Unpad actions
+#         original_action_dim = self.config.action_feature.shape[0]
+#         actions = actions[:, :, :original_action_dim]
+
+#         actions = self.unnormalize_outputs({ACTION: actions})[ACTION]
+
+#         if self.config.adapt_to_pi_aloha:
+#             actions = self._pi_aloha_encode_actions(actions)
+
+#         return actions
+
+#     def _prepare_batch(self, batch: dict[str, Tensor]) -> dict[str, Tensor]:
+#         if self.config.adapt_to_pi_aloha:
+#             batch[OBS_STATE] = self._pi_aloha_decode_state(batch[OBS_STATE])
+
+#         batch = self.normalize_inputs(batch)
+
+#         return batch
+
+#     @torch.no_grad()
+#     def predict_action_chunk(self, batch: dict[str, Tensor], noise: Tensor | None = None) -> Tensor:
+#         self.eval()
+
+#         batch = self._prepare_batch(batch)
+#         self._queues = populate_queues(self._queues, batch, exclude_keys=[ACTION])
+
+#         actions = self._get_action_chunk(batch, noise)
+#         return actions
+
+#     @torch.no_grad()
+#     def select_action(self, batch: dict[str, Tensor], noise: Tensor | None = None) -> Tensor:
+#         """Select a single action given environment observations.
+
+#         This method wraps `select_actions` in order to return one action at a time for execution in the
+#         environment. It works by managing the actions in a queue and only calling `select_actions` when the
+#         queue is empty.
+#         """
+#         self.eval()
+#         batch = self._prepare_batch(batch)
+#         self._queues = populate_queues(self._queues, batch, exclude_keys=[ACTION])
+
+#         # Action queue logic for n_action_steps > 1. When the action_queue is depleted, populate it by
+#         # querying the policy.
+#         if len(self._queues[ACTION]) == 0:
+#             actions = self._get_action_chunk(batch, noise)
+
+#             # `self.predict_action_chunk` returns a (batch_size, n_action_steps, action_dim) tensor, but the queue
+#             # effectively has shape (n_action_steps, batch_size, *), hence the transpose.
+#             self._queues[ACTION].extend(actions.transpose(0, 1)[: self.config.n_action_steps])
+
+#         return self._queues[ACTION].popleft()
+
+#     def forward(self, batch: dict[str, Tensor], noise=None, time=None) -> dict[str, Tensor]:
+#         """Do a full training forward pass to compute the loss"""
+#         if self.config.adapt_to_pi_aloha:
+#             batch[OBS_STATE] = self._pi_aloha_decode_state(batch[OBS_STATE])
+#             batch[ACTION] = self._pi_aloha_encode_actions_inv(batch[ACTION])
+#         batch = self.normalize_inputs(batch)
+#         batch = self.normalize_targets(batch)
+#         images, img_masks = self.prepare_images(batch)
+#         state = self.prepare_state(batch)
+#         lang_tokens, lang_masks = self.prepare_language(batch)
+#         actions = self.prepare_action(batch)
+#         actions_is_pad = batch.get("actions_id_pad")
+#         loss_dict = {}
+#         losses = self.model.forward(images, img_masks, lang_tokens, lang_masks, state, actions, noise, time)
+#         loss_dict["losses_after_forward"] = losses.clone()
+
+#         if actions_is_pad is not None:
+#             in_episode_bound = ~actions_is_pad
+#             losses = losses * in_episode_bound.unsqueeze(-1)
+#             loss_dict["losses_after_in_ep_bound"] = losses.clone()
+
+#         # Remove padding
+#         losses = losses[:, :, : self.config.max_action_dim]
+#         loss_dict["losses_after_rm_padding"] = losses.clone()
+
+#         # For backward pass
+#         loss = losses.mean()
+#         # For backward pass
+#         loss_dict["loss"] = loss.item()
+#         return loss, loss_dict
+
+#     def prepare_images(self, batch):
+#         """Apply SmolVLA preprocessing to the images, like resizing to 224x224 and padding to keep aspect ratio, and
+#         convert pixel range from [0.0, 1.0] to [-1.0, 1.0] as requested by SigLIP.
+#         """
+#         images = []
+#         img_masks = []
+#         present_img_keys = [key for key in self.config.image_features if key in batch]
+#         missing_img_keys = [key for key in self.config.image_features if key not in batch]
+
+#         if len(present_img_keys) == 0:
+#             raise ValueError(
+#                 f"All image features are missing from the batch. At least one expected. (batch: {batch.keys()}) (image_features:{self.config.image_features})"
+#             )
+#         # Preprocess image features present in the batch
+#         for key in present_img_keys:
+#             img = batch[key][:, -1, :, :, :] if batch[key].ndim == 5 else batch[key]
+#             if self.config.resize_imgs_with_padding is not None:
+#                 img = resize_with_pad(img, *self.config.resize_imgs_with_padding, pad_value=0)
+
+#             # Normalize from range [0,1] to [-1,1] as expacted by siglip
+#             img = img * 2.0 - 1.0
+
+#             bsize = img.shape[0]
+#             device = img.device
+#             if f"{key}_padding_mask" in batch:
+#                 mask = batch[f"{key}_padding_mask"].bool()
+#             else:
+#                 mask = torch.ones(bsize, dtype=torch.bool, device=device)
+#             images.append(img)
+#             img_masks.append(mask)
+
+#         # Create image features not present in the batch
+#         # as fully 0 padded images.
+#         for num_empty_cameras in range(len(missing_img_keys)):
+#             if num_empty_cameras >= self.config.empty_cameras:
+#                 break
+#             img = torch.ones_like(img) * -1
+#             mask = torch.zeros_like(mask)
+#             images.append(img)
+#             img_masks.append(mask)
+#         return images, img_masks
+
+#     def prepare_language(self, batch) -> tuple[Tensor, Tensor]:
+#         """Tokenize the text input"""
+#         device = batch[OBS_STATE].device
+#         tasks = batch["task"]
+#         if isinstance(tasks, str):
+#             tasks = [tasks]
+
+#         if len(tasks) == 1:
+#             tasks = [tasks[0] for _ in range(batch[OBS_STATE].shape[0])]
+
+#         tasks = [task if task.endswith("\n") else f"{task}\n" for task in tasks]
+
+#         tokenized_prompt = self.language_tokenizer.__call__(
+#             tasks,
+#             padding=self.config.pad_language_to,
+#             padding_side="right",
+#             max_length=self.config.tokenizer_max_length,
+#             return_tensors="pt",
+#         )
+#         lang_tokens = tokenized_prompt["input_ids"].to(device=device)
+#         lang_masks = tokenized_prompt["attention_mask"].to(device=device, dtype=torch.bool)
+
+#         return lang_tokens, lang_masks
+
+#     def _pi_aloha_decode_state(self, state):
+#         # Flip the joints.
+#         for motor_idx in [1, 2, 8, 9]:
+#             state[:, motor_idx] *= -1
+#         # Reverse the gripper transformation that is being applied by the Aloha runtime.
+#         for motor_idx in [6, 13]:
+#             state[:, motor_idx] = aloha_gripper_to_angular(state[:, motor_idx])
+#         return state
+
+#     def _pi_aloha_encode_actions(self, actions):
+#         # Flip the joints.
+#         for motor_idx in [1, 2, 8, 9]:
+#             actions[:, :, motor_idx] *= -1
+#         # Reverse the gripper transformation that is being applied by the Aloha runtime.
+#         for motor_idx in [6, 13]:
+#             actions[:, :, motor_idx] = aloha_gripper_from_angular(actions[:, :, motor_idx])
+#         return actions
+
+#     def _pi_aloha_encode_actions_inv(self, actions):
+#         # Flip the joints again.
+#         for motor_idx in [1, 2, 8, 9]:
+#             actions[:, :, motor_idx] *= -1
+#         # Reverse the gripper transformation that is being applied by the Aloha runtime.
+#         for motor_idx in [6, 13]:
+#             actions[:, :, motor_idx] = aloha_gripper_from_angular_inv(actions[:, :, motor_idx])
+#         return actions
+
+#     def prepare_state(self, batch):
+#         """Pad state"""
+#         state = batch[OBS_STATE][:, -1, :] if batch[OBS_STATE].ndim > 2 else batch[OBS_STATE]
+#         state = pad_vector(state, self.config.max_state_dim)
+#         return state
+
+#     def prepare_action(self, batch):
+#         """Pad action"""
+#         actions = pad_vector(batch[ACTION], self.config.max_action_dim)
+#         return actions
+
+
+# def pad_tensor(tensor, max_len, pad_value=0):
+#     """
+#     Efficiently pads a tensor along sequence dimension to match max_len.
+
+#     Args:
+#         tensor (torch.Tensor): Shape (B, L, ...) or (B, L).
+#         max_len (int): Fixed sequence length.
+#         pad_value (int/float): Value for padding.
+
+#     Returns:
+#         torch.Tensor: Shape (B, max_len, ...) or (B, max_len).
+#     """
+#     b, d = tensor.shape[:2]
+
+#     # Create a padded tensor of max_len and copy the existing values
+#     padded_tensor = torch.full(
+#         (b, max_len, *tensor.shape[2:]), pad_value, dtype=tensor.dtype, device=tensor.device
+#     )
+#     padded_tensor[:, :d] = tensor  # Efficient in-place copy
+
+#     return padded_tensor
+
+
+# class VLAFlowMatching(nn.Module):
+#     """
+#     SmolVLA
+
+#     [Paper]()
+
+#     Designed by Hugging Face.
+#     ┌──────────────────────────────┐
+#     │                 actions      │
+#     │                    ▲         │
+#     │ ┌─────────┐      ┌─|────┐    │
+#     │ |         │────► │      │    │
+#     │ |         │ kv   │      │    │
+#     │ |         │────► │Action│    │
+#     │ |   VLM   │cache │Expert│    |
+#     │ │         │────► |      │    │
+#     │ │         │      │      │    │
+#     │ └▲──▲───▲─┘      └───▲──┘    |
+#     │  │  |   |            │       |
+#     │  |  |   |          noise     │
+#     │  │  │ state                  │
+#     │  │ language tokens           │
+#     │  image(s)                    │
+#     └──────────────────────────────┘
+#     """
+
+#     def __init__(self, config: SmolVLAConfig):
+#         super().__init__()
+#         self.config = config
+
+#         self.vlm_with_expert = SmolVLMWithExpertModel(
+#             model_id=self.config.vlm_model_name,
+#             freeze_vision_encoder=self.config.freeze_vision_encoder,
+#             train_expert_only=self.config.train_expert_only,
+#             load_vlm_weights=self.config.load_vlm_weights,
+#             attention_mode=self.config.attention_mode,
+#             num_expert_layers=self.config.num_expert_layers,
+#             num_vlm_layers=self.config.num_vlm_layers,
+#             self_attn_every_n_layers=self.config.self_attn_every_n_layers,
+#             expert_width_multiplier=self.config.expert_width_multiplier,
+#         )
+#         self.state_proj = nn.Linear(
+#             self.config.max_state_dim, self.vlm_with_expert.config.text_config.hidden_size
+#         )
+#         self.action_in_proj = nn.Linear(self.config.max_action_dim, self.vlm_with_expert.expert_hidden_size)
+#         self.action_out_proj = nn.Linear(self.vlm_with_expert.expert_hidden_size, self.config.max_action_dim)
+
+#         self.action_time_mlp_in = nn.Linear(
+#             self.vlm_with_expert.expert_hidden_size * 2, self.vlm_with_expert.expert_hidden_size
+#         )
+#         self.action_time_mlp_out = nn.Linear(
+#             self.vlm_with_expert.expert_hidden_size, self.vlm_with_expert.expert_hidden_size
+#         )
+
+#         self.set_requires_grad()
+#         self.fake_image_token = self.vlm_with_expert.processor.tokenizer.fake_image_token_id
+#         self.global_image_token = self.vlm_with_expert.processor.tokenizer.global_image_token_id
+#         self.global_image_start_token = torch.tensor(
+#             [self.fake_image_token, self.global_image_token], dtype=torch.long
+#         )
+
+#         self.add_image_special_tokens = self.config.add_image_special_tokens
+#         self.image_end_token = torch.tensor([self.fake_image_token], dtype=torch.long)
+#         self.prefix_length = self.config.prefix_length
+
+#     def set_requires_grad(self):
+#         for params in self.state_proj.parameters():
+#             params.requires_grad = self.config.train_state_proj
+
+#     def sample_noise(self, shape, device):
+#         noise = torch.normal(
+#             mean=0.0,
+#             std=1.0,
+#             size=shape,
+#             dtype=torch.float32,
+#             device=device,
+#         )
+#         return noise
+
+#     def sample_time(self, bsize, device):
+#         beta_dist = torch.distributions.Beta(concentration1=1.5, concentration0=1.0)
+#         time_beta = beta_dist.sample((bsize,)).to(device=device, dtype=torch.float32)
+#         time = time_beta * 0.999 + 0.001
+#         return time
+
+#     def embed_prefix(
+#         self, images, img_masks, lang_tokens, lang_masks, state: torch.Tensor = None
+#     ) -> tuple[torch.Tensor, torch.Tensor, torch.Tensor]:
+#         """Embed images with SigLIP and language tokens with embedding layer to prepare
+#         for SmolVLM transformer processing.
+#         """
+#         embs = []
+#         pad_masks = []
+#         att_masks = []
+#         for _img_idx, (
+#             img,
+#             img_mask,
+#         ) in enumerate(zip(images, img_masks, strict=False)):
+#             if self.add_image_special_tokens:
+#                 image_start_token = (
+#                     self.vlm_with_expert.embed_language_tokens(
+#                         self.global_image_start_token.to(device=self.vlm_with_expert.vlm.device)
+#                     )
+#                     .unsqueeze(0)
+#                     .expand(img.shape[0], -1, -1)
+#                 )
+#                 image_start_mask = torch.ones_like(
+#                     image_start_token[:, :, 0], dtype=torch.bool, device=image_start_token.device
+#                 )
+#                 att_masks += [0] * (image_start_mask.shape[-1])
+#                 embs.append(image_start_token)
+#                 pad_masks.append(image_start_mask)
+
+#             img_emb = self.vlm_with_expert.embed_image(img)
+#             img_emb = img_emb
+
+#             # Normalize image embeddings
+#             img_emb_dim = img_emb.shape[-1]
+#             img_emb = img_emb * torch.tensor(img_emb_dim**0.5, dtype=img_emb.dtype, device=img_emb.device)
+
+#             bsize, num_img_embs = img_emb.shape[:2]
+#             img_mask = img_mask[:, None].expand(bsize, num_img_embs)
+
+#             embs.append(img_emb)
+#             pad_masks.append(img_mask)
+
+#             att_masks += [0] * (num_img_embs)
+#             if self.add_image_special_tokens:
+#                 image_end_token = (
+#                     self.vlm_with_expert.embed_language_tokens(
+#                         self.image_end_token.to(device=self.vlm_with_expert.vlm.device)
+#                     )
+#                     .unsqueeze(0)
+#                     .expand(img.shape[0], -1, -1)
+#                 )
+#                 image_end_mask = torch.ones_like(
+#                     image_end_token[:, :, 0], dtype=torch.bool, device=image_end_token.device
+#                 )
+#                 embs.append(image_end_token)
+#                 pad_masks.append(image_end_mask)
+#                 att_masks += [0] * (image_end_mask.shape[1])
+#         lang_emb = self.vlm_with_expert.embed_language_tokens(lang_tokens)
+#         # Normalize language embeddings
+#         lang_emb_dim = lang_emb.shape[-1]
+#         lang_emb = lang_emb * math.sqrt(lang_emb_dim)
+
+#         embs.append(lang_emb)
+#         pad_masks.append(lang_masks)
+
+#         num_lang_embs = lang_emb.shape[1]
+#         att_masks += [0] * num_lang_embs
+
+#         state_emb = self.state_proj(state)
+#         state_emb = state_emb[:, None, :] if state_emb.ndim == 2 else state_emb
+#         embs.append(state_emb)
+#         bsize = state_emb.shape[0]
+#         device = state_emb.device
+
+#         states_seq_len = state_emb.shape[1]
+#         state_mask = torch.ones(bsize, states_seq_len, dtype=torch.bool, device=device)
+#         pad_masks.append(state_mask)
+
+#         # Set attention masks so that image and language inputs do not attend to state or actions
+#         att_masks += [1] * (states_seq_len)
+#         embs = torch.cat(embs, dim=1)
+#         pad_masks = torch.cat(pad_masks, dim=1)
+#         att_masks = torch.tensor(att_masks, dtype=torch.bool, device=pad_masks.device)
+#         att_masks = att_masks[None, :]
+
+#         seq_len = pad_masks.shape[1]
+#         if seq_len < self.prefix_length:
+#             embs = pad_tensor(embs, self.prefix_length, pad_value=0)
+#             pad_masks = pad_tensor(pad_masks, self.prefix_length, pad_value=0)
+#             att_masks = pad_tensor(att_masks, self.prefix_length, pad_value=0)
+
+#         att_masks = att_masks.expand(bsize, -1)
+
+#         return embs, pad_masks, att_masks
+
+#     def embed_suffix(self, noisy_actions, timestep):
+#         """Embed state, noisy_actions, timestep to prepare for Expert Gemma processing."""
+#         embs = []
+#         pad_masks = []
+#         att_masks = []
+
+#         # Fuse timestep + action information using an MLP
+#         action_emb = self.action_in_proj(noisy_actions)
+#         device = action_emb.device
+#         bsize = action_emb.shape[0]
+#         dtype = action_emb.dtype
+#         # Embed timestep using sine-cosine positional encoding with sensitivity in the range [0, 1]
+#         time_emb = create_sinusoidal_pos_embedding(
+#             timestep,
+#             self.vlm_with_expert.expert_hidden_size,
+#             self.config.min_period,
+#             self.config.max_period,
+#             device=device,
+#         )
+#         time_emb = time_emb.type(dtype=dtype)
+
+#         time_emb = time_emb[:, None, :].expand_as(action_emb)
+#         action_time_emb = torch.cat([action_emb, time_emb], dim=2)
+
+#         action_time_emb = self.action_time_mlp_in(action_time_emb)
+#         action_time_emb = F.silu(action_time_emb)  # swish == silu
+#         action_time_emb = self.action_time_mlp_out(action_time_emb)
+
+#         # Add to input tokens
+#         embs.append(action_time_emb)
+
+#         bsize, action_time_dim = action_time_emb.shape[:2]
+#         action_time_mask = torch.ones(bsize, action_time_dim, dtype=torch.bool, device=device)
+#         pad_masks.append(action_time_mask)
+
+#         # Set attention masks so that image, language and state inputs do not attend to action tokens
+#         att_masks += [1] * self.config.chunk_size
+#         embs = torch.cat(embs, dim=1)
+#         pad_masks = torch.cat(pad_masks, dim=1)
+#         att_masks = torch.tensor(att_masks, dtype=embs.dtype, device=embs.device)
+#         att_masks = att_masks[None, :].expand(bsize, len(att_masks))
+#         # added by jade
+#         seq_len = pad_masks.shape[1]
+#         if seq_len < self.config.chunk_size:
+#             embs = pad_tensor(embs, self.config.chunk_size, pad_value=0)
+#             pad_masks = pad_tensor(pad_masks, self.config.chunk_size, pad_value=0)
+#             att_masks = pad_tensor(att_masks, self.config.chunk_size, pad_value=0)
+#         return embs, pad_masks, att_masks
+
+#     def forward(
+#         self, images, img_masks, lang_tokens, lang_masks, state, actions, noise=None, time=None
+#     ) -> Tensor:
+#         """Do a full training forward pass and compute the loss (batch_size x num_steps x num_motors)"""
+#         #added by jade
+#         if actions.ndim == 2:
+#             actions = actions[:, None, :].expand(-1, self.config.chunk_size, -1)
+#         if noise is None:
+#             noise = self.sample_noise(actions.shape, actions.device)
+
+#         if time is None:
+#             time = self.sample_time(actions.shape[0], actions.device)
+
+#         time_expanded = time[:, None, None]
+#         x_t = time_expanded * noise + (1 - time_expanded) * actions
+#         u_t = noise - actions
+#         prefix_embs, prefix_pad_masks, prefix_att_masks = self.embed_prefix(
+#             images, img_masks, lang_tokens, lang_masks, state=state
+#         )
+#         suffix_embs, suffix_pad_masks, suffix_att_masks = self.embed_suffix(x_t, time)
+
+#         pad_masks = torch.cat([prefix_pad_masks, suffix_pad_masks], dim=1)
+#         att_masks = torch.cat([prefix_att_masks, suffix_att_masks], dim=1)
+
+#         att_2d_masks = make_att_2d_masks(pad_masks, att_masks)
+#         position_ids = torch.cumsum(pad_masks, dim=1) - 1
+#         (_, suffix_out), _ = self.vlm_with_expert.forward(
+#             attention_mask=att_2d_masks,
+#             position_ids=position_ids,
+#             past_key_values=None,
+#             inputs_embeds=[prefix_embs, suffix_embs],
+#             use_cache=False,
+#             fill_kv_cache=False,
+#         )
+#         # suffix_out = suffix_out[:, -self.config.chunk_size :]
+#         suffix_out = suffix_out[:, -self.config.chunk_size:, :]
+#         # Original openpi code, upcast attention output
+#         suffix_out = suffix_out.to(dtype=torch.float32)
+#         v_t = self.action_out_proj(suffix_out)
+#         losses = F.mse_loss(u_t, v_t, reduction="none")
+#         return losses
+
+#     def sample_actions(self, images, img_masks, lang_tokens, lang_masks, state, noise=None) -> Tensor:
+#         """Do a full inference forward and compute the action (batch_size x num_steps x num_motors)"""
+#         bsize = state.shape[0]
+#         device = state.device
+
+#         if noise is None:
+#             actions_shape = (bsize, self.config.chunk_size, self.config.max_action_dim)
+#             noise = self.sample_noise(actions_shape, device)
+
+#         prefix_embs, prefix_pad_masks, prefix_att_masks = self.embed_prefix(
+#             images, img_masks, lang_tokens, lang_masks, state=state
+#         )
+#         prefix_att_2d_masks = make_att_2d_masks(prefix_pad_masks, prefix_att_masks)
+#         prefix_position_ids = torch.cumsum(prefix_pad_masks, dim=1) - 1
+#         # Compute image and language key value cache
+#         _, past_key_values = self.vlm_with_expert.forward(
+#             attention_mask=prefix_att_2d_masks,
+#             position_ids=prefix_position_ids,
+#             past_key_values=None,
+#             inputs_embeds=[prefix_embs, None],
+#             use_cache=self.config.use_cache,
+#             fill_kv_cache=True,
+#         )
+#         dt = -1.0 / self.config.num_steps
+#         dt = torch.tensor(dt, dtype=torch.float32, device=device)
+
+#         x_t = noise
+#         time = torch.tensor(1.0, dtype=torch.float32, device=device)
+#         while time >= -dt / 2:
+#             expanded_time = time.expand(bsize)
+#             v_t = self.denoise_step(
+#                 prefix_pad_masks,
+#                 past_key_values,
+#                 x_t,
+#                 expanded_time,
+#             )
+#             # Euler step
+#             x_t += dt * v_t
+#             time += dt
+#         return x_t
+
+#     def denoise_step(
+#         self,
+#         prefix_pad_masks,
+#         past_key_values,
+#         x_t,
+#         timestep,
+#     ):
+#         """Apply one denoising step of the noise `x_t` at a given timestep."""
+#         suffix_embs, suffix_pad_masks, suffix_att_masks = self.embed_suffix(x_t, timestep)
+
+#         suffix_len = suffix_pad_masks.shape[1]
+#         batch_size = prefix_pad_masks.shape[0]
+#         prefix_len = prefix_pad_masks.shape[1]
+#         prefix_pad_2d_masks = prefix_pad_masks[:, None, :].expand(batch_size, suffix_len, prefix_len)
+
+#         suffix_att_2d_masks = make_att_2d_masks(suffix_pad_masks, suffix_att_masks)
+
+#         full_att_2d_masks = torch.cat([prefix_pad_2d_masks, suffix_att_2d_masks], dim=2)
+#         prefix_offsets = torch.sum(prefix_pad_masks, dim=-1)[:, None]
+#         position_ids = prefix_offsets + torch.cumsum(suffix_pad_masks, dim=1) - 1
+
+#         outputs_embeds, _ = self.vlm_with_expert.forward(
+#             attention_mask=full_att_2d_masks,
+#             position_ids=position_ids,
+#             past_key_values=past_key_values,
+#             inputs_embeds=[None, suffix_embs],
+#             use_cache=self.config.use_cache,
+#             fill_kv_cache=False,
+#         )
+#         suffix_out = outputs_embeds[1]
+#         suffix_out = suffix_out[:, -self.config.chunk_size :]
+#         suffix_out = suffix_out.to(dtype=torch.float32)
+#         v_t = self.action_out_proj(suffix_out)
+#         return v_t
 #!/usr/bin/env python
 
 # Copyright 2025 HuggingFace Inc. team. All rights reserved.
@@ -63,7 +1015,7 @@ import torch.nn.functional as F  # noqa: N812
 from torch import Tensor, nn
 from transformers import AutoProcessor
 
-from lerobot.constants import ACTION
+from lerobot.constants import ACTION, OBS_STATE
 from lerobot.policies.normalize import (
     Normalize,
     Unnormalize,
@@ -75,8 +1027,7 @@ from lerobot.policies.utils import (
     populate_queues,
 )
 from lerobot.utils.utils import get_safe_dtype
-OBS_STATE = 'state'
-ACTION = 'actions'
+
 # Matches ".soNNN", optionally followed by "-something", up to the "_buffer_" marker
 _VARIANT_RE = re.compile(r"\.so\d+(?:-[\w]+)?_buffer_")
 
@@ -825,21 +1776,12 @@ class VLAFlowMatching(nn.Module):
         pad_masks = torch.cat(pad_masks, dim=1)
         att_masks = torch.tensor(att_masks, dtype=embs.dtype, device=embs.device)
         att_masks = att_masks[None, :].expand(bsize, len(att_masks))
-        # added by jade
-        seq_len = pad_masks.shape[1]
-        if seq_len < self.config.chunk_size:
-            embs = pad_tensor(embs, self.config.chunk_size, pad_value=0)
-            pad_masks = pad_tensor(pad_masks, self.config.chunk_size, pad_value=0)
-            att_masks = pad_tensor(att_masks, self.config.chunk_size, pad_value=0)
         return embs, pad_masks, att_masks
 
     def forward(
         self, images, img_masks, lang_tokens, lang_masks, state, actions, noise=None, time=None
     ) -> Tensor:
         """Do a full training forward pass and compute the loss (batch_size x num_steps x num_motors)"""
-        #added by jade
-        if actions.ndim == 2:
-            actions = actions[:, None, :].expand(-1, self.config.chunk_size, -1)
         if noise is None:
             noise = self.sample_noise(actions.shape, actions.device)
 
@@ -867,8 +1809,7 @@ class VLAFlowMatching(nn.Module):
             use_cache=False,
             fill_kv_cache=False,
         )
-        # suffix_out = suffix_out[:, -self.config.chunk_size :]
-        suffix_out = suffix_out[:, -self.config.chunk_size:, :]
+        suffix_out = suffix_out[:, -self.config.chunk_size :]
         # Original openpi code, upcast attention output
         suffix_out = suffix_out.to(dtype=torch.float32)
         v_t = self.action_out_proj(suffix_out)
@@ -949,4 +1890,4 @@ class VLAFlowMatching(nn.Module):
         suffix_out = suffix_out[:, -self.config.chunk_size :]
         suffix_out = suffix_out.to(dtype=torch.float32)
         v_t = self.action_out_proj(suffix_out)
-        return v_t
+        return v_t
\ No newline at end of file
diff --git a/src/lerobot/policies/smolvla/modeling_smolvla_v2.py b/src/lerobot/policies/smolvla/modeling_smolvla_v2.py
new file mode 100644
index 000000000..e69de29bb
diff --git a/src/lerobot/policies/smolvla/saver.txt b/src/lerobot/policies/smolvla/saver.txt
new file mode 100644
index 000000000..3410062ba
--- /dev/null
+++ b/src/lerobot/policies/smolvla/saver.txt
@@ -0,0 +1 @@
+c
\ No newline at end of file
diff --git a/src/lerobot/policies/smolvla/smolvlm_with_expert.py b/src/lerobot/policies/smolvla/smolvlm_with_expert.py
index f6a49dccf..e4cd7acac 100644
--- a/src/lerobot/policies/smolvla/smolvlm_with_expert.py
+++ b/src/lerobot/policies/smolvla/smolvlm_with_expert.py
@@ -77,8 +77,8 @@ class SmolVLMWithExpertModel(nn.Module):
             self.vlm = AutoModelForImageTextToText.from_pretrained(
                 model_id,
                 device_map="auto",
-                # torch_dtype="bfloat16",
-                torch_dtype=torch.float16,
+                torch_dtype="bfloat16",
+                # torch_dtype=torch.float16,
                 low_cpu_mem_usage=True,
             )
             config = self.vlm.config
@@ -547,4 +547,4 @@ class SmolVLMWithExpertModel(nn.Module):
         # we use -1 because sequence length can change
         att_output = att_output.reshape(batch_size, -1, num_key_value_heads * num_key_value_groups * head_dim)
 
-        return att_output
+        return att_output
\ No newline at end of file
diff --git a/src/lerobot/scripts/eval.py b/src/lerobot/scripts/eval.py
index 3145bed35..92a3bf833 100644
--- a/src/lerobot/scripts/eval.py
+++ b/src/lerobot/scripts/eval.py
@@ -458,6 +458,43 @@ def _compile_episode_data(
     data_dict["index"] = torch.arange(start_data_index, start_data_index + total_frames, 1)
 
     return data_dict
+from lerobot.policies.smolvla.modeling_smolvla import SmolVLAPolicy
+from lerobot.datasets.lerobot_dataset import LeRobotDatasetMetadata
+def _inject_normalization_stats(policy: SmolVLAPolicy, dataset_meta: LeRobotDatasetMetadata):
+    """Recreate normalization layers with proper stats from the dataset."""
+    from lerobot.policies.normalize import Normalize, Unnormalize
+
+    # Convert numpy stats to the format expected by normalization layers
+    stats = {}
+    for key, stat_dict in dataset_meta.stats.items():
+        stats[key] = {
+            stat_type: torch.from_numpy(stat_array) if isinstance(stat_array, np.ndarray) else stat_array
+            for stat_type, stat_array in stat_dict.items()
+        }
+
+    # Recreate normalization layers with proper stats
+    normalize_inputs = Normalize(policy.config.input_features, policy.config.normalization_mapping, stats)
+
+    normalize_targets = Normalize(policy.config.output_features, policy.config.normalization_mapping, stats)
+
+    unnormalize_outputs = Unnormalize(
+        policy.config.output_features, policy.config.normalization_mapping, stats
+    )
+
+    # Replace the normalization layers on the policy
+    policy.normalize_inputs = normalize_inputs
+    policy.normalize_targets = normalize_targets
+    policy.unnormalize_outputs = unnormalize_outputs
+
+    print("Normalization layers recreated with dataset stats.")
+
+
+def load_smolvla(cfg, dataset_repo: str):
+    from lerobot.datasets.lerobot_dataset import LeRobotDataset
+    dataset = LeRobotDataset(dataset_repo, root='/raid/jade/.cache/huggingface/datasets/')
+    policy = make_policy(cfg=cfg, ds_meta=dataset.meta)
+    _inject_normalization_stats(policy=policy, dataset_meta=dataset.meta)  # only needed if stats are missing
+    return policy, dataset
 
 
 @parser.wrap()
@@ -466,7 +503,9 @@ def eval_main(cfg: EvalPipelineConfig):
 
     # Check device is available
     device = get_safe_torch_device(cfg.policy.device, log=True)
-
+    #login to hf
+    from huggingface_hub import login
+    login()
     torch.backends.cudnn.benchmark = True
     torch.backends.cuda.matmul.allow_tf32 = True
     set_seed(cfg.seed)
@@ -481,6 +520,9 @@ def eval_main(cfg: EvalPipelineConfig):
         cfg=cfg.policy,
         env_cfg=cfg.env,
     )
+    # breakpoint()
+    load_smolvla(cfg.policy, "physical-intelligence/libero")
+    # breakpoint()
     policy.eval()
     with torch.no_grad(), torch.autocast(device_type=device.type) if cfg.policy.use_amp else nullcontext():
         if cfg.env.multitask_eval:
diff --git a/src/lerobot/scripts/train.py b/src/lerobot/scripts/train.py
index d5d562518..74219fc38 100644
--- a/src/lerobot/scripts/train.py
+++ b/src/lerobot/scripts/train.py
@@ -104,6 +104,32 @@ def update_policy(
     train_metrics.update_s = time.perf_counter() - start_time
     return train_metrics, output_dict
 
+# def _inject_normalization_stats(policy: SmolVLAPolicy, dataset_meta: LeRobotDatasetMetadata):
+#     """Recreate normalization layers with dataset stats if missing (Adil's workaround)."""
+#     from lerobot.policies.normalize import Normalize, Unnormalize
+
+#     if not hasattr(dataset_meta, "stats") or not dataset_meta.stats:
+#         print("⚠️ Dataset has no stats, skipping normalization injection.")
+#         return
+
+#     stats = {}
+#     for key, stat_dict in dataset_meta.stats.items():
+#         stats[key] = {
+#             stat_type: torch.as_tensor(stat_array)
+#             if isinstance(stat_array, np.ndarray)
+#             else stat_array
+#             for stat_type, stat_array in stat_dict.items()
+#         }
+
+#     normalize_inputs = Normalize(policy.config.input_features, policy.config.normalization_mapping, stats)
+#     normalize_targets = Normalize(policy.config.output_features, policy.config.normalization_mapping, stats)
+#     unnormalize_outputs = Unnormalize(policy.config.output_features, policy.config.normalization_mapping, stats)
+
+#     policy.normalize_inputs = normalize_inputs
+#     policy.normalize_targets = normalize_targets
+#     policy.unnormalize_outputs = unnormalize_outputs
+
+#     print("✅ Normalization layers injected with dataset stats.")
 
 @parser.wrap()
 def train(cfg: TrainPipelineConfig):
@@ -126,7 +152,6 @@ def train(cfg: TrainPipelineConfig):
 
     logging.info("Creating dataset")
     dataset = make_dataset(cfg)
-
     # Create environment used for evaluating checkpoints during training on simulation data.
     # On real-world data, no need to create an environment as evaluations are done outside train.py,
     # using the eval.py instead, with gym_dora environment and dora-rs.
@@ -140,7 +165,6 @@ def train(cfg: TrainPipelineConfig):
         cfg=cfg.policy,
         ds_meta=dataset.meta,
     )
-
     logging.info("Creating optimizer and scheduler")
     optimizer, lr_scheduler = make_optimizer_and_scheduler(cfg, policy)
     grad_scaler = GradScaler(device.type, enabled=cfg.policy.use_amp)
@@ -203,7 +227,6 @@ def train(cfg: TrainPipelineConfig):
         start_time = time.perf_counter()
         batch = next(dl_iter)
         train_tracker.dataloading_s = time.perf_counter() - start_time
-
         for key in batch:
             if isinstance(batch[key], torch.Tensor):
                 batch[key] = batch[key].to(device, non_blocking=device.type == "cuda")
diff --git a/src/lerobot/scripts/train_2.py b/src/lerobot/scripts/train_2.py
new file mode 100644
index 000000000..26a9e7aea
--- /dev/null
+++ b/src/lerobot/scripts/train_2.py
@@ -0,0 +1,343 @@
+#!/usr/bin/env python
+
+# Copyright 2024 The HuggingFace Inc. team. All rights reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+import logging
+import time
+from contextlib import nullcontext
+from pprint import pformat
+from typing import Any
+
+import torch
+from termcolor import colored
+from torch.amp import GradScaler
+from torch.optim import Optimizer
+
+from lerobot.configs import parser
+from lerobot.configs.train import TrainPipelineConfig
+from lerobot.datasets.factory import make_dataset
+from lerobot.datasets.sampler import EpisodeAwareSampler
+from lerobot.datasets.utils import cycle
+from lerobot.envs.factory import make_env
+from lerobot.optim.factory import make_optimizer_and_scheduler
+from lerobot.policies.factory import make_policy
+from lerobot.policies.pretrained import PreTrainedPolicy
+from lerobot.policies.utils import get_device_from_parameters
+from lerobot.scripts.eval import eval_policy, eval_policy_multitask
+from lerobot.utils.logging_utils import AverageMeter, MetricsTracker
+from lerobot.utils.random_utils import set_seed
+from lerobot.utils.train_utils import (
+    get_step_checkpoint_dir,
+    get_step_identifier,
+    load_training_state,
+    save_checkpoint,
+    update_last_checkpoint,
+)
+from lerobot.utils.utils import (
+    format_big_number,
+    get_safe_torch_device,
+    has_method,
+    init_logging,
+)
+from lerobot.utils.wandb_utils import WandBLogger
+
+
+def update_policy(
+    train_metrics: MetricsTracker,
+    policy: PreTrainedPolicy,
+    batch: Any,
+    optimizer: Optimizer,
+    grad_clip_norm: float,
+    grad_scaler: GradScaler,
+    lr_scheduler=None,
+    use_amp: bool = False,
+    lock=None,
+) -> tuple[MetricsTracker, dict]:
+    start_time = time.perf_counter()
+    device = get_device_from_parameters(policy)
+    policy.train()
+    with torch.autocast(device_type=device.type) if use_amp else nullcontext():
+        loss, output_dict = policy.forward(batch)
+        # TODO(rcadene): policy.unnormalize_outputs(out_dict)
+    grad_scaler.scale(loss).backward()
+
+    # Unscale the gradient of the optimizer's assigned params in-place **prior to gradient clipping**.
+    grad_scaler.unscale_(optimizer)
+
+    grad_norm = torch.nn.utils.clip_grad_norm_(
+        policy.parameters(),
+        grad_clip_norm,
+        error_if_nonfinite=False,
+    )
+
+    # Optimizer's gradients are already unscaled, so scaler.step does not unscale them,
+    # although it still skips optimizer.step() if the gradients contain infs or NaNs.
+    with lock if lock is not None else nullcontext():
+        grad_scaler.step(optimizer)
+    # Updates the scale for next iteration.
+    grad_scaler.update()
+
+    optimizer.zero_grad()
+
+    # Step through pytorch scheduler at every batch instead of epoch
+    if lr_scheduler is not None:
+        lr_scheduler.step()
+
+    if has_method(policy, "update"):
+        # To possibly update an internal buffer (for instance an Exponential Moving Average like in TDMPC).
+        policy.update()
+
+    train_metrics.loss = loss.item()
+    train_metrics.grad_norm = grad_norm.item()
+    train_metrics.lr = optimizer.param_groups[0]["lr"]
+    train_metrics.update_s = time.perf_counter() - start_time
+    return train_metrics, output_dict
+
+def _inject_normalization_stats(policy: SmolVLAPolicy, dataset_meta: LeRobotDatasetMetadata):
+    """Recreate normalization layers with dataset stats if missing (Adil's workaround)."""
+    from lerobot.policies.normalize import Normalize, Unnormalize
+
+    if not hasattr(dataset_meta, "stats") or not dataset_meta.stats:
+        print("⚠️ Dataset has no stats, skipping normalization injection.")
+        return
+
+    stats = {}
+    for key, stat_dict in dataset_meta.stats.items():
+        stats[key] = {
+            stat_type: torch.as_tensor(stat_array)
+            if isinstance(stat_array, np.ndarray)
+            else stat_array
+            for stat_type, stat_array in stat_dict.items()
+        }
+
+    normalize_inputs = Normalize(policy.config.input_features, policy.config.normalization_mapping, stats)
+    normalize_targets = Normalize(policy.config.output_features, policy.config.normalization_mapping, stats)
+    unnormalize_outputs = Unnormalize(policy.config.output_features, policy.config.normalization_mapping, stats)
+
+    policy.normalize_inputs = normalize_inputs
+    policy.normalize_targets = normalize_targets
+    policy.unnormalize_outputs = unnormalize_outputs
+
+    print("✅ Normalization layers injected with dataset stats.")
+
+@parser.wrap()
+def train(cfg: TrainPipelineConfig):
+    cfg.validate()
+    logging.info(pformat(cfg.to_dict()))
+
+    if cfg.wandb.enable and cfg.wandb.project:
+        wandb_logger = WandBLogger(cfg)
+    else:
+        wandb_logger = None
+        logging.info(colored("Logs will be saved locally.", "yellow", attrs=["bold"]))
+
+    if cfg.seed is not None:
+        set_seed(cfg.seed)
+
+    # Check device is available
+    device = get_safe_torch_device(cfg.policy.device, log=True)
+    torch.backends.cudnn.benchmark = True
+    torch.backends.cuda.matmul.allow_tf32 = True
+
+    logging.info("Creating dataset")
+    dataset = make_dataset(cfg)
+
+    # Create environment used for evaluating checkpoints during training on simulation data.
+    # On real-world data, no need to create an environment as evaluations are done outside train.py,
+    # using the eval.py instead, with gym_dora environment and dora-rs.
+    eval_env = None
+    if cfg.eval_freq > 0 and cfg.env is not None:
+        logging.info("Creating env")
+        eval_env = make_env(cfg.env, n_envs=cfg.eval.batch_size, use_async_envs=cfg.eval.use_async_envs)
+
+    logging.info("Creating policy")
+    policy = make_policy(
+        cfg=cfg.policy,
+        ds_meta=dataset.meta,
+    )
+    logging.info("Creating optimizer and scheduler")
+    optimizer, lr_scheduler = make_optimizer_and_scheduler(cfg, policy)
+    grad_scaler = GradScaler(device.type, enabled=cfg.policy.use_amp)
+
+    step = 0  # number of policy updates (forward + backward + optim)
+
+    if cfg.resume:
+        step, optimizer, lr_scheduler = load_training_state(cfg.checkpoint_path, optimizer, lr_scheduler)
+
+    num_learnable_params = sum(p.numel() for p in policy.parameters() if p.requires_grad)
+    num_total_params = sum(p.numel() for p in policy.parameters())
+
+    logging.info(colored("Output dir:", "yellow", attrs=["bold"]) + f" {cfg.output_dir}")
+    if cfg.env is not None:
+        logging.info(f"{cfg.env.task=}")
+    logging.info(f"{cfg.steps=} ({format_big_number(cfg.steps)})")
+    logging.info(f"{dataset.num_frames=} ({format_big_number(dataset.num_frames)})")
+    logging.info(f"{dataset.num_episodes=}")
+    logging.info(f"{num_learnable_params=} ({format_big_number(num_learnable_params)})")
+    logging.info(f"{num_total_params=} ({format_big_number(num_total_params)})")
+
+    # create dataloader for offline training
+    if hasattr(cfg.policy, "drop_n_last_frames"):
+        shuffle = False
+        sampler = EpisodeAwareSampler(
+            dataset.episode_data_index,
+            drop_n_last_frames=cfg.policy.drop_n_last_frames,
+            shuffle=True,
+        )
+    else:
+        shuffle = True
+        sampler = None
+
+    dataloader = torch.utils.data.DataLoader(
+        dataset,
+        num_workers=cfg.num_workers,
+        batch_size=cfg.batch_size,
+        shuffle=shuffle,
+        sampler=sampler,
+        pin_memory=device.type == "cuda",
+        drop_last=False,
+    )
+    dl_iter = cycle(dataloader)
+
+    policy.train()
+    train_metrics = {
+        "loss": AverageMeter("loss", ":.3f"),
+        "grad_norm": AverageMeter("grdn", ":.3f"),
+        "lr": AverageMeter("lr", ":0.1e"),
+        "update_s": AverageMeter("updt_s", ":.3f"),
+        "dataloading_s": AverageMeter("data_s", ":.3f"),
+    }
+
+    train_tracker = MetricsTracker(
+        cfg.batch_size, dataset.num_frames, dataset.num_episodes, train_metrics, initial_step=step
+    )
+
+    logging.info("Start offline training on a fixed dataset")
+    for _ in range(step, cfg.steps):
+        start_time = time.perf_counter()
+        batch = next(dl_iter)
+        train_tracker.dataloading_s = time.perf_counter() - start_time
+
+        for key in batch:
+            if isinstance(batch[key], torch.Tensor):
+                batch[key] = batch[key].to(device, non_blocking=device.type == "cuda")
+
+        train_tracker, output_dict = update_policy(
+            train_tracker,
+            policy,
+            batch,
+            optimizer,
+            cfg.optimizer.grad_clip_norm,
+            grad_scaler=grad_scaler,
+            lr_scheduler=lr_scheduler,
+            use_amp=cfg.policy.use_amp,
+        )
+
+        # Note: eval and checkpoint happens *after* the `step`th training update has completed, so we
+        # increment `step` here.
+        step += 1
+        train_tracker.step()
+        is_log_step = cfg.log_freq > 0 and step % cfg.log_freq == 0
+        is_saving_step = step % cfg.save_freq == 0 or step == cfg.steps
+        is_eval_step = cfg.eval_freq > 0 and step % cfg.eval_freq == 0
+
+        if is_log_step:
+            logging.info(train_tracker)
+            if wandb_logger:
+                wandb_log_dict = train_tracker.to_dict()
+                if output_dict:
+                    wandb_log_dict.update(output_dict)
+                wandb_logger.log_dict(wandb_log_dict, step)
+            train_tracker.reset_averages()
+
+        if cfg.save_checkpoint and is_saving_step:
+            logging.info(f"Checkpoint policy after step {step}")
+            checkpoint_dir = get_step_checkpoint_dir(cfg.output_dir, cfg.steps, step)
+            save_checkpoint(checkpoint_dir, step, cfg, policy, optimizer, lr_scheduler)
+            update_last_checkpoint(checkpoint_dir)
+            if wandb_logger:
+                wandb_logger.log_policy(checkpoint_dir)
+
+        if cfg.env and is_eval_step:
+            step_id = get_step_identifier(step, cfg.steps)
+            logging.info(f"Eval policy at step {step}")
+            with (
+                torch.no_grad(),
+                torch.autocast(device_type=device.type) if cfg.policy.use_amp else nullcontext(),
+            ):
+                if cfg.env.multitask_eval:
+                    eval_info = eval_policy_multitask(
+                        eval_env,
+                        policy,
+                        cfg.eval.n_episodes,
+                        videos_dir=cfg.output_dir / "eval" / f"videos_step_{step_id}",
+                        max_episodes_rendered=4,
+                        start_seed=cfg.seed,
+                        max_parallel_tasks=cfg.env.max_parallel_tasks,
+                    )
+                    aggregated = eval_info["overall"]["aggregated"]
+                    # Print per-suite stats, log?
+                    for task_group, task_group_info in eval_info.items():
+                        if task_group == "overall":
+                            continue  # Skip the overall stats since we already printed it
+                        print(f"\nAggregated Metrics for {task_group}:")
+                        print(task_group_info["aggregated"])
+                else:
+                    eval_info = eval_policy(
+                        eval_env,
+                        policy,
+                        cfg.eval.n_episodes,
+                        videos_dir=cfg.output_dir / "eval" / f"videos_step_{step_id}",
+                        max_episodes_rendered=4,
+                        start_seed=cfg.seed,
+                    )
+                    aggregated = eval_info["aggregated"]
+
+            eval_metrics = {
+                "avg_sum_reward": AverageMeter("∑rwrd", ":.3f"),
+                "pc_success": AverageMeter("success", ":.1f"),
+                "eval_s": AverageMeter("eval_s", ":.3f"),
+            }
+            eval_tracker = MetricsTracker(
+                cfg.batch_size, dataset.num_frames, dataset.num_episodes, eval_metrics, initial_step=step
+            )
+            eval_tracker.eval_s = aggregated.pop("eval_s")
+            eval_tracker.avg_sum_reward = aggregated.pop("avg_sum_reward")
+            eval_tracker.pc_success = aggregated.pop("pc_success")
+            logging.info(eval_tracker)
+            if wandb_logger:
+                wandb_log_dict = {**eval_tracker.to_dict(), **eval_info}
+                wandb_logger.log_dict(wandb_log_dict, step, mode="eval")
+                wandb_logger.log_video(eval_info["video_paths"][0], step, mode="eval")
+
+    if eval_env:
+        if cfg.env.multitask_eval:
+            for _task_group, envs_dict in eval_env.items():
+                for _idx, env in envs_dict.items():
+                    env.close()
+        else:
+            eval_env.close()
+    logging.info("End of training")
+
+    if cfg.policy.push_to_hub:
+        policy.push_model_to_hub(cfg)
+
+
+def main():
+    init_logging()
+    train()
+
+
+if __name__ == "__main__":
+    main()
diff --git a/src/lerobot/scripts/train_accelerate.py b/src/lerobot/scripts/train_accelerate.py
new file mode 100644
index 000000000..e205f138f
--- /dev/null
+++ b/src/lerobot/scripts/train_accelerate.py
@@ -0,0 +1,365 @@
+#!/usr/bin/env python
+
+# Copyright 2024 The HuggingFace Inc. team. All rights reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+import logging
+import time
+from pprint import pformat
+from typing import Any
+
+import torch
+from accelerate import Accelerator
+from accelerate.utils import set_seed as accelerate_set_seed
+from termcolor import colored
+from torch.optim import Optimizer
+
+from lerobot.datasets.factory import make_dataset
+from lerobot.datasets.sampler import EpisodeAwareSampler
+from lerobot.envs.factory import make_env
+from lerobot.optim.factory import make_optimizer_and_scheduler
+from lerobot.policies.factory import make_policy
+from lerobot.policies.pretrained import PreTrainedPolicy
+from lerobot.utils.logging_utils import AverageMeter, MetricsTracker
+from lerobot.utils.train_utils import (
+    get_step_checkpoint_dir,
+    get_step_identifier,
+    load_training_state,
+    save_checkpoint,
+    update_last_checkpoint,
+)
+from lerobot.utils.utils import (
+    format_big_number,
+    has_method,
+    init_logging,
+)
+from lerobot.configs import parser
+from lerobot.configs.train import TrainPipelineConfig
+from lerobot.scripts.eval import eval_policy
+
+
+def update_policy(
+    train_metrics: MetricsTracker,
+    policy: PreTrainedPolicy,
+    batch: Any,
+    optimizer: Optimizer,
+    grad_clip_norm: float,
+    accelerator: Accelerator,
+    lr_scheduler=None,
+) -> tuple[MetricsTracker, dict]:
+    start_time = time.perf_counter()
+    policy.train()
+
+    # Use accelerator's autocast context if mixed precision is enabled
+    with accelerator.autocast():
+        loss, output_dict = policy.forward(batch)
+        # TODO(rcadene): policy.unnormalize_outputs(out_dict)
+
+    # Use accelerator for backward pass
+    accelerator.backward(loss)
+
+    # Gradient clipping - accelerator handles unscaling automatically
+    if accelerator.sync_gradients and grad_clip_norm > 0:
+        grad_norm = accelerator.clip_grad_norm_(policy.parameters(), grad_clip_norm)
+    else:
+        grad_norm = torch.tensor(0.0)
+
+    optimizer.step()
+    lr_scheduler.step() if lr_scheduler is not None else None
+    optimizer.zero_grad()
+
+    # Update policy-specific buffers if needed
+    if has_method(policy, "update"):
+        policy.update()
+
+    # Gather metrics across all processes
+    loss_value = accelerator.gather(loss.detach()).mean().item()
+    grad_norm_value = accelerator.gather(grad_norm).mean().item()
+
+    train_metrics.loss = loss_value
+    train_metrics.grad_norm = grad_norm_value
+    train_metrics.lr = optimizer.param_groups[0]["lr"]
+    train_metrics.update_s = time.perf_counter() - start_time
+    return train_metrics, output_dict
+
+
+@parser.wrap()
+def train(cfg: TrainPipelineConfig):
+    cfg.validate()
+    logging.info(pformat(cfg.to_dict()))
+
+    # Initialize accelerator
+    from accelerate.utils import DistributedDataParallelKwargs
+    # added by jade 2 lines
+    ddp_kwargs = DistributedDataParallelKwargs(find_unused_parameters=False)
+    accelerator = Accelerator(..., kwargs_handlers=[ddp_kwargs])
+
+    from lerobot.utils.wandb_utils import cfg_to_group, get_wandb_run_id_from_filesystem
+
+    ddp_kwargs = DistributedDataParallelKwargs(find_unused_parameters=True)
+    accelerator = Accelerator(
+        mixed_precision="bf16" if cfg.policy.use_amp else "no",
+        gradient_accumulation_steps=cfg.policy.gradient_accumulation_steps,
+        log_with="wandb" if cfg.wandb.enable else None,
+        kwargs_handlers=[ddp_kwargs],
+        project_dir=cfg.output_dir,
+    )
+
+    accelerator.init_trackers(
+        project_name=cfg.wandb.project,
+        init_kwargs={
+            "wandb": {
+                "entity": cfg.wandb.entity,
+                "name": cfg.job_name,
+                "notes": cfg.wandb.notes,
+                "tags": cfg_to_group(cfg, return_list=True),
+                "dir": cfg.output_dir,
+                "config": cfg.to_dict(),
+                "save_code": False,
+                "job_type": "train_eval",
+                "mode": cfg.wandb.mode if cfg.wandb.mode in ["online", "offline", "disabled"] else "online",
+                "resume": "must" if cfg.resume else None,
+                "id": cfg.wandb.run_id
+                if cfg.wandb.run_id
+                else (get_wandb_run_id_from_filesystem(cfg.output_dir) if cfg.resume else None),
+            }
+        },
+    )
+
+    # Set seed for reproducibility
+    if cfg.seed is not None:
+        accelerate_set_seed(cfg.seed)
+
+    # Setup device - accelerator handles device placement
+    torch.backends.cudnn.benchmark = True
+    torch.backends.cuda.matmul.allow_tf32 = True
+
+    # Create dataset
+    if accelerator.is_main_process:
+        logging.info("Creating dataset")
+    dataset = make_dataset(cfg)
+    print("c")
+    # Create evaluation environment (only on main process)
+    eval_env = None
+    if cfg.eval_freq > 0 and cfg.env is not None and accelerator.is_main_process:
+        logging.info("Creating env")
+        eval_env = make_env(cfg.env, n_envs=cfg.eval.batch_size, use_async_envs=cfg.eval.use_async_envs)
+
+    # Create policy
+    if accelerator.is_main_process:
+        logging.info("Creating policy")
+
+    # Use accelerator's device instead of cfg.policy.device
+    with accelerator.main_process_first():
+        policy = make_policy(
+            cfg=cfg.policy,
+            ds_meta=dataset.meta,
+        )
+
+    # Create optimizer and scheduler
+    if accelerator.is_main_process:
+        logging.info("Creating optimizer and scheduler")
+    optimizer, lr_scheduler = make_optimizer_and_scheduler(cfg, policy)
+
+    step = 0  # number of policy updates
+
+    if cfg.resume:
+        step, optimizer, lr_scheduler = load_training_state(cfg.checkpoint_path, optimizer, lr_scheduler)
+
+    # Prepare dataloader
+    if hasattr(cfg.policy, "drop_n_last_frames"):
+        shuffle = False
+        sampler = EpisodeAwareSampler(
+            dataset.episode_data_index,
+            drop_n_last_frames=cfg.policy.drop_n_last_frames,
+            shuffle=True,
+        )
+    else:
+        shuffle = True
+        sampler = None
+
+    dataloader = torch.utils.data.DataLoader(
+        dataset,
+        num_workers=cfg.num_workers,
+        batch_size=cfg.batch_size,
+        shuffle=shuffle,
+        sampler=sampler,
+        pin_memory=True,
+        drop_last=True,  # Important for distributed training
+    )
+
+    # Prepare for distributed training
+    policy, optimizer, dataloader, lr_scheduler = accelerator.prepare(
+        policy, optimizer, dataloader, lr_scheduler
+    )
+
+    # Log training info (only on main process)
+    if accelerator.is_main_process:
+        num_learnable_params = sum(p.numel() for p in policy.parameters() if p.requires_grad)
+        num_total_params = sum(p.numel() for p in policy.parameters())
+
+        logging.info(colored("Output dir:", "yellow", attrs=["bold"]) + f" {cfg.output_dir}")
+        if cfg.env is not None:
+            logging.info(f"{cfg.env.task=}")
+        logging.info(f"{cfg.steps=} ({format_big_number(cfg.steps)})")
+        logging.info(f"{dataset.num_frames=} ({format_big_number(dataset.num_frames)})")
+        logging.info(f"{dataset.num_episodes=}")
+        logging.info(f"{num_learnable_params=} ({format_big_number(num_learnable_params)})")
+        logging.info(f"{num_total_params=} ({format_big_number(num_total_params)})")
+        logging.info(f"Number of processes: {accelerator.num_processes}")
+        logging.info(f"Device: {accelerator.device}")
+        logging.info(f"Mixed precision: {accelerator.mixed_precision}")
+
+    # Create metrics trackers
+    train_metrics = {
+        "loss": AverageMeter("loss", ":.3f"),
+        "grad_norm": AverageMeter("grdn", ":.3f"),
+        "lr": AverageMeter("lr", ":0.1e"),
+        "update_s": AverageMeter("updt_s", ":.3f"),
+        "dataloading_s": AverageMeter("data_s", ":.3f"),
+    }
+
+    train_tracker = MetricsTracker(
+        cfg.batch_size * accelerator.num_processes,  # Account for all processes
+        dataset.num_frames,
+        dataset.num_episodes,
+        train_metrics,
+        initial_step=step,
+    )
+
+    # Training loop
+    policy.train()
+    if accelerator.is_main_process:
+        logging.info("Start offline training on a fixed dataset")
+
+    # Create iterator from dataloader
+    dl_iter = iter(dataloader)
+
+    for current_step in range(step, cfg.steps):
+        start_time = time.perf_counter()
+        # Get next batch, cycling through dataloader if needed
+        try:
+            batch = next(dl_iter)
+            print("data laoder batch keys: ", batch.keys())
+            breakpoint()
+        except StopIteration:
+            dl_iter = iter(dataloader)
+            batch = next(dl_iter)
+        train_tracker.dataloading_s = time.perf_counter() - start_time
+        # Update policy
+        train_tracker, output_dict = update_policy(
+            train_tracker,
+            policy,
+            batch,
+            optimizer,
+            cfg.optimizer.grad_clip_norm,
+            accelerator,
+            lr_scheduler=lr_scheduler,
+        )
+
+        # Increment step counter
+        step += 1
+        train_tracker.step()
+
+        # Determine if we should log, save, or evaluate
+        is_log_step = cfg.log_freq > 0 and step % cfg.log_freq == 0
+        is_saving_step = step % cfg.save_freq == 0 or step == cfg.steps
+        is_eval_step = cfg.eval_freq > 0 and step % cfg.eval_freq == 0
+
+        # Logging (only on main process)
+        if is_log_step and accelerator.is_main_process:
+            logging.info(train_tracker)
+            wandb_log_dict = train_tracker.to_dict()
+            if output_dict:
+                wandb_log_dict.update(output_dict)
+            for k, v in wandb_log_dict.items():
+                accelerator.log({f"{'train'}/{k}": v}, step=step)
+            train_tracker.reset_averages()
+
+        # Checkpointing (only on main process)
+        if cfg.save_checkpoint and is_saving_step:
+            # ✅ all processes wait here
+            accelerator.wait_for_everyone()
+
+            if accelerator.is_main_process:
+                logging.info(f"Checkpoint policy after step {step}")
+                checkpoint_dir = get_step_checkpoint_dir(cfg.output_dir, cfg.steps, step)
+
+                unwrapped_policy = accelerator.unwrap_model(policy)
+                save_checkpoint(checkpoint_dir, step, cfg, unwrapped_policy, optimizer, lr_scheduler)
+                update_last_checkpoint(checkpoint_dir)
+
+            # ✅ all processes sync again after saving
+            accelerator.wait_for_everyone()
+
+            # if wandb_logger:
+            #     wandb_logger.log_policy(checkpoint_dir)
+
+        # Evaluation (only on main process)
+        if cfg.env and is_eval_step and accelerator.is_main_process:
+            step_id = get_step_identifier(step, cfg.steps)
+            logging.info(f"Eval policy at step {step}")
+
+            # Unwrap model for evaluation
+            unwrapped_policy = accelerator.unwrap_model(policy)
+            unwrapped_policy.eval()
+
+            with torch.no_grad():
+                eval_info = eval_policy(
+                    eval_env,
+                    unwrapped_policy,
+                    cfg.eval.n_episodes,
+                    videos_dir=cfg.output_dir / "eval" / f"videos_step_{step_id}",
+                    max_episodes_rendered=4,
+                    start_seed=cfg.seed,
+                )
+
+            eval_metrics = {
+                "avg_sum_reward": AverageMeter("∑rwrd", ":.3f"),
+                "pc_success": AverageMeter("success", ":.1f"),
+                "eval_s": AverageMeter("eval_s", ":.3f"),
+            }
+            eval_tracker = MetricsTracker(
+                cfg.batch_size * accelerator.num_processes,
+                dataset.num_frames,
+                dataset.num_episodes,
+                eval_metrics,
+                initial_step=step,
+            )
+            eval_tracker.eval_s = eval_info["aggregated"].pop("eval_s")
+            eval_tracker.avg_sum_reward = eval_info["aggregated"].pop("avg_sum_reward")
+            eval_tracker.pc_success = eval_info["aggregated"].pop("pc_success")
+            logging.info(eval_tracker)
+
+            wandb_log_dict = {**eval_tracker.to_dict(), **eval_info}
+            for k, v in wandb_log_dict.items():
+                accelerator.log({f"{'eval'}/{k}": v}, step=step)
+
+            # Set back to training mode
+            policy.train()
+
+    # Wait for all processes to finish
+    accelerator.wait_for_everyone()
+
+    # Cleanup
+    if eval_env and accelerator.is_main_process:
+        eval_env.close()
+
+    if accelerator.is_main_process:
+        logging.info("End of training")
+        accelerator.end_training() # added by jade
+
+
+if __name__ == "__main__":
+    init_logging()
+    train()
diff --git a/tmux_log.txt b/tmux_log.txt
new file mode 100644
index 000000000..4936578bc
--- /dev/null
+++ b/tmux_log.txt
@@ -0,0 +1,2008 @@
+ 'resume': False,
+ 'save_checkpoint': True,
+ 'save_freq': 20000,
+ 'scheduler': {'decay_lr': 2.5e-06,
+               'num_decay_steps': 30000,
+               'num_warmup_steps': 1000,
+               'peak_lr': 0.0001,
+               'type': 'cosine_decay_with_warmup'},
+ 'seed': 1000,
+ 'steps': 100000,
+ 'use_policy_training_preset': True,
+ 'wandb': {'disable_artifact': False,
+           'enable': False,
+           'entity': None,
+           'mode': None,
+           'notes': None,
+           'project': 'lerobot',
+           'run_id': None}}
+INFO 2025-09-08 13:23:15 ils/utils.py:48 Cuda backend detected, using cuda.
+WARNING 2025-09-08 13:23:15 /policies.py:81 Device 'None' is not available. Switching to 'cuda'.
+INFO 2025-09-08 13:23:15 ccelerate.py:99 {'batch_size': 32,
+ 'dataset': {'episodes': None,
+             'image_transforms': {'enable': False,
+                                  'max_num_transforms': 3,
+                                  'random_order': False,
+                                  'tfs': {'brightness': {'kwargs': {'brightness': [0.8,
+                                                                                   1.2]},
+                                                         'type': 'ColorJitter',
+                                                         'weight': 1.0},
+                                          'contrast': {'kwargs': {'contrast': [0.8,
+                                                                               1.2]},
+                                                       'type': 'ColorJitter',
+                                                       'weight': 1.0},
+                                          'hue': {'kwargs': {'hue': [-0.05,
+                                                                     0.05]},
+                                                  'type': 'ColorJitter',
+                                                  'weight': 1.0},
+                                          'saturation': {'kwargs': {'saturation': [0.5,
+                                                                                   1.5]},
+                                                         'type': 'ColorJitter',
+                                                         'weight': 1.0},
+                                          'sharpness': {'kwargs': {'sharpness': [0.5,
+                                                                                 1.5]},
+                                                        'type': 'SharpnessJitter',
+                                                        'weight': 1.0}}},
+             'repo_id': 'HuggingFaceVLA/libero',
+             'revision': None,
+             'root': '/raid/jade/.cache/huggingface/datasets',
+             'use_imagenet_stats': True,
+             'video_backend': 'torchcodec'},
+ 'env': {'camera_name': 'agentview_image,robot0_eye_in_hand_image',
+         'episode_length': 520,
+         'features': {'action': {'shape': [7],
+                                 'type': <FeatureType.ACTION: 'ACTION'>},
+                      'agent_pos': {'shape': [8],
+                                    'type': <FeatureType.STATE: 'STATE'>},
+                      'pixels/agentview_image': {'shape': [360, 360, 3],
+                                                 'type': <FeatureType.VISUAL: 'VISUAL'>},
+                      'pixels/robot0_eye_in_hand_image': {'shape': [360,
+                                                                    360,
+                                                                    3],
+                                                          'type': <FeatureType.VISUAL: 'VISUAL'>}},
+         'features_map': {'action': 'action',
+                          'agent_pos': 'observation.state',
+                          'pixels/agentview_image': 'observation.images.image',
+                          'pixels/robot0_eye_in_hand_image': 'observation.images.image2'},
+         'fps': 30,
+         'init_states': True,
+         'max_parallel_tasks': 5,
+         'multitask_eval': True,
+         'obs_type': 'pixels_agent_pos',
+         'render_mode': 'rgb_array',
+         'task': 'libero_spatial',
+         'type': 'libero'},
+ 'eval': {'batch_size': 1, 'n_episodes': 1, 'use_async_envs': False},
+ 'eval_freq': 0,
+ 'job_name': 'libero_smolvla',
+ 'log_freq': 200,
+ 'num_workers': 4,
+ 'optimizer': {'betas': [0.9, 0.95],
+               'eps': 1e-08,
+               'grad_clip_norm': 10,
+               'lr': 0.0001,
+               'type': 'adamw',
+               'weight_decay': 1e-10},
+ 'output_dir': '/raid/jade/logs/lerobot/lerobot_2_HuggingFaceVLA_libero_smolvla_lr1e-4bs32steps100000',
+ 'policy': {'adapt_to_pi_aloha': False,
+            'add_image_special_tokens': False,
+            'attention_mode': 'cross_attn',
+            'chunk_size': 50,
+            'device': 'cuda',
+            'empty_cameras': 0,
+            'expert_width_multiplier': 0.75,
+            'freeze_vision_encoder': True,
+            'gradient_accumulation_steps': 1,
+            'input_features': {},
+            'license': None,
+            'load_vlm_weights': False,
+            'max_action_dim': 32,
+            'max_period': 4.0,
+            'max_state_dim': 32,
+            'min_period': 0.004,
+            'n_action_steps': 1,
+            'n_obs_steps': 1,
+            'normalization_mapping': {'ACTION': <NormalizationMode.MEAN_STD: 'MEAN_STD'>,
+                                      'STATE': <NormalizationMode.MEAN_STD: 'MEAN_STD'>,
+                                      'VISUAL': <NormalizationMode.IDENTITY: 'IDENTITY'>},
+            'num_expert_layers': -1,
+            'num_steps': 10,
+            'num_vlm_layers': 16,
+            'optimizer_betas': [0.9, 0.95],
+            'optimizer_eps': 1e-08,
+            'optimizer_grad_clip_norm': 10,
+            'optimizer_lr': 0.0001,
+            'optimizer_weight_decay': 1e-10,
+            'output_features': {},
+            'pad_language_to': 'longest',
+            'prefix_length': -1,
+            'private': None,
+            'push_to_hub': True,
+            'repo_id': 'None',
+            'resize_imgs_with_padding': [512, 512],
+            'scheduler_decay_lr': 2.5e-06,
+            'scheduler_decay_steps': 30000,
+            'scheduler_warmup_steps': 1000,
+            'self_attn_every_n_layers': 2,
+            'tags': None,
+            'tokenizer_max_length': 48,
+            'train_expert_only': True,
+            'train_state_proj': True,
+            'type': 'smolvla',
+            'use_amp': True,
+            'use_cache': True,
+            'use_delta_joint_actions_aloha': False,
+            'vlm_model_name': 'HuggingFaceTB/SmolVLM2-500M-Instruct'},
+ 'resume': False,
+ 'save_checkpoint': True,
+ 'save_freq': 20000,
+ 'scheduler': {'decay_lr': 2.5e-06,
+               'num_decay_steps': 30000,
+               'num_warmup_steps': 1000,
+               'peak_lr': 0.0001,
+               'type': 'cosine_decay_with_warmup'},
+ 'seed': 1000,
+ 'steps': 100000,
+ 'use_policy_training_preset': True,
+ 'wandb': {'disable_artifact': False,
+           'enable': False,
+           'entity': None,
+           'mode': None,
+           'notes': None,
+           'project': 'lerobot',
+           'run_id': None}}
+WARNING 2025-09-08 13:23:15 ls/other.py:512 Detected kernel version 5.4.0, which is below the recommended minimum of
+ 5.5.0; this can cause the process to hang. It is recommended to upgrade the kernel to the minimum version or higher
+.
+WARNING 2025-09-08 13:23:15 ls/other.py:512 Detected kernel version 5.4.0, which is below the recommended minimum of
+ 5.5.0; this can cause the process to hang. It is recommended to upgrade the kernel to the minimum version or higher
+.
+INFO 2025-09-08 13:23:15 celerate.py:149 Creating dataset
+Resolving data files: 100%|█████████████████████████████████| 1693/1693 [00:00<00:00, 35414.48it/s]
+Loading dataset shards: 100%|████████████████████████████████████| 69/69 [00:00<00:00, 5660.00it/s]
+Resolving data files: 100%|█████████████████████████████████| 1693/1693 [00:00<00:00, 43760.67it/s]
+Loading dataset shards: 100%|████████████████████████████████████| 69/69 [00:00<00:00, 5629.72it/s]
+c
+INFO 2025-09-08 13:23:22 celerate.py:160 Creating policy
+/home/jade_choghari/miniconda3/envs/lerobot/lib/python3.10/site-packages/transformers/utils/hub.py:111: FutureWarnin
+g: Using `TRANSFORMERS_CACHE` is deprecated and will be removed in v5 of Transformers. Use `HF_HOME` instead.
+  warnings.warn(
+c
+/home/jade_choghari/miniconda3/envs/lerobot/lib/python3.10/site-packages/torch/distributed/distributed_c10d.py:4631:
+ UserWarning: No device id is provided via `init_process_group` or `barrier `. Using the current device set by the u
+ser.
+  warnings.warn(  # warn only once
+[rank1]:[W908 13:23:22.785516795 ProcessGroupNCCL.cpp:4718] [PG ID 0 PG GUID 0 Rank 1]  using GPU 1 as device used b
+y this process is currently unknown. This can potentially cause a hang if this rank to GPU mapping is incorrect. You
+ can pecify device_id in init_process_group() to force use of a particular device.
+Reducing the number of VLM layers to 16 ...
+/home/jade_choghari/miniconda3/envs/lerobot/lib/python3.10/site-packages/torch/distributed/distributed_c10d.py:4631:
+ UserWarning: No device id is provided via `init_process_group` or `barrier `. Using the current device set by the u
+ser.
+  warnings.warn(  # warn only once
+[rank0]:[W908 13:23:43.028071493 ProcessGroupNCCL.cpp:4718] [PG ID 0 PG GUID 0 Rank 0]  using GPU 0 as device used b
+y this process is currently unknown. This can potentially cause a hang if this rank to GPU mapping is incorrect. You
+ can pecify device_id in init_process_group() to force use of a particular device.
+INFO 2025-09-08 13:23:43 celerate.py:171 Creating optimizer and scheduler
+/home/jade_choghari/miniconda3/envs/lerobot/lib/python3.10/site-packages/transformers/utils/hub.py:111: FutureWarnin
+g: Using `TRANSFORMERS_CACHE` is deprecated and will be removed in v5 of Transformers. Use `HF_HOME` instead.
+  warnings.warn(
+Reducing the number of VLM layers to 16 ...
+INFO 2025-09-08 13:24:04 celerate.py:211 Output dir: /raid/jade/logs/lerobot/lerobot_2_HuggingFaceVLA_libero_smolvla
+_lr1e-4bs32steps100000
+INFO 2025-09-08 13:24:04 celerate.py:213 cfg.env.task='libero_spatial'
+INFO 2025-09-08 13:24:04 celerate.py:214 cfg.steps=100000 (100K)
+INFO 2025-09-08 13:24:04 celerate.py:215 dataset.num_frames=273465 (273K)
+INFO 2025-09-08 13:24:04 celerate.py:216 dataset.num_episodes=1693
+INFO 2025-09-08 13:24:04 celerate.py:217 num_learnable_params=99880992 (100M)
+INFO 2025-09-08 13:24:04 celerate.py:218 num_total_params=450046220 (450M)
+INFO 2025-09-08 13:24:04 celerate.py:219 Number of processes: 2
+INFO 2025-09-08 13:24:04 celerate.py:220 Device: cuda:0
+INFO 2025-09-08 13:24:04 celerate.py:221 Mixed precision: bf16
+INFO 2025-09-08 13:24:04 celerate.py:243 Start offline training on a fixed dataset
+
+bach:  dict_keys(['observation.images.image', 'observation.images.image2', 'observation.state', 'action', 'timestamp
+', 'frame_index', 'episode_index', 'index', 'task_index', 'observation.images.image_is_pad', 'observation.images.ima
+ge2_is_pad', 'observation.state_is_pad', 'action_is_pad', 'task'])
+> /home/jade_choghari/lerobot/src/lerobot/scripts/train_accelerate.py(263)train()
+-> train_tracker, output_dict = update_policy(
+(Pdb)
+bach:  dict_keys(['observation.images.image', 'observation.images.image2', 'observation.state', 'action', 'timestamp
+', 'frame_index', 'episode_index', 'index', 'task_index', 'observation.images.image_is_pad', 'observation.images.ima
+ge2_is_pad', 'observation.state_is_pad', 'action_is_pad', 'task'])
+> /home/jade_choghari/lerobot/src/lerobot/scripts/train_accelerate.py(263)train()
+-> train_tracker, output_dict = update_policy(
+(Pdb) batch.keys()[rank0]:[W908 13:24:43.868440913 reducer.cpp:1430] Warning: find_unused_parameters=True was specif
+ied in DDP constructor, but did not find any unused parameters in the forward pass. This flag results in an extra tr
+aversal of the autograd graph every iteration,  which can adversely affect performance. If your model indeed never h
+as any unused parameters in the forward pass, consider turning this flag off. Note that this warning may be a false
+positive if your model has flow control causing later iterations to have unused parameters. (function operator())
+policy.config.input_features
+*** SyntaxError: invalid syntax
+(Pdb) policy.config.input_features
+*** AttributeError: 'DistributedDataParallel' object has no attribute 'config'
+(Pdb) policy
+DistributedDataParallel(
+  (module): SmolVLAPolicy(
+    (normalize_inputs): Normalize(
+      (buffer_observation_state): ParameterDict(
+          (mean): Parameter containing: [torch.cuda.FloatTensor of size 8 (cuda:1)]
+          (std): Parameter containing: [torch.cuda.FloatTensor of size 8 (cuda:1)]
+      )
+    )
+    (normalize_targets): Normalize(
+      (buffer_action): ParameterDict(
+          (mean): Parameter containing: [torch.cuda.FloatTensor of size 7 (cuda:1)]
+          (std): Parameter containing: [torch.cuda.FloatTensor of size 7 (cuda:1)]
+      )
+    )
+    (unnormalize_outputs): Unnormalize(
+      (buffer_action): ParameterDict(
+          (mean): Parameter containing: [torch.cuda.FloatTensor of size 7 (cuda:1)]
+          (std): Parameter containing: [torch.cuda.FloatTensor of size 7 (cuda:1)]
+      )
+    )
+    (model): VLAFlowMatching(
+      (vlm_with_expert): SmolVLMWithExpertModel(
+        (vlm): SmolVLMForConditionalGeneration(
+          (model): SmolVLMModel(
+            (vision_model): SmolVLMVisionTransformer(
+              (embeddings): SmolVLMVisionEmbeddings(
+                (patch_embedding): Conv2d(3, 768, kernel_size=(16, 16), stride=(16, 16), padding=valid)
+                (position_embedding): Embedding(1024, 768)
+              )
+              (encoder): SmolVLMEncoder(
+                (layers): ModuleList(
+                  (0-11): 12 x SmolVLMEncoderLayer(
+                    (self_attn): SmolVLMVisionAttention(
+                      (k_proj): Linear(in_features=768, out_features=768, bias=True)
+                      (v_proj): Linear(in_features=768, out_features=768, bias=True)
+                      (q_proj): Linear(in_features=768, out_features=768, bias=True)
+                      (out_proj): Linear(in_features=768, out_features=768, bias=True)
+                    )
+                    (layer_norm1): LayerNorm((768,), eps=1e-06, elementwise_affine=True)
+                    (mlp): SmolVLMVisionMLP(
+                      (activation_fn): PytorchGELUTanh()
+                      (fc1): Linear(in_features=768, out_features=3072, bias=True)
+                      (fc2): Linear(in_features=3072, out_features=768, bias=True)
+                    )
+                    (layer_norm2): LayerNorm((768,), eps=1e-06, elementwise_affine=True)
+                  )
+                )
+              )
+              (post_layernorm): LayerNorm((768,), eps=1e-06, elementwise_affine=True)
+            )
+            (connector): SmolVLMConnector(
+              (modality_projection): SmolVLMSimpleMLP(
+                (proj): Linear(in_features=12288, out_features=960, bias=False)
+              )
+            )
+            (text_model): LlamaModel(
+              (embed_tokens): Embedding(49280, 960, padding_idx=2)
+              (layers): ModuleList(
+                (0-15): 16 x LlamaDecoderLayer(
+                  (self_attn): LlamaAttention(
+                    (q_proj): Linear(in_features=960, out_features=960, bias=False)
+                    (k_proj): Linear(in_features=960, out_features=320, bias=False)
+                    (v_proj): Linear(in_features=960, out_features=320, bias=False)
+                    (o_proj): Linear(in_features=960, out_features=960, bias=False)
+                  )
+                  (mlp): LlamaMLP(
+                    (gate_proj): Linear(in_features=960, out_features=2560, bias=False)
+                    (up_proj): Linear(in_features=960, out_features=2560, bias=False)
+                    (down_proj): Linear(in_features=2560, out_features=960, bias=False)
+                    (act_fn): SiLU()
+                  )
+                  (input_layernorm): LlamaRMSNorm((960,), eps=1e-05)
+                  (post_attention_layernorm): LlamaRMSNorm((960,), eps=1e-05)
+                )
+              )
+              (norm): LlamaRMSNorm((960,), eps=1e-05)
+              (rotary_emb): LlamaRotaryEmbedding()
+            )
+          )
+          (lm_head): Linear(in_features=960, out_features=49280, bias=False)
+        )
+        (lm_expert): LlamaModel(
+          (embed_tokens): None
+          (layers): ModuleList(
+            (0): LlamaDecoderLayer(
+              (self_attn): LlamaAttention(
+                (q_proj): Linear(in_features=720, out_features=960, bias=False)
+                (k_proj): Linear(in_features=720, out_features=320, bias=False)
+                (v_proj): Linear(in_features=720, out_features=320, bias=False)
+                (o_proj): Linear(in_features=960, out_features=720, bias=False)
+              )
+              (mlp): LlamaMLP(
+                (gate_proj): Linear(in_features=720, out_features=2048, bias=False)
+                (up_proj): Linear(in_features=720, out_features=2048, bias=False)
+                (down_proj): Linear(in_features=2048, out_features=720, bias=False)
+                (act_fn): SiLU()
+              )
+              (input_layernorm): LlamaRMSNorm((720,), eps=1e-05)
+              (post_attention_layernorm): LlamaRMSNorm((720,), eps=1e-05)
+            )
+            (1): LlamaDecoderLayer(
+              (self_attn): LlamaAttention(
+                (q_proj): Linear(in_features=720, out_features=960, bias=False)
+                (k_proj): Linear(in_features=320, out_features=320, bias=False)
+                (v_proj): Linear(in_features=320, out_features=320, bias=False)
+                (o_proj): Linear(in_features=960, out_features=720, bias=False)
+              )
+              (mlp): LlamaMLP(
+                (gate_proj): Linear(in_features=720, out_features=2048, bias=False)
+                (up_proj): Linear(in_features=720, out_features=2048, bias=False)
+                (down_proj): Linear(in_features=2048, out_features=720, bias=False)
+                (act_fn): SiLU()
+              )
+              (input_layernorm): LlamaRMSNorm((720,), eps=1e-05)
+              (post_attention_layernorm): LlamaRMSNorm((720,), eps=1e-05)
+            )
+            (2): LlamaDecoderLayer(
+              (self_attn): LlamaAttention(
+                (q_proj): Linear(in_features=720, out_features=960, bias=False)
+                (k_proj): Linear(in_features=720, out_features=320, bias=False)
+                (v_proj): Linear(in_features=720, out_features=320, bias=False)
+                (o_proj): Linear(in_features=960, out_features=720, bias=False)
+              )
+              (mlp): LlamaMLP(
+                (gate_proj): Linear(in_features=720, out_features=2048, bias=False)
+                (up_proj): Linear(in_features=720, out_features=2048, bias=False)
+                (down_proj): Linear(in_features=2048, out_features=720, bias=False)
+                (act_fn): SiLU()
+              )
+              (input_layernorm): LlamaRMSNorm((720,), eps=1e-05)
+              (post_attention_layernorm): LlamaRMSNorm((720,), eps=1e-05)
+            )
+            (3): LlamaDecoderLayer(
+              (self_attn): LlamaAttention(
+                (q_proj): Linear(in_features=720, out_features=960, bias=False)
+                (k_proj): Linear(in_features=320, out_features=320, bias=False)
+                (v_proj): Linear(in_features=320, out_features=320, bias=False)
+                (o_proj): Linear(in_features=960, out_features=720, bias=False)
+              )
+              (mlp): LlamaMLP(
+                (gate_proj): Linear(in_features=720, out_features=2048, bias=False)
+                (up_proj): Linear(in_features=720, out_features=2048, bias=False)
+                (down_proj): Linear(in_features=2048, out_features=720, bias=False)
+                (act_fn): SiLU()
+              )
+              (input_layernorm): LlamaRMSNorm((720,), eps=1e-05)
+              (post_attention_layernorm): LlamaRMSNorm((720,), eps=1e-05)
+            )
+            (4): LlamaDecoderLayer(
+              (self_attn): LlamaAttention(
+                (q_proj): Linear(in_features=720, out_features=960, bias=False)
+                (k_proj): Linear(in_features=720, out_features=320, bias=False)
+                (v_proj): Linear(in_features=720, out_features=320, bias=False)
+                (o_proj): Linear(in_features=960, out_features=720, bias=False)
+              )
+              (mlp): LlamaMLP(
+                (gate_proj): Linear(in_features=720, out_features=2048, bias=False)
+                (up_proj): Linear(in_features=720, out_features=2048, bias=False)
+                (down_proj): Linear(in_features=2048, out_features=720, bias=False)
+                (act_fn): SiLU()
+              )
+              (input_layernorm): LlamaRMSNorm((720,), eps=1e-05)
+              (post_attention_layernorm): LlamaRMSNorm((720,), eps=1e-05)
+            )
+            (5): LlamaDecoderLayer(
+              (self_attn): LlamaAttention(
+                (q_proj): Linear(in_features=720, out_features=960, bias=False)
+                (k_proj): Linear(in_features=320, out_features=320, bias=False)
+                (v_proj): Linear(in_features=320, out_features=320, bias=False)
+                (o_proj): Linear(in_features=960, out_features=720, bias=False)
+              )
+              (mlp): LlamaMLP(
+                (gate_proj): Linear(in_features=720, out_features=2048, bias=False)
+                (up_proj): Linear(in_features=720, out_features=2048, bias=False)
+                (down_proj): Linear(in_features=2048, out_features=720, bias=False)
+                (act_fn): SiLU()
+              )
+              (input_layernorm): LlamaRMSNorm((720,), eps=1e-05)
+              (post_attention_layernorm): LlamaRMSNorm((720,), eps=1e-05)
+            )
+            (6): LlamaDecoderLayer(
+              (self_attn): LlamaAttention(
+                (q_proj): Linear(in_features=720, out_features=960, bias=False)
+                (k_proj): Linear(in_features=720, out_features=320, bias=False)
+                (v_proj): Linear(in_features=720, out_features=320, bias=False)
+                (o_proj): Linear(in_features=960, out_features=720, bias=False)
+              )
+              (mlp): LlamaMLP(
+                (gate_proj): Linear(in_features=720, out_features=2048, bias=False)
+                (up_proj): Linear(in_features=720, out_features=2048, bias=False)
+                (down_proj): Linear(in_features=2048, out_features=720, bias=False)
+                (act_fn): SiLU()
+              )
+              (input_layernorm): LlamaRMSNorm((720,), eps=1e-05)
+              (post_attention_layernorm): LlamaRMSNorm((720,), eps=1e-05)
+            )
+            (7): LlamaDecoderLayer(
+              (self_attn): LlamaAttention(
+                (q_proj): Linear(in_features=720, out_features=960, bias=False)
+                (k_proj): Linear(in_features=320, out_features=320, bias=False)
+                (v_proj): Linear(in_features=320, out_features=320, bias=False)
+                (o_proj): Linear(in_features=960, out_features=720, bias=False)
+              )
+              (mlp): LlamaMLP(
+                (gate_proj): Linear(in_features=720, out_features=2048, bias=False)
+                (up_proj): Linear(in_features=720, out_features=2048, bias=False)
+                (down_proj): Linear(in_features=2048, out_features=720, bias=False)
+                (act_fn): SiLU()
+              )
+              (input_layernorm): LlamaRMSNorm((720,), eps=1e-05)
+              (post_attention_layernorm): LlamaRMSNorm((720,), eps=1e-05)
+            )
+            (8): LlamaDecoderLayer(
+              (self_attn): LlamaAttention(
+                (q_proj): Linear(in_features=720, out_features=960, bias=False)
+                (k_proj): Linear(in_features=720, out_features=320, bias=False)
+                (v_proj): Linear(in_features=720, out_features=320, bias=False)
+                (o_proj): Linear(in_features=960, out_features=720, bias=False)
+              )
+              (mlp): LlamaMLP(
+                (gate_proj): Linear(in_features=720, out_features=2048, bias=False)
+                (up_proj): Linear(in_features=720, out_features=2048, bias=False)
+                (down_proj): Linear(in_features=2048, out_features=720, bias=False)
+                (act_fn): SiLU()
+              )
+              (input_layernorm): LlamaRMSNorm((720,), eps=1e-05)
+              (post_attention_layernorm): LlamaRMSNorm((720,), eps=1e-05)
+            )
+            (9): LlamaDecoderLayer(
+              (self_attn): LlamaAttention(
+                (q_proj): Linear(in_features=720, out_features=960, bias=False)
+                (k_proj): Linear(in_features=320, out_features=320, bias=False)
+                (v_proj): Linear(in_features=320, out_features=320, bias=False)
+                (o_proj): Linear(in_features=960, out_features=720, bias=False)
+              )
+              (mlp): LlamaMLP(
+                (gate_proj): Linear(in_features=720, out_features=2048, bias=False)
+                (up_proj): Linear(in_features=720, out_features=2048, bias=False)
+                (down_proj): Linear(in_features=2048, out_features=720, bias=False)
+                (act_fn): SiLU()
+              )
+              (input_layernorm): LlamaRMSNorm((720,), eps=1e-05)
+              (post_attention_layernorm): LlamaRMSNorm((720,), eps=1e-05)
+            )
+            (10): LlamaDecoderLayer(
+              (self_attn): LlamaAttention(
+                (q_proj): Linear(in_features=720, out_features=960, bias=False)
+                (k_proj): Linear(in_features=720, out_features=320, bias=False)
+                (v_proj): Linear(in_features=720, out_features=320, bias=False)
+                (o_proj): Linear(in_features=960, out_features=720, bias=False)
+              )
+              (mlp): LlamaMLP(
+                (gate_proj): Linear(in_features=720, out_features=2048, bias=False)
+                (up_proj): Linear(in_features=720, out_features=2048, bias=False)
+                (down_proj): Linear(in_features=2048, out_features=720, bias=False)
+                (act_fn): SiLU()
+              )
+              (input_layernorm): LlamaRMSNorm((720,), eps=1e-05)
+              (post_attention_layernorm): LlamaRMSNorm((720,), eps=1e-05)
+            )
+            (11): LlamaDecoderLayer(
+              (self_attn): LlamaAttention(
+                (q_proj): Linear(in_features=720, out_features=960, bias=False)
+                (k_proj): Linear(in_features=320, out_features=320, bias=False)
+                (v_proj): Linear(in_features=320, out_features=320, bias=False)
+                (o_proj): Linear(in_features=960, out_features=720, bias=False)
+              )
+              (mlp): LlamaMLP(
+                (gate_proj): Linear(in_features=720, out_features=2048, bias=False)
+                (up_proj): Linear(in_features=720, out_features=2048, bias=False)
+                (down_proj): Linear(in_features=2048, out_features=720, bias=False)
+                (act_fn): SiLU()
+              )
+              (input_layernorm): LlamaRMSNorm((720,), eps=1e-05)
+              (post_attention_layernorm): LlamaRMSNorm((720,), eps=1e-05)
+            )
+            (12): LlamaDecoderLayer(
+              (self_attn): LlamaAttention(
+                (q_proj): Linear(in_features=720, out_features=960, bias=False)
+                (k_proj): Linear(in_features=720, out_features=320, bias=False)
+                (v_proj): Linear(in_features=720, out_features=320, bias=False)
+                (o_proj): Linear(in_features=960, out_features=720, bias=False)
+              )
+              (mlp): LlamaMLP(
+                (gate_proj): Linear(in_features=720, out_features=2048, bias=False)
+                (up_proj): Linear(in_features=720, out_features=2048, bias=False)
+                (down_proj): Linear(in_features=2048, out_features=720, bias=False)
+                (act_fn): SiLU()
+              )
+              (input_layernorm): LlamaRMSNorm((720,), eps=1e-05)
+              (post_attention_layernorm): LlamaRMSNorm((720,), eps=1e-05)
+            )
+            (13): LlamaDecoderLayer(
+              (self_attn): LlamaAttention(
+                (q_proj): Linear(in_features=720, out_features=960, bias=False)
+                (k_proj): Linear(in_features=320, out_features=320, bias=False)
+                (v_proj): Linear(in_features=320, out_features=320, bias=False)
+                (o_proj): Linear(in_features=960, out_features=720, bias=False)
+              )
+              (mlp): LlamaMLP(
+                (gate_proj): Linear(in_features=720, out_features=2048, bias=False)
+                (up_proj): Linear(in_features=720, out_features=2048, bias=False)
+                (down_proj): Linear(in_features=2048, out_features=720, bias=False)
+                (act_fn): SiLU()
+              )
+              (input_layernorm): LlamaRMSNorm((720,), eps=1e-05)
+              (post_attention_layernorm): LlamaRMSNorm((720,), eps=1e-05)
+            )
+            (14): LlamaDecoderLayer(
+              (self_attn): LlamaAttention(
+                (q_proj): Linear(in_features=720, out_features=960, bias=False)
+                (k_proj): Linear(in_features=720, out_features=320, bias=False)
+                (v_proj): Linear(in_features=720, out_features=320, bias=False)
+                (o_proj): Linear(in_features=960, out_features=720, bias=False)
+              )
+              (mlp): LlamaMLP(
+                (gate_proj): Linear(in_features=720, out_features=2048, bias=False)
+                (up_proj): Linear(in_features=720, out_features=2048, bias=False)
+                (down_proj): Linear(in_features=2048, out_features=720, bias=False)
+                (act_fn): SiLU()
+              )
+              (input_layernorm): LlamaRMSNorm((720,), eps=1e-05)
+              (post_attention_layernorm): LlamaRMSNorm((720,), eps=1e-05)
+            )
+            (15): LlamaDecoderLayer(
+              (self_attn): LlamaAttention(
+                (q_proj): Linear(in_features=720, out_features=960, bias=False)
+                (k_proj): Linear(in_features=320, out_features=320, bias=False)
+                (v_proj): Linear(in_features=320, out_features=320, bias=False)
+                (o_proj): Linear(in_features=960, out_features=720, bias=False)
+              )
+              (mlp): LlamaMLP(
+                (gate_proj): Linear(in_features=720, out_features=2048, bias=False)
+                (up_proj): Linear(in_features=720, out_features=2048, bias=False)
+                (down_proj): Linear(in_features=2048, out_features=720, bias=False)
+                (act_fn): SiLU()
+              )
+              (input_layernorm): LlamaRMSNorm((720,), eps=1e-05)
+              (post_attention_layernorm): LlamaRMSNorm((720,), eps=1e-05)
+            )
+          )
+          (norm): LlamaRMSNorm((720,), eps=1e-05)
+          (rotary_emb): LlamaRotaryEmbedding()
+        )
+      )
+      (state_proj): Linear(in_features=32, out_features=960, bias=True)
+      (action_in_proj): Linear(in_features=32, out_features=720, bias=True)
+      (action_out_proj): Linear(in_features=720, out_features=32, bias=True)
+      (action_time_mlp_in): Linear(in_features=1440, out_features=720, bias=True)
+      (action_time_mlp_out): Linear(in_features=720, out_features=720, bias=True)
+    )
+  )
+)
+(Pdb) policy.config
+*** AttributeError: 'DistributedDataParallel' object has no attribute 'config'
+(Pdb) policy.input_features
+*** AttributeError: 'DistributedDataParallel' object has no attribute 'input_features'
+(Pdb) quit()
+[rank1]: Traceback (most recent call last):
+[rank1]:   File "/home/jade_choghari/lerobot/src/lerobot/scripts/train_accelerate.py", line 368, in <module>
+[rank1]:     train()
+[rank1]:   File "/home/jade_choghari/lerobot/src/lerobot/configs/parser.py", line 225, in wrapper_inner
+[rank1]:     response = fn(cfg, *args, **kwargs)
+[rank1]:   File "/home/jade_choghari/lerobot/src/lerobot/scripts/train_accelerate.py", line 263, in train
+[rank1]:     train_tracker, output_dict = update_policy(
+[rank1]:   File "/home/jade_choghari/lerobot/src/lerobot/scripts/train_accelerate.py", line 263, in train
+[rank1]:     train_tracker, output_dict = update_policy(
+[rank1]:   File "/home/jade_choghari/miniconda3/envs/lerobot/lib/python3.10/bdb.py", line 90, in trace_dispatch
+[rank1]:     return self.dispatch_line(frame)
+[rank1]:   File "/home/jade_choghari/miniconda3/envs/lerobot/lib/python3.10/bdb.py", line 115, in dispatch_line
+[rank1]:     if self.quitting: raise BdbQuit
+[rank1]: bdb.BdbQuit
+W0908 13:25:34.274000 776579 site-packages/torch/distributed/elastic/multiprocessing/api.py:900] Sending process 776
+663 closing signal SIGTERM
+E0908 13:25:34.589000 776579 site-packages/torch/distributed/elastic/multiprocessing/api.py:874] failed (exitcode: 1
+) local_rank: 1 (pid: 776664) of binary: /home/jade_choghari/miniconda3/envs/lerobot/bin/python
+Traceback (most recent call last):
+  File "/home/jade_choghari/miniconda3/envs/lerobot/lib/python3.10/runpy.py", line 196, in _run_module_as_main
+    return _run_code(code, main_globals, None,
+  File "/home/jade_choghari/miniconda3/envs/lerobot/lib/python3.10/runpy.py", line 86, in _run_code
+    exec(code, run_globals)
+  File "/home/jade_choghari/miniconda3/envs/lerobot/lib/python3.10/site-packages/accelerate/commands/launch.py", lin
+e 1245, in <module>
+    main()
+  File "/home/jade_choghari/miniconda3/envs/lerobot/lib/python3.10/site-packages/accelerate/commands/launch.py", lin
+e 1241, in main
+    launch_command(args)
+  File "/home/jade_choghari/miniconda3/envs/lerobot/lib/python3.10/site-packages/accelerate/commands/launch.py", lin
+e 1226, in launch_command
+    multi_gpu_launcher(args)
+  File "/home/jade_choghari/miniconda3/envs/lerobot/lib/python3.10/site-packages/accelerate/commands/launch.py", lin
+e 853, in multi_gpu_launcher
+    distrib_run.run(args)
+  File "/home/jade_choghari/miniconda3/envs/lerobot/lib/python3.10/site-packages/torch/distributed/run.py", line 883
+, in run
+    elastic_launch(
+  File "/home/jade_choghari/miniconda3/envs/lerobot/lib/python3.10/site-packages/torch/distributed/launcher/api.py",
+ line 139, in __call__
+    return launch_agent(self._config, self._entrypoint, list(args))
+  File "/home/jade_choghari/miniconda3/envs/lerobot/lib/python3.10/site-packages/torch/distributed/launcher/api.py",
+ line 270, in launch_agent
+    raise ChildFailedError(
+torch.distributed.elastic.multiprocessing.errors.ChildFailedError:
+============================================================
+src/lerobot/scripts/train_accelerate.py FAILED
+------------------------------------------------------------
+Failures:
+  <NO_OTHER_FAILURES>
+------------------------------------------------------------
+Root Cause (first observed failure):
+[0]:
+  time      : 2025-09-08_13:25:34
+  host      : hf-dgx-01
+  rank      : 1 (local_rank: 1)
+  exitcode  : 1 (pid: 776664)
+  error_file: <N/A>
+  traceback : To enable traceback see: https://pytorch.org/docs/stable/elastic/errors.html
+============================================================
+(lerobot) jade_choghari@hf-dgx-01:~/lerobot$ clear
+(lerobot) jade_choghari@hf-dgx-01:~/lerobot$ clear
+(lerobot) jade_choghari@hf-dgx-01:~/lerobot$ bash examples/7_train_acc.sh
+Training dir: /raid/jade/logs/lerobot/lerobot_2_HuggingFaceVLA_libero_smolvla_lr1e-4bs32steps100000
+The following values were not passed to `accelerate launch` and had defaults used instead:
+                More than one GPU was found, enabling multi-GPU training.
+                If this was unintended please pass in `--num_processes=1`.
+        `--dynamo_backend` was set to a value of `'no'`
+To avoid this warning pass in values for each of the problematic parameters or run `accelerate config`.
+/home/jade_choghari/miniconda3/envs/lerobot/lib/python3.10/site-packages/accelerate/utils/launch.py:238: UserWarning
+: Port `29522` is already in use. Accelerate will attempt to launch in a standalone-like mode by finding an open por
+t automatically for this session. If this current attempt fails, or for more control in future runs, please specify
+a different port (e.g., `--main_process_port <your_chosen_port>`) or use `--main_process_port 0` for automatic selec
+tion in your launch command or Accelerate config file.
+  warnings.warn(
+INFO 2025-09-08 13:33:47 ils/utils.py:48 Cuda backend detected, using cuda.
+WARNING 2025-09-08 13:33:47 /policies.py:81 Device 'None' is not available. Switching to 'cuda'.
+INFO 2025-09-08 13:33:47 ccelerate.py:99 {'batch_size': 32,
+ 'dataset': {'episodes': None,
+             'image_transforms': {'enable': False,
+                                  'max_num_transforms': 3,
+                                  'random_order': False,
+                                  'tfs': {'brightness': {'kwargs': {'brightness': [0.8,
+                                                                                   1.2]},
+                                                         'type': 'ColorJitter',
+                                                         'weight': 1.0},
+                                          'contrast': {'kwargs': {'contrast': [0.8,
+                                                                               1.2]},
+                                                       'type': 'ColorJitter',
+                                                       'weight': 1.0},
+                                          'hue': {'kwargs': {'hue': [-0.05,
+                                                                     0.05]},
+                                                  'type': 'ColorJitter',
+                                                  'weight': 1.0},
+                                          'saturation': {'kwargs': {'saturation': [0.5,
+                                                                                   1.5]},
+                                                         'type': 'ColorJitter',
+                                                         'weight': 1.0},
+                                          'sharpness': {'kwargs': {'sharpness': [0.5,
+                                                                                 1.5]},
+                                                        'type': 'SharpnessJitter',
+                                                        'weight': 1.0}}},
+             'repo_id': 'HuggingFaceVLA/libero',
+             'revision': None,
+             'root': '/raid/jade/.cache/huggingface/datasets',
+             'use_imagenet_stats': True,
+             'video_backend': 'torchcodec'},
+ 'env': {'camera_name': 'agentview_image,robot0_eye_in_hand_image',
+         'episode_length': 520,
+         'features': {'action': {'shape': [7],
+                                 'type': <FeatureType.ACTION: 'ACTION'>},
+                      'agent_pos': {'shape': [8],
+                                    'type': <FeatureType.STATE: 'STATE'>},
+                      'pixels/agentview_image': {'shape': [360, 360, 3],
+                                                 'type': <FeatureType.VISUAL: 'VISUAL'>},
+                      'pixels/robot0_eye_in_hand_image': {'shape': [360,
+                                                                    360,
+                                                                    3],
+                                                          'type': <FeatureType.VISUAL: 'VISUAL'>}},
+         'features_map': {'action': 'action',
+                          'agent_pos': 'observation.state',
+                          'pixels/agentview_image': 'observation.images.image',
+                          'pixels/robot0_eye_in_hand_image': 'observation.images.image2'},
+         'fps': 30,
+         'init_states': True,
+         'max_parallel_tasks': 5,
+         'multitask_eval': True,
+         'obs_type': 'pixels_agent_pos',
+         'render_mode': 'rgb_array',
+         'task': 'libero_spatial',
+         'type': 'libero'},
+ 'eval': {'batch_size': 1, 'n_episodes': 1, 'use_async_envs': False},
+ 'eval_freq': 0,
+ 'job_name': 'libero_smolvla',
+ 'log_freq': 200,
+ 'num_workers': 4,
+ 'optimizer': {'betas': [0.9, 0.95],
+               'eps': 1e-08,
+               'grad_clip_norm': 10,
+               'lr': 0.0001,
+               'type': 'adamw',
+               'weight_decay': 1e-10},
+ 'output_dir': '/raid/jade/logs/lerobot/lerobot_2_HuggingFaceVLA_libero_smolvla_lr1e-4bs32steps100000',
+ 'policy': {'adapt_to_pi_aloha': False,
+            'add_image_special_tokens': False,
+            'attention_mode': 'cross_attn',
+            'chunk_size': 50,
+            'device': 'cuda',
+            'empty_cameras': 0,
+            'expert_width_multiplier': 0.75,
+            'freeze_vision_encoder': True,
+            'gradient_accumulation_steps': 1,
+            'input_features': {},
+            'license': None,
+            'load_vlm_weights': False,
+            'max_action_dim': 32,
+            'max_period': 4.0,
+            'max_state_dim': 32,
+            'min_period': 0.004,
+            'n_action_steps': 1,
+            'n_obs_steps': 1,
+            'normalization_mapping': {'ACTION': <NormalizationMode.MEAN_STD: 'MEAN_STD'>,
+                                      'STATE': <NormalizationMode.MEAN_STD: 'MEAN_STD'>,
+                                      'VISUAL': <NormalizationMode.IDENTITY: 'IDENTITY'>},
+            'num_expert_layers': -1,
+            'num_steps': 10,
+            'num_vlm_layers': 16,
+            'optimizer_betas': [0.9, 0.95],
+            'optimizer_eps': 1e-08,
+            'optimizer_grad_clip_norm': 10,
+            'optimizer_lr': 0.0001,
+            'optimizer_weight_decay': 1e-10,
+            'output_features': {},
+            'pad_language_to': 'longest',
+            'prefix_length': -1,
+            'private': None,
+            'push_to_hub': True,
+            'repo_id': 'None',
+            'resize_imgs_with_padding': [512, 512],
+            'scheduler_decay_lr': 2.5e-06,
+            'scheduler_decay_steps': 30000,
+            'scheduler_warmup_steps': 1000,
+            'self_attn_every_n_layers': 2,
+            'tags': None,
+            'tokenizer_max_length': 48,
+            'train_expert_only': True,
+            'train_state_proj': True,
+            'type': 'smolvla',
+            'use_amp': True,
+            'use_cache': True,
+            'use_delta_joint_actions_aloha': False,
+            'vlm_model_name': 'HuggingFaceTB/SmolVLM2-500M-Instruct'},
+ 'resume': False,
+ 'save_checkpoint': True,
+ 'save_freq': 20000,
+ 'scheduler': {'decay_lr': 2.5e-06,
+               'num_decay_steps': 30000,
+               'num_warmup_steps': 1000,
+               'peak_lr': 0.0001,
+               'type': 'cosine_decay_with_warmup'},
+ 'seed': 1000,
+ 'steps': 100000,
+ 'use_policy_training_preset': True,
+ 'wandb': {'disable_artifact': False,
+           'enable': False,
+           'entity': None,
+           'mode': None,
+           'notes': None,
+           'project': 'lerobot',
+           'run_id': None}}
+INFO 2025-09-08 13:33:47 ils/utils.py:48 Cuda backend detected, using cuda.
+WARNING 2025-09-08 13:33:47 /policies.py:81 Device 'None' is not available. Switching to 'cuda'.
+INFO 2025-09-08 13:33:47 ccelerate.py:99 {'batch_size': 32,
+ 'dataset': {'episodes': None,
+             'image_transforms': {'enable': False,
+                                  'max_num_transforms': 3,
+                                  'random_order': False,
+                                  'tfs': {'brightness': {'kwargs': {'brightness': [0.8,
+                                                                                   1.2]},
+                                                         'type': 'ColorJitter',
+                                                         'weight': 1.0},
+                                          'contrast': {'kwargs': {'contrast': [0.8,
+                                                                               1.2]},
+                                                       'type': 'ColorJitter',
+                                                       'weight': 1.0},
+                                          'hue': {'kwargs': {'hue': [-0.05,
+                                                                     0.05]},
+                                                  'type': 'ColorJitter',
+                                                  'weight': 1.0},
+                                          'saturation': {'kwargs': {'saturation': [0.5,
+                                                                                   1.5]},
+                                                         'type': 'ColorJitter',
+                                                         'weight': 1.0},
+                                          'sharpness': {'kwargs': {'sharpness': [0.5,
+                                                                                 1.5]},
+                                                        'type': 'SharpnessJitter',
+                                                        'weight': 1.0}}},
+             'repo_id': 'HuggingFaceVLA/libero',
+             'revision': None,
+             'root': '/raid/jade/.cache/huggingface/datasets',
+             'use_imagenet_stats': True,
+             'video_backend': 'torchcodec'},
+ 'env': {'camera_name': 'agentview_image,robot0_eye_in_hand_image',
+         'episode_length': 520,
+         'features': {'action': {'shape': [7],
+                                 'type': <FeatureType.ACTION: 'ACTION'>},
+                      'agent_pos': {'shape': [8],
+                                    'type': <FeatureType.STATE: 'STATE'>},
+                      'pixels/agentview_image': {'shape': [360, 360, 3],
+                                                 'type': <FeatureType.VISUAL: 'VISUAL'>},
+                      'pixels/robot0_eye_in_hand_image': {'shape': [360,
+                                                                    360,
+                                                                    3],
+                                                          'type': <FeatureType.VISUAL: 'VISUAL'>}},
+         'features_map': {'action': 'action',
+                          'agent_pos': 'observation.state',
+                          'pixels/agentview_image': 'observation.images.image',
+                          'pixels/robot0_eye_in_hand_image': 'observation.images.image2'},
+         'fps': 30,
+         'init_states': True,
+         'max_parallel_tasks': 5,
+         'multitask_eval': True,
+         'obs_type': 'pixels_agent_pos',
+         'render_mode': 'rgb_array',
+         'task': 'libero_spatial',
+         'type': 'libero'},
+ 'eval': {'batch_size': 1, 'n_episodes': 1, 'use_async_envs': False},
+ 'eval_freq': 0,
+ 'job_name': 'libero_smolvla',
+ 'log_freq': 200,
+ 'num_workers': 4,
+ 'optimizer': {'betas': [0.9, 0.95],
+               'eps': 1e-08,
+               'grad_clip_norm': 10,
+               'lr': 0.0001,
+               'type': 'adamw',
+               'weight_decay': 1e-10},
+ 'output_dir': '/raid/jade/logs/lerobot/lerobot_2_HuggingFaceVLA_libero_smolvla_lr1e-4bs32steps100000',
+ 'policy': {'adapt_to_pi_aloha': False,
+            'add_image_special_tokens': False,
+            'attention_mode': 'cross_attn',
+            'chunk_size': 50,
+            'device': 'cuda',
+            'empty_cameras': 0,
+            'expert_width_multiplier': 0.75,
+            'freeze_vision_encoder': True,
+            'gradient_accumulation_steps': 1,
+            'input_features': {},
+            'license': None,
+            'load_vlm_weights': False,
+            'max_action_dim': 32,
+            'max_period': 4.0,
+            'max_state_dim': 32,
+            'min_period': 0.004,
+            'n_action_steps': 1,
+            'n_obs_steps': 1,
+            'normalization_mapping': {'ACTION': <NormalizationMode.MEAN_STD: 'MEAN_STD'>,
+                                      'STATE': <NormalizationMode.MEAN_STD: 'MEAN_STD'>,
+                                      'VISUAL': <NormalizationMode.IDENTITY: 'IDENTITY'>},
+            'num_expert_layers': -1,
+            'num_steps': 10,
+            'num_vlm_layers': 16,
+            'optimizer_betas': [0.9, 0.95],
+            'optimizer_eps': 1e-08,
+            'optimizer_grad_clip_norm': 10,
+            'optimizer_lr': 0.0001,
+            'optimizer_weight_decay': 1e-10,
+            'output_features': {},
+            'pad_language_to': 'longest',
+            'prefix_length': -1,
+            'private': None,
+            'push_to_hub': True,
+            'repo_id': 'None',
+            'resize_imgs_with_padding': [512, 512],
+            'scheduler_decay_lr': 2.5e-06,
+            'scheduler_decay_steps': 30000,
+            'scheduler_warmup_steps': 1000,
+            'self_attn_every_n_layers': 2,
+            'tags': None,
+            'tokenizer_max_length': 48,
+            'train_expert_only': True,
+            'train_state_proj': True,
+            'type': 'smolvla',
+            'use_amp': True,
+            'use_cache': True,
+            'use_delta_joint_actions_aloha': False,
+            'vlm_model_name': 'HuggingFaceTB/SmolVLM2-500M-Instruct'},
+ 'resume': False,
+ 'save_checkpoint': True,
+ 'save_freq': 20000,
+ 'scheduler': {'decay_lr': 2.5e-06,
+               'num_decay_steps': 30000,
+               'num_warmup_steps': 1000,
+               'peak_lr': 0.0001,
+               'type': 'cosine_decay_with_warmup'},
+ 'seed': 1000,
+ 'steps': 100000,
+ 'use_policy_training_preset': True,
+ 'wandb': {'disable_artifact': False,
+           'enable': False,
+           'entity': None,
+           'mode': None,
+           'notes': None,
+           'project': 'lerobot',
+           'run_id': None}}
+WARNING 2025-09-08 13:33:47 ls/other.py:512 Detected kernel version 5.4.0, which is below the recommended minimum of
+ 5.5.0; this can cause the process to hang. It is recommended to upgrade the kernel to the minimum version or higher
+.
+WARNING 2025-09-08 13:33:47 ls/other.py:512 Detected kernel version 5.4.0, which is below the recommended minimum of
+ 5.5.0; this can cause the process to hang. It is recommended to upgrade the kernel to the minimum version or higher
+.
+INFO 2025-09-08 13:33:47 celerate.py:149 Creating dataset
+Resolving data files: 100%|████████████████████████████████| 1693/1693 [00:00<00:00, 103295.66it/s]
+Loading dataset shards: 100%|████████████████████████████████████| 69/69 [00:00<00:00, 5229.81it/s]
+Resolving data files: 100%|████████████████████████████████| 1693/1693 [00:00<00:00, 360601.09it/s]
+Loading dataset shards: 100%|████████████████████████████████████| 69/69 [00:00<00:00, 4881.54it/s]
+c
+INFO 2025-09-08 13:33:53 celerate.py:160 Creating policy
+/home/jade_choghari/miniconda3/envs/lerobot/lib/python3.10/site-packages/transformers/utils/hub.py:111: FutureWarnin
+g: Using `TRANSFORMERS_CACHE` is deprecated and will be removed in v5 of Transformers. Use `HF_HOME` instead.
+  warnings.warn(
+c
+/home/jade_choghari/miniconda3/envs/lerobot/lib/python3.10/site-packages/torch/distributed/distributed_c10d.py:4631:
+ UserWarning: No device id is provided via `init_process_group` or `barrier `. Using the current device set by the u
+ser.
+  warnings.warn(  # warn only once
+[rank1]:[W908 13:33:54.613597516 ProcessGroupNCCL.cpp:4718] [PG ID 0 PG GUID 0 Rank 1]  using GPU 1 as device used b
+y this process is currently unknown. This can potentially cause a hang if this rank to GPU mapping is incorrect. You
+ can pecify device_id in init_process_group() to force use of a particular device.
+Reducing the number of VLM layers to 16 ...
+/home/jade_choghari/miniconda3/envs/lerobot/lib/python3.10/site-packages/torch/distributed/distributed_c10d.py:4631:
+ UserWarning: No device id is provided via `init_process_group` or `barrier `. Using the current device set by the u
+ser.
+  warnings.warn(  # warn only once
+[rank0]:[W908 13:34:15.806448425 ProcessGroupNCCL.cpp:4718] [PG ID 0 PG GUID 0 Rank 0]  using GPU 0 as device used b
+y this process is currently unknown. This can potentially cause a hang if this rank to GPU mapping is incorrect. You
+ can pecify device_id in init_process_group() to force use of a particular device.
+INFO 2025-09-08 13:34:15 celerate.py:171 Creating optimizer and scheduler
+/home/jade_choghari/miniconda3/envs/lerobot/lib/python3.10/site-packages/transformers/utils/hub.py:111: FutureWarnin
+g: Using `TRANSFORMERS_CACHE` is deprecated and will be removed in v5 of Transformers. Use `HF_HOME` instead.
+  warnings.warn(
+Reducing the number of VLM layers to 16 ...
+INFO 2025-09-08 13:34:36 celerate.py:211 Output dir: /raid/jade/logs/lerobot/lerobot_2_HuggingFaceVLA_libero_smolvla
+_lr1e-4bs32steps100000
+INFO 2025-09-08 13:34:36 celerate.py:213 cfg.env.task='libero_spatial'
+INFO 2025-09-08 13:34:36 celerate.py:214 cfg.steps=100000 (100K)
+INFO 2025-09-08 13:34:36 celerate.py:215 dataset.num_frames=273465 (273K)
+INFO 2025-09-08 13:34:36 celerate.py:216 dataset.num_episodes=1693
+INFO 2025-09-08 13:34:36 celerate.py:217 num_learnable_params=99880992 (100M)
+INFO 2025-09-08 13:34:36 celerate.py:218 num_total_params=450046220 (450M)
+INFO 2025-09-08 13:34:36 celerate.py:219 Number of processes: 2
+INFO 2025-09-08 13:34:36 celerate.py:220 Device: cuda:0
+INFO 2025-09-08 13:34:36 celerate.py:221 Mixed precision: bf16
+INFO 2025-09-08 13:34:36 celerate.py:243 Start offline training on a fixed dataset
+[rank1]:[W908 13:34:39.454560620 reducer.cpp:1430] Warning: find_unused_parameters=True was specified in DDP constru
+ctor, but did not find any unused parameters in the forward pass. This flag results in an extra traversal of the aut
+ograd graph every iteration,  which can adversely affect performance. If your model indeed never has any unused para
+meters in the forward pass, consider turning this flag off. Note that this warning may be a false positive if your m
+odel has flow control causing later iterations to have unused parameters. (function operator())
+[rank0]:[W908 13:34:40.502702504 reducer.cpp:1430] Warning: find_unused_parameters=True was specified in DDP constru
+ctor, but did not find any unused parameters in the forward pass. This flag results in an extra traversal of the aut
+ograd graph every iteration,  which can adversely affect performance. If your model indeed never has any unused para
+meters in the forward pass, consider turning this flag off. Note that this warning may be a false positive if your m
+odel has flow control causing later iterations to have unused parameters. (function operator())
+INFO 2025-09-08 13:36:23 celerate.py:281 step:200 smpl:13K ep:79 epch:0.05 loss:0.963 grdn:2.699 lr:2.0e-05 updt_s:0
+.506 data_s:0.027
+INFO 2025-09-08 13:38:09 celerate.py:281 step:400 smpl:26K ep:158 epch:0.09 loss:0.389 grdn:3.127 lr:6.0e-05 updt_s:
+0.525 data_s:0.003
+INFO 2025-09-08 13:39:53 celerate.py:281 step:600 smpl:38K ep:238 epch:0.14 loss:0.261 grdn:2.618 lr:9.5e-05 updt_s:
+0.517 data_s:0.003
+INFO 2025-09-08 13:41:37 celerate.py:281 step:800 smpl:51K ep:317 epch:0.19 loss:0.231 grdn:1.684 lr:9.9e-05 updt_s:
+0.516 data_s:0.003
+INFO 2025-09-08 13:43:21 celerate.py:281 step:1K smpl:64K ep:396 epch:0.23 loss:0.211 grdn:1.258 lr:9.9e-05 updt_s:0
+.514 data_s:0.003
+INFO 2025-09-08 13:45:05 celerate.py:281 step:1K smpl:77K ep:475 epch:0.28 loss:0.198 grdn:1.032 lr:9.9e-05 updt_s:0
+.517 data_s:0.003
+INFO 2025-09-08 13:46:49 celerate.py:281 step:1K smpl:90K ep:555 epch:0.33 loss:0.182 grdn:0.880 lr:9.8e-05 updt_s:0
+.515 data_s:0.003
+INFO 2025-09-08 13:48:33 celerate.py:281 step:2K smpl:102K ep:634 epch:0.37 loss:0.167 grdn:0.744 lr:9.8e-05 updt_s:
+0.514 data_s:0.003
+INFO 2025-09-08 13:50:17 celerate.py:281 step:2K smpl:115K ep:713 epch:0.42 loss:0.157 grdn:0.680 lr:9.7e-05 updt_s:
+0.514 data_s:0.003
+INFO 2025-09-08 13:52:01 celerate.py:281 step:2K smpl:128K ep:792 epch:0.47 loss:0.147 grdn:0.612 lr:9.6e-05 updt_s:
+0.517 data_s:0.003
+INFO 2025-09-08 13:53:44 celerate.py:281 step:2K smpl:141K ep:872 epch:0.51 loss:0.142 grdn:0.576 lr:9.5e-05 updt_s:
+0.510 data_s:0.003
+INFO 2025-09-08 13:55:27 celerate.py:281 step:2K smpl:154K ep:951 epch:0.56 loss:0.136 grdn:0.523 lr:9.4e-05 updt_s:
+0.514 data_s:0.003
+INFO 2025-09-08 13:57:11 celerate.py:281 step:3K smpl:166K ep:1K epch:0.61 loss:0.132 grdn:0.509 lr:9.3e-05 updt_s:0
+.516 data_s:0.003
+INFO 2025-09-08 13:58:57 celerate.py:281 step:3K smpl:179K ep:1K epch:0.66 loss:0.126 grdn:0.492 lr:9.2e-05 updt_s:0
+.525 data_s:0.003
+INFO 2025-09-08 14:00:43 celerate.py:281 step:3K smpl:192K ep:1K epch:0.70 loss:0.124 grdn:0.467 lr:9.1e-05 updt_s:0
+.525 data_s:0.003
+INFO 2025-09-08 14:02:26 celerate.py:281 step:3K smpl:205K ep:1K epch:0.75 loss:0.119 grdn:0.438 lr:9.0e-05 updt_s:0
+.508 data_s:0.003
+INFO 2025-09-08 14:04:27 celerate.py:281 step:3K smpl:218K ep:1K epch:0.80 loss:0.118 grdn:0.426 lr:8.9e-05 updt_s:0
+.564 data_s:0.039
+INFO 2025-09-08 14:06:10 celerate.py:281 step:4K smpl:230K ep:1K epch:0.84 loss:0.116 grdn:0.422 lr:8.7e-05 updt_s:0
+.511 data_s:0.004
+INFO 2025-09-08 14:07:55 celerate.py:281 step:4K smpl:243K ep:2K epch:0.89 loss:0.113 grdn:0.395 lr:8.6e-05 updt_s:0
+.517 data_s:0.003
+INFO 2025-09-08 14:09:38 celerate.py:281 step:4K smpl:256K ep:2K epch:0.94 loss:0.111 grdn:0.401 lr:8.5e-05 updt_s:0
+.511 data_s:0.003
+INFO 2025-09-08 14:11:21 celerate.py:281 step:4K smpl:269K ep:2K epch:0.98 loss:0.110 grdn:0.380 lr:8.3e-05 updt_s:0
+.511 data_s:0.003
+INFO 2025-09-08 14:13:08 celerate.py:281 step:4K smpl:282K ep:2K epch:1.03 loss:0.109 grdn:0.381 lr:8.2e-05 updt_s:0
+.413 data_s:0.119
+INFO 2025-09-08 14:14:52 celerate.py:281 step:5K smpl:294K ep:2K epch:1.08 loss:0.107 grdn:0.387 lr:8.0e-05 updt_s:0
+.373 data_s:0.146
+INFO 2025-09-08 14:16:36 celerate.py:281 step:5K smpl:307K ep:2K epch:1.12 loss:0.107 grdn:0.366 lr:7.8e-05 updt_s:0
+.446 data_s:0.072
+INFO 2025-09-08 14:18:19 celerate.py:281 step:5K smpl:320K ep:2K epch:1.17 loss:0.105 grdn:0.347 lr:7.6e-05 updt_s:0
+.468 data_s:0.045
+INFO 2025-09-08 14:20:01 celerate.py:281 step:5K smpl:333K ep:2K epch:1.22 loss:0.103 grdn:0.350 lr:7.5e-05 updt_s:0
+.510 data_s:0.003
+INFO 2025-09-08 14:21:46 celerate.py:281 step:5K smpl:346K ep:2K epch:1.26 loss:0.101 grdn:0.336 lr:7.3e-05 updt_s:0
+.512 data_s:0.011
+INFO 2025-09-08 14:23:30 celerate.py:281 step:6K smpl:358K ep:2K epch:1.31 loss:0.102 grdn:0.345 lr:7.1e-05 updt_s:0
+.515 data_s:0.003
+INFO 2025-09-08 14:25:15 celerate.py:281 step:6K smpl:371K ep:2K epch:1.36 loss:0.100 grdn:0.333 lr:6.9e-05 updt_s:0
+.521 data_s:0.003
+INFO 2025-09-08 14:26:59 celerate.py:281 step:6K smpl:384K ep:2K epch:1.40 loss:0.100 grdn:0.328 lr:6.7e-05 updt_s:0
+.516 data_s:0.003
+INFO 2025-09-08 14:28:43 celerate.py:281 step:6K smpl:397K ep:2K epch:1.45 loss:0.099 grdn:0.319 lr:6.5e-05 updt_s:0
+.512 data_s:0.003
+INFO 2025-09-08 14:30:26 celerate.py:281 step:6K smpl:410K ep:3K epch:1.50 loss:0.098 grdn:0.313 lr:6.3e-05 updt_s:0
+.515 data_s:0.003
+INFO 2025-09-08 14:32:11 celerate.py:281 step:7K smpl:422K ep:3K epch:1.54 loss:0.097 grdn:0.319 lr:6.1e-05 updt_s:0
+.519 data_s:0.004
+INFO 2025-09-08 14:33:55 celerate.py:281 step:7K smpl:435K ep:3K epch:1.59 loss:0.097 grdn:0.312 lr:5.9e-05 updt_s:0
+.506 data_s:0.010
+INFO 2025-09-08 14:35:39 celerate.py:281 step:7K smpl:448K ep:3K epch:1.64 loss:0.097 grdn:0.307 lr:5.7e-05 updt_s:0
+.516 data_s:0.003
+INFO 2025-09-08 14:37:23 celerate.py:281 step:7K smpl:461K ep:3K epch:1.69 loss:0.095 grdn:0.294 lr:5.5e-05 updt_s:0
+.518 data_s:0.003
+INFO 2025-09-08 14:39:07 celerate.py:281 step:7K smpl:474K ep:3K epch:1.73 loss:0.095 grdn:0.299 lr:5.3e-05 updt_s:0
+.507 data_s:0.007
+INFO 2025-09-08 14:40:52 celerate.py:281 step:8K smpl:486K ep:3K epch:1.78 loss:0.094 grdn:0.283 lr:5.1e-05 updt_s:0
+.523 data_s:0.003
+INFO 2025-09-08 14:42:36 celerate.py:281 step:8K smpl:499K ep:3K epch:1.83 loss:0.093 grdn:0.284 lr:4.9e-05 updt_s:0
+.517 data_s:0.003
+INFO 2025-09-08 14:44:22 celerate.py:281 step:8K smpl:512K ep:3K epch:1.87 loss:0.092 grdn:0.284 lr:4.7e-05 updt_s:0
+.465 data_s:0.060
+INFO 2025-09-08 14:46:06 celerate.py:281 step:8K smpl:525K ep:3K epch:1.92 loss:0.093 grdn:0.292 lr:4.5e-05 updt_s:0
+.456 data_s:0.066
+INFO 2025-09-08 14:47:49 celerate.py:281 step:8K smpl:538K ep:3K epch:1.97 loss:0.093 grdn:0.290 lr:4.3e-05 updt_s:0
+.510 data_s:0.003
+INFO 2025-09-08 14:49:37 celerate.py:281 step:9K smpl:550K ep:3K epch:2.01 loss:0.092 grdn:0.283 lr:4.1e-05 updt_s:0
+.419 data_s:0.117
+INFO 2025-09-08 14:51:20 celerate.py:281 step:9K smpl:563K ep:3K epch:2.06 loss:0.092 grdn:0.275 lr:3.9e-05 updt_s:0
+.463 data_s:0.053
+INFO 2025-09-08 14:53:05 celerate.py:281 step:9K smpl:576K ep:4K epch:2.11 loss:0.090 grdn:0.272 lr:3.7e-05 updt_s:0
+.517 data_s:0.003
+INFO 2025-09-08 14:54:49 celerate.py:281 step:9K smpl:589K ep:4K epch:2.15 loss:0.090 grdn:0.268 lr:3.5e-05 updt_s:0
+.506 data_s:0.013
+INFO 2025-09-08 14:56:32 celerate.py:281 step:9K smpl:602K ep:4K epch:2.20 loss:0.090 grdn:0.271 lr:3.3e-05 updt_s:0
+.513 data_s:0.003
+INFO 2025-09-08 14:58:17 celerate.py:281 step:10K smpl:614K ep:4K epch:2.25 loss:0.090 grdn:0.268 lr:3.1e-05 updt_s:
+0.520 data_s:0.003
+INFO 2025-09-08 15:00:02 celerate.py:281 step:10K smpl:627K ep:4K epch:2.29 loss:0.089 grdn:0.261 lr:3.0e-05 updt_s:
+0.519 data_s:0.003
+INFO 2025-09-08 15:01:48 celerate.py:281 step:10K smpl:640K ep:4K epch:2.34 loss:0.090 grdn:0.271 lr:2.8e-05 updt_s:
+0.526 data_s:0.003
+INFO 2025-09-08 15:03:33 celerate.py:281 step:10K smpl:653K ep:4K epch:2.39 loss:0.089 grdn:0.262 lr:2.6e-05 updt_s:
+0.521 data_s:0.003
+INFO 2025-09-08 15:05:18 celerate.py:281 step:10K smpl:666K ep:4K epch:2.43 loss:0.090 grdn:0.264 lr:2.4e-05 updt_s:
+0.519 data_s:0.003
+INFO 2025-09-08 15:07:32 celerate.py:281 step:11K smpl:678K ep:4K epch:2.48 loss:0.089 grdn:0.255 lr:2.3e-05 updt_s:
+0.663 data_s:0.004
+INFO 2025-09-08 15:09:21 celerate.py:281 step:11K smpl:691K ep:4K epch:2.53 loss:0.090 grdn:0.263 lr:2.1e-05 updt_s:
+0.514 data_s:0.030
+INFO 2025-09-08 15:11:06 celerate.py:281 step:11K smpl:704K ep:4K epch:2.57 loss:0.088 grdn:0.254 lr:1.9e-05 updt_s:
+0.517 data_s:0.006
+INFO 2025-09-08 15:12:51 celerate.py:281 step:11K smpl:717K ep:4K epch:2.62 loss:0.088 grdn:0.252 lr:1.8e-05 updt_s:
+0.517 data_s:0.005
+INFO 2025-09-08 15:14:38 celerate.py:281 step:11K smpl:730K ep:5K epch:2.67 loss:0.088 grdn:0.251 lr:1.6e-05 updt_s:
+0.532 data_s:0.003
+INFO 2025-09-08 15:16:23 celerate.py:281 step:12K smpl:742K ep:5K epch:2.71 loss:0.088 grdn:0.253 lr:1.5e-05 updt_s:
+0.520 data_s:0.003
+INFO 2025-09-08 15:18:08 celerate.py:281 step:12K smpl:755K ep:5K epch:2.76 loss:0.087 grdn:0.244 lr:1.4e-05 updt_s:
+0.521 data_s:0.003
+INFO 2025-09-08 15:19:54 celerate.py:281 step:12K smpl:768K ep:5K epch:2.81 loss:0.088 grdn:0.247 lr:1.2e-05 updt_s:
+0.524 data_s:0.003
+INFO 2025-09-08 15:21:39 celerate.py:281 step:12K smpl:781K ep:5K epch:2.86 loss:0.087 grdn:0.242 lr:1.1e-05 updt_s:
+0.520 data_s:0.003
+INFO 2025-09-08 15:23:32 celerate.py:281 step:12K smpl:794K ep:5K epch:2.90 loss:0.088 grdn:0.243 lr:1.0e-05 updt_s:
+0.560 data_s:0.003
+INFO 2025-09-08 15:25:48 celerate.py:281 step:13K smpl:806K ep:5K epch:2.95 loss:0.087 grdn:0.240 lr:9.0e-06 updt_s:
+0.674 data_s:0.005
+INFO 2025-09-08 15:28:02 celerate.py:281 step:13K smpl:819K ep:5K epch:3.00 loss:0.088 grdn:0.245 lr:8.0e-06 updt_s:
+0.662 data_s:0.004
+INFO 2025-09-08 15:31:06 celerate.py:281 step:13K smpl:832K ep:5K epch:3.04 loss:0.086 grdn:0.236 lr:7.1e-06 updt_s:
+0.688 data_s:0.231
+INFO 2025-09-08 15:32:52 celerate.py:281 step:13K smpl:845K ep:5K epch:3.09 loss:0.087 grdn:0.231 lr:6.3e-06 updt_s:
+0.521 data_s:0.003
+INFO 2025-09-08 15:35:46 celerate.py:281 step:13K smpl:858K ep:5K epch:3.14 loss:0.088 grdn:0.235 lr:5.6e-06 updt_s:
+0.637 data_s:0.232
+INFO 2025-09-08 15:37:34 celerate.py:281 step:14K smpl:870K ep:5K epch:3.18 loss:0.087 grdn:0.238 lr:4.9e-06 updt_s:
+0.514 data_s:0.025
+INFO 2025-09-08 15:39:18 celerate.py:281 step:14K smpl:883K ep:5K epch:3.23 loss:0.087 grdn:0.226 lr:4.3e-06 updt_s:
+0.515 data_s:0.003
+INFO 2025-09-08 15:41:02 celerate.py:281 step:14K smpl:896K ep:6K epch:3.28 loss:0.087 grdn:0.230 lr:3.8e-06 updt_s:
+0.515 data_s:0.003
+INFO 2025-09-08 15:42:45 celerate.py:281 step:14K smpl:909K ep:6K epch:3.32 loss:0.086 grdn:0.229 lr:3.4e-06 updt_s:
+0.507 data_s:0.008
+INFO 2025-09-08 15:44:29 celerate.py:281 step:14K smpl:922K ep:6K epch:3.37 loss:0.087 grdn:0.229 lr:3.0e-06 updt_s:
+0.515 data_s:0.003
+INFO 2025-09-08 15:46:12 celerate.py:281 step:15K smpl:934K ep:6K epch:3.42 loss:0.087 grdn:0.228 lr:2.8e-06 updt_s:
+0.502 data_s:0.011
+INFO 2025-09-08 15:47:56 celerate.py:281 step:15K smpl:947K ep:6K epch:3.46 loss:0.086 grdn:0.232 lr:2.6e-06 updt_s:
+0.515 data_s:0.004
+INFO 2025-09-08 15:49:39 celerate.py:281 step:15K smpl:960K ep:6K epch:3.51 loss:0.087 grdn:0.234 lr:2.5e-06 updt_s:
+0.365 data_s:0.147
+INFO 2025-09-08 15:51:22 celerate.py:281 step:15K smpl:973K ep:6K epch:3.56 loss:0.086 grdn:0.230 lr:2.5e-06 updt_s:
+0.333 data_s:0.179
+INFO 2025-09-08 15:53:07 celerate.py:281 step:15K smpl:986K ep:6K epch:3.60 loss:0.087 grdn:0.229 lr:2.5e-06 updt_s:
+0.357 data_s:0.164
+INFO 2025-09-08 15:54:50 celerate.py:281 step:16K smpl:998K ep:6K epch:3.65 loss:0.087 grdn:0.230 lr:2.5e-06 updt_s:
+0.365 data_s:0.151
+INFO 2025-09-08 15:56:35 celerate.py:281 step:16K smpl:1M ep:6K epch:3.70 loss:0.086 grdn:0.228 lr:2.5e-06 updt_s:0.
+450 data_s:0.071
+INFO 2025-09-08 15:58:19 celerate.py:281 step:16K smpl:1M ep:6K epch:3.74 loss:0.087 grdn:0.232 lr:2.5e-06 updt_s:0.
+495 data_s:0.023
+INFO 2025-09-08 16:00:04 celerate.py:281 step:16K smpl:1M ep:6K epch:3.79 loss:0.087 grdn:0.227 lr:2.5e-06 updt_s:0.
+393 data_s:0.131
+INFO 2025-09-08 16:01:48 celerate.py:281 step:16K smpl:1M ep:6K epch:3.84 loss:0.086 grdn:0.230 lr:2.5e-06 updt_s:0.
+515 data_s:0.003
+INFO 2025-09-08 16:03:32 celerate.py:281 step:17K smpl:1M ep:7K epch:3.88 loss:0.087 grdn:0.231 lr:2.5e-06 updt_s:0.
+518 data_s:0.003
+INFO 2025-09-08 16:05:17 celerate.py:281 step:17K smpl:1M ep:7K epch:3.93 loss:0.087 grdn:0.231 lr:2.5e-06 updt_s:0.
+397 data_s:0.125
+INFO 2025-09-08 16:12:41 celerate.py:281 step:17K smpl:1M ep:7K epch:3.98 loss:0.086 grdn:0.230 lr:2.5e-06 updt_s:1.
+496 data_s:0.719
+INFO 2025-09-08 16:14:28 celerate.py:281 step:17K smpl:1M ep:7K epch:4.03 loss:0.087 grdn:0.228 lr:2.5e-06 updt_s:0.
+413 data_s:0.124
+INFO 2025-09-08 16:16:13 celerate.py:281 step:17K smpl:1M ep:7K epch:4.07 loss:0.087 grdn:0.234 lr:2.5e-06 updt_s:0.
+469 data_s:0.050
+INFO 2025-09-08 16:17:57 celerate.py:281 step:18K smpl:1M ep:7K epch:4.12 loss:0.087 grdn:0.228 lr:2.5e-06 updt_s:0.
+375 data_s:0.145
+INFO 2025-09-08 16:19:42 celerate.py:281 step:18K smpl:1M ep:7K epch:4.17 loss:0.086 grdn:0.230 lr:2.5e-06 updt_s:0.
+333 data_s:0.189
+INFO 2025-09-08 16:21:26 celerate.py:281 step:18K smpl:1M ep:7K epch:4.21 loss:0.087 grdn:0.231 lr:2.5e-06 updt_s:0.
+334 data_s:0.184
+INFO 2025-09-08 16:23:09 celerate.py:281 step:18K smpl:1M ep:7K epch:4.26 loss:0.086 grdn:0.235 lr:2.5e-06 updt_s:0.
+331 data_s:0.185
+INFO 2025-09-08 16:24:53 celerate.py:281 step:18K smpl:1M ep:7K epch:4.31 loss:0.088 grdn:0.236 lr:2.5e-06 updt_s:0.
+333 data_s:0.182
+INFO 2025-09-08 16:26:38 celerate.py:281 step:19K smpl:1M ep:7K epch:4.35 loss:0.086 grdn:0.230 lr:2.5e-06 updt_s:0.
+337 data_s:0.188
+INFO 2025-09-08 16:28:22 celerate.py:281 step:19K smpl:1M ep:7K epch:4.40 loss:0.086 grdn:0.231 lr:2.5e-06 updt_s:0.
+420 data_s:0.099
+INFO 2025-09-08 16:30:06 celerate.py:281 step:19K smpl:1M ep:8K epch:4.45 loss:0.086 grdn:0.232 lr:2.5e-06 updt_s:0.
+444 data_s:0.075
+INFO 2025-09-08 16:31:49 celerate.py:281 step:19K smpl:1M ep:8K epch:4.49 loss:0.087 grdn:0.235 lr:2.5e-06 updt_s:0.
+475 data_s:0.036
+INFO 2025-09-08 16:33:33 celerate.py:281 step:19K smpl:1M ep:8K epch:4.54 loss:0.087 grdn:0.236 lr:2.5e-06 updt_s:0.
+379 data_s:0.139
+INFO 2025-09-08 16:35:17 celerate.py:281 step:20K smpl:1M ep:8K epch:4.59 loss:0.087 grdn:0.237 lr:2.5e-06 updt_s:0.
+348 data_s:0.171
+INFO 2025-09-08 16:37:01 celerate.py:281 step:20K smpl:1M ep:8K epch:4.63 loss:0.087 grdn:0.231 lr:2.5e-06 updt_s:0.
+332 data_s:0.185
+/home/jade_choghari/miniconda3/envs/lerobot/lib/python3.10/site-packages/torch/distributed/distributed_c10d.py:4631:
+ UserWarning: No device id is provided via `init_process_group` or `barrier `. Using the current device set by the u
+ser.
+  warnings.warn(  # warn only once
+INFO 2025-09-08 16:38:46 celerate.py:281 step:20K smpl:1M ep:8K epch:4.68 loss:0.086 grdn:0.228 lr:2.5e-06 updt_s:0.
+486 data_s:0.037
+INFO 2025-09-08 16:38:46 celerate.py:295 Checkpoint policy after step 20000
+INFO 2025-09-08 16:40:30 celerate.py:281 step:20K smpl:1M ep:8K epch:4.73 loss:0.086 grdn:0.232 lr:2.5e-06 updt_s:0.
+509 data_s:0.003
+INFO 2025-09-08 16:42:16 celerate.py:281 step:20K smpl:1M ep:8K epch:4.77 loss:0.086 grdn:0.237 lr:2.5e-06 updt_s:0.
+527 data_s:0.003
+INFO 2025-09-08 16:44:01 celerate.py:281 step:21K smpl:1M ep:8K epch:4.82 loss:0.086 grdn:0.238 lr:2.5e-06 updt_s:0.
+519 data_s:0.003
+INFO 2025-09-08 16:45:45 celerate.py:281 step:21K smpl:1M ep:8K epch:4.87 loss:0.087 grdn:0.235 lr:2.5e-06 updt_s:0.
+504 data_s:0.013
+INFO 2025-09-08 16:47:29 celerate.py:281 step:21K smpl:1M ep:8K epch:4.91 loss:0.087 grdn:0.233 lr:2.5e-06 updt_s:0.
+509 data_s:0.011
+INFO 2025-09-08 16:49:19 celerate.py:281 step:21K smpl:1M ep:8K epch:4.96 loss:0.086 grdn:0.234 lr:2.5e-06 updt_s:0.
+544 data_s:0.003
+INFO 2025-09-08 16:51:04 celerate.py:281 step:21K smpl:1M ep:8K epch:5.01 loss:0.086 grdn:0.225 lr:2.5e-06 updt_s:0.
+488 data_s:0.039
+INFO 2025-09-08 16:52:51 celerate.py:281 step:22K smpl:1M ep:9K epch:5.06 loss:0.086 grdn:0.232 lr:2.5e-06 updt_s:0.
+430 data_s:0.099
+INFO 2025-09-08 16:54:36 celerate.py:281 step:22K smpl:1M ep:9K epch:5.10 loss:0.087 grdn:0.236 lr:2.5e-06 updt_s:0.
+521 data_s:0.003
+INFO 2025-09-08 16:56:23 celerate.py:281 step:22K smpl:1M ep:9K epch:5.15 loss:0.086 grdn:0.233 lr:2.5e-06 updt_s:0.
+521 data_s:0.014
+INFO 2025-09-08 16:58:09 celerate.py:281 step:22K smpl:1M ep:9K epch:5.20 loss:0.087 grdn:0.234 lr:2.5e-06 updt_s:0.
+525 data_s:0.003
+INFO 2025-09-08 17:00:04 celerate.py:281 step:22K smpl:1M ep:9K epch:5.24 loss:0.086 grdn:0.233 lr:2.5e-06 updt_s:0.
+568 data_s:0.003
+INFO 2025-09-08 17:02:00 celerate.py:281 step:23K smpl:1M ep:9K epch:5.29 loss:0.087 grdn:0.238 lr:2.5e-06 updt_s:0.
+575 data_s:0.003
+INFO 2025-09-08 17:03:49 celerate.py:281 step:23K smpl:1M ep:9K epch:5.34 loss:0.087 grdn:0.233 lr:2.5e-06 updt_s:0.
+513 data_s:0.030
+INFO 2025-09-08 17:05:39 celerate.py:281 step:23K smpl:1M ep:9K epch:5.38 loss:0.085 grdn:0.227 lr:2.5e-06 updt_s:0.
+523 data_s:0.027
+INFO 2025-09-08 17:07:26 celerate.py:281 step:23K smpl:1M ep:9K epch:5.43 loss:0.086 grdn:0.233 lr:2.5e-06 updt_s:0.
+529 data_s:0.003
+INFO 2025-09-08 17:09:12 celerate.py:281 step:23K smpl:1M ep:9K epch:5.48 loss:0.086 grdn:0.232 lr:2.5e-06 updt_s:0.
+526 data_s:0.003
+INFO 2025-09-08 17:10:55 celerate.py:281 step:24K smpl:2M ep:9K epch:5.52 loss:0.087 grdn:0.230 lr:2.5e-06 updt_s:0.
+443 data_s:0.072
+INFO 2025-09-08 17:12:40 celerate.py:281 step:24K smpl:2M ep:9K epch:5.57 loss:0.087 grdn:0.229 lr:2.5e-06 updt_s:0.
+518 data_s:0.004
+INFO 2025-09-08 17:14:25 celerate.py:281 step:24K smpl:2M ep:10K epch:5.62 loss:0.087 grdn:0.232 lr:2.5e-06 updt_s:0
+.521 data_s:0.003
+INFO 2025-09-08 17:16:11 celerate.py:281 step:24K smpl:2M ep:10K epch:5.66 loss:0.086 grdn:0.230 lr:2.5e-06 updt_s:0
+.523 data_s:0.003
+INFO 2025-09-08 17:17:55 celerate.py:281 step:24K smpl:2M ep:10K epch:5.71 loss:0.086 grdn:0.228 lr:2.5e-06 updt_s:0
+.515 data_s:0.005
+INFO 2025-09-08 17:19:39 celerate.py:281 step:25K smpl:2M ep:10K epch:5.76 loss:0.087 grdn:0.229 lr:2.5e-06 updt_s:0
+.415 data_s:0.106
+INFO 2025-09-08 17:21:24 celerate.py:281 step:25K smpl:2M ep:10K epch:5.80 loss:0.086 grdn:0.234 lr:2.5e-06 updt_s:0
+.507 data_s:0.016
+INFO 2025-09-08 17:23:08 celerate.py:281 step:25K smpl:2M ep:10K epch:5.85 loss:0.085 grdn:0.229 lr:2.5e-06 updt_s:0
+.514 data_s:0.003
+INFO 2025-09-08 17:24:54 celerate.py:281 step:25K smpl:2M ep:10K epch:5.90 loss:0.087 grdn:0.227 lr:2.5e-06 updt_s:0
+.518 data_s:0.008
+INFO 2025-09-08 17:26:41 celerate.py:281 step:25K smpl:2M ep:10K epch:5.94 loss:0.087 grdn:0.237 lr:2.5e-06 updt_s:0
+.529 data_s:0.003
+INFO 2025-09-08 17:28:24 celerate.py:281 step:26K smpl:2M ep:10K epch:5.99 loss:0.087 grdn:0.232 lr:2.5e-06 updt_s:0
+.513 data_s:0.003
+INFO 2025-09-08 17:30:11 celerate.py:281 step:26K smpl:2M ep:10K epch:6.04 loss:0.087 grdn:0.233 lr:2.5e-06 updt_s:0
+.370 data_s:0.164
+INFO 2025-09-08 17:31:55 celerate.py:281 step:26K smpl:2M ep:10K epch:6.08 loss:0.087 grdn:0.231 lr:2.5e-06 updt_s:0
+.385 data_s:0.132
+INFO 2025-09-08 17:33:39 celerate.py:281 step:26K smpl:2M ep:10K epch:6.13 loss:0.087 grdn:0.237 lr:2.5e-06 updt_s:0
+.450 data_s:0.069
+INFO 2025-09-08 17:35:24 celerate.py:281 step:26K smpl:2M ep:10K epch:6.18 loss:0.087 grdn:0.238 lr:2.5e-06 updt_s:0
+.468 data_s:0.052
+INFO 2025-09-08 17:37:07 celerate.py:281 step:27K smpl:2M ep:11K epch:6.23 loss:0.087 grdn:0.233 lr:2.5e-06 updt_s:0
+.514 data_s:0.004
+INFO 2025-09-08 17:38:52 celerate.py:281 step:27K smpl:2M ep:11K epch:6.27 loss:0.087 grdn:0.236 lr:2.5e-06 updt_s:0
+.519 data_s:0.003
+INFO 2025-09-08 17:40:40 celerate.py:281 step:27K smpl:2M ep:11K epch:6.32 loss:0.086 grdn:0.234 lr:2.5e-06 updt_s:0
+.534 data_s:0.003
+INFO 2025-09-08 17:42:57 celerate.py:281 step:27K smpl:2M ep:11K epch:6.37 loss:0.087 grdn:0.231 lr:2.5e-06 updt_s:0
+.678 data_s:0.007
+INFO 2025-09-08 17:46:13 celerate.py:281 step:27K smpl:2M ep:11K epch:6.41 loss:0.086 grdn:0.233 lr:2.5e-06 updt_s:0
+.968 data_s:0.009
+INFO 2025-09-08 17:49:20 celerate.py:281 step:28K smpl:2M ep:11K epch:6.46 loss:0.087 grdn:0.231 lr:2.5e-06 updt_s:0
+.895 data_s:0.037
+INFO 2025-09-08 17:51:22 celerate.py:281 step:28K smpl:2M ep:11K epch:6.51 loss:0.087 grdn:0.235 lr:2.5e-06 updt_s:0
+.604 data_s:0.003
+INFO 2025-09-08 17:53:07 celerate.py:281 step:28K smpl:2M ep:11K epch:6.55 loss:0.086 grdn:0.236 lr:2.5e-06 updt_s:0
+.521 data_s:0.003
+INFO 2025-09-08 17:54:51 celerate.py:281 step:28K smpl:2M ep:11K epch:6.60 loss:0.087 grdn:0.234 lr:2.5e-06 updt_s:0
+.516 data_s:0.003
+INFO 2025-09-08 17:56:36 celerate.py:281 step:28K smpl:2M ep:11K epch:6.65 loss:0.086 grdn:0.232 lr:2.5e-06 updt_s:0
+.519 data_s:0.003
+INFO 2025-09-08 17:58:21 celerate.py:281 step:29K smpl:2M ep:11K epch:6.69 loss:0.085 grdn:0.228 lr:2.5e-06 updt_s:0
+.521 data_s:0.003
+INFO 2025-09-08 18:00:06 celerate.py:281 step:29K smpl:2M ep:11K epch:6.74 loss:0.087 grdn:0.237 lr:2.5e-06 updt_s:0
+.513 data_s:0.011
+INFO 2025-09-08 18:01:50 celerate.py:281 step:29K smpl:2M ep:11K epch:6.79 loss:0.086 grdn:0.232 lr:2.5e-06 updt_s:0
+.476 data_s:0.041
+INFO 2025-09-08 18:03:34 celerate.py:281 step:29K smpl:2M ep:12K epch:6.83 loss:0.086 grdn:0.236 lr:2.5e-06 updt_s:0
+.506 data_s:0.012
+INFO 2025-09-08 18:05:21 celerate.py:281 step:29K smpl:2M ep:12K epch:6.88 loss:0.086 grdn:0.229 lr:2.5e-06 updt_s:0
+.455 data_s:0.075
+INFO 2025-09-08 18:07:04 celerate.py:281 step:30K smpl:2M ep:12K epch:6.93 loss:0.086 grdn:0.233 lr:2.5e-06 updt_s:0
+.514 data_s:0.003
+INFO 2025-09-08 18:08:47 celerate.py:281 step:30K smpl:2M ep:12K epch:6.97 loss:0.086 grdn:0.236 lr:2.5e-06 updt_s:0
+.509 data_s:0.003
+INFO 2025-09-08 18:10:33 celerate.py:281 step:30K smpl:2M ep:12K epch:7.02 loss:0.087 grdn:0.237 lr:2.5e-06 updt_s:0
+.422 data_s:0.105
+INFO 2025-09-08 18:12:19 celerate.py:281 step:30K smpl:2M ep:12K epch:7.07 loss:0.087 grdn:0.236 lr:2.5e-06 updt_s:0
+.347 data_s:0.182
+INFO 2025-09-08 18:14:05 celerate.py:281 step:30K smpl:2M ep:12K epch:7.11 loss:0.086 grdn:0.231 lr:2.5e-06 updt_s:0
+.473 data_s:0.053
+INFO 2025-09-08 18:15:52 celerate.py:281 step:31K smpl:2M ep:12K epch:7.16 loss:0.086 grdn:0.238 lr:2.5e-06 updt_s:0
+.531 data_s:0.005
+INFO 2025-09-08 18:17:37 celerate.py:281 step:31K smpl:2M ep:12K epch:7.21 loss:0.086 grdn:0.235 lr:2.5e-06 updt_s:0
+.520 data_s:0.003
+INFO 2025-09-08 18:19:22 celerate.py:281 step:31K smpl:2M ep:12K epch:7.26 loss:0.086 grdn:0.231 lr:2.5e-06 updt_s:0
+.500 data_s:0.020
+INFO 2025-09-08 18:21:06 celerate.py:281 step:31K smpl:2M ep:12K epch:7.30 loss:0.087 grdn:0.243 lr:2.5e-06 updt_s:0
+.511 data_s:0.009
+INFO 2025-09-08 18:22:50 celerate.py:281 step:31K smpl:2M ep:12K epch:7.35 loss:0.087 grdn:0.227 lr:2.5e-06 updt_s:0
+.518 data_s:0.003
+INFO 2025-09-08 18:24:33 celerate.py:281 step:32K smpl:2M ep:13K epch:7.40 loss:0.086 grdn:0.235 lr:2.5e-06 updt_s:0
+.507 data_s:0.007
+INFO 2025-09-08 18:26:16 celerate.py:281 step:32K smpl:2M ep:13K epch:7.44 loss:0.087 grdn:0.238 lr:2.5e-06 updt_s:0
+.463 data_s:0.047
+INFO 2025-09-08 18:27:59 celerate.py:281 step:32K smpl:2M ep:13K epch:7.49 loss:0.087 grdn:0.240 lr:2.5e-06 updt_s:0
+.509 data_s:0.007
+INFO 2025-09-08 18:29:43 celerate.py:281 step:32K smpl:2M ep:13K epch:7.54 loss:0.087 grdn:0.234 lr:2.5e-06 updt_s:0
+.514 data_s:0.003
+INFO 2025-09-08 18:31:26 celerate.py:281 step:32K smpl:2M ep:13K epch:7.58 loss:0.086 grdn:0.234 lr:2.5e-06 updt_s:0
+.511 data_s:0.003
+INFO 2025-09-08 18:33:11 celerate.py:281 step:33K smpl:2M ep:13K epch:7.63 loss:0.086 grdn:0.233 lr:2.5e-06 updt_s:0
+.518 data_s:0.003
+INFO 2025-09-08 18:34:57 celerate.py:281 step:33K smpl:2M ep:13K epch:7.68 loss:0.087 grdn:0.236 lr:2.5e-06 updt_s:0
+.512 data_s:0.016
+INFO 2025-09-08 18:36:41 celerate.py:281 step:33K smpl:2M ep:13K epch:7.72 loss:0.086 grdn:0.229 lr:2.5e-06 updt_s:0
+.517 data_s:0.003
+INFO 2025-09-08 18:38:25 celerate.py:281 step:33K smpl:2M ep:13K epch:7.77 loss:0.087 grdn:0.236 lr:2.5e-06 updt_s:0
+.516 data_s:0.003
+INFO 2025-09-08 18:40:07 celerate.py:281 step:33K smpl:2M ep:13K epch:7.82 loss:0.086 grdn:0.235 lr:2.5e-06 updt_s:0
+.506 data_s:0.003
+INFO 2025-09-08 18:41:51 celerate.py:281 step:34K smpl:2M ep:13K epch:7.86 loss:0.087 grdn:0.234 lr:2.5e-06 updt_s:0
+.509 data_s:0.006
+INFO 2025-09-08 18:43:36 celerate.py:281 step:34K smpl:2M ep:13K epch:7.91 loss:0.086 grdn:0.239 lr:2.5e-06 updt_s:0
+.521 data_s:0.003
+INFO 2025-09-08 18:45:19 celerate.py:281 step:34K smpl:2M ep:13K epch:7.96 loss:0.086 grdn:0.233 lr:2.5e-06 updt_s:0
+.511 data_s:0.003
+INFO 2025-09-08 18:47:05 celerate.py:281 step:34K smpl:2M ep:14K epch:8.00 loss:0.086 grdn:0.232 lr:2.5e-06 updt_s:0
+.495 data_s:0.035
+INFO 2025-09-08 18:48:51 celerate.py:281 step:34K smpl:2M ep:14K epch:8.05 loss:0.087 grdn:0.243 lr:2.5e-06 updt_s:0
+.413 data_s:0.112
+INFO 2025-09-08 18:50:34 celerate.py:281 step:35K smpl:2M ep:14K epch:8.10 loss:0.086 grdn:0.237 lr:2.5e-06 updt_s:0
+.515 data_s:0.003
+INFO 2025-09-08 18:52:19 celerate.py:281 step:35K smpl:2M ep:14K epch:8.14 loss:0.087 grdn:0.237 lr:2.5e-06 updt_s:0
+.520 data_s:0.003
+INFO 2025-09-08 18:54:03 celerate.py:281 step:35K smpl:2M ep:14K epch:8.19 loss:0.087 grdn:0.231 lr:2.5e-06 updt_s:0
+.515 data_s:0.003
+INFO 2025-09-08 18:55:48 celerate.py:281 step:35K smpl:2M ep:14K epch:8.24 loss:0.086 grdn:0.231 lr:2.5e-06 updt_s:0
+.420 data_s:0.101
+INFO 2025-09-08 18:57:33 celerate.py:281 step:35K smpl:2M ep:14K epch:8.28 loss:0.086 grdn:0.237 lr:2.5e-06 updt_s:0
+.506 data_s:0.022
+INFO 2025-09-08 18:59:19 celerate.py:281 step:36K smpl:2M ep:14K epch:8.33 loss:0.087 grdn:0.235 lr:2.5e-06 updt_s:0
+.525 data_s:0.003
+INFO 2025-09-08 19:01:03 celerate.py:281 step:36K smpl:2M ep:14K epch:8.38 loss:0.086 grdn:0.228 lr:2.5e-06 updt_s:0
+.516 data_s:0.003
+INFO 2025-09-08 19:02:48 celerate.py:281 step:36K smpl:2M ep:14K epch:8.43 loss:0.086 grdn:0.232 lr:2.5e-06 updt_s:0
+.516 data_s:0.003
+INFO 2025-09-08 19:04:32 celerate.py:281 step:36K smpl:2M ep:14K epch:8.47 loss:0.087 grdn:0.235 lr:2.5e-06 updt_s:0
+.505 data_s:0.013
+INFO 2025-09-08 19:06:15 celerate.py:281 step:36K smpl:2M ep:14K epch:8.52 loss:0.086 grdn:0.232 lr:2.5e-06 updt_s:0
+.384 data_s:0.130
+INFO 2025-09-08 19:07:58 celerate.py:281 step:37K smpl:2M ep:15K epch:8.57 loss:0.087 grdn:0.235 lr:2.5e-06 updt_s:0
+.430 data_s:0.084
+INFO 2025-09-08 19:09:41 celerate.py:281 step:37K smpl:2M ep:15K epch:8.61 loss:0.087 grdn:0.234 lr:2.5e-06 updt_s:0
+.351 data_s:0.162
+INFO 2025-09-08 19:11:25 celerate.py:281 step:37K smpl:2M ep:15K epch:8.66 loss:0.086 grdn:0.239 lr:2.5e-06 updt_s:0
+.337 data_s:0.181
+INFO 2025-09-08 19:13:08 celerate.py:281 step:37K smpl:2M ep:15K epch:8.71 loss:0.086 grdn:0.232 lr:2.5e-06 updt_s:0
+.336 data_s:0.177
+INFO 2025-09-08 19:14:52 celerate.py:281 step:37K smpl:2M ep:15K epch:8.75 loss:0.086 grdn:0.235 lr:2.5e-06 updt_s:0
+.344 data_s:0.174
+INFO 2025-09-08 19:16:35 celerate.py:281 step:38K smpl:2M ep:15K epch:8.80 loss:0.086 grdn:0.231 lr:2.5e-06 updt_s:0
+.332 data_s:0.182
+INFO 2025-09-08 19:18:18 celerate.py:281 step:38K smpl:2M ep:15K epch:8.85 loss:0.086 grdn:0.234 lr:2.5e-06 updt_s:0
+.332 data_s:0.182
+INFO 2025-09-08 19:20:01 celerate.py:281 step:38K smpl:2M ep:15K epch:8.89 loss:0.087 grdn:0.233 lr:2.5e-06 updt_s:0
+.337 data_s:0.177
+INFO 2025-09-08 19:21:45 celerate.py:281 step:38K smpl:2M ep:15K epch:8.94 loss:0.086 grdn:0.234 lr:2.5e-06 updt_s:0
+.495 data_s:0.022
+INFO 2025-09-08 19:23:29 celerate.py:281 step:38K smpl:2M ep:15K epch:8.99 loss:0.086 grdn:0.235 lr:2.5e-06 updt_s:0
+.513 data_s:0.003
+INFO 2025-09-08 19:25:15 celerate.py:281 step:39K smpl:2M ep:15K epch:9.03 loss:0.086 grdn:0.234 lr:2.5e-06 updt_s:0
+.436 data_s:0.097
+INFO 2025-09-08 19:26:59 celerate.py:281 step:39K smpl:2M ep:15K epch:9.08 loss:0.086 grdn:0.237 lr:2.5e-06 updt_s:0
+.378 data_s:0.138
+INFO 2025-09-08 19:28:43 celerate.py:281 step:39K smpl:2M ep:15K epch:9.13 loss:0.087 grdn:0.241 lr:2.5e-06 updt_s:0
+.497 data_s:0.023
+INFO 2025-09-08 19:30:27 celerate.py:281 step:39K smpl:3M ep:16K epch:9.17 loss:0.086 grdn:0.235 lr:2.5e-06 updt_s:0
+.515 data_s:0.003
+INFO 2025-09-08 19:32:14 celerate.py:281 step:39K smpl:3M ep:16K epch:9.22 loss:0.086 grdn:0.233 lr:2.5e-06 updt_s:0
+.526 data_s:0.003
+INFO 2025-09-08 19:33:57 celerate.py:281 step:40K smpl:3M ep:16K epch:9.27 loss:0.087 grdn:0.234 lr:2.5e-06 updt_s:0
+.506 data_s:0.011
+INFO 2025-09-08 19:35:42 celerate.py:281 step:40K smpl:3M ep:16K epch:9.31 loss:0.087 grdn:0.229 lr:2.5e-06 updt_s:0
+.455 data_s:0.066
+INFO 2025-09-08 19:37:26 celerate.py:281 step:40K smpl:3M ep:16K epch:9.36 loss:0.086 grdn:0.238 lr:2.5e-06 updt_s:0
+.493 data_s:0.026
+INFO 2025-09-08 19:37:26 celerate.py:295 Checkpoint policy after step 40000
+INFO 2025-09-08 19:39:10 celerate.py:281 step:40K smpl:3M ep:16K epch:9.41 loss:0.086 grdn:0.236 lr:2.5e-06 updt_s:0
+.397 data_s:0.114
+INFO 2025-09-08 19:40:53 celerate.py:281 step:40K smpl:3M ep:16K epch:9.45 loss:0.086 grdn:0.239 lr:2.5e-06 updt_s:0
+.344 data_s:0.168
+INFO 2025-09-08 19:42:37 celerate.py:281 step:41K smpl:3M ep:16K epch:9.50 loss:0.086 grdn:0.234 lr:2.5e-06 updt_s:0
+.480 data_s:0.036
+INFO 2025-09-08 19:44:21 celerate.py:281 step:41K smpl:3M ep:16K epch:9.55 loss:0.086 grdn:0.237 lr:2.5e-06 updt_s:0
+.517 data_s:0.003
+INFO 2025-09-08 19:46:05 celerate.py:281 step:41K smpl:3M ep:16K epch:9.60 loss:0.086 grdn:0.233 lr:2.5e-06 updt_s:0
+.517 data_s:0.003
+INFO 2025-09-08 19:47:49 celerate.py:281 step:41K smpl:3M ep:16K epch:9.64 loss:0.086 grdn:0.234 lr:2.5e-06 updt_s:0
+.513 data_s:0.003
+INFO 2025-09-08 19:49:33 celerate.py:281 step:41K smpl:3M ep:16K epch:9.69 loss:0.087 grdn:0.236 lr:2.5e-06 updt_s:0
+.515 data_s:0.003
+INFO 2025-09-08 19:51:17 celerate.py:281 step:42K smpl:3M ep:16K epch:9.74 loss:0.086 grdn:0.228 lr:2.5e-06 updt_s:0
+.515 data_s:0.003
+INFO 2025-09-08 19:53:00 celerate.py:281 step:42K smpl:3M ep:17K epch:9.78 loss:0.086 grdn:0.232 lr:2.5e-06 updt_s:0
+.513 data_s:0.003
+INFO 2025-09-08 19:54:44 celerate.py:281 step:42K smpl:3M ep:17K epch:9.83 loss:0.086 grdn:0.233 lr:2.5e-06 updt_s:0
+.516 data_s:0.003
+INFO 2025-09-08 19:56:28 celerate.py:281 step:42K smpl:3M ep:17K epch:9.88 loss:0.087 grdn:0.237 lr:2.5e-06 updt_s:0
+.512 data_s:0.003
+INFO 2025-09-08 19:58:11 celerate.py:281 step:42K smpl:3M ep:17K epch:9.92 loss:0.086 grdn:0.233 lr:2.5e-06 updt_s:0
+.514 data_s:0.003
+INFO 2025-09-08 19:59:55 celerate.py:281 step:43K smpl:3M ep:17K epch:9.97 loss:0.087 grdn:0.238 lr:2.5e-06 updt_s:0
+.514 data_s:0.003
+INFO 2025-09-08 20:01:42 celerate.py:281 step:43K smpl:3M ep:17K epch:10.02 loss:0.087 grdn:0.234 lr:2.5e-06 updt_s:
+0.476 data_s:0.057
+INFO 2025-09-08 20:03:25 celerate.py:281 step:43K smpl:3M ep:17K epch:10.06 loss:0.087 grdn:0.239 lr:2.5e-06 updt_s:
+0.471 data_s:0.043
+INFO 2025-09-08 20:05:09 celerate.py:281 step:43K smpl:3M ep:17K epch:10.11 loss:0.086 grdn:0.237 lr:2.5e-06 updt_s:
+0.515 data_s:0.004
+INFO 2025-09-08 20:06:53 celerate.py:281 step:43K smpl:3M ep:17K epch:10.16 loss:0.087 grdn:0.237 lr:2.5e-06 updt_s:
+0.505 data_s:0.013
+INFO 2025-09-08 20:08:36 celerate.py:281 step:44K smpl:3M ep:17K epch:10.20 loss:0.087 grdn:0.237 lr:2.5e-06 updt_s:
+0.511 data_s:0.003
+INFO 2025-09-08 20:10:20 celerate.py:281 step:44K smpl:3M ep:17K epch:10.25 loss:0.086 grdn:0.236 lr:2.5e-06 updt_s:
+0.516 data_s:0.003
+INFO 2025-09-08 20:12:04 celerate.py:281 step:44K smpl:3M ep:17K epch:10.30 loss:0.086 grdn:0.232 lr:2.5e-06 updt_s:
+0.511 data_s:0.003
+INFO 2025-09-08 20:13:47 celerate.py:281 step:44K smpl:3M ep:18K epch:10.34 loss:0.087 grdn:0.235 lr:2.5e-06 updt_s:
+0.503 data_s:0.011
+INFO 2025-09-08 20:15:31 celerate.py:281 step:44K smpl:3M ep:18K epch:10.39 loss:0.086 grdn:0.233 lr:2.5e-06 updt_s:
+0.416 data_s:0.102
+INFO 2025-09-08 20:17:15 celerate.py:281 step:45K smpl:3M ep:18K epch:10.44 loss:0.087 grdn:0.235 lr:2.5e-06 updt_s:
+0.502 data_s:0.017
+INFO 2025-09-08 20:18:58 celerate.py:281 step:45K smpl:3M ep:18K epch:10.48 loss:0.086 grdn:0.234 lr:2.5e-06 updt_s:
+0.512 data_s:0.003
+INFO 2025-09-08 20:20:41 celerate.py:281 step:45K smpl:3M ep:18K epch:10.53 loss:0.086 grdn:0.238 lr:2.5e-06 updt_s:
+0.496 data_s:0.017
+INFO 2025-09-08 20:22:24 celerate.py:281 step:45K smpl:3M ep:18K epch:10.58 loss:0.086 grdn:0.233 lr:2.5e-06 updt_s:
+0.493 data_s:0.022
+INFO 2025-09-08 20:24:08 celerate.py:281 step:45K smpl:3M ep:18K epch:10.63 loss:0.086 grdn:0.236 lr:2.5e-06 updt_s:
+0.485 data_s:0.031
+INFO 2025-09-08 20:25:52 celerate.py:281 step:46K smpl:3M ep:18K epch:10.67 loss:0.086 grdn:0.231 lr:2.5e-06 updt_s:
+0.518 data_s:0.003
+INFO 2025-09-08 20:27:36 celerate.py:281 step:46K smpl:3M ep:18K epch:10.72 loss:0.085 grdn:0.228 lr:2.5e-06 updt_s:
+0.513 data_s:0.003
+INFO 2025-09-08 20:29:19 celerate.py:281 step:46K smpl:3M ep:18K epch:10.77 loss:0.087 grdn:0.235 lr:2.5e-06 updt_s:
+0.514 data_s:0.003
+INFO 2025-09-08 20:31:04 celerate.py:281 step:46K smpl:3M ep:18K epch:10.81 loss:0.086 grdn:0.242 lr:2.5e-06 updt_s:
+0.516 data_s:0.003
+INFO 2025-09-08 20:32:47 celerate.py:281 step:46K smpl:3M ep:18K epch:10.86 loss:0.086 grdn:0.235 lr:2.5e-06 updt_s:
+0.513 data_s:0.003
+INFO 2025-09-08 20:34:32 celerate.py:281 step:47K smpl:3M ep:18K epch:10.91 loss:0.086 grdn:0.234 lr:2.5e-06 updt_s:
+0.518 data_s:0.003
+INFO 2025-09-08 20:36:15 celerate.py:281 step:47K smpl:3M ep:19K epch:10.95 loss:0.087 grdn:0.238 lr:2.5e-06 updt_s:
+0.514 data_s:0.003
+INFO 2025-09-08 20:38:01 celerate.py:281 step:47K smpl:3M ep:19K epch:11.00 loss:0.086 grdn:0.239 lr:2.5e-06 updt_s:
+0.416 data_s:0.114
+INFO 2025-09-08 20:39:45 celerate.py:281 step:47K smpl:3M ep:19K epch:11.05 loss:0.086 grdn:0.233 lr:2.5e-06 updt_s:
+0.429 data_s:0.086
+INFO 2025-09-08 20:41:28 celerate.py:281 step:47K smpl:3M ep:19K epch:11.09 loss:0.086 grdn:0.229 lr:2.5e-06 updt_s:
+0.409 data_s:0.106
+INFO 2025-09-08 20:43:12 celerate.py:281 step:48K smpl:3M ep:19K epch:11.14 loss:0.086 grdn:0.238 lr:2.5e-06 updt_s:
+0.470 data_s:0.050
+INFO 2025-09-08 20:44:56 celerate.py:281 step:48K smpl:3M ep:19K epch:11.19 loss:0.086 grdn:0.237 lr:2.5e-06 updt_s:
+0.516 data_s:0.003
+INFO 2025-09-08 20:46:41 celerate.py:281 step:48K smpl:3M ep:19K epch:11.23 loss:0.087 grdn:0.243 lr:2.5e-06 updt_s:
+0.517 data_s:0.003
+INFO 2025-09-08 20:48:25 celerate.py:281 step:48K smpl:3M ep:19K epch:11.28 loss:0.086 grdn:0.237 lr:2.5e-06 updt_s:
+0.518 data_s:0.003
+INFO 2025-09-08 20:50:08 celerate.py:281 step:48K smpl:3M ep:19K epch:11.33 loss:0.086 grdn:0.238 lr:2.5e-06 updt_s:
+0.505 data_s:0.009
+INFO 2025-09-08 20:51:52 celerate.py:281 step:49K smpl:3M ep:19K epch:11.37 loss:0.087 grdn:0.245 lr:2.5e-06 updt_s:
+0.449 data_s:0.067
+INFO 2025-09-08 20:53:35 celerate.py:281 step:49K smpl:3M ep:19K epch:11.42 loss:0.086 grdn:0.238 lr:2.5e-06 updt_s:
+0.420 data_s:0.094
+INFO 2025-09-08 20:55:19 celerate.py:281 step:49K smpl:3M ep:19K epch:11.47 loss:0.086 grdn:0.232 lr:2.5e-06 updt_s:
+0.515 data_s:0.003
+INFO 2025-09-08 20:57:01 celerate.py:281 step:49K smpl:3M ep:19K epch:11.51 loss:0.087 grdn:0.236 lr:2.5e-06 updt_s:
+0.505 data_s:0.003
+INFO 2025-09-08 20:58:44 celerate.py:281 step:49K smpl:3M ep:20K epch:11.56 loss:0.086 grdn:0.234 lr:2.5e-06 updt_s:
+0.511 data_s:0.003
+INFO 2025-09-08 21:00:28 celerate.py:281 step:50K smpl:3M ep:20K epch:11.61 loss:0.087 grdn:0.239 lr:2.5e-06 updt_s:
+0.516 data_s:0.003
+INFO 2025-09-08 21:02:11 celerate.py:281 step:50K smpl:3M ep:20K epch:11.65 loss:0.086 grdn:0.233 lr:2.5e-06 updt_s:
+0.402 data_s:0.110
+INFO 2025-09-08 21:03:54 celerate.py:281 step:50K smpl:3M ep:20K epch:11.70 loss:0.086 grdn:0.238 lr:2.5e-06 updt_s:
+0.332 data_s:0.184
+INFO 2025-09-08 21:05:37 celerate.py:281 step:50K smpl:3M ep:20K epch:11.75 loss:0.086 grdn:0.229 lr:2.5e-06 updt_s:
+0.332 data_s:0.182
+INFO 2025-09-08 21:07:21 celerate.py:281 step:50K smpl:3M ep:20K epch:11.80 loss:0.087 grdn:0.237 lr:2.5e-06 updt_s:
+0.466 data_s:0.049
+INFO 2025-09-08 21:09:05 celerate.py:281 step:51K smpl:3M ep:20K epch:11.84 loss:0.086 grdn:0.238 lr:2.5e-06 updt_s:
+0.517 data_s:0.003
+INFO 2025-09-08 21:10:49 celerate.py:281 step:51K smpl:3M ep:20K epch:11.89 loss:0.087 grdn:0.240 lr:2.5e-06 updt_s:
+0.512 data_s:0.004
+INFO 2025-09-08 21:12:32 celerate.py:281 step:51K smpl:3M ep:20K epch:11.94 loss:0.085 grdn:0.234 lr:2.5e-06 updt_s:
+0.484 data_s:0.032
+INFO 2025-09-08 21:14:17 celerate.py:281 step:51K smpl:3M ep:20K epch:11.98 loss:0.087 grdn:0.236 lr:2.5e-06 updt_s:
+0.517 data_s:0.004
+INFO 2025-09-08 21:16:03 celerate.py:281 step:51K smpl:3M ep:20K epch:12.03 loss:0.086 grdn:0.237 lr:2.5e-06 updt_s:
+0.424 data_s:0.105
+INFO 2025-09-08 21:17:46 celerate.py:281 step:52K smpl:3M ep:20K epch:12.08 loss:0.086 grdn:0.236 lr:2.5e-06 updt_s:
+0.442 data_s:0.073
+INFO 2025-09-08 21:19:30 celerate.py:281 step:52K smpl:3M ep:21K epch:12.12 loss:0.086 grdn:0.231 lr:2.5e-06 updt_s:
+0.511 data_s:0.007
+INFO 2025-09-08 21:21:15 celerate.py:281 step:52K smpl:3M ep:21K epch:12.17 loss:0.086 grdn:0.241 lr:2.5e-06 updt_s:
+0.520 data_s:0.003
+INFO 2025-09-08 21:22:59 celerate.py:281 step:52K smpl:3M ep:21K epch:12.22 loss:0.086 grdn:0.242 lr:2.5e-06 updt_s:
+0.515 data_s:0.003
+INFO 2025-09-08 21:24:43 celerate.py:281 step:52K smpl:3M ep:21K epch:12.26 loss:0.086 grdn:0.241 lr:2.5e-06 updt_s:
+0.518 data_s:0.003
+INFO 2025-09-08 21:26:27 celerate.py:281 step:53K smpl:3M ep:21K epch:12.31 loss:0.086 grdn:0.241 lr:2.5e-06 updt_s:
+0.514 data_s:0.003
+INFO 2025-09-08 21:28:11 celerate.py:281 step:53K smpl:3M ep:21K epch:12.36 loss:0.087 grdn:0.241 lr:2.5e-06 updt_s:
+0.517 data_s:0.003
+INFO 2025-09-08 21:29:55 celerate.py:281 step:53K smpl:3M ep:21K epch:12.40 loss:0.087 grdn:0.236 lr:2.5e-06 updt_s:
+0.514 data_s:0.003
+INFO 2025-09-08 21:31:39 celerate.py:281 step:53K smpl:3M ep:21K epch:12.45 loss:0.086 grdn:0.240 lr:2.5e-06 updt_s:
+0.517 data_s:0.003
+INFO 2025-09-08 21:33:23 celerate.py:281 step:53K smpl:3M ep:21K epch:12.50 loss:0.086 grdn:0.237 lr:2.5e-06 updt_s:
+0.515 data_s:0.003
+INFO 2025-09-08 21:35:08 celerate.py:281 step:54K smpl:3M ep:21K epch:12.54 loss:0.086 grdn:0.236 lr:2.5e-06 updt_s:
+0.519 data_s:0.003
+INFO 2025-09-08 21:36:51 celerate.py:281 step:54K smpl:3M ep:21K epch:12.59 loss:0.086 grdn:0.232 lr:2.5e-06 updt_s:
+0.511 data_s:0.003
+INFO 2025-09-08 21:38:36 celerate.py:281 step:54K smpl:3M ep:21K epch:12.64 loss:0.087 grdn:0.236 lr:2.5e-06 updt_s:
+0.517 data_s:0.003
+INFO 2025-09-08 21:40:18 celerate.py:281 step:54K smpl:3M ep:21K epch:12.68 loss:0.086 grdn:0.237 lr:2.5e-06 updt_s:
+0.473 data_s:0.038
+INFO 2025-09-08 21:42:02 celerate.py:281 step:54K smpl:3M ep:22K epch:12.73 loss:0.085 grdn:0.232 lr:2.5e-06 updt_s:
+0.408 data_s:0.109
+INFO 2025-09-08 21:43:45 celerate.py:281 step:55K smpl:3M ep:22K epch:12.78 loss:0.086 grdn:0.244 lr:2.5e-06 updt_s:
+0.377 data_s:0.136
+INFO 2025-09-08 21:45:29 celerate.py:281 step:55K smpl:4M ep:22K epch:12.83 loss:0.087 grdn:0.233 lr:2.5e-06 updt_s:
+0.497 data_s:0.022
+INFO 2025-09-08 21:47:12 celerate.py:281 step:55K smpl:4M ep:22K epch:12.87 loss:0.086 grdn:0.235 lr:2.5e-06 updt_s:
+0.512 data_s:0.003
+INFO 2025-09-08 21:48:56 celerate.py:281 step:55K smpl:4M ep:22K epch:12.92 loss:0.086 grdn:0.237 lr:2.5e-06 updt_s:
+0.429 data_s:0.086
+INFO 2025-09-08 21:50:39 celerate.py:281 step:55K smpl:4M ep:22K epch:12.97 loss:0.087 grdn:0.241 lr:2.5e-06 updt_s:
+0.454 data_s:0.059
+INFO 2025-09-08 21:52:25 celerate.py:281 step:56K smpl:4M ep:22K epch:13.01 loss:0.086 grdn:0.235 lr:2.5e-06 updt_s:
+0.459 data_s:0.072
+INFO 2025-09-08 21:54:08 celerate.py:281 step:56K smpl:4M ep:22K epch:13.06 loss:0.086 grdn:0.232 lr:2.5e-06 updt_s:
+0.382 data_s:0.132
+INFO 2025-09-08 21:55:51 celerate.py:281 step:56K smpl:4M ep:22K epch:13.11 loss:0.086 grdn:0.236 lr:2.5e-06 updt_s:
+0.500 data_s:0.016
+INFO 2025-09-08 21:57:36 celerate.py:281 step:56K smpl:4M ep:22K epch:13.15 loss:0.086 grdn:0.237 lr:2.5e-06 updt_s:
+0.517 data_s:0.003
+INFO 2025-09-08 21:59:20 celerate.py:281 step:56K smpl:4M ep:22K epch:13.20 loss:0.085 grdn:0.235 lr:2.5e-06 updt_s:
+0.518 data_s:0.003
+INFO 2025-09-08 22:01:03 celerate.py:281 step:57K smpl:4M ep:22K epch:13.25 loss:0.086 grdn:0.244 lr:2.5e-06 updt_s:
+0.510 data_s:0.003
+INFO 2025-09-08 22:02:48 celerate.py:281 step:57K smpl:4M ep:23K epch:13.29 loss:0.086 grdn:0.242 lr:2.5e-06 updt_s:
+0.517 data_s:0.003
+INFO 2025-09-08 22:04:32 celerate.py:281 step:57K smpl:4M ep:23K epch:13.34 loss:0.086 grdn:0.236 lr:2.5e-06 updt_s:
+0.515 data_s:0.003
+INFO 2025-09-08 22:06:14 celerate.py:281 step:57K smpl:4M ep:23K epch:13.39 loss:0.087 grdn:0.244 lr:2.5e-06 updt_s:
+0.505 data_s:0.005
+bINFO 2025-09-08 22:07:59 celerate.py:281 step:57K smpl:4M ep:23K epch:13.43 loss:0.086 grdn:0.239 lr:2.5e-06 updt_s
+:0.496 data_s:0.026
+INFO 2025-09-08 22:09:43 celerate.py:281 step:58K smpl:4M ep:23K epch:13.48 loss:0.087 grdn:0.239 lr:2.5e-06 updt_s:
+0.438 data_s:0.080
+INFO 2025-09-08 22:11:27 celerate.py:281 step:58K smpl:4M ep:23K epch:13.53 loss:0.087 grdn:0.240 lr:2.5e-06 updt_s:
+0.444 data_s:0.073
+INFO 2025-09-08 22:13:11 celerate.py:281 step:58K smpl:4M ep:23K epch:13.57 loss:0.086 grdn:0.240 lr:2.5e-06 updt_s:
+0.515 data_s:0.003
+INFO 2025-09-08 22:14:55 celerate.py:281 step:58K smpl:4M ep:23K epch:13.62 loss:0.086 grdn:0.237 lr:2.5e-06 updt_s:
+0.518 data_s:0.003
+INFO 2025-09-08 22:16:39 celerate.py:281 step:58K smpl:4M ep:23K epch:13.67 loss:0.086 grdn:0.233 lr:2.5e-06 updt_s:
+0.513 data_s:0.003
+INFO 2025-09-08 22:18:22 celerate.py:281 step:59K smpl:4M ep:23K epch:13.71 loss:0.087 grdn:0.240 lr:2.5e-06 updt_s:
+0.513 data_s:0.003
+INFO 2025-09-08 22:20:05 celerate.py:281 step:59K smpl:4M ep:23K epch:13.76 loss:0.087 grdn:0.239 lr:2.5e-06 updt_s:
+0.508 data_s:0.003
+INFO 2025-09-08 22:21:48 celerate.py:281 step:59K smpl:4M ep:23K epch:13.81 loss:0.086 grdn:0.228 lr:2.5e-06 updt_s:
+0.505 data_s:0.008
+INFO 2025-09-08 22:23:31 celerate.py:281 step:59K smpl:4M ep:23K epch:13.85 loss:0.086 grdn:0.238 lr:2.5e-06 updt_s:
+0.505 data_s:0.008
+INFO 2025-09-08 22:25:15 celerate.py:281 step:59K smpl:4M ep:24K epch:13.90 loss:0.086 grdn:0.240 lr:2.5e-06 updt_s:
+0.515 data_s:0.003
+INFO 2025-09-08 22:26:58 celerate.py:281 step:60K smpl:4M ep:24K epch:13.95 loss:0.087 grdn:0.235 lr:2.5e-06 updt_s:
+0.493 data_s:0.022
+INFO 2025-09-08 22:28:42 celerate.py:281 step:60K smpl:4M ep:24K epch:14.00 loss:0.086 grdn:0.231 lr:2.5e-06 updt_s:
+0.515 data_s:0.003
+INFO 2025-09-08 22:30:29 celerate.py:281 step:60K smpl:4M ep:24K epch:14.04 loss:0.086 grdn:0.235 lr:2.5e-06 updt_s:
+0.435 data_s:0.101
+INFO 2025-09-08 22:30:29 celerate.py:295 Checkpoint policy after step 60000
+INFO 2025-09-08 22:32:14 celerate.py:281 step:60K smpl:4M ep:24K epch:14.09 loss:0.086 grdn:0.237 lr:2.5e-06 updt_s:
+0.508 data_s:0.003
+INFO 2025-09-08 22:33:58 celerate.py:281 step:60K smpl:4M ep:24K epch:14.14 loss:0.086 grdn:0.238 lr:2.5e-06 updt_s:
+0.516 data_s:0.003
+INFO 2025-09-08 22:35:42 celerate.py:281 step:61K smpl:4M ep:24K epch:14.18 loss:0.086 grdn:0.241 lr:2.5e-06 updt_s:
+0.515 data_s:0.003
+INFO 2025-09-08 22:37:25 celerate.py:281 step:61K smpl:4M ep:24K epch:14.23 loss:0.086 grdn:0.240 lr:2.5e-06 updt_s:
+0.513 data_s:0.003
+INFO 2025-09-08 22:39:09 celerate.py:281 step:61K smpl:4M ep:24K epch:14.28 loss:0.087 grdn:0.235 lr:2.5e-06 updt_s:
+0.514 data_s:0.003
+INFO 2025-09-08 22:40:52 celerate.py:281 step:61K smpl:4M ep:24K epch:14.32 loss:0.086 grdn:0.242 lr:2.5e-06 updt_s:
+0.509 data_s:0.003
+INFO 2025-09-08 22:42:35 celerate.py:281 step:61K smpl:4M ep:24K epch:14.37 loss:0.086 grdn:0.235 lr:2.5e-06 updt_s:
+0.513 data_s:0.003
+INFO 2025-09-08 22:44:18 celerate.py:281 step:62K smpl:4M ep:24K epch:14.42 loss:0.086 grdn:0.233 lr:2.5e-06 updt_s:
+0.511 data_s:0.003
+INFO 2025-09-08 22:46:02 celerate.py:281 step:62K smpl:4M ep:24K epch:14.46 loss:0.086 grdn:0.235 lr:2.5e-06 updt_s:
+0.515 data_s:0.003
+INFO 2025-09-08 22:47:47 celerate.py:281 step:62K smpl:4M ep:25K epch:14.51 loss:0.086 grdn:0.242 lr:2.5e-06 updt_s:
+0.517 data_s:0.003
+INFO 2025-09-08 22:49:30 celerate.py:281 step:62K smpl:4M ep:25K epch:14.56 loss:0.086 grdn:0.234 lr:2.5e-06 updt_s:
+0.512 data_s:0.003
+INFO 2025-09-08 22:51:14 celerate.py:281 step:62K smpl:4M ep:25K epch:14.60 loss:0.086 grdn:0.237 lr:2.5e-06 updt_s:
+0.514 data_s:0.003
+INFO 2025-09-08 22:52:58 celerate.py:281 step:63K smpl:4M ep:25K epch:14.65 loss:0.087 grdn:0.245 lr:2.5e-06 updt_s:
+0.484 data_s:0.033
+INFO 2025-09-08 22:54:41 celerate.py:281 step:63K smpl:4M ep:25K epch:14.70 loss:0.086 grdn:0.239 lr:2.5e-06 updt_s:
+0.501 data_s:0.016
+INFO 2025-09-08 22:56:25 celerate.py:281 step:63K smpl:4M ep:25K epch:14.74 loss:0.086 grdn:0.233 lr:2.5e-06 updt_s:
+0.436 data_s:0.081
+INFO 2025-09-08 22:58:08 celerate.py:281 step:63K smpl:4M ep:25K epch:14.79 loss:0.087 grdn:0.235 lr:2.5e-06 updt_s:
+0.436 data_s:0.080
+INFO 2025-09-08 22:59:51 celerate.py:281 step:63K smpl:4M ep:25K epch:14.84 loss:0.086 grdn:0.234 lr:2.5e-06 updt_s:
+0.344 data_s:0.168
+INFO 2025-09-08 23:01:34 celerate.py:281 step:64K smpl:4M ep:25K epch:14.88 loss:0.086 grdn:0.237 lr:2.5e-06 updt_s:
+0.479 data_s:0.035
+INFO 2025-09-08 23:03:18 celerate.py:281 step:64K smpl:4M ep:25K epch:14.93 loss:0.086 grdn:0.241 lr:2.5e-06 updt_s:
+0.514 data_s:0.003
+INFO 2025-09-08 23:05:02 celerate.py:281 step:64K smpl:4M ep:25K epch:14.98 loss:0.086 grdn:0.230 lr:2.5e-06 updt_s:
+0.513 data_s:0.003
+INFO 2025-09-08 23:06:47 celerate.py:281 step:64K smpl:4M ep:25K epch:15.02 loss:0.086 grdn:0.234 lr:2.5e-06 updt_s:
+0.405 data_s:0.119
+INFO 2025-09-08 23:08:30 celerate.py:281 step:64K smpl:4M ep:26K epch:15.07 loss:0.087 grdn:0.248 lr:2.5e-06 updt_s:
+0.393 data_s:0.121
+INFO 2025-09-08 23:10:13 celerate.py:281 step:65K smpl:4M ep:26K epch:15.12 loss:0.087 grdn:0.242 lr:2.5e-06 updt_s:
+0.369 data_s:0.142
+INFO 2025-09-08 23:11:56 celerate.py:281 step:65K smpl:4M ep:26K epch:15.17 loss:0.085 grdn:0.230 lr:2.5e-06 updt_s:
+0.360 data_s:0.156
+INFO 2025-09-08 23:13:40 celerate.py:281 step:65K smpl:4M ep:26K epch:15.21 loss:0.086 grdn:0.241 lr:2.5e-06 updt_s:
+0.333 data_s:0.182
+INFO 2025-09-08 23:15:23 celerate.py:281 step:65K smpl:4M ep:26K epch:15.26 loss:0.087 grdn:0.241 lr:2.5e-06 updt_s:
+0.376 data_s:0.136
+INFO 2025-09-08 23:17:05 celerate.py:281 step:65K smpl:4M ep:26K epch:15.31 loss:0.087 grdn:0.239 lr:2.5e-06 updt_s:
+0.439 data_s:0.069
+INFO 2025-09-08 23:18:49 celerate.py:281 step:66K smpl:4M ep:26K epch:15.35 loss:0.086 grdn:0.242 lr:2.5e-06 updt_s:
+0.512 data_s:0.006
+INFO 2025-09-08 23:20:32 celerate.py:281 step:66K smpl:4M ep:26K epch:15.40 loss:0.087 grdn:0.242 lr:2.5e-06 updt_s:
+0.511 data_s:0.003
+INFO 2025-09-08 23:22:15 celerate.py:281 step:66K smpl:4M ep:26K epch:15.45 loss:0.086 grdn:0.236 lr:2.5e-06 updt_s:
+0.508 data_s:0.004
+INFO 2025-09-08 23:23:59 celerate.py:281 step:66K smpl:4M ep:26K epch:15.49 loss:0.086 grdn:0.235 lr:2.5e-06 updt_s:
+0.474 data_s:0.046
+INFO 2025-09-08 23:25:42 celerate.py:281 step:66K smpl:4M ep:26K epch:15.54 loss:0.087 grdn:0.238 lr:2.5e-06 updt_s:
+0.510 data_s:0.003
+INFO 2025-09-08 23:27:25 celerate.py:281 step:67K smpl:4M ep:26K epch:15.59 loss:0.086 grdn:0.240 lr:2.5e-06 updt_s:
+0.487 data_s:0.025
+INFO 2025-09-08 23:29:08 celerate.py:281 step:67K smpl:4M ep:26K epch:15.63 loss:0.086 grdn:0.241 lr:2.5e-06 updt_s:
+0.510 data_s:0.005
+INFO 2025-09-08 23:30:50 celerate.py:281 step:67K smpl:4M ep:27K epch:15.68 loss:0.086 grdn:0.237 lr:2.5e-06 updt_s:
+0.506 data_s:0.003
+INFO 2025-09-08 23:32:33 celerate.py:281 step:67K smpl:4M ep:27K epch:15.73 loss:0.086 grdn:0.245 lr:2.5e-06 updt_s:
+0.509 data_s:0.003
+INFO 2025-09-08 23:34:16 celerate.py:281 step:67K smpl:4M ep:27K epch:15.77 loss:0.087 grdn:0.244 lr:2.5e-06 updt_s:
+0.503 data_s:0.014
+INFO 2025-09-08 23:35:59 celerate.py:281 step:68K smpl:4M ep:27K epch:15.82 loss:0.086 grdn:0.236 lr:2.5e-06 updt_s:
+0.512 data_s:0.003
+INFO 2025-09-08 23:37:42 celerate.py:281 step:68K smpl:4M ep:27K epch:15.87 loss:0.086 grdn:0.233 lr:2.5e-06 updt_s:
+0.509 data_s:0.003
+INFO 2025-09-08 23:39:26 celerate.py:281 step:68K smpl:4M ep:27K epch:15.91 loss:0.085 grdn:0.235 lr:2.5e-06 updt_s:
+0.512 data_s:0.003
+INFO 2025-09-08 23:41:10 celerate.py:281 step:68K smpl:4M ep:27K epch:15.96 loss:0.086 grdn:0.241 lr:2.5e-06 updt_s:
+0.517 data_s:0.003
+INFO 2025-09-08 23:42:56 celerate.py:281 step:68K smpl:4M ep:27K epch:16.01 loss:0.086 grdn:0.242 lr:2.5e-06 updt_s:
+0.469 data_s:0.061
+INFO 2025-09-08 23:44:40 celerate.py:281 step:69K smpl:4M ep:27K epch:16.05 loss:0.086 grdn:0.235 lr:2.5e-06 updt_s:
+0.339 data_s:0.179
+INFO 2025-09-08 23:46:22 celerate.py:281 step:69K smpl:4M ep:27K epch:16.10 loss:0.086 grdn:0.242 lr:2.5e-06 updt_s:
+0.386 data_s:0.124
+INFO 2025-09-08 23:48:06 celerate.py:281 step:69K smpl:4M ep:27K epch:16.15 loss:0.085 grdn:0.239 lr:2.5e-06 updt_s:
+0.515 data_s:0.003
+INFO 2025-09-08 23:49:50 celerate.py:281 step:69K smpl:4M ep:27K epch:16.20 loss:0.086 grdn:0.237 lr:2.5e-06 updt_s:
+0.516 data_s:0.003
+INFO 2025-09-08 23:51:33 celerate.py:281 step:69K smpl:4M ep:27K epch:16.24 loss:0.087 grdn:0.246 lr:2.5e-06 updt_s:
+0.501 data_s:0.014
+INFO 2025-09-08 23:53:17 celerate.py:281 step:70K smpl:4M ep:28K epch:16.29 loss:0.086 grdn:0.236 lr:2.5e-06 updt_s:
+0.513 data_s:0.003
+INFO 2025-09-08 23:54:59 celerate.py:281 step:70K smpl:4M ep:28K epch:16.34 loss:0.087 grdn:0.245 lr:2.5e-06 updt_s:
+0.507 data_s:0.005
+INFO 2025-09-08 23:56:43 celerate.py:281 step:70K smpl:4M ep:28K epch:16.38 loss:0.086 grdn:0.238 lr:2.5e-06 updt_s:
+0.516 data_s:0.003
+INFO 2025-09-08 23:58:27 celerate.py:281 step:70K smpl:4M ep:28K epch:16.43 loss:0.086 grdn:0.235 lr:2.5e-06 updt_s:
+0.514 data_s:0.003
+INFO 2025-09-09 00:00:12 celerate.py:281 step:70K smpl:5M ep:28K epch:16.48 loss:0.086 grdn:0.240 lr:2.5e-06 updt_s:
+0.518 data_s:0.003
+INFO 2025-09-09 00:01:55 celerate.py:281 step:71K smpl:5M ep:28K epch:16.52 loss:0.086 grdn:0.241 lr:2.5e-06 updt_s:
+0.512 data_s:0.003
+INFO 2025-09-09 00:03:37 celerate.py:281 step:71K smpl:5M ep:28K epch:16.57 loss:0.087 grdn:0.241 lr:2.5e-06 updt_s:
+0.508 data_s:0.003
+INFO 2025-09-09 00:05:20 celerate.py:281 step:71K smpl:5M ep:28K epch:16.62 loss:0.086 grdn:0.238 lr:2.5e-06 updt_s:
+0.509 data_s:0.003
+INFO 2025-09-09 00:07:04 celerate.py:281 step:71K smpl:5M ep:28K epch:16.66 loss:0.086 grdn:0.235 lr:2.5e-06 updt_s:
+0.519 data_s:0.003
+INFO 2025-09-09 00:08:47 celerate.py:281 step:71K smpl:5M ep:28K epch:16.71 loss:0.087 grdn:0.240 lr:2.5e-06 updt_s:
+0.509 data_s:0.003
+INFO 2025-09-09 00:10:30 celerate.py:281 step:72K smpl:5M ep:28K epch:16.76 loss:0.087 grdn:0.245 lr:2.5e-06 updt_s:
+0.511 data_s:0.003
+INFO 2025-09-09 00:12:13 celerate.py:281 step:72K smpl:5M ep:28K epch:16.80 loss:0.085 grdn:0.230 lr:2.5e-06 updt_s:
+0.510 data_s:0.003
+INFO 2025-09-09 00:13:58 celerate.py:281 step:72K smpl:5M ep:29K epch:16.85 loss:0.086 grdn:0.237 lr:2.5e-06 updt_s:
+0.522 data_s:0.003
+INFO 2025-09-09 00:15:40 celerate.py:281 step:72K smpl:5M ep:29K epch:16.90 loss:0.086 grdn:0.239 lr:2.5e-06 updt_s:
+0.506 data_s:0.003
+INFO 2025-09-09 00:17:23 celerate.py:281 step:72K smpl:5M ep:29K epch:16.94 loss:0.086 grdn:0.237 lr:2.5e-06 updt_s:
+0.510 data_s:0.004
+INFO 2025-09-09 00:19:05 celerate.py:281 step:73K smpl:5M ep:29K epch:16.99 loss:0.086 grdn:0.237 lr:2.5e-06 updt_s:
+0.505 data_s:0.003
+INFO 2025-09-09 00:20:51 celerate.py:281 step:73K smpl:5M ep:29K epch:17.04 loss:0.086 grdn:0.239 lr:2.5e-06 updt_s:
+0.437 data_s:0.092
+INFO 2025-09-09 00:22:34 celerate.py:281 step:73K smpl:5M ep:29K epch:17.08 loss:0.087 grdn:0.241 lr:2.5e-06 updt_s:
+0.489 data_s:0.025
+INFO 2025-09-09 00:24:17 celerate.py:281 step:73K smpl:5M ep:29K epch:17.13 loss:0.086 grdn:0.243 lr:2.5e-06 updt_s:
+0.511 data_s:0.003
+INFO 2025-09-09 00:26:01 celerate.py:281 step:73K smpl:5M ep:29K epch:17.18 loss:0.087 grdn:0.245 lr:2.5e-06 updt_s:
+0.513 data_s:0.003
+INFO 2025-09-09 00:27:45 celerate.py:281 step:74K smpl:5M ep:29K epch:17.22 loss:0.087 grdn:0.246 lr:2.5e-06 updt_s:
+0.515 data_s:0.003
+INFO 2025-09-09 00:29:28 celerate.py:281 step:74K smpl:5M ep:29K epch:17.27 loss:0.086 grdn:0.239 lr:2.5e-06 updt_s:
+0.509 data_s:0.003
+INFO 2025-09-09 00:31:11 celerate.py:281 step:74K smpl:5M ep:29K epch:17.32 loss:0.086 grdn:0.238 lr:2.5e-06 updt_s:
+0.512 data_s:0.003
+INFO 2025-09-09 00:32:54 celerate.py:281 step:74K smpl:5M ep:29K epch:17.37 loss:0.087 grdn:0.237 lr:2.5e-06 updt_s:
+0.510 data_s:0.003
+INFO 2025-09-09 00:34:38 celerate.py:281 step:74K smpl:5M ep:29K epch:17.41 loss:0.086 grdn:0.240 lr:2.5e-06 updt_s:
+0.515 data_s:0.003
+INFO 2025-09-09 00:36:21 celerate.py:281 step:75K smpl:5M ep:30K epch:17.46 loss:0.086 grdn:0.241 lr:2.5e-06 updt_s:
+0.513 data_s:0.003
+INFO 2025-09-09 00:38:05 celerate.py:281 step:75K smpl:5M ep:30K epch:17.51 loss:0.086 grdn:0.244 lr:2.5e-06 updt_s:
+0.512 data_s:0.003
+INFO 2025-09-09 00:39:49 celerate.py:281 step:75K smpl:5M ep:30K epch:17.55 loss:0.086 grdn:0.251 lr:2.5e-06 updt_s:
+0.517 data_s:0.003
+INFO 2025-09-09 00:41:32 celerate.py:281 step:75K smpl:5M ep:30K epch:17.60 loss:0.086 grdn:0.243 lr:2.5e-06 updt_s:
+0.506 data_s:0.005
+INFO 2025-09-09 00:43:14 celerate.py:281 step:75K smpl:5M ep:30K epch:17.65 loss:0.086 grdn:0.240 lr:2.5e-06 updt_s:
+0.390 data_s:0.122
+INFO 2025-09-09 00:44:58 celerate.py:281 step:76K smpl:5M ep:30K epch:17.69 loss:0.086 grdn:0.232 lr:2.5e-06 updt_s:
+0.410 data_s:0.107
+INFO 2025-09-09 00:46:41 celerate.py:281 step:76K smpl:5M ep:30K epch:17.74 loss:0.087 grdn:0.247 lr:2.5e-06 updt_s:
+0.427 data_s:0.085
+INFO 2025-09-09 00:48:24 celerate.py:281 step:76K smpl:5M ep:30K epch:17.79 loss:0.085 grdn:0.241 lr:2.5e-06 updt_s:
+0.492 data_s:0.025
+INFO 2025-09-09 00:50:08 celerate.py:281 step:76K smpl:5M ep:30K epch:17.83 loss:0.086 grdn:0.238 lr:2.5e-06 updt_s:
+0.514 data_s:0.003
+INFO 2025-09-09 00:51:51 celerate.py:281 step:76K smpl:5M ep:30K epch:17.88 loss:0.087 grdn:0.241 lr:2.5e-06 updt_s:
+0.510 data_s:0.003
+INFO 2025-09-09 00:53:33 celerate.py:281 step:77K smpl:5M ep:30K epch:17.93 loss:0.086 grdn:0.234 lr:2.5e-06 updt_s:
+0.510 data_s:0.003
+INFO 2025-09-09 00:55:18 celerate.py:281 step:77K smpl:5M ep:30K epch:17.97 loss:0.086 grdn:0.240 lr:2.5e-06 updt_s:
+0.519 data_s:0.003
+INFO 2025-09-09 00:57:05 celerate.py:281 step:77K smpl:5M ep:31K epch:18.02 loss:0.086 grdn:0.236 lr:2.5e-06 updt_s:
+0.486 data_s:0.046
+INFO 2025-09-09 00:58:47 celerate.py:281 step:77K smpl:5M ep:31K epch:18.07 loss:0.086 grdn:0.239 lr:2.5e-06 updt_s:
+0.509 data_s:0.003
+INFO 2025-09-09 01:00:31 celerate.py:281 step:77K smpl:5M ep:31K epch:18.11 loss:0.086 grdn:0.244 lr:2.5e-06 updt_s:
+0.514 data_s:0.003
+INFO 2025-09-09 01:02:15 celerate.py:281 step:78K smpl:5M ep:31K epch:18.16 loss:0.086 grdn:0.241 lr:2.5e-06 updt_s:
+0.515 data_s:0.003
+INFO 2025-09-09 01:03:56 celerate.py:281 step:78K smpl:5M ep:31K epch:18.21 loss:0.086 grdn:0.233 lr:2.5e-06 updt_s:
+0.498 data_s:0.008
+INFO 2025-09-09 01:05:40 celerate.py:281 step:78K smpl:5M ep:31K epch:18.25 loss:0.086 grdn:0.237 lr:2.5e-06 updt_s:
+0.508 data_s:0.010
+INFO 2025-09-09 01:07:24 celerate.py:281 step:78K smpl:5M ep:31K epch:18.30 loss:0.085 grdn:0.240 lr:2.5e-06 updt_s:
+0.515 data_s:0.003
+INFO 2025-09-09 01:09:07 celerate.py:281 step:78K smpl:5M ep:31K epch:18.35 loss:0.086 grdn:0.237 lr:2.5e-06 updt_s:
+0.513 data_s:0.003
+INFO 2025-09-09 01:10:51 celerate.py:281 step:79K smpl:5M ep:31K epch:18.40 loss:0.086 grdn:0.236 lr:2.5e-06 updt_s:
+0.515 data_s:0.003
+INFO 2025-09-09 01:12:35 celerate.py:281 step:79K smpl:5M ep:31K epch:18.44 loss:0.086 grdn:0.239 lr:2.5e-06 updt_s:
+0.514 data_s:0.003
+INFO 2025-09-09 01:14:17 celerate.py:281 step:79K smpl:5M ep:31K epch:18.49 loss:0.086 grdn:0.247 lr:2.5e-06 updt_s:
+0.508 data_s:0.003
+INFO 2025-09-09 01:16:01 celerate.py:281 step:79K smpl:5M ep:31K epch:18.54 loss:0.086 grdn:0.244 lr:2.5e-06 updt_s:
+0.513 data_s:0.003
+INFO 2025-09-09 01:17:43 celerate.py:281 step:79K smpl:5M ep:31K epch:18.58 loss:0.086 grdn:0.233 lr:2.5e-06 updt_s:
+0.507 data_s:0.003
+INFO 2025-09-09 01:19:26 celerate.py:281 step:80K smpl:5M ep:32K epch:18.63 loss:0.086 grdn:0.237 lr:2.5e-06 updt_s:
+0.511 data_s:0.003
+INFO 2025-09-09 01:21:09 celerate.py:281 step:80K smpl:5M ep:32K epch:18.68 loss:0.086 grdn:0.237 lr:2.5e-06 updt_s:
+0.513 data_s:0.003
+INFO 2025-09-09 01:22:52 celerate.py:281 step:80K smpl:5M ep:32K epch:18.72 loss:0.086 grdn:0.240 lr:2.5e-06 updt_s:
+0.501 data_s:0.011
+INFO 2025-09-09 01:22:52 celerate.py:295 Checkpoint policy after step 80000
+INFO 2025-09-09 01:24:37 celerate.py:281 step:80K smpl:5M ep:32K epch:18.77 loss:0.086 grdn:0.241 lr:2.5e-06 updt_s:
+0.510 data_s:0.003
+INFO 2025-09-09 01:26:20 celerate.py:281 step:80K smpl:5M ep:32K epch:18.82 loss:0.087 grdn:0.242 lr:2.5e-06 updt_s:
+0.514 data_s:0.003
+INFO 2025-09-09 01:28:05 celerate.py:281 step:81K smpl:5M ep:32K epch:18.86 loss:0.086 grdn:0.241 lr:2.5e-06 updt_s:
+0.517 data_s:0.003
+INFO 2025-09-09 01:29:48 celerate.py:281 step:81K smpl:5M ep:32K epch:18.91 loss:0.086 grdn:0.243 lr:2.5e-06 updt_s:
+0.511 data_s:0.003
+INFO 2025-09-09 01:31:32 celerate.py:281 step:81K smpl:5M ep:32K epch:18.96 loss:0.086 grdn:0.237 lr:2.5e-06 updt_s:
+0.517 data_s:0.003
+INFO 2025-09-09 01:33:18 celerate.py:281 step:81K smpl:5M ep:32K epch:19.00 loss:0.087 grdn:0.244 lr:2.5e-06 updt_s:
+0.487 data_s:0.042
+INFO 2025-09-09 01:35:01 celerate.py:281 step:81K smpl:5M ep:32K epch:19.05 loss:0.086 grdn:0.236 lr:2.5e-06 updt_s:
+0.461 data_s:0.054
+INFO 2025-09-09 01:36:46 celerate.py:281 step:82K smpl:5M ep:32K epch:19.10 loss:0.086 grdn:0.239 lr:2.5e-06 updt_s:
+0.450 data_s:0.071
+INFO 2025-09-09 01:38:29 celerate.py:281 step:82K smpl:5M ep:32K epch:19.14 loss:0.086 grdn:0.241 lr:2.5e-06 updt_s:
+0.451 data_s:0.064
+INFO 2025-09-09 01:40:12 celerate.py:281 step:82K smpl:5M ep:32K epch:19.19 loss:0.086 grdn:0.234 lr:2.5e-06 updt_s:
+0.512 data_s:0.003
+INFO 2025-09-09 01:41:54 celerate.py:281 step:82K smpl:5M ep:33K epch:19.24 loss:0.085 grdn:0.237 lr:2.5e-06 updt_s:
+0.480 data_s:0.030
+INFO 2025-09-09 01:43:37 celerate.py:281 step:82K smpl:5M ep:33K epch:19.28 loss:0.086 grdn:0.242 lr:2.5e-06 updt_s:
+0.434 data_s:0.080
+INFO 2025-09-09 01:45:20 celerate.py:281 step:83K smpl:5M ep:33K epch:19.33 loss:0.086 grdn:0.243 lr:2.5e-06 updt_s:
+0.511 data_s:0.003
+INFO 2025-09-09 01:47:04 celerate.py:281 step:83K smpl:5M ep:33K epch:19.38 loss:0.086 grdn:0.243 lr:2.5e-06 updt_s:
+0.511 data_s:0.003
+INFO 2025-09-09 01:48:49 celerate.py:281 step:83K smpl:5M ep:33K epch:19.42 loss:0.086 grdn:0.243 lr:2.5e-06 updt_s:
+0.520 data_s:0.003
+INFO 2025-09-09 01:50:32 celerate.py:281 step:83K smpl:5M ep:33K epch:19.47 loss:0.086 grdn:0.238 lr:2.5e-06 updt_s:
+0.511 data_s:0.003
+INFO 2025-09-09 01:52:15 celerate.py:281 step:83K smpl:5M ep:33K epch:19.52 loss:0.086 grdn:0.243 lr:2.5e-06 updt_s:
+0.513 data_s:0.003
+INFO 2025-09-09 01:53:59 celerate.py:281 step:84K smpl:5M ep:33K epch:19.57 loss:0.085 grdn:0.236 lr:2.5e-06 updt_s:
+0.513 data_s:0.003
+INFO 2025-09-09 01:55:41 celerate.py:281 step:84K smpl:5M ep:33K epch:19.61 loss:0.086 grdn:0.238 lr:2.5e-06 updt_s:
+0.508 data_s:0.003
+INFO 2025-09-09 01:57:24 celerate.py:281 step:84K smpl:5M ep:33K epch:19.66 loss:0.087 grdn:0.237 lr:2.5e-06 updt_s:
+0.500 data_s:0.013
+INFO 2025-09-09 01:59:07 celerate.py:281 step:84K smpl:5M ep:33K epch:19.71 loss:0.087 grdn:0.245 lr:2.5e-06 updt_s:
+0.510 data_s:0.003
+INFO 2025-09-09 02:00:50 celerate.py:281 step:84K smpl:5M ep:33K epch:19.75 loss:0.086 grdn:0.237 lr:2.5e-06 updt_s:
+0.512 data_s:0.003
+INFO 2025-09-09 02:02:34 celerate.py:281 step:85K smpl:5M ep:34K epch:19.80 loss:0.086 grdn:0.240 lr:2.5e-06 updt_s:
+0.516 data_s:0.003
+INFO 2025-09-09 02:04:17 celerate.py:281 step:85K smpl:5M ep:34K epch:19.85 loss:0.086 grdn:0.244 lr:2.5e-06 updt_s:
+0.511 data_s:0.003
+INFO 2025-09-09 02:05:59 celerate.py:281 step:85K smpl:5M ep:34K epch:19.89 loss:0.086 grdn:0.243 lr:2.5e-06 updt_s:
+0.507 data_s:0.004
+INFO 2025-09-09 02:07:43 celerate.py:281 step:85K smpl:5M ep:34K epch:19.94 loss:0.086 grdn:0.237 lr:2.5e-06 updt_s:
+0.505 data_s:0.011
+INFO 2025-09-09 02:09:26 celerate.py:281 step:85K smpl:5M ep:34K epch:19.99 loss:0.086 grdn:0.242 lr:2.5e-06 updt_s:
+0.392 data_s:0.124
+INFO 2025-09-09 02:11:14 celerate.py:281 step:86K smpl:5M ep:34K epch:20.03 loss:0.086 grdn:0.240 lr:2.5e-06 updt_s:
+0.435 data_s:0.100
+INFO 2025-09-09 02:12:57 celerate.py:281 step:86K smpl:5M ep:34K epch:20.08 loss:0.086 grdn:0.242 lr:2.5e-06 updt_s:
+0.514 data_s:0.003
+INFO 2025-09-09 02:14:41 celerate.py:281 step:86K smpl:6M ep:34K epch:20.13 loss:0.085 grdn:0.234 lr:2.5e-06 updt_s:
+0.516 data_s:0.003
+INFO 2025-09-09 02:16:25 celerate.py:281 step:86K smpl:6M ep:34K epch:20.17 loss:0.085 grdn:0.234 lr:2.5e-06 updt_s:
+0.514 data_s:0.003
+INFO 2025-09-09 02:18:09 celerate.py:281 step:86K smpl:6M ep:34K epch:20.22 loss:0.085 grdn:0.238 lr:2.5e-06 updt_s:
+0.506 data_s:0.010
+INFO 2025-09-09 02:19:53 celerate.py:281 step:87K smpl:6M ep:34K epch:20.27 loss:0.086 grdn:0.238 lr:2.5e-06 updt_s:
+0.515 data_s:0.003
+INFO 2025-09-09 02:21:35 celerate.py:281 step:87K smpl:6M ep:34K epch:20.31 loss:0.086 grdn:0.243 lr:2.5e-06 updt_s:
+0.501 data_s:0.007
+INFO 2025-09-09 02:23:17 celerate.py:281 step:87K smpl:6M ep:34K epch:20.36 loss:0.085 grdn:0.242 lr:2.5e-06 updt_s:
+0.432 data_s:0.080
+INFO 2025-09-09 02:25:01 celerate.py:281 step:87K smpl:6M ep:35K epch:20.41 loss:0.086 grdn:0.235 lr:2.5e-06 updt_s:
+0.445 data_s:0.073
+INFO 2025-09-09 02:26:43 celerate.py:281 step:87K smpl:6M ep:35K epch:20.45 loss:0.085 grdn:0.239 lr:2.5e-06 updt_s:
+0.401 data_s:0.107
+INFO 2025-09-09 02:28:26 celerate.py:281 step:88K smpl:6M ep:35K epch:20.50 loss:0.086 grdn:0.237 lr:2.5e-06 updt_s:
+0.502 data_s:0.009
+INFO 2025-09-09 02:30:09 celerate.py:281 step:88K smpl:6M ep:35K epch:20.55 loss:0.087 grdn:0.248 lr:2.5e-06 updt_s:
+0.491 data_s:0.021
+INFO 2025-09-09 02:31:52 celerate.py:281 step:88K smpl:6M ep:35K epch:20.59 loss:0.086 grdn:0.243 lr:2.5e-06 updt_s:
+0.391 data_s:0.124
+INFO 2025-09-09 02:33:35 celerate.py:281 step:88K smpl:6M ep:35K epch:20.64 loss:0.086 grdn:0.238 lr:2.5e-06 updt_s:
+0.332 data_s:0.180
+INFO 2025-09-09 02:35:18 celerate.py:281 step:88K smpl:6M ep:35K epch:20.69 loss:0.087 grdn:0.243 lr:2.5e-06 updt_s:
+0.333 data_s:0.181
+INFO 2025-09-09 02:37:02 celerate.py:281 step:89K smpl:6M ep:35K epch:20.74 loss:0.086 grdn:0.245 lr:2.5e-06 updt_s:
+0.332 data_s:0.185
+INFO 2025-09-09 02:38:47 celerate.py:281 step:89K smpl:6M ep:35K epch:20.78 loss:0.086 grdn:0.241 lr:2.5e-06 updt_s:
+0.332 data_s:0.190
+INFO 2025-09-09 02:40:30 celerate.py:281 step:89K smpl:6M ep:35K epch:20.83 loss:0.086 grdn:0.240 lr:2.5e-06 updt_s:
+0.330 data_s:0.187
+INFO 2025-09-09 02:42:12 celerate.py:281 step:89K smpl:6M ep:35K epch:20.88 loss:0.085 grdn:0.243 lr:2.5e-06 updt_s:
+0.331 data_s:0.177
+INFO 2025-09-09 02:43:56 celerate.py:281 step:89K smpl:6M ep:35K epch:20.92 loss:0.086 grdn:0.244 lr:2.5e-06 updt_s:
+0.330 data_s:0.185
+INFO 2025-09-09 02:45:39 celerate.py:281 step:90K smpl:6M ep:36K epch:20.97 loss:0.087 grdn:0.252 lr:2.5e-06 updt_s:
+0.335 data_s:0.182
+INFO 2025-09-09 02:47:25 celerate.py:281 step:90K smpl:6M ep:36K epch:21.02 loss:0.087 grdn:0.247 lr:2.5e-06 updt_s:
+0.330 data_s:0.197
+INFO 2025-09-09 02:49:08 celerate.py:281 step:90K smpl:6M ep:36K epch:21.06 loss:0.086 grdn:0.240 lr:2.5e-06 updt_s:
+0.330 data_s:0.182
+INFO 2025-09-09 02:50:51 celerate.py:281 step:90K smpl:6M ep:36K epch:21.11 loss:0.086 grdn:0.243 lr:2.5e-06 updt_s:
+0.334 data_s:0.181
+INFO 2025-09-09 02:52:34 celerate.py:281 step:90K smpl:6M ep:36K epch:21.16 loss:0.086 grdn:0.237 lr:2.5e-06 updt_s:
+0.330 data_s:0.183
+INFO 2025-09-09 02:54:17 celerate.py:281 step:91K smpl:6M ep:36K epch:21.20 loss:0.085 grdn:0.235 lr:2.5e-06 updt_s:
+0.342 data_s:0.173
+INFO 2025-09-09 02:56:01 celerate.py:281 step:91K smpl:6M ep:36K epch:21.25 loss:0.086 grdn:0.246 lr:2.5e-06 updt_s:
+0.331 data_s:0.189
+INFO 2025-09-09 02:57:44 celerate.py:281 step:91K smpl:6M ep:36K epch:21.30 loss:0.086 grdn:0.236 lr:2.5e-06 updt_s:
+0.329 data_s:0.182
+INFO 2025-09-09 02:59:27 celerate.py:281 step:91K smpl:6M ep:36K epch:21.34 loss:0.087 grdn:0.246 lr:2.5e-06 updt_s:
+0.341 data_s:0.175
+INFO 2025-09-09 03:01:11 celerate.py:281 step:91K smpl:6M ep:36K epch:21.39 loss:0.086 grdn:0.242 lr:2.5e-06 updt_s:
+0.330 data_s:0.188
+INFO 2025-09-09 03:02:54 celerate.py:281 step:92K smpl:6M ep:36K epch:21.44 loss:0.086 grdn:0.237 lr:2.5e-06 updt_s:
+0.328 data_s:0.186
+INFO 2025-09-09 03:04:39 celerate.py:281 step:92K smpl:6M ep:36K epch:21.48 loss:0.086 grdn:0.244 lr:2.5e-06 updt_s:
+0.329 data_s:0.195
+INFO 2025-09-09 03:06:22 celerate.py:281 step:92K smpl:6M ep:36K epch:21.53 loss:0.086 grdn:0.244 lr:2.5e-06 updt_s:
+0.330 data_s:0.183
+INFO 2025-09-09 03:08:04 celerate.py:281 step:92K smpl:6M ep:37K epch:21.58 loss:0.086 grdn:0.241 lr:2.5e-06 updt_s:
+0.462 data_s:0.051
+INFO 2025-09-09 03:09:48 celerate.py:281 step:92K smpl:6M ep:37K epch:21.62 loss:0.086 grdn:0.248 lr:2.5e-06 updt_s:
+0.407 data_s:0.108
+INFO 2025-09-09 03:11:32 celerate.py:281 step:93K smpl:6M ep:37K epch:21.67 loss:0.086 grdn:0.232 lr:2.5e-06 updt_s:
+0.333 data_s:0.185
+INFO 2025-09-09 03:13:15 celerate.py:281 step:93K smpl:6M ep:37K epch:21.72 loss:0.085 grdn:0.242 lr:2.5e-06 updt_s:
+0.329 data_s:0.187
+INFO 2025-09-09 03:14:58 celerate.py:281 step:93K smpl:6M ep:37K epch:21.77 loss:0.086 grdn:0.240 lr:2.5e-06 updt_s:
+0.357 data_s:0.156
+INFO 2025-09-09 03:16:41 celerate.py:281 step:93K smpl:6M ep:37K epch:21.81 loss:0.086 grdn:0.237 lr:2.5e-06 updt_s:
+0.487 data_s:0.027
+INFO 2025-09-09 03:18:25 celerate.py:281 step:93K smpl:6M ep:37K epch:21.86 loss:0.086 grdn:0.240 lr:2.5e-06 updt_s:
+0.512 data_s:0.003
+INFO 2025-09-09 03:20:08 celerate.py:281 step:94K smpl:6M ep:37K epch:21.91 loss:0.087 grdn:0.247 lr:2.5e-06 updt_s:
+0.512 data_s:0.003
+INFO 2025-09-09 03:21:51 celerate.py:281 step:94K smpl:6M ep:37K epch:21.95 loss:0.086 grdn:0.238 lr:2.5e-06 updt_s:
+0.508 data_s:0.004
+INFO 2025-09-09 03:23:38 celerate.py:281 step:94K smpl:6M ep:37K epch:22.00 loss:0.085 grdn:0.239 lr:2.5e-06 updt_s:
+0.429 data_s:0.104
+INFO 2025-09-09 03:25:20 celerate.py:281 step:94K smpl:6M ep:37K epch:22.05 loss:0.086 grdn:0.243 lr:2.5e-06 updt_s:
+0.328 data_s:0.183
+INFO 2025-09-09 03:27:04 celerate.py:281 step:94K smpl:6M ep:37K epch:22.09 loss:0.086 grdn:0.246 lr:2.5e-06 updt_s:
+0.329 data_s:0.191
+INFO 2025-09-09 03:28:47 celerate.py:281 step:95K smpl:6M ep:37K epch:22.14 loss:0.086 grdn:0.246 lr:2.5e-06 updt_s:
+0.329 data_s:0.186
+INFO 2025-09-09 03:30:30 celerate.py:281 step:95K smpl:6M ep:38K epch:22.19 loss:0.086 grdn:0.241 lr:2.5e-06 updt_s:
+0.346 data_s:0.166
+INFO 2025-09-09 03:32:13 celerate.py:281 step:95K smpl:6M ep:38K epch:22.23 loss:0.086 grdn:0.247 lr:2.5e-06 updt_s:
+0.334 data_s:0.181
+INFO 2025-09-09 03:33:56 celerate.py:281 step:95K smpl:6M ep:38K epch:22.28 loss:0.086 grdn:0.244 lr:2.5e-06 updt_s:
+0.402 data_s:0.110
+INFO 2025-09-09 03:35:39 celerate.py:281 step:95K smpl:6M ep:38K epch:22.33 loss:0.085 grdn:0.237 lr:2.5e-06 updt_s:
+0.353 data_s:0.161
+INFO 2025-09-09 03:37:22 celerate.py:281 step:96K smpl:6M ep:38K epch:22.37 loss:0.086 grdn:0.235 lr:2.5e-06 updt_s:
+0.356 data_s:0.158
+INFO 2025-09-09 03:39:04 celerate.py:281 step:96K smpl:6M ep:38K epch:22.42 loss:0.086 grdn:0.244 lr:2.5e-06 updt_s:
+0.379 data_s:0.131
+INFO 2025-09-09 03:40:49 celerate.py:281 step:96K smpl:6M ep:38K epch:22.47 loss:0.085 grdn:0.239 lr:2.5e-06 updt_s:
+0.344 data_s:0.175
+INFO 2025-09-09 03:42:32 celerate.py:281 step:96K smpl:6M ep:38K epch:22.51 loss:0.086 grdn:0.236 lr:2.5e-06 updt_s:
+0.331 data_s:0.185
+INFO 2025-09-09 03:44:15 celerate.py:281 step:96K smpl:6M ep:38K epch:22.56 loss:0.086 grdn:0.244 lr:2.5e-06 updt_s:
+0.331 data_s:0.183
+INFO 2025-09-09 03:45:58 celerate.py:281 step:97K smpl:6M ep:38K epch:22.61 loss:0.086 grdn:0.238 lr:2.5e-06 updt_s:
+0.330 data_s:0.184
+INFO 2025-09-09 03:47:43 celerate.py:281 step:97K smpl:6M ep:38K epch:22.65 loss:0.086 grdn:0.248 lr:2.5e-06 updt_s:
+0.331 data_s:0.188
+INFO 2025-09-09 03:49:27 celerate.py:281 step:97K smpl:6M ep:38K epch:22.70 loss:0.086 grdn:0.240 lr:2.5e-06 updt_s:
+0.333 data_s:0.185
+INFO 2025-09-09 03:51:10 celerate.py:281 step:97K smpl:6M ep:39K epch:22.75 loss:0.085 grdn:0.241 lr:2.5e-06 updt_s:
+0.330 data_s:0.185
+INFO 2025-09-09 03:52:54 celerate.py:281 step:97K smpl:6M ep:39K epch:22.79 loss:0.086 grdn:0.247 lr:2.5e-06 updt_s:
+0.330 data_s:0.192
+INFO 2025-09-09 03:54:37 celerate.py:281 step:98K smpl:6M ep:39K epch:22.84 loss:0.086 grdn:0.237 lr:2.5e-06 updt_s:
+0.329 data_s:0.185
+INFO 2025-09-09 03:56:21 celerate.py:281 step:98K smpl:6M ep:39K epch:22.89 loss:0.086 grdn:0.237 lr:2.5e-06 updt_s:
+0.329 data_s:0.187
+INFO 2025-09-09 03:58:04 celerate.py:281 step:98K smpl:6M ep:39K epch:22.94 loss:0.086 grdn:0.242 lr:2.5e-06 updt_s:
+0.329 data_s:0.185
+INFO 2025-09-09 03:59:46 celerate.py:281 step:98K smpl:6M ep:39K epch:22.98 loss:0.086 grdn:0.239 lr:2.5e-06 updt_s:
+0.329 data_s:0.183
+INFO 2025-09-09 04:01:32 celerate.py:281 step:98K smpl:6M ep:39K epch:23.03 loss:0.087 grdn:0.250 lr:2.5e-06 updt_s:
+0.376 data_s:0.151
+INFO 2025-09-09 04:03:16 celerate.py:281 step:99K smpl:6M ep:39K epch:23.08 loss:0.086 grdn:0.241 lr:2.5e-06 updt_s:
+0.329 data_s:0.187
+INFO 2025-09-09 04:04:59 celerate.py:281 step:99K smpl:6M ep:39K epch:23.12 loss:0.086 grdn:0.243 lr:2.5e-06 updt_s:
+0.379 data_s:0.136
+INFO 2025-09-09 04:06:42 celerate.py:281 step:99K smpl:6M ep:39K epch:23.17 loss:0.086 grdn:0.240 lr:2.5e-06 updt_s:
+0.513 data_s:0.003
+INFO 2025-09-09 04:08:25 celerate.py:281 step:99K smpl:6M ep:39K epch:23.22 loss:0.086 grdn:0.242 lr:2.5e-06 updt_s:
+0.510 data_s:0.003
+INFO 2025-09-09 04:10:07 celerate.py:281 step:99K smpl:6M ep:39K epch:23.26 loss:0.087 grdn:0.242 lr:2.5e-06 updt_s:
+0.470 data_s:0.039
+INFO 2025-09-09 04:11:50 celerate.py:281 step:100K smpl:6M ep:39K epch:23.31 loss:0.086 grdn:0.247 lr:2.5e-06 updt_s
+:0.347 data_s:0.169
+INFO 2025-09-09 04:13:34 celerate.py:281 step:100K smpl:6M ep:40K epch:23.36 loss:0.085 grdn:0.237 lr:2.5e-06 updt_s
+:0.330 data_s:0.185
+INFO 2025-09-09 04:15:17 celerate.py:281 step:100K smpl:6M ep:40K epch:23.40 loss:0.086 grdn:0.239 lr:2.5e-06 updt_s
+:0.356 data_s:0.156
+INFO 2025-09-09 04:15:17 celerate.py:295 Checkpoint policy after step 100000
+INFO 2025-09-09 04:15:18 celerate.py:359 End of training
+(lerobot) jade_choghari@hf-dgx-01:~/lerobot$ clear
+(lerobot) jade_choghari@hf-dgx-01:~/lerobot$ tmux capture-pane -pS - > tmux_log.txt
+
+
+
+
+
+
+
+
+
+

From aa40c8c813ee35c8c4289675d00cb75ea7609733 Mon Sep 17 00:00:00 2001
From: Jade Choghari <chogharijade@gmail.com>
Date: Wed, 10 Sep 2025 23:24:18 +0200
Subject: [PATCH 03/12] More things

---
 examples/6_evaluate_libero.sh                 |    4 +-
 examples/9_evaluate_must.sh                   | 2811 +++++++++++++++++
 src/lerobot/envs/configs.py                   |    2 -
 src/lerobot/envs/factory.py                   |   47 +-
 src/lerobot/envs/libero copy.py               |  326 ++
 src/lerobot/envs/libero.py                    |   15 +-
 src/lerobot/envs/libero2.py                   |  308 ++
 src/lerobot/envs/utils.py                     |   49 +
 src/lerobot/policies/factory.py               |    2 +-
 .../policies/smolpi0/modeling_smolpi0.py      |  110 +-
 .../policies/smolvla/modeling_smolvla.py      |    3 +-
 src/lerobot/scripts/eval.py                   |   41 +-
 12 files changed, 3670 insertions(+), 48 deletions(-)
 create mode 100644 examples/9_evaluate_must.sh
 create mode 100644 src/lerobot/envs/libero copy.py
 create mode 100644 src/lerobot/envs/libero2.py

diff --git a/examples/6_evaluate_libero.sh b/examples/6_evaluate_libero.sh
index ad6ca0f13..c15d71c95 100644
--- a/examples/6_evaluate_libero.sh
+++ b/examples/6_evaluate_libero.sh
@@ -14,11 +14,11 @@ export HF_DATASETS_OFFLINE=1
 export HF_HUB_OFFLINE=1
 export TOKENIZERS_PARALLELISM=false
 export MUJOCO_GL=egl
-export CUDA_VISIBLE_DEVICES=3
+export CUDA_VISIBLE_DEVICES=2
 
 # CONFIGURATION
 POLICY_PATH="/raid/jade/logs/lerobot/lerobot_2_HuggingFaceVLA_libero_smolvla_lr1e-4bs32steps100000/checkpoints/100000/pretrained_model"
-POLICY_PATH="/raid/jade/models/smolvlamust"
+POLICY_PATH="/raid/jade/logs/lerobot/lerobot_new_HuggingfaceVLA_libero_smolvla_lr1e-4bs32steps100000/checkpoints/100000/pretrained_model"
 TASK=libero_spatial
 ENV_TYPE="libero"
 BATCH_SIZE=10
diff --git a/examples/9_evaluate_must.sh b/examples/9_evaluate_must.sh
new file mode 100644
index 000000000..534153330
--- /dev/null
+++ b/examples/9_evaluate_must.sh
@@ -0,0 +1,2811 @@
+#!/bin/bash
+
+#SBATCH --job-name=lerobot_eval_smolpi0_libero_eval10ep_ca_sa2_16vlm_w075_smolvlm2b_lr7e5
+#SBATCH --nodes=1
+#SBATCH --ntasks=1
+#SBATCH --gpus-per-node=1
+#SBATCH --mail-type=END,FAIL
+#SBATCH --output=/lustre/fswork/projects/rech/dyf/ugz83ue/logs/slurm/lerobot_eval_smolpi0_libero_eval10ep_ca_sa2_16vlm_w075_smolvlm2b_lr7e5.out
+###SBATCH --nodelist=jean-zay-a101
+#SBATCH --cpus-per-task=45
+###SBATCH --exclusive
+#SBATCH --time=15:00:00
+#SBATCH --mail-user=mustafa.shukor@isir.upmc.fr
+
+##SBATCH --partition=gpu_p2
+##SBATCH --qos=qos_gpu-t3
+###SBATCH -C v100-32g
+##SBATCH -A dyf@v100
+
+##SBATCH --partition=gpu_p5
+##SBATCH -C a100
+###SBATCH -A dyf@a100
+##SBATCH -A lqm@a100
+##SBATCH --qos=qos_gpu_a100-dev 
+##SBATCH --qos=qos_gpu_a100-t3 
+
+#SBATCH --partition=gpu_p6
+#SBATCH -C h100
+#SBATCH -A lqm@h100
+###SBATCH --qos=qos_gpu_h100-dev 
+#SBATCH --qos=qos_gpu_h100-t3 
+
+###SBATCH --begin=now+2hour
+
+# cd ~/lerobot_pi
+# source ~/.bashrc
+# source activate lerobot
+# export LC_ALL=C
+
+# rm core-*
+export CUDA_VISIBLE_DEVICES=3
+# storage / caches
+RAID=/raid/jade
+export TRANSFORMERS_CACHE=$RAID/.cache/huggingface/transformers
+export HF_HOME=$RAID/.cache/huggingface
+export HF_DATASETS_CACHE=$RAID/.cache/huggingface/datasets
+export HF_LEROBOT_HOME=$RAID/.cache/huggingface/lerobot
+export WANDB_CACHE_DIR=$RAID/.cache/wandb
+export TMPDIR=$RAID/.cache/tmp
+mkdir -p $TMPDIR
+export WANDB_MODE=offline
+export HF_DATASETS_OFFLINE=1
+# export HF_HUB_OFFLINE=1
+export TOKENIZERS_PARALLELISM=false
+export MUJOCO_GL=egl
+export CUDA_VISIBLE_DEVICES=3
+
+PORT=29512
+
+## then later
+## wandb sync wandb/offline-run-*
+
+
+ENV=libero
+
+# TASK=libero_10
+TASK=libero_spatial
+# TASK=libero_spatial
+# TASK=libero_10
+# TASK=libero_spatial
+
+
+POLICY_NAME=smolpi0
+
+POLICY=smolpi0
+ENV=libero
+
+
+
+
+
+
+CKPT_KEYS_MAPPING=model._orig_mod.//model.
+LOAD_VLM_WEIGHTS=true
+PEFT_METHOD=freeze
+SELF_ATTN_ONLY_ACTIONS=false
+CAUSAL_ATTENTION_ON_HISTORY=false
+
+PREDICT_RELATIVE_ACTIONS=false
+RELATIVE_ACTIONS_MODE=first
+SHUFFLE_CAMERA_POSITIONS=false
+
+VLM_IMG_SIZE=-1
+REGRESSION_LOSS=false
+
+
+# ## Baseline for ablation study
+# EXPERT_WIDTH_MULTIPLIER=0.5
+# PAST_OBS_KEYS="image"
+# N_OBS_STEPS=1
+# NUM_EXPERT_LAYERS=0
+# CHUNK_SIZE=50
+# NUM_VLM_LAYERS=0
+# PAD_LANG_TO=max_length
+# EVAL_CKPT=$WORK/logs/lerobot/lerobot_physical_intelligence_libero_smolpi0_lr1e-4_bs8_steps100000_gpus2_freeze32_onlyexpert_1act_promptfalse_imgtoktrue_nobs1_compiletrue_cross_attn_pref0_gap1_localimgfalse_reverseimgorderfalse_statetopreftrue/checkpoints/last/pretrained_model/
+# ADD_IMAGE_TOKENS=true
+# ATTN_MODE=cross_attn
+# STATE_TO_PREFIX=true
+# CAUSAL_ACTION_ATTENTION_MASK=false
+# SELF_ATTN_EVERY_N_LAYERS=0
+# VLM_NAME=HuggingFaceTB/SmolVLM2-500M-Video-Instruct
+
+# EXPERT_WIDTH_MULTIPLIER=0.5
+# PAST_OBS_KEYS="image"
+# N_OBS_STEPS=1
+# NUM_EXPERT_LAYERS=0
+# CHUNK_SIZE=50
+# NUM_VLM_LAYERS=0
+# PAD_LANG_TO=max_length
+# EVAL_CKPT=$WORK/logs/lerobot/lerobot_physical_intelligence_libero_smolpi0_lr1e-4_bs8_steps100000_gpus2_freeze32_onlyexpert_1act_promptfalse_imgtoktrue_nobs1_compiletrue_cross_attn/checkpoints/last/pretrained_model/
+# ADD_IMAGE_TOKENS=true
+# ATTN_MODE=cross_attn
+# STATE_TO_PREFIX=false
+# CAUSAL_ACTION_ATTENTION_MASK=false
+# SELF_ATTN_EVERY_N_LAYERS=0
+# VLM_NAME=HuggingFaceTB/SmolVLM2-500M-Video-Instruct
+
+
+# EXPERT_WIDTH_MULTIPLIER=0.5
+# PAST_OBS_KEYS="image"
+# N_OBS_STEPS=1
+# NUM_EXPERT_LAYERS=0
+# CHUNK_SIZE=50
+# NUM_VLM_LAYERS=0
+# PAD_LANG_TO=max_length
+# EVAL_CKPT=$WORK/logs/lerobot/lerobot_physical_intelligence_libero_smolpi0_lr1e-4_bs8_steps100000_gpus2_freeze32_onlyexpert_1act_promptfalse_imgtoktrue_nobs1_compiletrue_self_attn/checkpoints/last/pretrained_model/
+# ADD_IMAGE_TOKENS=true
+# ATTN_MODE=self_attn
+# STATE_TO_PREFIX=false
+# CAUSAL_ACTION_ATTENTION_MASK=false
+# SELF_ATTN_EVERY_N_LAYERS=0
+# VLM_NAME=HuggingFaceTB/SmolVLM2-500M-Video-Instruct
+
+
+# EXPERT_WIDTH_MULTIPLIER=0.5
+# PAST_OBS_KEYS="image"
+# N_OBS_STEPS=1
+# NUM_EXPERT_LAYERS=0
+# CHUNK_SIZE=50
+# NUM_VLM_LAYERS=0
+# PAD_LANG_TO=longest
+# EVAL_CKPT=$WORK/logs/lerobot/lerobot_physical_intelligence_libero_smolpi0_lr1e-4bs8steps100000gpus2freeze32_imgtoktrue_self_attn_gap1_localimgfalse_statetopreffalse_explay0_vlml0_causalacttrue_sa0/checkpoints/last/pretrained_model/
+# ADD_IMAGE_TOKENS=true
+# ATTN_MODE=self_attn
+# STATE_TO_PREFIX=false
+# CAUSAL_ACTION_ATTENTION_MASK=true
+# SELF_ATTN_EVERY_N_LAYERS=0
+# VLM_NAME=HuggingFaceTB/SmolVLM2-500M-Video-Instruct
+
+# EXPERT_WIDTH_MULTIPLIER=0.5
+# PAST_OBS_KEYS="image"
+# N_OBS_STEPS=1
+# NUM_EXPERT_LAYERS=0
+# CHUNK_SIZE=50
+# NUM_VLM_LAYERS=0
+# PAD_LANG_TO=longest
+# EVAL_CKPT=$WORK/logs/lerobot/lerobot_physical_intelligence_libero_smolpi0_lr5e-4bs8steps100000gpus2freeze32_imgtoktrue_cross_attn_gap1_localimgfalse_statetopreftrue_explay0_vlml0_causalactfalse_sa0_smolvlm2250/checkpoints/best/pretrained_model/
+# ADD_IMAGE_TOKENS=true
+# ATTN_MODE=cross_attn
+# STATE_TO_PREFIX=true
+# CAUSAL_ACTION_ATTENTION_MASK=false
+# SELF_ATTN_EVERY_N_LAYERS=0
+# VLM_NAME=HuggingFaceTB/SmolVLM2-256M-Video-Instruct
+
+# EXPERT_WIDTH_MULTIPLIER=0.5
+# PAST_OBS_KEYS="image"
+# N_OBS_STEPS=1
+# NUM_EXPERT_LAYERS=0
+# CHUNK_SIZE=50
+# NUM_VLM_LAYERS=0
+# PAD_LANG_TO=longest
+# EVAL_CKPT=$WORK/logs/lerobot/lerobot_physical_intelligence_libero_smolpi0_lr8e-5bs8steps100000gpus2freeze32_imgtoktrue_cross_attn_gap1_localimgfalse_statetopreftrue_explay0_vlml0_causalactfalse_sa0_smolvlm22b/checkpoints/best/pretrained_model/
+# ADD_IMAGE_TOKENS=true
+# ATTN_MODE=cross_attn
+# STATE_TO_PREFIX=true
+# CAUSAL_ACTION_ATTENTION_MASK=false
+# SELF_ATTN_EVERY_N_LAYERS=0
+# VLM_NAME=HuggingFaceTB/SmolVLM2-2.2B-Instruct
+
+# EXPERT_WIDTH_MULTIPLIER=0.5
+# PAST_OBS_KEYS="image"
+# N_OBS_STEPS=1
+# NUM_EXPERT_LAYERS=0
+# CHUNK_SIZE=50
+# NUM_VLM_LAYERS=0
+# PAD_LANG_TO=longest
+# EVAL_CKPT=$WORK/logs/lerobot/lerobot_physical_intelligence_libero_smolpi0_lr1e-4_bs8_steps100000_gpus2_freeze32_onlyexpert_1act_promptfalse_imgtoktrue_nobs1_compiletrue_self_attn_pref0_gap1_localimgfalse_reverseimgorderfalse_statetopreftrue_toklongest_explay0_vlml0_causalacttrue/checkpoints/last/pretrained_model/
+# ADD_IMAGE_TOKENS=true
+# ATTN_MODE=self_attn
+# STATE_TO_PREFIX=true
+# CAUSAL_ACTION_ATTENTION_MASK=true
+# SELF_ATTN_EVERY_N_LAYERS=0
+# VLM_NAME=HuggingFaceTB/SmolVLM2-500M-Video-Instruct
+
+# EXPERT_WIDTH_MULTIPLIER=0.5
+# PAST_OBS_KEYS="image"
+# N_OBS_STEPS=1
+# NUM_EXPERT_LAYERS=0
+# CHUNK_SIZE=50
+# NUM_VLM_LAYERS=0
+# PAD_LANG_TO=longest
+# EVAL_CKPT=$WORK/logs/lerobot/lerobot_physical_intelligence_libero_smolpi0_lr1e-4bs8steps100000gpus2freeze32_imgtoktrue_cross_attn_gap1_localimgfalse_statetopreftrue_explay0_vlml0_causalacttrue_sa2/checkpoints/best/pretrained_model/
+# ADD_IMAGE_TOKENS=true
+# ATTN_MODE=cross_attn
+# STATE_TO_PREFIX=true
+# CAUSAL_ACTION_ATTENTION_MASK=true
+# SELF_ATTN_EVERY_N_LAYERS=2
+# VLM_NAME=HuggingFaceTB/SmolVLM2-500M-Video-Instruct
+
+# EXPERT_WIDTH_MULTIPLIER=0.5
+# PAST_OBS_KEYS="image"
+# N_OBS_STEPS=1
+# NUM_EXPERT_LAYERS=0
+# CHUNK_SIZE=50
+# NUM_VLM_LAYERS=0
+# PAD_LANG_TO=longest
+# EVAL_CKPT=$WORK/logs/lerobot/lerobot_physical_intelligence_libero_smolpi0_lr1e-4_bs8_steps100000_gpus2_freeze32_onlyexpert_1act_promptfalse_imgtoktrue_nobs1_compiletrue_cross_attn_pref0_gap1_localimgfalse_reverseimgorderfalse_statetopreffalse/checkpoints/last/pretrained_model/
+# ADD_IMAGE_TOKENS=true
+# ATTN_MODE=cross_attn
+# STATE_TO_PREFIX=false
+# CAUSAL_ACTION_ATTENTION_MASK=true
+# SELF_ATTN_EVERY_N_LAYERS=0
+# VLM_NAME=HuggingFaceTB/SmolVLM2-500M-Video-Instruct
+
+# EXPERT_WIDTH_MULTIPLIER=0.5
+# PAST_OBS_KEYS="image"
+# N_OBS_STEPS=1
+# NUM_EXPERT_LAYERS=0
+# CHUNK_SIZE=50
+# NUM_VLM_LAYERS=0
+# PAD_LANG_TO=longest
+# EVAL_CKPT=$WORK/logs/lerobot/lerobot_physical_intelligence_libero_smolpi0_lr1e-4bs8steps100000gpus2freeze32_imgtoktrue_self_attn_gap1_localimgfalse_statetopreffalse_explay0_vlml0_causalacttrue_sa0_smolvlm2500/checkpoints/best/pretrained_model/
+# ADD_IMAGE_TOKENS=true
+# ATTN_MODE=self_attn
+# STATE_TO_PREFIX=false
+# CAUSAL_ACTION_ATTENTION_MASK=true
+# SELF_ATTN_EVERY_N_LAYERS=0
+# VLM_NAME=HuggingFaceTB/SmolVLM2-500M-Video-Instruct
+
+# EXPERT_WIDTH_MULTIPLIER=0.5
+# PAST_OBS_KEYS="image"
+# N_OBS_STEPS=1
+# NUM_EXPERT_LAYERS=0
+# CHUNK_SIZE=50
+# NUM_VLM_LAYERS=0
+# PAD_LANG_TO=longest
+# EVAL_CKPT=$WORK/logs/lerobot/lerobot_physical_intelligence_libero_smolpi0_lr1e-4bs8steps100000gpus2freeze32_imgtoktrue_self_attn_gap1_localimgfalse_statetopreftrue_explay0_vlml0_causalactfalse_sa0_smolvlm2500/checkpoints/best/pretrained_model/
+# ADD_IMAGE_TOKENS=true
+# ATTN_MODE=self_attn
+# STATE_TO_PREFIX=true
+# CAUSAL_ACTION_ATTENTION_MASK=false
+# SELF_ATTN_EVERY_N_LAYERS=0
+# VLM_NAME=HuggingFaceTB/SmolVLM2-500M-Video-Instruct
+
+# EXPERT_WIDTH_MULTIPLIER=0.5
+# PAST_OBS_KEYS="image"
+# N_OBS_STEPS=1
+# NUM_EXPERT_LAYERS=0
+# CHUNK_SIZE=50
+# NUM_VLM_LAYERS=8
+# PAD_LANG_TO=longest
+# EVAL_CKPT=$WORK/logs/lerobot/lerobot_physical_intelligence_libero_smolpi0_lr1e-4bs8steps100000gpus2freeze32_imgtoktrue_cross_attn_gap1_localimgfalse_statetopreftrue_explay0_vlml8_causalactfalse_sa0/checkpoints/best/pretrained_model/
+# ADD_IMAGE_TOKENS=true
+# ATTN_MODE=cross_attn
+# STATE_TO_PREFIX=true
+# CAUSAL_ACTION_ATTENTION_MASK=false
+# SELF_ATTN_EVERY_N_LAYERS=0
+# VLM_NAME=HuggingFaceTB/SmolVLM2-500M-Video-Instruct
+
+# EXPERT_WIDTH_MULTIPLIER=0.5
+# PAST_OBS_KEYS="image"
+# N_OBS_STEPS=1
+# NUM_EXPERT_LAYERS=0
+# CHUNK_SIZE=50
+# NUM_VLM_LAYERS=16
+# PAD_LANG_TO=longest
+# EVAL_CKPT=$WORK/logs/lerobot/lerobot_physical_intelligence_libero_smolpi0_lr1e-4bs8steps100000gpus2freeze32_imgtoktrue_cross_attn_gap1_localimgfalse_statetopreftrue_explay0_vlml16_causalactfalse_sa0/checkpoints/last/pretrained_model/
+# ADD_IMAGE_TOKENS=true
+# ATTN_MODE=cross_attn
+# STATE_TO_PREFIX=true
+# CAUSAL_ACTION_ATTENTION_MASK=false
+# SELF_ATTN_EVERY_N_LAYERS=0
+# VLM_NAME=HuggingFaceTB/SmolVLM2-500M-Video-Instruct
+
+# EXPERT_WIDTH_MULTIPLIER=0.5
+# PAST_OBS_KEYS="image"
+# N_OBS_STEPS=1
+# NUM_EXPERT_LAYERS=0
+# CHUNK_SIZE=50
+# NUM_VLM_LAYERS=24
+# PAD_LANG_TO=longest
+# EVAL_CKPT=$WORK/logs/lerobot/lerobot_physical_intelligence_libero_smolpi0_lr1e-4bs8steps100000gpus2freeze32_imgtoktrue_cross_attn_gap1_localimgfalse_statetopreftrue_explay0_vlml24_causalactfalse_sa0/checkpoints/best/pretrained_model/
+# ADD_IMAGE_TOKENS=true
+# ATTN_MODE=cross_attn
+# STATE_TO_PREFIX=true
+# CAUSAL_ACTION_ATTENTION_MASK=false
+# SELF_ATTN_EVERY_N_LAYERS=0
+# VLM_NAME=HuggingFaceTB/SmolVLM2-500M-Video-Instruct
+
+# EXPERT_WIDTH_MULTIPLIER=0.5
+# PAST_OBS_KEYS="image"
+# N_OBS_STEPS=1
+# NUM_EXPERT_LAYERS=0
+# CHUNK_SIZE=50
+# NUM_VLM_LAYERS=0
+# PAD_LANG_TO=longest
+# EVAL_CKPT=$WORK/logs/lerobot/lerobot_physical_intelligence_libero_smolpi0_lr1e-4bs8steps100000gpus2freeze32_imgtokfalse_cross_attn_gap1_localimgfalse_statetopreftrue_explay0_vlml0_causalactfalse_sa0_smolvlm2500/checkpoints/best/pretrained_model/
+# ADD_IMAGE_TOKENS=false
+# ATTN_MODE=cross_attn
+# STATE_TO_PREFIX=true
+# CAUSAL_ACTION_ATTENTION_MASK=false
+# SELF_ATTN_EVERY_N_LAYERS=0
+# VLM_NAME=HuggingFaceTB/SmolVLM2-500M-Video-Instruct
+
+# EXPERT_WIDTH_MULTIPLIER=0.5
+# PAST_OBS_KEYS="image"
+# N_OBS_STEPS=1
+# NUM_EXPERT_LAYERS=0
+# CHUNK_SIZE=100
+# NUM_VLM_LAYERS=0
+# PAD_LANG_TO=longest
+# EVAL_CKPT=$WORK/logs/lerobot/lerobot_physical_intelligence_libero_smolpi0_lr1e-4bs8steps100000gpus2freeze32_imgtoktrue_cross_attn_gap1_localimgfalse_statetopreftrue_explay0_vlml0_causalactfalse_sa0_smolvlm2500_chunk100/checkpoints/best/pretrained_model/
+# ADD_IMAGE_TOKENS=true
+# ATTN_MODE=cross_attn
+# STATE_TO_PREFIX=true
+# CAUSAL_ACTION_ATTENTION_MASK=false
+# SELF_ATTN_EVERY_N_LAYERS=0
+# VLM_NAME=HuggingFaceTB/SmolVLM2-500M-Video-Instruct
+
+# EXPERT_WIDTH_MULTIPLIER=0.5
+# PAST_OBS_KEYS="image"
+# N_OBS_STEPS=1
+# NUM_EXPERT_LAYERS=0
+# CHUNK_SIZE=30
+# NUM_VLM_LAYERS=0
+# PAD_LANG_TO=longest
+# EVAL_CKPT=$WORK/logs/lerobot/lerobot_physical_intelligence_libero_smolpi0_lr1e-4bs8steps100000gpus2freeze32_imgtoktrue_cross_attn_gap1_localimgfalse_statetopreftrue_explay0_vlml0_causalactfalse_sa0_smolvlm2500_chunk30/checkpoints/best/pretrained_model/
+# ADD_IMAGE_TOKENS=true
+# ATTN_MODE=cross_attn
+# STATE_TO_PREFIX=true
+# CAUSAL_ACTION_ATTENTION_MASK=false
+# SELF_ATTN_EVERY_N_LAYERS=0
+# VLM_NAME=HuggingFaceTB/SmolVLM2-500M-Video-Instruct
+
+# EXPERT_WIDTH_MULTIPLIER=0.5
+# PAST_OBS_KEYS="image"
+# N_OBS_STEPS=1
+# NUM_EXPERT_LAYERS=0
+# CHUNK_SIZE=10
+# NUM_VLM_LAYERS=0
+# PAD_LANG_TO=longest
+# EVAL_CKPT=$WORK/logs/lerobot/lerobot_physical_intelligence_libero_smolpi0_lr1e-4bs8steps100000gpus2freeze32_imgtoktrue_cross_attn_gap1_localimgfalse_statetopreftrue_explay0_vlml0_causalactfalse_sa0_smolvlm2500_chunk10/checkpoints/best/pretrained_model/
+# ADD_IMAGE_TOKENS=true
+# ATTN_MODE=cross_attn
+# STATE_TO_PREFIX=true
+# CAUSAL_ACTION_ATTENTION_MASK=false
+# SELF_ATTN_EVERY_N_LAYERS=0
+# VLM_NAME=HuggingFaceTB/SmolVLM2-500M-Video-Instruct
+
+# EXPERT_WIDTH_MULTIPLIER=0.5
+# PAST_OBS_KEYS="image"
+# N_OBS_STEPS=1
+# NUM_EXPERT_LAYERS=0
+# CHUNK_SIZE=1
+# NUM_VLM_LAYERS=0
+# PAD_LANG_TO=longest
+# EVAL_CKPT=$WORK/logs/lerobot/lerobot_physical_intelligence_libero_smolpi0_lr1e-4bs8steps100000gpus2freeze32_imgtoktrue_cross_attn_gap1_localimgfalse_statetopreftrue_explay0_vlml0_causalactfalse_sa0_smolvlm2500_chunk1/checkpoints/best/pretrained_model/
+# ADD_IMAGE_TOKENS=true
+# ATTN_MODE=cross_attn
+# STATE_TO_PREFIX=true
+# CAUSAL_ACTION_ATTENTION_MASK=false
+# SELF_ATTN_EVERY_N_LAYERS=0
+# VLM_NAME=HuggingFaceTB/SmolVLM2-500M-Video-Instruct
+
+# EXPERT_WIDTH_MULTIPLIER=0.5
+# PAST_OBS_KEYS="image"
+# N_OBS_STEPS=1
+# NUM_EXPERT_LAYERS=16
+# CHUNK_SIZE=50
+# NUM_VLM_LAYERS=0
+# PAD_LANG_TO=longest
+# EVAL_CKPT=$WORK/logs/lerobot/lerobot_physical_intelligence_libero_smolpi0_lr1e-4bs8steps100000gpus2freeze32_imgtoktrue_cross_attn_gap1_localimgfalse_statetopreftrue_explay16_vlml0_causalactfalse_sa0_smolvlm2500_chunk50_nobs1/checkpoints/best/pretrained_model/
+# ADD_IMAGE_TOKENS=true
+# ATTN_MODE=cross_attn
+# STATE_TO_PREFIX=true
+# CAUSAL_ACTION_ATTENTION_MASK=false
+# SELF_ATTN_EVERY_N_LAYERS=0
+# VLM_NAME=HuggingFaceTB/SmolVLM2-500M-Video-Instruct
+
+
+# EXPERT_WIDTH_MULTIPLIER=0.5
+# PAST_OBS_KEYS="image"
+# N_OBS_STEPS=2
+# NUM_EXPERT_LAYERS=0
+# CHUNK_SIZE=50
+# NUM_VLM_LAYERS=0
+# PAD_LANG_TO=longest
+# EVAL_CKPT=$WORK/logs/lerobot/lerobot_physical_intelligence_libero_smolpi0_lr1e-4bs8steps100000gpus2freeze32_imgtoktrue_cross_attn_gap1_localimgfalse_statetopreftrue_explay0_vlml0_causalactfalse_sa0_smolvlm2500_chunk50_nobs2/checkpoints/best/pretrained_model/
+# ADD_IMAGE_TOKENS=true
+# ATTN_MODE=cross_attn
+# STATE_TO_PREFIX=true
+# CAUSAL_ACTION_ATTENTION_MASK=false
+# SELF_ATTN_EVERY_N_LAYERS=0
+# VLM_NAME=HuggingFaceTB/SmolVLM2-500M-Video-Instruct
+
+# EXPERT_WIDTH_MULTIPLIER=0.5
+# PAST_OBS_KEYS="image"
+# N_OBS_STEPS=3
+# NUM_EXPERT_LAYERS=0
+# CHUNK_SIZE=50
+# NUM_VLM_LAYERS=0
+# PAD_LANG_TO=longest
+# EVAL_CKPT=$WORK/logs/lerobot/lerobot_physical_intelligence_libero_smolpi0_lr1e-4_bs8_steps100000_gpus2_freeze32_onlyexpert_1act_promptfalse_imgtoktrue_nobs3_compiletrue_cross_attn_pref0_gap1_localimgfalse_reverseimgorderfalse_statetopreftrue_toklongest/checkpoints/last/pretrained_model/
+# ADD_IMAGE_TOKENS=true
+# ATTN_MODE=cross_attn
+# STATE_TO_PREFIX=true
+# CAUSAL_ACTION_ATTENTION_MASK=false
+# SELF_ATTN_EVERY_N_LAYERS=0
+# VLM_NAME=HuggingFaceTB/SmolVLM2-500M-Video-Instruct
+
+# EXPERT_WIDTH_MULTIPLIER=0.5
+# PAST_OBS_KEYS="observation.state"
+# N_OBS_STEPS=3
+# NUM_EXPERT_LAYERS=0
+# CHUNK_SIZE=50
+# NUM_VLM_LAYERS=0
+# PAD_LANG_TO=longest
+# EVAL_CKPT=$WORK/logs/lerobot/lerobot_physical_intelligence_libero_smolpi0_lr1e-4bs8steps100000gpus2freeze32_imgtoktrue_cross_attn_gap1_localimgfalse_statetopreftrue_explay0_vlml0_causalactfalse_sa0_smolvlm2500_chunk50_nobs3_paststates/checkpoints/best/pretrained_model/
+# ADD_IMAGE_TOKENS=true
+# ATTN_MODE=cross_attn
+# STATE_TO_PREFIX=true
+# CAUSAL_ACTION_ATTENTION_MASK=false
+# SELF_ATTN_EVERY_N_LAYERS=0
+# VLM_NAME=HuggingFaceTB/SmolVLM2-500M-Video-Instruct
+
+# EXPERT_WIDTH_MULTIPLIER=0.5
+# PAST_OBS_KEYS="observation.state,image"
+# N_OBS_STEPS=3
+# NUM_EXPERT_LAYERS=0
+# CHUNK_SIZE=50
+# NUM_VLM_LAYERS=0
+# PAD_LANG_TO=longest
+# EVAL_CKPT=$WORK/logs/lerobot/lerobot_physical_intelligence_libero_smolpi0_lr1e-4bs8steps100000gpus2freeze32_imgtoktrue_cross_attn_gap1_localimgfalse_statetopreftrue_explay0_vlml0_causalactfalse_sa0_smolvlm2500_chunk50_nobs3_paststatesimgs/checkpoints/best/pretrained_model/
+# ADD_IMAGE_TOKENS=true
+# ATTN_MODE=cross_attn
+# STATE_TO_PREFIX=true
+# CAUSAL_ACTION_ATTENTION_MASK=false
+# SELF_ATTN_EVERY_N_LAYERS=0
+# VLM_NAME=HuggingFaceTB/SmolVLM2-500M-Video-Instruct
+
+
+# EXPERT_WIDTH_MULTIPLIER=1
+# PAST_OBS_KEYS="image"
+# N_OBS_STEPS=1
+# NUM_EXPERT_LAYERS=0
+# CHUNK_SIZE=50
+# NUM_VLM_LAYERS=0
+# PAD_LANG_TO=longest
+# EVAL_CKPT=$WORK/logs/lerobot/lerobot_physical_intelligence_libero_smolpi0_lr9.5e-5bs8steps100000gpus2freeze32_imgtoktrue_cross_attn_gap1_localimgfalse_statetopreftrue_explay0_vlml0_causalactfalse_sa0_smolvlm2500_chunk50_nobs1_expw1/checkpoints/best/pretrained_model/
+# ADD_IMAGE_TOKENS=true
+# ATTN_MODE=cross_attn
+# STATE_TO_PREFIX=true
+# CAUSAL_ACTION_ATTENTION_MASK=false
+# SELF_ATTN_EVERY_N_LAYERS=0
+# VLM_NAME=HuggingFaceTB/SmolVLM2-500M-Video-Instruct
+
+
+# EXPERT_WIDTH_MULTIPLIER=0.75
+# PAST_OBS_KEYS="image"
+# N_OBS_STEPS=1
+# NUM_EXPERT_LAYERS=0
+# CHUNK_SIZE=50
+# NUM_VLM_LAYERS=0
+# PAD_LANG_TO=longest
+# EVAL_CKPT=$WORK/logs/lerobot/lerobot_physical_intelligence_libero_smolpi0_lr9.5e-5bs8steps100000gpus2freeze32_imgtoktrue_cross_attn_gap1_localimgfalse_statetopreftrue_explay0_vlml0_causalactfalse_sa0_smolvlm2500_chunk50_nobs1_expw0.75/checkpoints/best/pretrained_model/
+# ADD_IMAGE_TOKENS=true
+# ATTN_MODE=cross_attn
+# STATE_TO_PREFIX=true
+# CAUSAL_ACTION_ATTENTION_MASK=false
+# SELF_ATTN_EVERY_N_LAYERS=0
+# VLM_NAME=HuggingFaceTB/SmolVLM2-500M-Video-Instruct
+
+
+# EXPERT_WIDTH_MULTIPLIER=0.25
+# PAST_OBS_KEYS="image"
+# N_OBS_STEPS=1
+# NUM_EXPERT_LAYERS=0
+# CHUNK_SIZE=50
+# NUM_VLM_LAYERS=0
+# PAD_LANG_TO=longest
+# EVAL_CKPT=$WORK/logs/lerobot/lerobot_physical_intelligence_libero_smolpi0_lr2e-4bs8steps100000gpus2freeze32_imgtoktrue_cross_attn_gap1_localimgfalse_statetopreftrue_explay0_vlml0_causalactfalse_sa0_smolvlm2500_chunk50_nobs1_expw0.25/checkpoints/best/pretrained_model/
+# ADD_IMAGE_TOKENS=true
+# ATTN_MODE=cross_attn
+# STATE_TO_PREFIX=true
+# CAUSAL_ACTION_ATTENTION_MASK=false
+# SELF_ATTN_EVERY_N_LAYERS=0
+# VLM_NAME=HuggingFaceTB/SmolVLM2-500M-Video-Instruct
+
+
+# EXPERT_WIDTH_MULTIPLIER=0.5
+# PAST_OBS_KEYS="observation.state,image"
+# N_OBS_STEPS=1
+# NUM_EXPERT_LAYERS=0
+# CHUNK_SIZE=50
+# NUM_VLM_LAYERS=16
+# PAD_LANG_TO=longest
+# EVAL_CKPT=$WORK/logs/lerobot/lerobot_physical_intelligence_libero_smolpi0_lr1e-4bs8steps100000gpus2freeze32_imgtokfalse_cross_attn_gap1_localimgfalse_statetopreftrue_explay0_vlml16_causalactfalse_sa0_smolvlm2500_chunk50_nobs1_expw0.5/checkpoints/best/pretrained_model/
+# ADD_IMAGE_TOKENS=false
+# ATTN_MODE=cross_attn
+# STATE_TO_PREFIX=true
+# CAUSAL_ACTION_ATTENTION_MASK=false
+# SELF_ATTN_EVERY_N_LAYERS=0
+# VLM_NAME=HuggingFaceTB/SmolVLM2-500M-Video-Instruct
+
+
+# EXPERT_WIDTH_MULTIPLIER=0.5
+# PAST_OBS_KEYS="observation.state,image"
+# N_OBS_STEPS=1
+# NUM_EXPERT_LAYERS=0
+# CHUNK_SIZE=50
+# NUM_VLM_LAYERS=16
+# PAD_LANG_TO=longest
+# EVAL_CKPT=$WORK/logs/lerobot/lerobot_physical_intelligence_libero_smolpi0_lr1e-4bs16steps100000gpus2freeze32_imgtokfalse_cross_attn_gap1_localimgfalse_statetopreftrue_explay0_vlml16_causalactfalse_sa0_smolvlm2500_chunk50_nobs1_expw0.5/checkpoints/best/pretrained_model/
+# ADD_IMAGE_TOKENS=false
+# ATTN_MODE=cross_attn
+# STATE_TO_PREFIX=true
+# CAUSAL_ACTION_ATTENTION_MASK=false
+# SELF_ATTN_EVERY_N_LAYERS=0
+# VLM_NAME=HuggingFaceTB/SmolVLM2-500M-Video-Instruct
+
+
+
+# EXPERT_WIDTH_MULTIPLIER=0.5
+# PAST_OBS_KEYS="observation.state,image"
+# N_OBS_STEPS=1
+# NUM_EXPERT_LAYERS=0
+# CHUNK_SIZE=50
+# NUM_VLM_LAYERS=16
+# PAD_LANG_TO=longest
+# EVAL_CKPT=$WORK/logs/lerobot/lerobot_physical_intelligence_libero_smolpi0_lr1e-4bs32steps100000gpus2freeze32_imgtokfalse_cross_attn_gap1_localimgfalse_statetopreftrue_explay0_vlml16_causalactfalse_sa0_smolvlm2500_chunk50_nobs1_expw0.5/checkpoints/best/pretrained_model/
+# ADD_IMAGE_TOKENS=false
+# ATTN_MODE=cross_attn
+# STATE_TO_PREFIX=true
+# CAUSAL_ACTION_ATTENTION_MASK=false
+# SELF_ATTN_EVERY_N_LAYERS=0
+# VLM_NAME=HuggingFaceTB/SmolVLM2-500M-Video-Instruct
+
+
+# EXPERT_WIDTH_MULTIPLIER=0.5
+# PAST_OBS_KEYS="image"
+# N_OBS_STEPS=1
+# NUM_EXPERT_LAYERS=0
+# CHUNK_SIZE=50
+# NUM_VLM_LAYERS=0
+# PAD_LANG_TO=longest
+# EVAL_CKPT=$WORK/logs/lerobot/lerobot_physical_intelligence_libero_smolpi0_lr5e-4bs8steps100000gpus2freeze32_imgtoktrue_cross_attn_gap1_localimgfalse_statetopreftrue_explay0_vlml0_causalactfalse_sa0_smolvlm1250_chunk50_nobs1_expw0.5/checkpoints/best/pretrained_model/
+# ADD_IMAGE_TOKENS=true
+# ATTN_MODE=cross_attn
+# STATE_TO_PREFIX=true
+# CAUSAL_ACTION_ATTENTION_MASK=false
+# SELF_ATTN_EVERY_N_LAYERS=0
+# VLM_NAME=HuggingFaceTB/SmolVLM-256M-Instruct
+
+
+# EXPERT_WIDTH_MULTIPLIER=0.5
+# PAST_OBS_KEYS="image"
+# N_OBS_STEPS=1
+# NUM_EXPERT_LAYERS=0
+# CHUNK_SIZE=50
+# NUM_VLM_LAYERS=0
+# PAD_LANG_TO=longest
+# EVAL_CKPT=$WORK/logs/lerobot/lerobot_physical_intelligence_libero_smolpi0_lr8e-5bs8steps100000gpus2freeze32_imgtoktrue_cross_attn_gap1_localimgfalse_statetopreftrue_explay0_vlml0_causalactfalse_sa0_smolvlm12b_chunk50_nobs1_expw0.5/checkpoints/best/pretrained_model/
+# ADD_IMAGE_TOKENS=true
+# ATTN_MODE=cross_attn
+# STATE_TO_PREFIX=true
+# CAUSAL_ACTION_ATTENTION_MASK=false
+# SELF_ATTN_EVERY_N_LAYERS=0
+# VLM_NAME=HuggingFaceTB/SmolVLM-Instruct
+
+
+
+# EXPERT_WIDTH_MULTIPLIER=0.5
+# PAST_OBS_KEYS="image"
+# N_OBS_STEPS=1
+# NUM_EXPERT_LAYERS=0
+# CHUNK_SIZE=50
+# NUM_VLM_LAYERS=0
+# PAD_LANG_TO=longest
+# EVAL_CKPT=$WORK/logs/lerobot/lerobot_physical_intelligence_libero_smolpi0_lr1e-4bs8steps100000gpus2freeze32_imgtoktrue_cross_attn_gap1_localimgfalse_statetopreftrue_explay0_vlml0_causalactfalse_sa0_smolvlm1500_chunk50_nobs1_expw0.5/checkpoints/best/pretrained_model/
+# ADD_IMAGE_TOKENS=true
+# ATTN_MODE=cross_attn
+# STATE_TO_PREFIX=true
+# CAUSAL_ACTION_ATTENTION_MASK=false
+# SELF_ATTN_EVERY_N_LAYERS=0
+# VLM_NAME=HuggingFaceTB/SmolVLM-500M-Instruct
+
+
+# EXPERT_WIDTH_MULTIPLIER=0.5
+# PAST_OBS_KEYS="image"
+# N_OBS_STEPS=1
+# NUM_EXPERT_LAYERS=0
+# CHUNK_SIZE=50
+# NUM_VLM_LAYERS=0
+# PAD_LANG_TO=longest
+# EVAL_CKPT=$WORK/logs/lerobot/lerobot_physical_intelligence_libero_smolpi0_lr8e-5bs8steps100000gpus2freeze32_imgtokfalse_cross_attn_gap1_localimgfalse_statetopreftrue_explay0_vlml0_causalactfalse_sa0_smolvlm1500_chunk50_nobs1_expw0.5/checkpoints/best/pretrained_model/
+# ADD_IMAGE_TOKENS=false
+# ATTN_MODE=cross_attn
+# STATE_TO_PREFIX=true
+# CAUSAL_ACTION_ATTENTION_MASK=false
+# SELF_ATTN_EVERY_N_LAYERS=0
+# VLM_NAME=HuggingFaceTB/SmolVLM-500M-Instruct
+
+
+
+# EXPERT_WIDTH_MULTIPLIER=0.5
+# PAST_OBS_KEYS="image"
+# N_OBS_STEPS=1
+# NUM_EXPERT_LAYERS=0
+# CHUNK_SIZE=50
+# NUM_VLM_LAYERS=0
+# PAD_LANG_TO=longest
+# EVAL_CKPT=$WORK/logs/lerobot/lerobot_physical_intelligence_libero_smolpi0_lr8e-5bs8steps100000gpus2freeze32_imgtokfalse_cross_attn_gap1_localimgfalse_statetopreftrue_explay0_vlml0_causalactfalse_sa0_smolvlm1500_chunk50_nobs1_expw0.5/checkpoints/best/pretrained_model/
+# ADD_IMAGE_TOKENS=false
+# ATTN_MODE=cross_attn
+# STATE_TO_PREFIX=true
+# CAUSAL_ACTION_ATTENTION_MASK=false
+# SELF_ATTN_EVERY_N_LAYERS=0
+# VLM_NAME=HuggingFaceTB/SmolVLM-500M-Instruct
+
+# EXPERT_WIDTH_MULTIPLIER=0.5
+# PAST_OBS_KEYS="image"
+# N_OBS_STEPS=1
+# NUM_EXPERT_LAYERS=0
+# CHUNK_SIZE=50
+# NUM_VLM_LAYERS=16
+# PAD_LANG_TO=longest
+# EVAL_CKPT=$WORK/logs/lerobot/lerobot_physical_intelligence_libero_smolpi0_lr1e-4bs8steps100000gpus2freeze32_imgtoktrue_cross_attn_gap1_localimgfalse_statetopreftrue_explay0_vlml16_causalactfalse_sa0_smolvlm2500_chunk50_nobs1_expw0.5/checkpoints/best/pretrained_model/
+# ADD_IMAGE_TOKENS=true
+# ATTN_MODE=cross_attn
+# STATE_TO_PREFIX=true
+# CAUSAL_ACTION_ATTENTION_MASK=false
+# SELF_ATTN_EVERY_N_LAYERS=0
+# VLM_NAME=HuggingFaceTB/SmolVLM-500M-Instruct
+
+
+# EXPERT_WIDTH_MULTIPLIER=0.5
+# PAST_OBS_KEYS="image"
+# N_OBS_STEPS=1
+# NUM_EXPERT_LAYERS=0
+# CHUNK_SIZE=50
+# NUM_VLM_LAYERS=16
+# PAD_LANG_TO=longest
+# EVAL_CKPT=$WORK/logs/lerobot/lerobot_physical_intelligence_libero_smolpi0_lr1e-4bs8steps100000gpus2freeze32_imgtoktrue_cross_attn_gap1_localimgfalse_statetopreftrue_explay0_vlml16_causalacttrue_sa2_smolvlm2500_chunk50_nobs1_expw0.5/checkpoints/best/pretrained_model/
+# ADD_IMAGE_TOKENS=true
+# ATTN_MODE=cross_attn
+# STATE_TO_PREFIX=true
+# CAUSAL_ACTION_ATTENTION_MASK=true
+# SELF_ATTN_EVERY_N_LAYERS=2
+# VLM_NAME=HuggingFaceTB/SmolVLM-500M-Instruct
+
+
+# EXPERT_WIDTH_MULTIPLIER=0.5
+# PAST_OBS_KEYS="image"
+# N_OBS_STEPS=1
+# NUM_EXPERT_LAYERS=0
+# CHUNK_SIZE=50
+# NUM_VLM_LAYERS=16
+# PAD_LANG_TO=longest
+# EVAL_CKPT=$WORK/logs/lerobot/lerobot_physical_intelligence_libero_smolpi0_lr1e-4bs32steps100000gpus2freeze32_imgtoktrue_cross_attn_gap1_localimgfalse_statetopreftrue_explay0_vlml16_causalacttrue_sa2_smolvlm2500_chunk50_nobs1_expw0.5/checkpoints/best/pretrained_model/
+# ADD_IMAGE_TOKENS=true
+# ATTN_MODE=cross_attn
+# STATE_TO_PREFIX=true
+# CAUSAL_ACTION_ATTENTION_MASK=true
+# SELF_ATTN_EVERY_N_LAYERS=2
+# VLM_NAME=HuggingFaceTB/SmolVLM-500M-Instruct
+
+
+
+# EXPERT_WIDTH_MULTIPLIER=0.5
+# PAST_OBS_KEYS="image"
+# N_OBS_STEPS=1
+# NUM_EXPERT_LAYERS=0
+# CHUNK_SIZE=50
+# NUM_VLM_LAYERS=16
+# PAD_LANG_TO=longest
+# EVAL_CKPT=$WORK/logs/lerobot/lerobot_physical_intelligence_libero_smolpi0_lr1e-4bs32steps100000gpus2freeze32_imgtoktrue_cross_attn_gap1_localimgfalse_statetopreftrue_explay0_vlml16_causalactfalse_sa0_smolvlm2500_chunk50_nobs1_expw0.5/checkpoints/best/pretrained_model/
+# ADD_IMAGE_TOKENS=true
+# ATTN_MODE=cross_attn
+# STATE_TO_PREFIX=true
+# CAUSAL_ACTION_ATTENTION_MASK=false
+# SELF_ATTN_EVERY_N_LAYERS=0
+# VLM_NAME=HuggingFaceTB/SmolVLM-500M-Instruct
+
+
+# EXPERT_WIDTH_MULTIPLIER=0.5
+# PAST_OBS_KEYS="image"
+# N_OBS_STEPS=1
+# NUM_EXPERT_LAYERS=0
+# CHUNK_SIZE=50
+# NUM_VLM_LAYERS=0
+# PAD_LANG_TO=longest
+# EVAL_CKPT=$WORK/logs/lerobot/lerobot_physical_intelligence_libero_smolpi0_lr1e-4bs8steps100000gpus2freeze32_imgtoktrue_cross_attn_gap1_localimgfalse_statetopreftrue_explay0_vlml0_causalactfalse_sa0_smolvlm2500_chunk50_nobs1_expw0.5_rep/checkpoints/best/pretrained_model/
+# ADD_IMAGE_TOKENS=true
+# ATTN_MODE=cross_attn
+# STATE_TO_PREFIX=true
+# CAUSAL_ACTION_ATTENTION_MASK=false
+# SELF_ATTN_EVERY_N_LAYERS=0
+# VLM_NAME=HuggingFaceTB/SmolVLM-500M-Instruct
+
+# EXPERT_WIDTH_MULTIPLIER=0.5
+# PAST_OBS_KEYS="image"
+# N_OBS_STEPS=1
+# NUM_EXPERT_LAYERS=0
+# CHUNK_SIZE=50
+# NUM_VLM_LAYERS=16
+# PAD_LANG_TO=longest
+# EVAL_CKPT=$WORK/logs/lerobot/lerobot_physical_intelligence_libero_smolpi0_lr1e-4bs32steps100000gpus2freeze32_imgtoktrue_cross_attn_gap1_localimgfalse_statetopreftrue_explay0_vlml16_causalacttrue_sa2_smolvlm2500_chunk50_nobs1_expw0.5/checkpoints/best/pretrained_model/
+# ADD_IMAGE_TOKENS=true
+# ATTN_MODE=cross_attn
+# STATE_TO_PREFIX=true
+# CAUSAL_ACTION_ATTENTION_MASK=true
+# SELF_ATTN_EVERY_N_LAYERS=2
+# VLM_NAME=HuggingFaceTB/SmolVLM-500M-Instruct
+
+# EXPERT_WIDTH_MULTIPLIER=0.5
+# PAST_OBS_KEYS="image"
+# N_OBS_STEPS=1
+# NUM_EXPERT_LAYERS=0
+# CHUNK_SIZE=50
+# NUM_VLM_LAYERS=16
+# PAD_LANG_TO=longest
+# EVAL_CKPT=$WORK/logs/lerobot/lerobot_physical_intelligence_libero_smolpi0_lr1e-4bs32steps100000gpus2freeze32_imgtoktrue_cross_attn_gap1_localimgfalse_statetopreftrue_explay0_vlml16_causalactfalse_sa2_smolvlm2500_chunk50_nobs1_expw0.5_rep/checkpoints/best/pretrained_model/
+# ADD_IMAGE_TOKENS=true
+# ATTN_MODE=cross_attn
+# STATE_TO_PREFIX=true
+# CAUSAL_ACTION_ATTENTION_MASK=false
+# SELF_ATTN_EVERY_N_LAYERS=2
+# VLM_NAME=HuggingFaceTB/SmolVLM-500M-Instruct
+
+
+# EXPERT_WIDTH_MULTIPLIER=0.5
+# PAST_OBS_KEYS="image"
+# N_OBS_STEPS=1
+# NUM_EXPERT_LAYERS=0
+# CHUNK_SIZE=50
+# NUM_VLM_LAYERS=16
+# PAD_LANG_TO=longest
+# EVAL_CKPT=$WORK/logs/lerobot/lerobot_physical_intelligence_libero_smolpi0_lr1e-4bs32steps100000gpus2freeze32_imgtokfalse_cross_attn_gap1_localimgfalse_statetopreftrue_explay0_vlml16_causalacttrue_sa2_smolvlm2500_chunk50_nobs1_expw0.5/checkpoints/best/pretrained_model/
+# ADD_IMAGE_TOKENS=false
+# ATTN_MODE=cross_attn
+# STATE_TO_PREFIX=true
+# CAUSAL_ACTION_ATTENTION_MASK=true
+# SELF_ATTN_EVERY_N_LAYERS=2
+# VLM_NAME=HuggingFaceTB/SmolVLM-500M-Instruct
+
+
+# EXPERT_WIDTH_MULTIPLIER=0.5
+# PAST_OBS_KEYS="image"
+# N_OBS_STEPS=1
+# NUM_EXPERT_LAYERS=0
+# CHUNK_SIZE=50
+# NUM_VLM_LAYERS=0
+# PAD_LANG_TO=longest
+# EVAL_CKPT=$WORK/logs/lerobot/lerobot_physical_intelligence_libero_smolpi0_lr1e-4bs8steps100000gpus2full8_imgtoktrue_cross_attn_gap1_localimgfalse_statetopreftrue_explay0_vlml0_causalacttrue_sa0_smolvlm2500_chunk50_nobs1_expw0.5_lrvlm1e-6/checkpoints/best/pretrained_model/
+# ADD_IMAGE_TOKENS=true
+# ATTN_MODE=cross_attn
+# STATE_TO_PREFIX=true
+# CAUSAL_ACTION_ATTENTION_MASK=true
+# SELF_ATTN_EVERY_N_LAYERS=0
+# VLM_NAME=HuggingFaceTB/SmolVLM-500M-Instruct
+
+
+# EXPERT_WIDTH_MULTIPLIER=0.5
+# PAST_OBS_KEYS="image"
+# N_OBS_STEPS=1
+# NUM_EXPERT_LAYERS=0
+# CHUNK_SIZE=50
+# NUM_VLM_LAYERS=0
+# PAD_LANG_TO=longest
+# EVAL_CKPT=$WORK/logs/lerobot/lerobot_physical_intelligence_libero_smolpi0_lr1e-4bs8steps100000gpus2full8_imgtoktrue_cross_attn_gap1_localimgfalse_statetopreftrue_explay0_vlml0_causalacttrue_sa0_smolvlm2500_chunk50_nobs1_expw0.5_lrvlm1e-5/checkpoints/best/pretrained_model/
+# ADD_IMAGE_TOKENS=true
+# ATTN_MODE=cross_attn
+# STATE_TO_PREFIX=true
+# CAUSAL_ACTION_ATTENTION_MASK=true
+# SELF_ATTN_EVERY_N_LAYERS=0
+# VLM_NAME=HuggingFaceTB/SmolVLM-500M-Instruct
+
+
+# EXPERT_WIDTH_MULTIPLIER=0.5
+# PAST_OBS_KEYS="image"
+# N_OBS_STEPS=1
+# NUM_EXPERT_LAYERS=0
+# CHUNK_SIZE=50
+# NUM_VLM_LAYERS=0
+# PAD_LANG_TO=longest
+# EVAL_CKPT=$WORK/logs/lerobot/lerobot_physical_intelligence_libero_smolpi0_lr1e-4bs8steps100000gpus2full8_imgtoktrue_cross_attn_gap1_localimgfalse_statetopreftrue_explay0_vlml0_causalacttrue_sa0_smolvlm2500_chunk50_nobs1_expw0.5_lrvlm1e-4/checkpoints/best/pretrained_model/
+# ADD_IMAGE_TOKENS=true
+# ATTN_MODE=cross_attn
+# STATE_TO_PREFIX=true
+# CAUSAL_ACTION_ATTENTION_MASK=true
+# SELF_ATTN_EVERY_N_LAYERS=0
+# VLM_NAME=HuggingFaceTB/SmolVLM-500M-Instruct
+
+
+# PEFT_METHOD=lora
+# PEFT_TARGET_MODEL=text
+# LORA_TARGET_MODULES=q_proj,v_proj,k_proj
+# LORA_R=32
+# EXPERT_WIDTH_MULTIPLIER=0.5
+# PAST_OBS_KEYS="image"
+# N_OBS_STEPS=1
+# NUM_EXPERT_LAYERS=0
+# CHUNK_SIZE=50
+# NUM_VLM_LAYERS=0
+# PAD_LANG_TO=longest
+# EVAL_CKPT=$WORK/logs/lerobot/lerobot_physical_intelligence_libero_smolpi0_lr1e-4bs8steps100000gpus2lora32_imgtoktrue_cross_attn_gap1_localimgfalse_statetopreftrue_explay0_vlml0_causalacttrue_sa0_smolvlm2500_chunk50_nobs1_expw0.5_lrvlm1e-4_loraqkv/checkpoints/best/pretrained_model/
+# ADD_IMAGE_TOKENS=true
+# ATTN_MODE=cross_attn
+# STATE_TO_PREFIX=true
+# CAUSAL_ACTION_ATTENTION_MASK=true
+# SELF_ATTN_EVERY_N_LAYERS=0
+# VLM_NAME=HuggingFaceTB/SmolVLM-500M-Instruct
+
+
+# PEFT_METHOD=lora
+# PEFT_TARGET_MODEL=text
+# LORA_TARGET_MODULES=q_proj,v_proj,k_proj
+# LORA_R=32
+# EXPERT_WIDTH_MULTIPLIER=0.5
+# PAST_OBS_KEYS="image"
+# N_OBS_STEPS=1
+# NUM_EXPERT_LAYERS=0
+# CHUNK_SIZE=50
+# NUM_VLM_LAYERS=0
+# PAD_LANG_TO=longest
+# EVAL_CKPT=$WORK/logs/lerobot/lerobot_physical_intelligence_libero_smolpi0_lr1e-4bs8steps100000gpus2lora32_imgtoktrue_cross_attn_gap1_localimgfalse_statetopreftrue_explay0_vlml0_causalacttrue_sa0_smolvlm2500_chunk50_nobs1_expw0.5_lrvlm1e-5_loraqkv/checkpoints/best/pretrained_model/
+# ADD_IMAGE_TOKENS=true
+# ATTN_MODE=cross_attn
+# STATE_TO_PREFIX=true
+# CAUSAL_ACTION_ATTENTION_MASK=true
+# SELF_ATTN_EVERY_N_LAYERS=0
+# VLM_NAME=HuggingFaceTB/SmolVLM-500M-Instruct
+
+# PEFT_METHOD=lora
+# PEFT_TARGET_MODEL=text
+# LORA_TARGET_MODULES=q_proj,v_proj,k_proj,up_proj,down_proj,gate_proj
+# LORA_R=32
+# EXPERT_WIDTH_MULTIPLIER=0.5
+# PAST_OBS_KEYS="image"
+# N_OBS_STEPS=1
+# NUM_EXPERT_LAYERS=0
+# CHUNK_SIZE=50
+# NUM_VLM_LAYERS=0
+# PAD_LANG_TO=longest
+# EVAL_CKPT=$WORK/logs/lerobot/lerobot_physical_intelligence_libero_smolpi0_lr1e-4bs8steps100000gpus2lora32_imgtoktrue_cross_attn_gap1_localimgfalse_statetopreftrue_explay0_vlml0_causalacttrue_sa0_smolvlm2500_chunk50_nobs1_expw0.5_lrvlm1e-4/checkpoints/best/pretrained_model/
+# ADD_IMAGE_TOKENS=true
+# ATTN_MODE=cross_attn
+# STATE_TO_PREFIX=true
+# CAUSAL_ACTION_ATTENTION_MASK=true
+# SELF_ATTN_EVERY_N_LAYERS=0
+# VLM_NAME=HuggingFaceTB/SmolVLM-500M-Instruct
+
+
+# PEFT_METHOD=lora
+# PEFT_TARGET_MODEL=text
+# LORA_TARGET_MODULES=q_proj,v_proj,k_proj,up_proj,down_proj,gate_proj
+# LORA_R=32
+# EXPERT_WIDTH_MULTIPLIER=0.5
+# PAST_OBS_KEYS="image"
+# N_OBS_STEPS=1
+# NUM_EXPERT_LAYERS=0
+# CHUNK_SIZE=50
+# NUM_VLM_LAYERS=0
+# PAD_LANG_TO=longest
+# EVAL_CKPT=$WORK/logs/lerobot/lerobot_physical_intelligence_libero_smolpi0_lr1e-4bs8steps100000gpus2lora32_imgtoktrue_cross_attn_gap1_localimgfalse_statetopreftrue_explay0_vlml0_causalacttrue_sa0_smolvlm2500_chunk50_nobs1_expw0.5_lrvlm1e-5/checkpoints/best/pretrained_model/
+# ADD_IMAGE_TOKENS=true
+# ATTN_MODE=cross_attn
+# STATE_TO_PREFIX=true
+# CAUSAL_ACTION_ATTENTION_MASK=true
+# SELF_ATTN_EVERY_N_LAYERS=0
+# VLM_NAME=HuggingFaceTB/SmolVLM-500M-Instruct
+
+
+# PEFT_METHOD=lora
+# PEFT_TARGET_MODEL=text
+# LORA_TARGET_MODULES=q_proj,v_proj,k_proj,up_proj,down_proj,gate_proj
+# LORA_R=32
+# EXPERT_WIDTH_MULTIPLIER=0.5
+# PAST_OBS_KEYS="image"
+# N_OBS_STEPS=1
+# NUM_EXPERT_LAYERS=0
+# CHUNK_SIZE=50
+# NUM_VLM_LAYERS=0
+# PAD_LANG_TO=longest
+# EVAL_CKPT=$WORK/logs/lerobot/lerobot_physical_intelligence_libero_smolpi0_lr1e-4bs8steps100000gpus2lora32_imgtoktrue_cross_attn_gap1_localimgfalse_statetopreftrue_explay0_vlml0_causalacttrue_sa0_smolvlm2500_chunk50_nobs1_expw0.5_lrvlm1e-6/checkpoints/best/pretrained_model/
+# ADD_IMAGE_TOKENS=true
+# ATTN_MODE=cross_attn
+# STATE_TO_PREFIX=true
+# CAUSAL_ACTION_ATTENTION_MASK=true
+# SELF_ATTN_EVERY_N_LAYERS=0
+# VLM_NAME=HuggingFaceTB/SmolVLM-500M-Instruct
+
+
+# EXPERT_WIDTH_MULTIPLIER=0.5
+# PAST_OBS_KEYS="image"
+# N_OBS_STEPS=1
+# NUM_EXPERT_LAYERS=0
+# CHUNK_SIZE=50
+# NUM_VLM_LAYERS=0
+# PAD_LANG_TO=longest
+# EVAL_CKPT=$WORK/logs/lerobot/lerobot_physical_intelligence_libero_smolpi0_lr1e-4bs8steps100000gpus2lora32_imgtoktrue_cross_attn_gap1_localimgfalse_statetopreftrue_explay0_vlml0_causalacttrue_sa0_smolvlm2500_chunk50_nobs1_expw0.5_lrvlm1e-5/checkpoints/best/pretrained_model/
+# ADD_IMAGE_TOKENS=true
+# ATTN_MODE=cross_attn
+# STATE_TO_PREFIX=true
+# CAUSAL_ACTION_ATTENTION_MASK=true
+# SELF_ATTN_EVERY_N_LAYERS=0
+# VLM_NAME=HuggingFaceTB/SmolVLM-500M-Instruct
+
+
+
+# EXPERT_WIDTH_MULTIPLIER=0.5
+# PAST_OBS_KEYS="image"
+# N_OBS_STEPS=1
+# NUM_EXPERT_LAYERS=0
+# CHUNK_SIZE=50
+# NUM_VLM_LAYERS=16
+# PAD_LANG_TO=longest
+# EVAL_CKPT=$WORK/logs/lerobot/lerobot_physical_intelligence_libero_smolpi0_lr1e-4bs32steps100000gpus2freeze32_imgtokfalse_cross_attn_gap1_localimgfalse_statetopreftrue_explay0_vlml16_causalacttrue_sa0_smolvlm2500_chunk50_nobs1_expw0.5_lrvlm1e-4_loraqkv/checkpoints/best/pretrained_model/
+# ADD_IMAGE_TOKENS=false
+# ATTN_MODE=cross_attn
+# STATE_TO_PREFIX=true
+# CAUSAL_ACTION_ATTENTION_MASK=true
+# SELF_ATTN_EVERY_N_LAYERS=0
+# VLM_NAME=HuggingFaceTB/SmolVLM-500M-Instruct
+
+
+
+# EXPERT_WIDTH_MULTIPLIER=0.5
+# PAST_OBS_KEYS="image"
+# N_OBS_STEPS=1
+# NUM_EXPERT_LAYERS=0
+# CHUNK_SIZE=50
+# NUM_VLM_LAYERS=0
+# PAD_LANG_TO=longest
+# EVAL_CKPT=$WORK/logs/lerobot/lerobot_physical_intelligence_libero_smolpi0_lr1e-4bs32steps100000gpus2freeze32_imgtokfalse_cross_attn_gap1_localimgfalse_statetopreftrue_explay0_vlml0_causalacttrue_sa2_smolvlm2500_chunk50_nobs1_expw0.5_lrvlm1e-4/checkpoints/best/pretrained_model/
+# ADD_IMAGE_TOKENS=false
+# ATTN_MODE=cross_attn
+# STATE_TO_PREFIX=true
+# CAUSAL_ACTION_ATTENTION_MASK=true
+# SELF_ATTN_EVERY_N_LAYERS=2
+# VLM_NAME=HuggingFaceTB/SmolVLM-500M-Instruct
+
+
+# EXPERT_WIDTH_MULTIPLIER=0.5
+# PAST_OBS_KEYS="image"
+# N_OBS_STEPS=1
+# NUM_EXPERT_LAYERS=0
+# CHUNK_SIZE=50
+# NUM_VLM_LAYERS=0
+# PAD_LANG_TO=longest
+# EVAL_CKPT=$WORK/logs/lerobot/lerobot_physical_intelligence_libero_smolpi0_lr1e-4bs32steps100000gpus2freeze32_imgtoktrue_cross_attn_gap1_localimgfalse_statetopreftrue_explay0_vlml0_causalacttrue_sa2_smolvlm2500_chunk50_nobs1_expw0.5_lrvlm1e-4/checkpoints/best/pretrained_model/
+# ADD_IMAGE_TOKENS=true
+# ATTN_MODE=cross_attn
+# STATE_TO_PREFIX=true
+# CAUSAL_ACTION_ATTENTION_MASK=true
+# SELF_ATTN_EVERY_N_LAYERS=2
+# VLM_NAME=HuggingFaceTB/SmolVLM-500M-Instruct
+
+
+
+# EXPERT_WIDTH_MULTIPLIER=0.5
+# PAST_OBS_KEYS="image"
+# N_OBS_STEPS=1
+# NUM_EXPERT_LAYERS=0
+# CHUNK_SIZE=50
+# NUM_VLM_LAYERS=16
+# PAD_LANG_TO=longest
+# EVAL_CKPT=$WORK/logs/lerobot/lerobot_physical_intelligence_libero_smolpi0_lr1e-4bs16steps100000gpus2freeze32_imgtoktrue_cross_attn_gap1_localimgfalse_statetopreftrue_explay0_vlml16_causalacttrue_sa2_smolvlm2500_chunk50_nobs1_expw0.5_lrvlm1e-4/checkpoints/best/pretrained_model/
+# ADD_IMAGE_TOKENS=true
+# ATTN_MODE=cross_attn
+# STATE_TO_PREFIX=true
+# CAUSAL_ACTION_ATTENTION_MASK=true
+# SELF_ATTN_EVERY_N_LAYERS=2
+# VLM_NAME=HuggingFaceTB/SmolVLM-500M-Instruct
+
+
+# EXPERT_WIDTH_MULTIPLIER=0.5
+# PAST_OBS_KEYS="image"
+# N_OBS_STEPS=1
+# NUM_EXPERT_LAYERS=0
+# CHUNK_SIZE=50
+# NUM_VLM_LAYERS=0
+# PAD_LANG_TO=longest
+# EVAL_CKPT=$WORK/logs/lerobot/lerobot_physical_intelligence_libero_smolpi0_lr1e-4bs16steps100000gpus2freeze32_imgtoktrue_cross_attn_gap1_localimgfalse_statetopreftrue_explay0_vlml0_causalacttrue_sa2_smolvlm2500_chunk50_nobs1_expw0.5_lrvlm1e-4/checkpoints/best/pretrained_model/
+# ADD_IMAGE_TOKENS=true
+# ATTN_MODE=cross_attn
+# STATE_TO_PREFIX=true
+# CAUSAL_ACTION_ATTENTION_MASK=true
+# SELF_ATTN_EVERY_N_LAYERS=2
+# VLM_NAME=HuggingFaceTB/SmolVLM-500M-Instruct
+
+# EXPERT_WIDTH_MULTIPLIER=0.5
+# PAST_OBS_KEYS="image"
+# N_OBS_STEPS=1
+# NUM_EXPERT_LAYERS=0
+# CHUNK_SIZE=50
+# NUM_VLM_LAYERS=0
+# PAD_LANG_TO=longest
+# EVAL_CKPT=$WORK/logs/lerobot/lerobot_physical_intelligence_libero_smolpi0_lr1e-4bs16steps100000gpus2freeze32_imgtokfalse_cross_attn_gap1_localimgfalse_statetopreftrue_explay0_vlml0_causalacttrue_sa2_smolvlm2500_chunk50_nobs1_expw0.5_lrvlm1e-4/checkpoints/best/pretrained_model/
+# ADD_IMAGE_TOKENS=false
+# ATTN_MODE=cross_attn
+# STATE_TO_PREFIX=true
+# CAUSAL_ACTION_ATTENTION_MASK=true
+# SELF_ATTN_EVERY_N_LAYERS=2
+# VLM_NAME=HuggingFaceTB/SmolVLM-500M-Instruct
+
+
+
+# EXPERT_WIDTH_MULTIPLIER=0.5
+# PAST_OBS_KEYS="image"
+# N_OBS_STEPS=1
+# NUM_EXPERT_LAYERS=0
+# CHUNK_SIZE=50
+# NUM_VLM_LAYERS=0
+# PAD_LANG_TO=longest
+# EVAL_CKPT=$WORK/logs/lerobot/lerobot_physical_intelligence_libero_smolpi0_lr1e-4bs8steps100000gpus2freeze32_imgtokfalse_cross_attn_gap1_localimgfalse_statetopreftrue_explay0_vlml0_causalacttrue_sa0_smolvlm2500_chunk50_nobs1_expw0.5_lrvlm1e-4/checkpoints/best/pretrained_model/
+# ADD_IMAGE_TOKENS=false
+# ATTN_MODE=cross_attn
+# STATE_TO_PREFIX=true
+# CAUSAL_ACTION_ATTENTION_MASK=true
+# SELF_ATTN_EVERY_N_LAYERS=0
+# VLM_NAME=HuggingFaceTB/SmolVLM-500M-Instruct
+
+
+# SELF_ATTN_ONLY_ACTIONS=true
+# EXPERT_WIDTH_MULTIPLIER=0.5
+# PAST_OBS_KEYS="image"
+# N_OBS_STEPS=1
+# NUM_EXPERT_LAYERS=0
+# CHUNK_SIZE=50
+# NUM_VLM_LAYERS=16
+# PAD_LANG_TO=longest
+# EVAL_CKPT=$WORK/logs/lerobot/lerobot_physical_intelligence_libero_smolpi0_lr1e-4bs8steps100000gpus2freeze32_imgtoktrue_cross_attn_gap1_localimgfalse_statetopreftrue_explay0_vlml16_causalacttrue_sa2_smolvlm2500_chunk50_nobs1_expw0.5_lrvlm1e-4_saacttrue/checkpoints/best/pretrained_model/
+# ADD_IMAGE_TOKENS=true
+# ATTN_MODE=cross_attn
+# STATE_TO_PREFIX=true
+# CAUSAL_ACTION_ATTENTION_MASK=true
+# SELF_ATTN_EVERY_N_LAYERS=2
+# VLM_NAME=HuggingFaceTB/SmolVLM-500M-Instruct
+
+
+# SELF_ATTN_ONLY_ACTIONS=false
+# EXPERT_WIDTH_MULTIPLIER=0.5
+# PAST_OBS_KEYS="image"
+# N_OBS_STEPS=1
+# NUM_EXPERT_LAYERS=0
+# CHUNK_SIZE=50
+# NUM_VLM_LAYERS=16
+# PAD_LANG_TO=max_length
+# EVAL_CKPT=$WORK/logs/lerobot/lerobot_physical_intelligence_libero_smolpi0_lr1e-4bs8steps100000gpus2freeze32_imgtoktrue_cross_attn_gap1_localimgfalse_statetopreftrue_explay0_vlml16_causalacttrue_sa2_smolvlm2500_chunk50_nobs1_expw0.5_lrvlm1e-4_saactfalse_droptrue_max_length/checkpoints/best/pretrained_model/
+# ADD_IMAGE_TOKENS=true
+# ATTN_MODE=cross_attn
+# STATE_TO_PREFIX=true
+# CAUSAL_ACTION_ATTENTION_MASK=true
+# SELF_ATTN_EVERY_N_LAYERS=2
+# VLM_NAME=HuggingFaceTB/SmolVLM-500M-Instruct
+
+
+
+# SELF_ATTN_ONLY_ACTIONS=false
+# EXPERT_WIDTH_MULTIPLIER=0.5
+# PAST_OBS_KEYS="image"
+# N_OBS_STEPS=1
+# NUM_EXPERT_LAYERS=0
+# CHUNK_SIZE=50
+# NUM_VLM_LAYERS=16
+# PAD_LANG_TO=max_length
+# EVAL_CKPT=$WORK/logs/lerobot/lerobot_physical_intelligence_libero_smolpi0_lr1e-4bs8steps100000gpus2freeze32_imgtoktrue_cross_attn_gap1_localimgfalse_statetopreftrue_explay0_vlml16_causalacttrue_sa2_smolvlm2500_chunk50_nobs1_expw0.5_lrvlm1e-4_saactfalse_dropfalse_max_length/checkpoints/best/pretrained_model/
+# ADD_IMAGE_TOKENS=true
+# ATTN_MODE=cross_attn
+# STATE_TO_PREFIX=true
+# CAUSAL_ACTION_ATTENTION_MASK=true
+# SELF_ATTN_EVERY_N_LAYERS=2
+# VLM_NAME=HuggingFaceTB/SmolVLM-500M-Instruct
+
+# SELF_ATTN_ONLY_ACTIONS=false
+# EXPERT_WIDTH_MULTIPLIER=0.75
+# PAST_OBS_KEYS="image"
+# N_OBS_STEPS=1
+# NUM_EXPERT_LAYERS=0
+# CHUNK_SIZE=50
+# NUM_VLM_LAYERS=16
+# PAD_LANG_TO=max_length
+# EVAL_CKPT=$WORK/logs/lerobot/lerobot_physical_intelligence_libero_smolpi0_lr9.5e-5bs8steps100000gpus2freeze32_imgtoktrue_cross_attn_gap1_localimgfalse_statetopreftrue_explay0_vlml16_causalacttrue_sa2_smolvlm2500_chunk50_nobs1_expw0.75_lrvlm1e-4_saactfalse_dropfalse_max_length/checkpoints/best/pretrained_model/
+# ADD_IMAGE_TOKENS=true
+# ATTN_MODE=cross_attn
+# STATE_TO_PREFIX=true
+# CAUSAL_ACTION_ATTENTION_MASK=true
+# SELF_ATTN_EVERY_N_LAYERS=2
+# VLM_NAME=HuggingFaceTB/SmolVLM-500M-Instruct
+
+
+# SELF_ATTN_ONLY_ACTIONS=false
+# EXPERT_WIDTH_MULTIPLIER=0.75
+# PAST_OBS_KEYS="image"
+# N_OBS_STEPS=1
+# NUM_EXPERT_LAYERS=0
+# CHUNK_SIZE=50
+# NUM_VLM_LAYERS=16
+# PAD_LANG_TO=longest
+# EVAL_CKPT=$WORK/logs/lerobot/lerobot_physical_intelligence_libero_smolpi0_lr9.5e-5bs32steps100000gpus2freeze32_imgtoktrue_cross_attn_gap1_localimgfalse_statetopreftrue_explay0_vlml16_causalacttrue_sa2_smolvlm2500_chunk50_nobs1_expw0.75_lrvlm1e-4_saactfalse_dropfalse_longest/checkpoints/best/pretrained_model/
+# ADD_IMAGE_TOKENS=true
+# ATTN_MODE=cross_attn
+# STATE_TO_PREFIX=true
+# CAUSAL_ACTION_ATTENTION_MASK=true
+# SELF_ATTN_EVERY_N_LAYERS=2
+# VLM_NAME=HuggingFaceTB/SmolVLM-500M-Instruct
+
+
+# SELF_ATTN_ONLY_ACTIONS=false
+# EXPERT_WIDTH_MULTIPLIER=0.75
+# PAST_OBS_KEYS="image"
+# N_OBS_STEPS=1
+# NUM_EXPERT_LAYERS=0
+# CHUNK_SIZE=50
+# NUM_VLM_LAYERS=16
+# PAD_LANG_TO=longest
+# EVAL_CKPT=$WORK/logs/lerobot/lerobot_physical_intelligence_libero_smolpi0_lr1e-4bs32steps100000gpus2freeze32_imgtoktrue_cross_attn_gap1_localimgfalse_statetopreftrue_explay0_vlml16_causalacttrue_sa2_smolvlm2500_chunk50_nobs1_expw0.75_lrvlm1e-4_saactfalse_dropfalse_longest/checkpoints/best/pretrained_model/
+# ADD_IMAGE_TOKENS=true
+# ATTN_MODE=cross_attn
+# STATE_TO_PREFIX=true
+# CAUSAL_ACTION_ATTENTION_MASK=true
+# SELF_ATTN_EVERY_N_LAYERS=2
+# VLM_NAME=HuggingFaceTB/SmolVLM-500M-Instruct
+
+
+# SELF_ATTN_ONLY_ACTIONS=false
+# EXPERT_WIDTH_MULTIPLIER=0.5
+# PAST_OBS_KEYS="image"
+# N_OBS_STEPS=1
+# NUM_EXPERT_LAYERS=0
+# CHUNK_SIZE=50
+# NUM_VLM_LAYERS=16
+# PAD_LANG_TO=longest
+# EVAL_CKPT=$WORK/logs/lerobot/lerobot_physical_intelligence_libero_smolpi0_lr1e-4bs8steps100000gpus2freeze32_imgtoktrue_cross_attn_gap1_localimgfalse_statetopreftrue_explay0_vlml16_causalacttrue_sa2_smolvlm2500_chunk50_nobs1_expw0.5_lrvlm1e-4_longest_ptdroidfull/checkpoints/best/pretrained_model/
+# ADD_IMAGE_TOKENS=true
+# ATTN_MODE=cross_attn
+# STATE_TO_PREFIX=true
+# CAUSAL_ACTION_ATTENTION_MASK=true
+# SELF_ATTN_EVERY_N_LAYERS=2
+# VLM_NAME=HuggingFaceTB/SmolVLM-500M-Instruct
+
+# SELF_ATTN_ONLY_ACTIONS=false
+# EXPERT_WIDTH_MULTIPLIER=0.5
+# PAST_OBS_KEYS="image"
+# N_OBS_STEPS=1
+# NUM_EXPERT_LAYERS=0
+# CHUNK_SIZE=50
+# NUM_VLM_LAYERS=16
+# PAD_LANG_TO=longest
+# EVAL_CKPT=$WORK/logs/lerobot/lerobot_physical_intelligence_libero_smolpi0_lr1e-4bs8steps100000gpus2freeze32_imgtoktrue_cross_attn_gap1_localimgfalse_statetopreftrue_explay0_vlml16_causalacttrue_sa2_smolvlm2500_chunk50_nobs1_expw0.5_lrvlm1e-4_longest_ptcomv3freeze/checkpoints/best/pretrained_model/
+# ADD_IMAGE_TOKENS=true
+# ATTN_MODE=cross_attn
+# STATE_TO_PREFIX=true
+# CAUSAL_ACTION_ATTENTION_MASK=true
+# SELF_ATTN_EVERY_N_LAYERS=2
+# VLM_NAME=HuggingFaceTB/SmolVLM-500M-Instruct
+
+# SELF_ATTN_ONLY_ACTIONS=false
+# EXPERT_WIDTH_MULTIPLIER=0.5
+# PAST_OBS_KEYS="image"
+# N_OBS_STEPS=1
+# NUM_EXPERT_LAYERS=0
+# CHUNK_SIZE=50
+# NUM_VLM_LAYERS=16
+# PAD_LANG_TO=longest
+# EVAL_CKPT=$WORK/logs/lerobot/lerobot_physical_intelligence_libero_smolpi0_lr1e-4bs8steps100000gpus2freeze32_imgtoktrue_cross_attn_gap1_localimgfalse_statetopreftrue_explay0_vlml16_causalacttrue_sa2_smolvlm2500_chunk50_nobs1_expw0.5_lrvlm1e-4_longest_ptcomv1v2full/checkpoints/best/pretrained_model/
+# ADD_IMAGE_TOKENS=true
+# ATTN_MODE=cross_attn
+# STATE_TO_PREFIX=true
+# CAUSAL_ACTION_ATTENTION_MASK=true
+# SELF_ATTN_EVERY_N_LAYERS=2
+# VLM_NAME=HuggingFaceTB/SmolVLM-500M-Instruct
+
+# SELF_ATTN_ONLY_ACTIONS=false
+# EXPERT_WIDTH_MULTIPLIER=0.5
+# PAST_OBS_KEYS="image"
+# N_OBS_STEPS=1
+# NUM_EXPERT_LAYERS=0
+# CHUNK_SIZE=50
+# NUM_VLM_LAYERS=16
+# PAD_LANG_TO=longest
+# EVAL_CKPT=$WORK/logs/lerobot/lerobot_physical_intelligence_libero_smolpi0_lr1e-4bs8steps100000gpus2freeze32_imgtoktrue_cross_attn_gap1_localimgfalse_statetopreftrue_explay0_vlml16_causalacttrue_sa2_smolvlm2500_chunk50_nobs1_expw0.5_lrvlm1e-4_longest_ptcomv1v2freeze/checkpoints/best/pretrained_model/
+# ADD_IMAGE_TOKENS=true
+# ATTN_MODE=cross_attn
+# STATE_TO_PREFIX=true
+# CAUSAL_ACTION_ATTENTION_MASK=true
+# SELF_ATTN_EVERY_N_LAYERS=2
+# VLM_NAME=HuggingFaceTB/SmolVLM-500M-Instruct
+
+
+
+# SELF_ATTN_ONLY_ACTIONS=false
+# EXPERT_WIDTH_MULTIPLIER=0.5
+# PAST_OBS_KEYS="image"
+# N_OBS_STEPS=1
+# NUM_EXPERT_LAYERS=0
+# CHUNK_SIZE=50
+# NUM_VLM_LAYERS=16
+# PAD_LANG_TO=longest
+# EVAL_CKPT=$WORK/logs/lerobot/lerobot_physical_intelligence_libero_smolpi0_lr2e-4bs8steps100000gpus2freeze32_imgtoktrue_cross_attn_gap1_localimgfalse_statetopreftrue_explay0_vlml16_causalacttrue_sa2_smolvlm2500_chunk50_nobs1_expw0.5_lrvlm1e-4_longest_pt_trans1true/checkpoints/best/pretrained_model/
+# ADD_IMAGE_TOKENS=true
+# ATTN_MODE=cross_attn
+# STATE_TO_PREFIX=true
+# CAUSAL_ACTION_ATTENTION_MASK=true
+# SELF_ATTN_EVERY_N_LAYERS=2
+# VLM_NAME=HuggingFaceTB/SmolVLM-500M-Instruct
+
+
+# SELF_ATTN_ONLY_ACTIONS=false
+# EXPERT_WIDTH_MULTIPLIER=0.5
+# PAST_OBS_KEYS="image"
+# N_OBS_STEPS=1
+# NUM_EXPERT_LAYERS=0
+# CHUNK_SIZE=50
+# NUM_VLM_LAYERS=16
+# PAD_LANG_TO=longest
+# EVAL_CKPT=$WORK/logs/lerobot/lerobot_physical_intelligence_libero_smolpi0_lr2e-4bs8steps100000gpus2freeze32_imgtoktrue_cross_attn_gap1_localimgfalse_statetopreftrue_explay0_vlml16_causalacttrue_sa2_smolvlm2500_chunk50_nobs1_expw0.5_lrvlm1e-4_longest_pt_trans3true/checkpoints/best/pretrained_model/
+# ADD_IMAGE_TOKENS=true
+# ATTN_MODE=cross_attn
+# STATE_TO_PREFIX=true
+# CAUSAL_ACTION_ATTENTION_MASK=true
+# SELF_ATTN_EVERY_N_LAYERS=2
+# VLM_NAME=HuggingFaceTB/SmolVLM-500M-Instruct
+
+# SELF_ATTN_ONLY_ACTIONS=false
+# EXPERT_WIDTH_MULTIPLIER=0.5
+# PAST_OBS_KEYS="image"
+# N_OBS_STEPS=1
+# NUM_EXPERT_LAYERS=0
+# CHUNK_SIZE=50
+# NUM_VLM_LAYERS=0
+# PAD_LANG_TO=longest
+# EVAL_CKPT=$WORK/logs/lerobot/lerobot_physical_intelligence_libero_smolpi0_lr1e-4bs32steps100000gpus2freeze32_imgtoktrue_self_attn_gap1_localimgfalse_statetopreftrue_explay0_vlml0_causalacttrue_sa0_smolvlm2500_chunk50_nobs1_expw0.5_lrvlm1e-4_longest_pt_trans1false/checkpoints/best/pretrained_model/
+# ADD_IMAGE_TOKENS=true
+# ATTN_MODE=self_attn
+# STATE_TO_PREFIX=true
+# CAUSAL_ACTION_ATTENTION_MASK=true
+# SELF_ATTN_EVERY_N_LAYERS=0
+# VLM_NAME=HuggingFaceTB/SmolVLM-500M-Instruct
+
+
+# SELF_ATTN_ONLY_ACTIONS=false
+# EXPERT_WIDTH_MULTIPLIER=0.5
+# PAST_OBS_KEYS="image"
+# N_OBS_STEPS=1
+# NUM_EXPERT_LAYERS=0
+# CHUNK_SIZE=50
+# NUM_VLM_LAYERS=0
+# PAD_LANG_TO=longest
+# EVAL_CKPT=$WORK/logs/lerobot/lerobot_physical_intelligence_libero_smolpi0_lr1e-4bs32steps100000gpus2freeze32_imgtoktrue_self_attn_gap1_localimgfalse_statetopreftrue_explay0_vlml0_causalactfalse_sa0_smolvlm2500_chunk50_nobs1_expw0.5_lrvlm1e-4_longest_pt_trans1false/checkpoints/best/pretrained_model/
+# ADD_IMAGE_TOKENS=true
+# ATTN_MODE=self_attn
+# STATE_TO_PREFIX=true
+# CAUSAL_ACTION_ATTENTION_MASK=false
+# SELF_ATTN_EVERY_N_LAYERS=0
+# VLM_NAME=HuggingFaceTB/SmolVLM-500M-Instruct
+
+# SELF_ATTN_ONLY_ACTIONS=false
+# EXPERT_WIDTH_MULTIPLIER=0.5
+# PAST_OBS_KEYS="image"
+# N_OBS_STEPS=1
+# NUM_EXPERT_LAYERS=0
+# CHUNK_SIZE=50
+# NUM_VLM_LAYERS=0
+# PAD_LANG_TO=longest
+# EVAL_CKPT=$WORK/logs/lerobot/lerobot_physical_intelligence_libero_smolpi0_lr1e-4bs32steps100000gpus2freeze32_imgtoktrue_cross_attn_gap1_localimgfalse_statetopreftrue_explay0_vlml0_causalacttrue_sa2_smolvlm2500_chunk50_nobs1_expw0.5_lrvlm1e-4_longest_pt_trans1false/checkpoints/best/pretrained_model/
+# ADD_IMAGE_TOKENS=true
+# ATTN_MODE=cross_attn
+# STATE_TO_PREFIX=true
+# CAUSAL_ACTION_ATTENTION_MASK=true
+# SELF_ATTN_EVERY_N_LAYERS=2
+# VLM_NAME=HuggingFaceTB/SmolVLM-500M-Instruct
+
+
+# SELF_ATTN_ONLY_ACTIONS=false
+# EXPERT_WIDTH_MULTIPLIER=0.5
+# PAST_OBS_KEYS="image"
+# N_OBS_STEPS=1
+# NUM_EXPERT_LAYERS=0
+# CHUNK_SIZE=50
+# NUM_VLM_LAYERS=0
+# PAD_LANG_TO=longest
+# EVAL_CKPT=$WORK/logs/lerobot/lerobot_physical_intelligence_libero_smolpi0_lr1e-4bs32steps100000gpus2freeze32_imgtoktrue_cross_attn_gap1_localimgfalse_statetopreftrue_explay0_vlml0_causalacttrue_sa0_smolvlm2500_chunk50_nobs1_expw0.5_lrvlm1e-4_longest_pt_trans1false/checkpoints/best/pretrained_model/
+# ADD_IMAGE_TOKENS=true
+# ATTN_MODE=cross_attn
+# STATE_TO_PREFIX=true
+# CAUSAL_ACTION_ATTENTION_MASK=true
+# SELF_ATTN_EVERY_N_LAYERS=0
+# VLM_NAME=HuggingFaceTB/SmolVLM-500M-Instruct
+
+
+# SELF_ATTN_ONLY_ACTIONS=false
+# EXPERT_WIDTH_MULTIPLIER=0.75
+# PAST_OBS_KEYS="image"
+# N_OBS_STEPS=1
+# NUM_EXPERT_LAYERS=0
+# CHUNK_SIZE=50
+# NUM_VLM_LAYERS=16
+# PAD_LANG_TO=longest
+# EVAL_CKPT=$WORK/logs/lerobot/lerobot_physical_intelligence_libero_smolpi0_lr1e-4bs8steps100000gpus2freeze32_imgtoktrue_cross_attn_gap1_localimgfalse_statetopreftrue_explay0_vlml16_causalacttrue_sa2_smolvlm2500_chunk50_nobs1_expw0.75_lrvlm1e-4_longest_ptcomv3freeze25_trans1false/checkpoints/best/pretrained_model/
+# ADD_IMAGE_TOKENS=true
+# ATTN_MODE=cross_attn
+# STATE_TO_PREFIX=true
+# CAUSAL_ACTION_ATTENTION_MASK=true
+# SELF_ATTN_EVERY_N_LAYERS=2
+# VLM_NAME=HuggingFaceTB/SmolVLM-500M-Instruct
+
+
+# SELF_ATTN_ONLY_ACTIONS=false
+# EXPERT_WIDTH_MULTIPLIER=0.75
+# PAST_OBS_KEYS="image"
+# N_OBS_STEPS=1
+# NUM_EXPERT_LAYERS=0
+# CHUNK_SIZE=50
+# NUM_VLM_LAYERS=16
+# PAD_LANG_TO=longest
+# EVAL_CKPT=$WORK/logs/lerobot/lerobot_physical_intelligence_libero_smolpi0_lr1e-4bs8steps100000gpus2freeze32_imgtoktrue_cross_attn_gap1_localimgfalse_statetopreftrue_explay0_vlml16_causalacttrue_sa2_smolvlm2500_chunk50_nobs1_expw0.75_lrvlm1e-4_longest_ptcomv3freeze50_trans1false/checkpoints/best/pretrained_model/
+# ADD_IMAGE_TOKENS=true
+# ATTN_MODE=cross_attn
+# STATE_TO_PREFIX=true
+# CAUSAL_ACTION_ATTENTION_MASK=true
+# SELF_ATTN_EVERY_N_LAYERS=2
+# VLM_NAME=HuggingFaceTB/SmolVLM-500M-Instruct
+
+# SELF_ATTN_ONLY_ACTIONS=false
+# EXPERT_WIDTH_MULTIPLIER=0.75
+# PAST_OBS_KEYS="image"
+# N_OBS_STEPS=1
+# NUM_EXPERT_LAYERS=0
+# CHUNK_SIZE=50
+# NUM_VLM_LAYERS=16
+# PAD_LANG_TO=longest
+# EVAL_CKPT=$WORK/logs/lerobot/lerobot_physical_intelligence_libero_smolpi0_lr1e-4bs8steps100000gpus2freeze32_imgtoktrue_cross_attn_gap1_localimgfalse_statetopreftrue_explay0_vlml16_causalacttrue_sa2_smolvlm2500_chunk50_nobs1_expw0.75_lrvlm1e-4_longest_ptcomv3freeze75_trans1false/checkpoints/best/pretrained_model/
+# ADD_IMAGE_TOKENS=true
+# ATTN_MODE=cross_attn
+# STATE_TO_PREFIX=true
+# CAUSAL_ACTION_ATTENTION_MASK=true
+# SELF_ATTN_EVERY_N_LAYERS=2
+# VLM_NAME=HuggingFaceTB/SmolVLM-500M-Instruct
+
+# SELF_ATTN_ONLY_ACTIONS=false
+# EXPERT_WIDTH_MULTIPLIER=0.75
+# PAST_OBS_KEYS="image"
+# N_OBS_STEPS=1
+# NUM_EXPERT_LAYERS=0
+# CHUNK_SIZE=50
+# NUM_VLM_LAYERS=16
+# PAD_LANG_TO=longest
+# EVAL_CKPT=$WORK/logs/lerobot/lerobot_physical_intelligence_libero_smolpi0_lr1e-4bs8steps100000gpus2freeze32_imgtoktrue_cross_attn_gap1_localimgfalse_statetopreftrue_explay0_vlml16_causalacttrue_sa2_smolvlm2500_chunk50_nobs1_expw0.75_lrvlm1e-4_longest_ptcomv3freeze100_trans1false/checkpoints/best/pretrained_model/
+# ADD_IMAGE_TOKENS=true
+# ATTN_MODE=cross_attn
+# STATE_TO_PREFIX=true
+# CAUSAL_ACTION_ATTENTION_MASK=true
+# SELF_ATTN_EVERY_N_LAYERS=2
+# VLM_NAME=HuggingFaceTB/SmolVLM-500M-Instruct
+
+# SELF_ATTN_ONLY_ACTIONS=false
+# EXPERT_WIDTH_MULTIPLIER=0.5
+# PAST_OBS_KEYS="image"
+# N_OBS_STEPS=1
+# NUM_EXPERT_LAYERS=0
+# CHUNK_SIZE=50
+# NUM_VLM_LAYERS=16
+# PAD_LANG_TO=longest
+# EVAL_CKPT=$WORK/logs/lerobot/lerobot_physical_intelligence_libero_smolpi0_lr1e-4bs8steps100000gpus2freeze32_imgtoktrue_cross_attn_gap1_localimgfalse_statetopreftrue_explay0_vlml16_causalacttrue_sa2_smolvlm2500_chunk50_nobs1_expw0.5_lrvlm1e-4_longest_pt_trans6true/checkpoints/best/pretrained_model/
+# ADD_IMAGE_TOKENS=true
+# ATTN_MODE=cross_attn
+# STATE_TO_PREFIX=true
+# CAUSAL_ACTION_ATTENTION_MASK=true
+# SELF_ATTN_EVERY_N_LAYERS=2
+# VLM_NAME=HuggingFaceTB/SmolVLM-500M-Instruct
+
+# SELF_ATTN_ONLY_ACTIONS=false
+# EXPERT_WIDTH_MULTIPLIER=0.5
+# PAST_OBS_KEYS="image"
+# N_OBS_STEPS=1
+# NUM_EXPERT_LAYERS=0
+# CHUNK_SIZE=50
+# NUM_VLM_LAYERS=16
+# PAD_LANG_TO=longest
+# EVAL_CKPT=$WORK/logs/lerobot/lerobot_physical_intelligence_libero_smolpi0_lr1e-4bs8steps100000gpus2freeze32_imgtoktrue_cross_attn_gap1_localimgfalse_statetopreftrue_explay0_vlml16_causalacttrue_sa2_smolvlm2500_chunk50_nobs1_expw0.5_lrvlm1e-4_longest_pt_trans4true/checkpoints/best/pretrained_model/
+# ADD_IMAGE_TOKENS=true
+# ATTN_MODE=cross_attn
+# STATE_TO_PREFIX=true
+# CAUSAL_ACTION_ATTENTION_MASK=true
+# SELF_ATTN_EVERY_N_LAYERS=2
+# VLM_NAME=HuggingFaceTB/SmolVLM-500M-Instruct
+
+# SELF_ATTN_ONLY_ACTIONS=false
+# EXPERT_WIDTH_MULTIPLIER=0.5
+# PAST_OBS_KEYS="image"
+# N_OBS_STEPS=1
+# NUM_EXPERT_LAYERS=0
+# CHUNK_SIZE=50
+# NUM_VLM_LAYERS=16
+# PAD_LANG_TO=longest
+# EVAL_CKPT=$WORK/logs/lerobot/lerobot_physical_intelligence_libero_smolpi0_lr1e-4bs8steps100000gpus2freeze32_imgtoktrue_cross_attn_gap1_localimgfalse_statetopreftrue_explay0_vlml16_causalacttrue_sa2_smolvlm2500_chunk50_nobs1_expw0.5_lrvlm1e-4_longest_pt_trans7true/checkpoints/best/pretrained_model/
+# ADD_IMAGE_TOKENS=true
+# ATTN_MODE=cross_attn
+# STATE_TO_PREFIX=true
+# CAUSAL_ACTION_ATTENTION_MASK=true
+# SELF_ATTN_EVERY_N_LAYERS=2
+# VLM_NAME=HuggingFaceTB/SmolVLM-500M-Instruct
+
+# SELF_ATTN_ONLY_ACTIONS=false
+# EXPERT_WIDTH_MULTIPLIER=0.5
+# PAST_OBS_KEYS="image"
+# N_OBS_STEPS=1
+# NUM_EXPERT_LAYERS=0
+# CHUNK_SIZE=50
+# NUM_VLM_LAYERS=16
+# PAD_LANG_TO=longest
+# EVAL_CKPT=$WORK/logs/lerobot/lerobot_physical_intelligence_libero_smolpi0_lr1e-4bs8steps100000gpus2freeze32_imgtoktrue_cross_attn_gap1_localimgfalse_statetopreftrue_explay0_vlml16_causalacttrue_sa2_smolvlm2500_chunk50_nobs1_expw0.5_lrvlm1e-4_longest_pt_trans5true/checkpoints/best/pretrained_model/
+# ADD_IMAGE_TOKENS=true
+# ATTN_MODE=cross_attn
+# STATE_TO_PREFIX=true
+# CAUSAL_ACTION_ATTENTION_MASK=true
+# SELF_ATTN_EVERY_N_LAYERS=2
+# VLM_NAME=HuggingFaceTB/SmolVLM-500M-Instruct
+
+
+# SELF_ATTN_ONLY_ACTIONS=false
+# EXPERT_WIDTH_MULTIPLIER=0.5
+# PAST_OBS_KEYS="image"
+# N_OBS_STEPS=1
+# NUM_EXPERT_LAYERS=0
+# CHUNK_SIZE=50
+# NUM_VLM_LAYERS=16
+# PAD_LANG_TO=longest
+# EVAL_CKPT=$WORK/logs/lerobot/lerobot_physical_intelligence_libero_smolpi0_lr1e-4bs8steps100000gpus2freeze32_imgtoktrue_cross_attn_gap1_localimgfalse_statetopreftrue_explay0_vlml16_causalacttrue_sa2_smolvlm2500_chunk50_nobs1_expw0.5_lrvlm1e-4_longest_pt_trans2true/checkpoints/best/pretrained_model/
+# ADD_IMAGE_TOKENS=true
+# ATTN_MODE=cross_attn
+# STATE_TO_PREFIX=true
+# CAUSAL_ACTION_ATTENTION_MASK=true
+# SELF_ATTN_EVERY_N_LAYERS=2
+# VLM_NAME=HuggingFaceTB/SmolVLM-500M-Instruct
+
+
+
+# SELF_ATTN_ONLY_ACTIONS=false
+# EXPERT_WIDTH_MULTIPLIER=0.75
+# PAST_OBS_KEYS="image"
+# N_OBS_STEPS=1
+# NUM_EXPERT_LAYERS=0
+# CHUNK_SIZE=50
+# NUM_VLM_LAYERS=16
+# PAD_LANG_TO=longest
+# EVAL_CKPT=$WORK/logs/lerobot/lerobot_physical_intelligence_libero_smolpi0_lr1e-4bs32steps100000gpus2freeze32_imgtoktrue_cross_attn_gap1_localimgfalse_statetopreftrue_explay0_vlml16_causalacttrue_sa2_smolvlm2500_chunk50_nobs1_expw0.75_lrvlm1e-4_longest_pt_trans1true/checkpoints/best/pretrained_model/
+# ADD_IMAGE_TOKENS=true
+# ATTN_MODE=cross_attn
+# STATE_TO_PREFIX=true
+# CAUSAL_ACTION_ATTENTION_MASK=true
+# SELF_ATTN_EVERY_N_LAYERS=2
+# VLM_NAME=HuggingFaceTB/SmolVLM-500M-Instruct
+
+# SELF_ATTN_ONLY_ACTIONS=false
+# EXPERT_WIDTH_MULTIPLIER=0.5
+# PAST_OBS_KEYS="image"
+# N_OBS_STEPS=1
+# NUM_EXPERT_LAYERS=0
+# CHUNK_SIZE=50
+# NUM_VLM_LAYERS=16
+# PAD_LANG_TO=longest
+# EVAL_CKPT=$WORK/logs/lerobot/lerobot_physical_intelligence_libero_smolpi0_lr1e-4bs8steps100000gpus2freeze32_imgtoktrue_cross_attn_gap1_localimgfalse_statetopreftrue_explay0_vlml16_causalacttrue_sa2_smolvlm2500_chunk50_nobs1_expw0.5_lrvlm1e-4_longest_pt_trans8true/checkpoints/best/pretrained_model/
+# ADD_IMAGE_TOKENS=true
+# ATTN_MODE=cross_attn
+# STATE_TO_PREFIX=true
+# CAUSAL_ACTION_ATTENTION_MASK=true
+# SELF_ATTN_EVERY_N_LAYERS=2
+# VLM_NAME=HuggingFaceTB/SmolVLM-500M-Instruct
+
+# SELF_ATTN_ONLY_ACTIONS=false
+# EXPERT_WIDTH_MULTIPLIER=0.5
+# PAST_OBS_KEYS="image"
+# N_OBS_STEPS=1
+# NUM_EXPERT_LAYERS=0
+# CHUNK_SIZE=50
+# NUM_VLM_LAYERS=16
+# PAD_LANG_TO=longest
+# EVAL_CKPT=$WORK/logs/lerobot/lerobot_physical_intelligence_libero_smolpi0_lr1e-4bs8steps100000gpus2freeze32_imgtoktrue_cross_attn_gap1_localimgfalse_statetopreftrue_explay0_vlml16_causalacttrue_sa2_smolvlm2500_chunk50_nobs1_expw0.5_lrvlm1e-4_longest_pt_trans9true/checkpoints/best/pretrained_model/
+# ADD_IMAGE_TOKENS=true
+# ATTN_MODE=cross_attn
+# STATE_TO_PREFIX=true
+# CAUSAL_ACTION_ATTENTION_MASK=true
+# SELF_ATTN_EVERY_N_LAYERS=2
+# VLM_NAME=HuggingFaceTB/SmolVLM-500M-Instruct
+
+# SELF_ATTN_ONLY_ACTIONS=false
+# EXPERT_WIDTH_MULTIPLIER=0.5
+# PAST_OBS_KEYS="image"
+# N_OBS_STEPS=1
+# NUM_EXPERT_LAYERS=0
+# CHUNK_SIZE=50
+# NUM_VLM_LAYERS=16
+# PAD_LANG_TO=longest
+# EVAL_CKPT=$WORK/logs/lerobot/lerobot_physical_intelligence_libero_smolpi0_lr1e-4bs8steps100000gpus2freeze32_imgtoktrue_cross_attn_gap1_localimgfalse_statetopreftrue_explay0_vlml16_causalacttrue_sa2_smolvlm2500_chunk50_nobs1_expw0.5_lrvlm1e-4_longest_ptcomv1v2full/checkpoints/best/pretrained_model/
+# ADD_IMAGE_TOKENS=true
+# ATTN_MODE=cross_attn
+# STATE_TO_PREFIX=true
+# CAUSAL_ACTION_ATTENTION_MASK=true
+# SELF_ATTN_EVERY_N_LAYERS=2
+# VLM_NAME=HuggingFaceTB/SmolVLM-500M-Instruct
+
+# SELF_ATTN_ONLY_ACTIONS=false
+# EXPERT_WIDTH_MULTIPLIER=0.25
+# PAST_OBS_KEYS="image"
+# N_OBS_STEPS=1
+# NUM_EXPERT_LAYERS=0
+# CHUNK_SIZE=50
+# NUM_VLM_LAYERS=0
+# PAD_LANG_TO=longest
+# EVAL_CKPT=$WORK/logs/lerobot/lerobot_physical_intelligence_libero_smolpi0_lr1e-4bs8steps100000gpus2freeze32_imgtoktrue_cross_attn_gap1_localimgfalse_statetopreftrue_explay0_vlml0_causalacttrue_sa0_smolvlm2500_chunk50_nobs1_expw0.25_lrvlm1e-4_longest_pt_trans0false/checkpoints/best/pretrained_model/
+# ADD_IMAGE_TOKENS=true
+# ATTN_MODE=cross_attn
+# STATE_TO_PREFIX=true
+# CAUSAL_ACTION_ATTENTION_MASK=true
+# SELF_ATTN_EVERY_N_LAYERS=0
+# VLM_NAME=HuggingFaceTB/SmolVLM-500M-Instruct
+
+
+# SELF_ATTN_ONLY_ACTIONS=false
+# EXPERT_WIDTH_MULTIPLIER=1
+# PAST_OBS_KEYS="image"
+# N_OBS_STEPS=1
+# NUM_EXPERT_LAYERS=0
+# CHUNK_SIZE=50
+# NUM_VLM_LAYERS=0
+# PAD_LANG_TO=longest
+# EVAL_CKPT=$WORK/logs/lerobot/lerobot_physical_intelligence_libero_smolpi0_lr1e-4bs8steps100000gpus2freeze32_imgtoktrue_cross_attn_gap1_localimgfalse_statetopreftrue_explay0_vlml0_causalacttrue_sa0_smolvlm2500_chunk50_nobs1_expw1_lrvlm1e-4_longest_pt_trans0false/checkpoints/best/pretrained_model/
+# ADD_IMAGE_TOKENS=true
+# ATTN_MODE=cross_attn
+# STATE_TO_PREFIX=true
+# CAUSAL_ACTION_ATTENTION_MASK=true
+# SELF_ATTN_EVERY_N_LAYERS=0
+# VLM_NAME=HuggingFaceTB/SmolVLM-500M-Instruct
+
+# SELF_ATTN_ONLY_ACTIONS=false
+# EXPERT_WIDTH_MULTIPLIER=0.75
+# PAST_OBS_KEYS="image"
+# N_OBS_STEPS=1
+# NUM_EXPERT_LAYERS=0
+# CHUNK_SIZE=50
+# NUM_VLM_LAYERS=0
+# PAD_LANG_TO=longest
+# EVAL_CKPT=$WORK/logs/lerobot/lerobot_physical_intelligence_libero_smolpi0_lr1e-4bs8steps100000gpus2freeze32_imgtoktrue_cross_attn_gap1_localimgfalse_statetopreftrue_explay0_vlml0_causalacttrue_sa0_smolvlm2500_chunk50_nobs1_expw0.75_lrvlm1e-4_longest_pt_trans0false/checkpoints/best/pretrained_model/
+# ADD_IMAGE_TOKENS=true
+# ATTN_MODE=cross_attn
+# STATE_TO_PREFIX=true
+# CAUSAL_ACTION_ATTENTION_MASK=true
+# SELF_ATTN_EVERY_N_LAYERS=0
+# VLM_NAME=HuggingFaceTB/SmolVLM-500M-Instruct
+
+# SELF_ATTN_ONLY_ACTIONS=false
+# EXPERT_WIDTH_MULTIPLIER=0.75
+# PAST_OBS_KEYS="image"
+# N_OBS_STEPS=1
+# NUM_EXPERT_LAYERS=0
+# CHUNK_SIZE=50
+# NUM_VLM_LAYERS=0
+# PAD_LANG_TO=longest
+# EVAL_CKPT=$WORK/logs/lerobot/lerobot_physical_intelligence_libero_smolpi0_lr1e-4bs32steps100000gpus2freeze32_imgtoktrue_cross_attn_gap1_localimgfalse_statetopreftrue_explay0_vlml0_causalacttrue_sa0_smolvlm2500_chunk50_nobs1_expw0.75_lrvlm1e-4_longest_pt_trans0false/checkpoints/best/pretrained_model/
+# ADD_IMAGE_TOKENS=true
+# ATTN_MODE=cross_attn
+# STATE_TO_PREFIX=true
+# CAUSAL_ACTION_ATTENTION_MASK=true
+# SELF_ATTN_EVERY_N_LAYERS=0
+# VLM_NAME=HuggingFaceTB/SmolVLM-500M-Instruct
+
+# SELF_ATTN_ONLY_ACTIONS=false
+# EXPERT_WIDTH_MULTIPLIER=0.25
+# PAST_OBS_KEYS="image"
+# N_OBS_STEPS=1
+# NUM_EXPERT_LAYERS=0
+# CHUNK_SIZE=50
+# NUM_VLM_LAYERS=0
+# PAD_LANG_TO=longest
+# EVAL_CKPT=$WORK/logs/lerobot/lerobot_physical_intelligence_libero_smolpi0_lr1e-4bs32steps100000gpus2freeze32_imgtoktrue_cross_attn_gap1_localimgfalse_statetopreftrue_explay0_vlml0_causalacttrue_sa0_smolvlm2500_chunk50_nobs1_expw0.25_lrvlm1e-4_longest_pt_trans0false/checkpoints/best/pretrained_model/
+# ADD_IMAGE_TOKENS=true
+# ATTN_MODE=cross_attn
+# STATE_TO_PREFIX=true
+# CAUSAL_ACTION_ATTENTION_MASK=true
+# SELF_ATTN_EVERY_N_LAYERS=0
+# VLM_NAME=HuggingFaceTB/SmolVLM-500M-Instruct
+
+# SELF_ATTN_ONLY_ACTIONS=false
+# EXPERT_WIDTH_MULTIPLIER=1
+# PAST_OBS_KEYS="image"
+# N_OBS_STEPS=1
+# NUM_EXPERT_LAYERS=0
+# CHUNK_SIZE=50
+# NUM_VLM_LAYERS=0
+# PAD_LANG_TO=longest
+# EVAL_CKPT=$WORK/logs/lerobot/lerobot_physical_intelligence_libero_smolpi0_lr1e-4bs32steps100000gpus2freeze32_imgtoktrue_cross_attn_gap1_localimgfalse_statetopreftrue_explay0_vlml0_causalacttrue_sa0_smolvlm2500_chunk50_nobs1_expw1_lrvlm1e-4_longest_pt_trans0false/checkpoints/best/pretrained_model/
+# ADD_IMAGE_TOKENS=true
+# ATTN_MODE=cross_attn
+# STATE_TO_PREFIX=true
+# CAUSAL_ACTION_ATTENTION_MASK=true
+# SELF_ATTN_EVERY_N_LAYERS=0
+# VLM_NAME=HuggingFaceTB/SmolVLM-500M-Instruct
+
+
+# SELF_ATTN_ONLY_ACTIONS=false
+# EXPERT_WIDTH_MULTIPLIER=0.75
+# PAST_OBS_KEYS="observation.state"
+# N_OBS_STEPS=3
+# NUM_EXPERT_LAYERS=0
+# CHUNK_SIZE=50
+# NUM_VLM_LAYERS=16
+# PAD_LANG_TO=longest
+# EVAL_CKPT=$WORK/logs/lerobot/lerobot_physical_intelligence_libero_smolpi0_lr1e-4bs8steps100000gpus2freeze32_imgtoktrue_cross_attn_gap1_localimgfalse_statetopreftrue_explay0_vlml16_causalacttrue_sa2_smolvlm2500_chunk50_nobs3statestrue_expw0.75_lrvlm1e-4_longest_pt_trans0false/checkpoints/best/pretrained_model/
+# ADD_IMAGE_TOKENS=true
+# ATTN_MODE=cross_attn
+# STATE_TO_PREFIX=true
+# CAUSAL_ACTION_ATTENTION_MASK=true
+# SELF_ATTN_EVERY_N_LAYERS=2
+# VLM_NAME=HuggingFaceTB/SmolVLM-500M-Instruct
+
+
+# CAUSAL_ATTENTION_ON_HISTORY=true
+# SELF_ATTN_ONLY_ACTIONS=false
+# EXPERT_WIDTH_MULTIPLIER=0.75
+# PAST_OBS_KEYS="image,observation.state"
+# N_OBS_STEPS=3
+# NUM_EXPERT_LAYERS=0
+# CHUNK_SIZE=50
+# NUM_VLM_LAYERS=16
+# PAD_LANG_TO=longest
+# EVAL_CKPT=$WORK/logs/lerobot/lerobot_physical_intelligence_libero_smolpi0_lr1e-4bs8steps100000gpus2freeze32_imgtoktrue_cross_attn_gap1_localimgfalse_statetopreftrue_explay0_vlml16_causalacttrue_sa2_smolvlm2500_chunk50_nobs3_expw0.75_lrvlm1e-4_longest_pt_trans0false/checkpoints/best/pretrained_model/
+# ADD_IMAGE_TOKENS=true
+# ATTN_MODE=cross_attn
+# STATE_TO_PREFIX=true
+# CAUSAL_ACTION_ATTENTION_MASK=true
+# SELF_ATTN_EVERY_N_LAYERS=2
+# VLM_NAME=HuggingFaceTB/SmolVLM-500M-Instruct
+
+
+
+# CAUSAL_ATTENTION_ON_HISTORY=true
+# SELF_ATTN_ONLY_ACTIONS=false
+# EXPERT_WIDTH_MULTIPLIER=0.75
+# PAST_OBS_KEYS="image,observation.state"
+# N_OBS_STEPS=1
+# NUM_EXPERT_LAYERS=0
+# CHUNK_SIZE=50
+# NUM_VLM_LAYERS=16
+# PAD_LANG_TO=longest
+# EVAL_CKPT=$WORK/logs/lerobot/lerobot_physical_intelligence_libero_smolpi0_lr1e-4bs8steps100000gpus2freeze32_imgtoktrue_cross_attn_vlml16_causalacttrue_sa2_smolvlm2500_chunk50_nobs1statestrue_expw0.75_lrvlm1e-4_longest_pt_trans0false_decaylr1e-5100000/checkpoints/best/pretrained_model/
+# ADD_IMAGE_TOKENS=true
+# ATTN_MODE=cross_attn
+# STATE_TO_PREFIX=true
+# CAUSAL_ACTION_ATTENTION_MASK=true
+# SELF_ATTN_EVERY_N_LAYERS=2
+# VLM_NAME=HuggingFaceTB/SmolVLM-500M-Instruct
+
+
+
+# CAUSAL_ATTENTION_ON_HISTORY=true
+# SELF_ATTN_ONLY_ACTIONS=false
+# EXPERT_WIDTH_MULTIPLIER=0.75
+# PAST_OBS_KEYS="image,observation.state"
+# N_OBS_STEPS=1
+# NUM_EXPERT_LAYERS=0
+# CHUNK_SIZE=50
+# NUM_VLM_LAYERS=16
+# PAD_LANG_TO=longest
+# EVAL_CKPT=$WORK/logs/lerobot/lerobot_physical_intelligence_libero_smolpi0_lr1e-4bs8steps100000gpus2freeze32_imgtoktrue_cross_attn_vlml16_causalacttrue_sa2_smolvlm2500_chunk50_nobs1statestrue_expw0.75_lrvlm1e-4_longest_pt_trans0false_decaylr1e-530000/checkpoints/best/pretrained_model/
+# ADD_IMAGE_TOKENS=true
+# ATTN_MODE=cross_attn
+# STATE_TO_PREFIX=true
+# CAUSAL_ACTION_ATTENTION_MASK=true
+# SELF_ATTN_EVERY_N_LAYERS=2
+# VLM_NAME=HuggingFaceTB/SmolVLM-500M-Instruct
+
+
+# CAUSAL_ATTENTION_ON_HISTORY=true
+# SELF_ATTN_ONLY_ACTIONS=false
+# EXPERT_WIDTH_MULTIPLIER=0.75
+# PAST_OBS_KEYS="image,observation.state"
+# N_OBS_STEPS=1
+# NUM_EXPERT_LAYERS=0
+# CHUNK_SIZE=50
+# NUM_VLM_LAYERS=16
+# PAD_LANG_TO=longest
+# EVAL_CKPT=$WORK/logs/lerobot/lerobot_physical_intelligence_libero_smolpi0_lr1e-4bs8steps100000gpus2freeze32_imgtoktrue_cross_attn_vlml16_causalacttrue_sa2_smolvlm2500_chunk50_nobs1statestrue_expw0.75_lrvlm1e-4_longest_pt_trans0false_decaylr5e-6100000/checkpoints/best/pretrained_model/
+# ADD_IMAGE_TOKENS=true
+# ATTN_MODE=cross_attn
+# STATE_TO_PREFIX=true
+# CAUSAL_ACTION_ATTENTION_MASK=true
+# SELF_ATTN_EVERY_N_LAYERS=2
+# VLM_NAME=HuggingFaceTB/SmolVLM-500M-Instruct
+
+
+# CAUSAL_ATTENTION_ON_HISTORY=true
+# SELF_ATTN_ONLY_ACTIONS=false
+# EXPERT_WIDTH_MULTIPLIER=0.75
+# PAST_OBS_KEYS="image"
+# N_OBS_STEPS=1
+# NUM_EXPERT_LAYERS=0
+# CHUNK_SIZE=50
+# NUM_VLM_LAYERS=16
+# PAD_LANG_TO=longest
+# EVAL_CKPT=$WORK/logs/lerobot/lerobot_physical_intelligence_libero_smolpi0_lr1e-4bs8steps100000gpus2freeze32_imgtoktrue_cross_attn_vlml16_causalacttrue_sa2_smolvlm2500_chunk50_nobs1statestrue_expw0.75_lrvlm1e-4_longest_pt_trans0false_decaylr2.5e-6100000/checkpoints/best/pretrained_model/
+# ADD_IMAGE_TOKENS=true
+# ATTN_MODE=cross_attn
+# STATE_TO_PREFIX=true
+# CAUSAL_ACTION_ATTENTION_MASK=true
+# SELF_ATTN_EVERY_N_LAYERS=2
+# VLM_NAME=HuggingFaceTB/SmolVLM-500M-Instruct
+
+# CAUSAL_ATTENTION_ON_HISTORY=true
+# SELF_ATTN_ONLY_ACTIONS=false
+# EXPERT_WIDTH_MULTIPLIER=0.75
+# PAST_OBS_KEYS="image"
+# N_OBS_STEPS=1
+# NUM_EXPERT_LAYERS=0
+# CHUNK_SIZE=50
+# NUM_VLM_LAYERS=16
+# PAD_LANG_TO=longest
+# EVAL_CKPT=$WORK/logs/lerobot/lerobot_physical_intelligence_libero_smolpi0_lr1e-4bs8steps100000gpus2freeze32_imgtoktrue_cross_attn_vlml16_causalacttrue_sa2_smolvlm2500_chunk50_nobs1statestrue_expw0.75_lrvlm1e-4_longest_pt_trans0false_decaylr2.5e-630000/checkpoints/best/pretrained_model/
+# ADD_IMAGE_TOKENS=true
+# ATTN_MODE=cross_attn
+# STATE_TO_PREFIX=true
+# CAUSAL_ACTION_ATTENTION_MASK=true
+# SELF_ATTN_EVERY_N_LAYERS=2
+# VLM_NAME=HuggingFaceTB/SmolVLM-500M-Instruct
+
+# CAUSAL_ATTENTION_ON_HISTORY=true
+# SELF_ATTN_ONLY_ACTIONS=false
+# EXPERT_WIDTH_MULTIPLIER=0.75
+# PAST_OBS_KEYS="image"
+# N_OBS_STEPS=1
+# NUM_EXPERT_LAYERS=0
+# CHUNK_SIZE=50
+# NUM_VLM_LAYERS=16
+# PAD_LANG_TO=longest
+# EVAL_CKPT=$WORK/logs/lerobot/lerobot_physical_intelligence_libero_smolpi0_lr1e-4bs8steps100000gpus2freeze32_imgtoktrue_cross_attn_vlml16_causalacttrue_sa2_smolvlm2500_chunk50_nobs1statestrue_expw0.75_lrvlm1e-4_longest_pt_trans0true_decaylr2.5e-630000/checkpoints/best/pretrained_model/
+# ADD_IMAGE_TOKENS=true
+# ATTN_MODE=cross_attn
+# STATE_TO_PREFIX=true
+# CAUSAL_ACTION_ATTENTION_MASK=true
+# SELF_ATTN_EVERY_N_LAYERS=2
+# VLM_NAME=HuggingFaceTB/SmolVLM-500M-Instruct
+
+
+# CAUSAL_ATTENTION_ON_HISTORY=true
+# SELF_ATTN_ONLY_ACTIONS=false
+# EXPERT_WIDTH_MULTIPLIER=0.75
+# PAST_OBS_KEYS="image"
+# N_OBS_STEPS=1
+# NUM_EXPERT_LAYERS=0
+# CHUNK_SIZE=50
+# NUM_VLM_LAYERS=16
+# PAD_LANG_TO=longest
+# EVAL_CKPT=$WORK/logs/lerobot/lerobot_physical_intelligence_libero_smolpi0_lr1e-4bs32steps100000gpus2freeze32_imgtoktrue_cross_attn_vlml16_causalacttrue_sa2_smolvlm2500_chunk50_nobs1statestrue_expw0.75_lrvlm1e-4_longest_pt_trans0false_decaylr1e-5200000/checkpoints/best/pretrained_model/
+# ADD_IMAGE_TOKENS=true
+# ATTN_MODE=cross_attn
+# STATE_TO_PREFIX=true
+# CAUSAL_ACTION_ATTENTION_MASK=true
+# SELF_ATTN_EVERY_N_LAYERS=2
+# VLM_NAME=HuggingFaceTB/SmolVLM-500M-Instruct
+
+
+# CAUSAL_ATTENTION_ON_HISTORY=true
+# SELF_ATTN_ONLY_ACTIONS=false
+# EXPERT_WIDTH_MULTIPLIER=0.75
+# PAST_OBS_KEYS="image"
+# N_OBS_STEPS=1
+# NUM_EXPERT_LAYERS=0
+# CHUNK_SIZE=50
+# NUM_VLM_LAYERS=16
+# PAD_LANG_TO=longest
+# EVAL_CKPT=$WORK/logs/lerobot/lerobot_physical_intelligence_libero_smolpi0_lr1e-4bs32steps100000gpus2freeze32_imgtoktrue_cross_attn_vlml16_causalacttrue_sa2_smolvlm2500_chunk50_nobs1statestrue_expw0.75_lrvlm1e-4_longest_pt_trans0false_decaylr5e-6200000/checkpoints/best/pretrained_model/
+# ADD_IMAGE_TOKENS=true
+# ATTN_MODE=cross_attn
+# STATE_TO_PREFIX=true
+# CAUSAL_ACTION_ATTENTION_MASK=true
+# SELF_ATTN_EVERY_N_LAYERS=2
+# VLM_NAME=HuggingFaceTB/SmolVLM-500M-Instruct
+
+# CAUSAL_ATTENTION_ON_HISTORY=true
+# SELF_ATTN_ONLY_ACTIONS=false
+# EXPERT_WIDTH_MULTIPLIER=0.75
+# PAST_OBS_KEYS="image"
+# N_OBS_STEPS=1
+# NUM_EXPERT_LAYERS=0
+# CHUNK_SIZE=50
+# NUM_VLM_LAYERS=16
+# PAD_LANG_TO=longest
+# EVAL_CKPT=$WORK/logs/lerobot/lerobot_physical_intelligence_libero_smolpi0_lr1e-4bs32steps100000gpus2freeze32_imgtoktrue_cross_attn_vlml16_causalacttrue_sa2_smolvlm2500_chunk50_nobs1statestrue_expw0.75_lrvlm1e-4_longest_pt_trans0false_decaylr1e-530000/checkpoints/best/pretrained_model/
+# ADD_IMAGE_TOKENS=true
+# ATTN_MODE=cross_attn
+# STATE_TO_PREFIX=true
+# CAUSAL_ACTION_ATTENTION_MASK=true
+# SELF_ATTN_EVERY_N_LAYERS=2
+# VLM_NAME=HuggingFaceTB/SmolVLM-500M-Instruct
+
+
+# CAUSAL_ATTENTION_ON_HISTORY=true
+# SELF_ATTN_ONLY_ACTIONS=false
+# EXPERT_WIDTH_MULTIPLIER=0.75
+# PAST_OBS_KEYS="image"
+# N_OBS_STEPS=1
+# NUM_EXPERT_LAYERS=0
+# CHUNK_SIZE=50
+# NUM_VLM_LAYERS=16
+# PAD_LANG_TO=longest
+# EVAL_CKPT=$WORK/logs/lerobot/lerobot_physical_intelligence_libero_smolpi0_lr1e-4bs32steps100000gpus2freeze32_imgtoktrue_cross_attn_vlml16_causalacttrue_sa2_smolvlm2500_chunk50_nobs1statestrue_expw0.75_lrvlm1e-4_longest_pt_trans0false_decaylr2.5e-6200000/checkpoints/best/pretrained_model/
+# ADD_IMAGE_TOKENS=true
+# ATTN_MODE=cross_attn
+# STATE_TO_PREFIX=true
+# CAUSAL_ACTION_ATTENTION_MASK=true
+# SELF_ATTN_EVERY_N_LAYERS=2
+# VLM_NAME=HuggingFaceTB/SmolVLM-500M-Instruct
+
+
+# CAUSAL_ATTENTION_ON_HISTORY=true
+# SELF_ATTN_ONLY_ACTIONS=false
+# EXPERT_WIDTH_MULTIPLIER=0.75
+# PAST_OBS_KEYS="image"
+# N_OBS_STEPS=1
+# NUM_EXPERT_LAYERS=0
+# CHUNK_SIZE=50
+# NUM_VLM_LAYERS=16
+# PAD_LANG_TO=longest
+# EVAL_CKPT=$WORK/logs/lerobot/lerobot_physical_intelligence_libero_smolpi0_lr1e-4bs32steps100000gpus2freeze32_imgtoktrue_cross_attn_vlml16_causalacttrue_sa2_smolvlm2500_chunk50_nobs1statestrue_expw0.75_lrvlm1e-4_longest_pt_trans0false_decaylr2.5e-6100000/checkpoints/best/pretrained_model/
+# ADD_IMAGE_TOKENS=true
+# ATTN_MODE=cross_attn
+# STATE_TO_PREFIX=true
+# CAUSAL_ACTION_ATTENTION_MASK=true
+# SELF_ATTN_EVERY_N_LAYERS=2
+# VLM_NAME=HuggingFaceTB/SmolVLM-500M-Instruct
+
+
+# CAUSAL_ATTENTION_ON_HISTORY=true
+# SELF_ATTN_ONLY_ACTIONS=false
+# EXPERT_WIDTH_MULTIPLIER=0.75
+# PAST_OBS_KEYS="image"
+# N_OBS_STEPS=1
+# NUM_EXPERT_LAYERS=0
+# CHUNK_SIZE=50
+# NUM_VLM_LAYERS=16
+# PAD_LANG_TO=longest
+# EVAL_CKPT=$WORK/logs/lerobot/lerobot_physical_intelligence_libero_smolpi0_lr1e-4bs32steps100000gpus2freeze32_imgtoktrue_cross_attn_vlml16_causalacttrue_sa2_smolvlm2500_chunk50_nobs1statestrue_expw0.75_lrvlm1e-4_longest_pt_trans0false_decaylr2.5e-630000/checkpoints/best/pretrained_model/
+# ADD_IMAGE_TOKENS=true
+# ATTN_MODE=cross_attn
+# STATE_TO_PREFIX=true
+# CAUSAL_ACTION_ATTENTION_MASK=true
+# SELF_ATTN_EVERY_N_LAYERS=2
+# VLM_NAME=HuggingFaceTB/SmolVLM-500M-Instruct
+
+
+# CAUSAL_ATTENTION_ON_HISTORY=true
+# SELF_ATTN_ONLY_ACTIONS=false
+# EXPERT_WIDTH_MULTIPLIER=0.75
+# PAST_OBS_KEYS="image"
+# N_OBS_STEPS=1
+# NUM_EXPERT_LAYERS=0
+# CHUNK_SIZE=50
+# NUM_VLM_LAYERS=16
+# PAD_LANG_TO=longest
+# EVAL_CKPT=$WORK/logs/lerobot/lerobot_physical_intelligence_libero_smolpi0_lr1e-4bs32steps100000gpus2freeze32_imgtoktrue_cross_attn_vlml16_causalacttrue_sa2_smolvlm2500_chunk50_nobs1statestrue_expw0.75_lrvlm1e-4_longest_pt_trans0false_decaylr5e-6100000/checkpoints/best/pretrained_model/
+# ADD_IMAGE_TOKENS=true
+# ATTN_MODE=cross_attn
+# STATE_TO_PREFIX=true
+# CAUSAL_ACTION_ATTENTION_MASK=true
+# SELF_ATTN_EVERY_N_LAYERS=2
+# VLM_NAME=HuggingFaceTB/SmolVLM-500M-Instruct
+
+
+# CAUSAL_ATTENTION_ON_HISTORY=true
+# SELF_ATTN_ONLY_ACTIONS=false
+# EXPERT_WIDTH_MULTIPLIER=0.75
+# PAST_OBS_KEYS="image"
+# N_OBS_STEPS=1
+# NUM_EXPERT_LAYERS=0
+# CHUNK_SIZE=50
+# NUM_VLM_LAYERS=16
+# PAD_LANG_TO=longest
+# EVAL_CKPT=$WORK/logs/lerobot/lerobot_physical_intelligence_libero_smolpi0_lr1e-4bs8steps100000gpus2freeze32_imgtoktrue_cross_attn_vlml16_causalacttrue_sa2_smolvla500base_chunk50_nobs1statestrue_expw0.75_lrvlm1e-4_longest_pt_trans0false_decaylr2.5e-630000_relactfalsefirst_camfalse/checkpoints/best/pretrained_model/
+# ADD_IMAGE_TOKENS=true
+# ATTN_MODE=cross_attn
+# STATE_TO_PREFIX=true
+# CAUSAL_ACTION_ATTENTION_MASK=true
+# SELF_ATTN_EVERY_N_LAYERS=2
+# VLM_NAME=HuggingFaceTB/SmolVLA-500M-Base 
+
+
+
+# PREDICT_RELATIVE_ACTIONS=true
+# RELATIVE_ACTIONS_MODE=relative
+# CAUSAL_ATTENTION_ON_HISTORY=true
+# SELF_ATTN_ONLY_ACTIONS=false
+# EXPERT_WIDTH_MULTIPLIER=0.75
+# PAST_OBS_KEYS="image"
+# N_OBS_STEPS=1
+# NUM_EXPERT_LAYERS=0
+# CHUNK_SIZE=50
+# NUM_VLM_LAYERS=16
+# PAD_LANG_TO=longest
+# EVAL_CKPT=$WORK/logs/lerobot/lerobot_physical_intelligence_libero_smolpi0_lr1e-4bs32steps100000gpus2freeze32_imgtoktrue_cross_attn_vlml16_causalacttrue_sa2_smolvlm2500_chunk50_nobs1statestrue_expw0.75_lrvlm1e-4_longest_pt_trans0false_decaylr2.5e-630000_relacttruerelative/checkpoints/best/pretrained_model/
+# ADD_IMAGE_TOKENS=true
+# ATTN_MODE=cross_attn
+# STATE_TO_PREFIX=true
+# CAUSAL_ACTION_ATTENTION_MASK=true
+# SELF_ATTN_EVERY_N_LAYERS=2
+# VLM_NAME=HuggingFaceTB/SmolVLM-500M-Instruct
+
+
+
+# PREDICT_RELATIVE_ACTIONS=true
+# RELATIVE_ACTIONS_MODE=first
+# CAUSAL_ATTENTION_ON_HISTORY=true
+# SELF_ATTN_ONLY_ACTIONS=false
+# EXPERT_WIDTH_MULTIPLIER=0.75
+# PAST_OBS_KEYS="image"
+# N_OBS_STEPS=1
+# NUM_EXPERT_LAYERS=0
+# CHUNK_SIZE=50
+# NUM_VLM_LAYERS=16
+# PAD_LANG_TO=longest
+# EVAL_CKPT=$WORK/logs/lerobot/lerobot_physical_intelligence_libero_smolpi0_lr1e-4bs32steps100000gpus2freeze32_imgtoktrue_cross_attn_vlml16_causalacttrue_sa2_smolvlm2500_chunk50_nobs1statestrue_expw0.75_lrvlm1e-4_longest_pt_trans0false_decaylr2.5e-630000_relacttruefirst/checkpoints/best/pretrained_model/
+# ADD_IMAGE_TOKENS=true
+# ATTN_MODE=cross_attn
+# STATE_TO_PREFIX=true
+# CAUSAL_ACTION_ATTENTION_MASK=true
+# SELF_ATTN_EVERY_N_LAYERS=2
+# VLM_NAME=HuggingFaceTB/SmolVLM-500M-Instruct
+
+
+# CAUSAL_ATTENTION_ON_HISTORY=true
+# SELF_ATTN_ONLY_ACTIONS=false
+# EXPERT_WIDTH_MULTIPLIER=0.75
+# PAST_OBS_KEYS="image"
+# N_OBS_STEPS=1
+# NUM_EXPERT_LAYERS=0
+# CHUNK_SIZE=50
+# NUM_VLM_LAYERS=16
+# PAD_LANG_TO=longest
+# EVAL_CKPT=$WORK/logs/lerobot/lerobot_physical_intelligence_libero_smolpi0_lr1e-4bs32steps100000gpus2freeze32_imgtoktrue_cross_attn_vlml16_causalacttrue_sa2_smolvla500base_chunk50_nobs1statestrue_expw0.75_lrvlm1e-4_longest_pt_trans0false_decaylr2.5e-630000_relactfalsefirst_camfalse/checkpoints/best/pretrained_model/
+# ADD_IMAGE_TOKENS=true
+# ATTN_MODE=cross_attn
+# STATE_TO_PREFIX=true
+# CAUSAL_ACTION_ATTENTION_MASK=true
+# SELF_ATTN_EVERY_N_LAYERS=2
+# VLM_NAME=HuggingFaceTB/SmolVLA-500M-Base 
+
+
+# CAUSAL_ATTENTION_ON_HISTORY=true
+# SELF_ATTN_ONLY_ACTIONS=false
+# EXPERT_WIDTH_MULTIPLIER=0.75
+# PAST_OBS_KEYS="image"
+# N_OBS_STEPS=1
+# NUM_EXPERT_LAYERS=0
+# CHUNK_SIZE=50
+# NUM_VLM_LAYERS=16
+# PAD_LANG_TO=longest
+# EVAL_CKPT=$WORK/logs/lerobot/lerobot_physical_intelligence_libero_smolpi0_lr8e-5bs8steps100000gpus2freeze32_imgtoktrue_cross_attn_vlml16_causalacttrue_sa2_smolvlm2500_chunk50_nobs1statestrue_expw0.75_lrvlm1e-4_longest_pt_trans0false_decaylr2.5e-6100000_relactfalsefirst_camfalse/checkpoints/best/pretrained_model/
+# ADD_IMAGE_TOKENS=true
+# ATTN_MODE=cross_attn
+# STATE_TO_PREFIX=true
+# CAUSAL_ACTION_ATTENTION_MASK=true
+# SELF_ATTN_EVERY_N_LAYERS=2
+# VLM_NAME=HuggingFaceTB/SmolVLM-500M-Instruct
+
+
+# CAUSAL_ATTENTION_ON_HISTORY=true
+# SELF_ATTN_ONLY_ACTIONS=false
+# EXPERT_WIDTH_MULTIPLIER=0.75
+# PAST_OBS_KEYS="image"
+# N_OBS_STEPS=1
+# NUM_EXPERT_LAYERS=0
+# CHUNK_SIZE=50
+# NUM_VLM_LAYERS=16
+# PAD_LANG_TO=longest
+# EVAL_CKPT=$WORK/logs/lerobot/lerobot_physical_intelligence_libero_smolpi0_lr6e-5bs8steps100000gpus2freeze32_imgtoktrue_cross_attn_vlml16_causalacttrue_sa2_smolvlm2500_chunk50_nobs1statestrue_expw0.75_lrvlm1e-4_longest_pt_trans0false_decaylr2.5e-6100000_relactfalsefirst_camfalse/checkpoints/best/pretrained_model/
+# ADD_IMAGE_TOKENS=true
+# ATTN_MODE=cross_attn
+# STATE_TO_PREFIX=true
+# CAUSAL_ACTION_ATTENTION_MASK=true
+# SELF_ATTN_EVERY_N_LAYERS=2
+# VLM_NAME=HuggingFaceTB/SmolVLM-500M-Instruct
+
+# CAUSAL_ATTENTION_ON_HISTORY=true
+# SELF_ATTN_ONLY_ACTIONS=false
+# EXPERT_WIDTH_MULTIPLIER=0.75
+# PAST_OBS_KEYS="image"
+# N_OBS_STEPS=1
+# NUM_EXPERT_LAYERS=0
+# CHUNK_SIZE=50
+# NUM_VLM_LAYERS=16
+# PAD_LANG_TO=longest
+# EVAL_CKPT=$WORK/logs/lerobot/lerobot_physical_intelligence_libero_smolpi0_lr4e-5bs8steps100000gpus2freeze32_imgtoktrue_cross_attn_vlml16_causalacttrue_sa2_smolvlm2500_chunk50_nobs1statestrue_expw0.75_lrvlm1e-4_longest_pt_trans0false_decaylr2.5e-6100000_relactfalsefirst_camfalse/checkpoints/best/pretrained_model/
+# ADD_IMAGE_TOKENS=true
+# ATTN_MODE=cross_attn
+# STATE_TO_PREFIX=true
+# CAUSAL_ACTION_ATTENTION_MASK=true
+# SELF_ATTN_EVERY_N_LAYERS=2
+# VLM_NAME=HuggingFaceTB/SmolVLM-500M-Instruct
+
+# CAUSAL_ATTENTION_ON_HISTORY=true
+# SELF_ATTN_ONLY_ACTIONS=false
+# EXPERT_WIDTH_MULTIPLIER=0.75
+# PAST_OBS_KEYS="image"
+# N_OBS_STEPS=1
+# NUM_EXPERT_LAYERS=0
+# CHUNK_SIZE=50
+# NUM_VLM_LAYERS=16
+# PAD_LANG_TO=longest
+# EVAL_CKPT=$WORK/logs/lerobot/lerobot_physical_intelligence_libero_smolpi0_lr1e-4bs32steps100000gpus2freeze32_cross_attn_vlml16_sa2_smolvlm2500_chunk50_nobs1_expw0.75_lrvlm1e-4_longest_ptcomv1v2freezebs64transv0_trans0false_decaylr2.5e-630000_relactfalsefirst_camfalse_vim-1/checkpoints/best/pretrained_model/
+# ADD_IMAGE_TOKENS=true
+# ATTN_MODE=cross_attn
+# STATE_TO_PREFIX=true
+# CAUSAL_ACTION_ATTENTION_MASK=true
+# SELF_ATTN_EVERY_N_LAYERS=2
+# VLM_NAME=HuggingFaceTB/SmolVLM-500M-Instruct
+
+# CAUSAL_ATTENTION_ON_HISTORY=true
+# SELF_ATTN_ONLY_ACTIONS=false
+# EXPERT_WIDTH_MULTIPLIER=0.75
+# PAST_OBS_KEYS="image"
+# N_OBS_STEPS=1
+# NUM_EXPERT_LAYERS=0
+# CHUNK_SIZE=50
+# NUM_VLM_LAYERS=16
+# PAD_LANG_TO=longest
+# EVAL_CKPT=$WORK/logs/lerobot/lerobot_physical_intelligence_libero_smolpi0_lr1e-4bs32steps100000gpus2freeze32_cross_attn_vlml16_sa2_smolvlm2500_chunk50_nobs1_expw0.75_lrvlm1e-4_longest_pt_trans1true_decaylr2.5e-630000_relactfalsefirst_camfalse_vim-1/checkpoints/best/pretrained_model/
+# ADD_IMAGE_TOKENS=true
+# ATTN_MODE=cross_attn
+# STATE_TO_PREFIX=true
+# CAUSAL_ACTION_ATTENTION_MASK=true
+# SELF_ATTN_EVERY_N_LAYERS=2
+# VLM_NAME=HuggingFaceTB/SmolVLM-500M-Instruct
+
+# REGRESSION_LOSS=true
+# CAUSAL_ATTENTION_ON_HISTORY=true
+# SELF_ATTN_ONLY_ACTIONS=false
+# EXPERT_WIDTH_MULTIPLIER=0.5
+# PAST_OBS_KEYS="image"
+# N_OBS_STEPS=1
+# NUM_EXPERT_LAYERS=0
+# CHUNK_SIZE=50
+# NUM_VLM_LAYERS=0
+# PAD_LANG_TO=longest
+# EVAL_CKPT=$WORK/logs/lerobot/lerobot_physical_intelligence_libero_smolpi0_lr1e-4bs8steps100000gpus2freeze32_cross_attn_vlml0_sa0_smolvlm2500_chunk50_nobs1_expw0.5_lrvlm1e-4_longest_pt_trans0false_decaylr2.5e-630000_relactfalsefirst_camfalse_vim-1_regtrue/checkpoints/best/pretrained_model/
+# ADD_IMAGE_TOKENS=true
+# ATTN_MODE=cross_attn
+# STATE_TO_PREFIX=true
+# CAUSAL_ACTION_ATTENTION_MASK=true
+# SELF_ATTN_EVERY_N_LAYERS=0
+# VLM_NAME=HuggingFaceTB/SmolVLM-500M-Instruct
+
+# REGRESSION_LOSS=true
+# CAUSAL_ATTENTION_ON_HISTORY=true
+# SELF_ATTN_ONLY_ACTIONS=false
+# EXPERT_WIDTH_MULTIPLIER=0.75
+# PAST_OBS_KEYS="image"
+# N_OBS_STEPS=1
+# NUM_EXPERT_LAYERS=0
+# CHUNK_SIZE=50
+# NUM_VLM_LAYERS=16
+# PAD_LANG_TO=longest
+# EVAL_CKPT=$WORK/logs/lerobot/lerobot_physical_intelligence_libero_smolpi0_lr1e-4bs32steps100000gpus2freeze32_cross_attn_vlml16_sa2_smolvlm2500_chunk50_nobs1_expw0.75_lrvlm1e-4_longest_pt_trans0false_decaylr2.5e-630000_relactfalsefirst_camfalse_vim-1_regtrue/checkpoints/best/pretrained_model/
+# ADD_IMAGE_TOKENS=true
+# ATTN_MODE=cross_attn
+# STATE_TO_PREFIX=true
+# CAUSAL_ACTION_ATTENTION_MASK=true
+# SELF_ATTN_EVERY_N_LAYERS=2
+# VLM_NAME=HuggingFaceTB/SmolVLM-500M-Instruct
+
+
+# CAUSAL_ATTENTION_ON_HISTORY=true
+# SELF_ATTN_ONLY_ACTIONS=false
+# EXPERT_WIDTH_MULTIPLIER=0.75
+# PAST_OBS_KEYS="image"
+# N_OBS_STEPS=1
+# NUM_EXPERT_LAYERS=0
+# CHUNK_SIZE=50
+# NUM_VLM_LAYERS=16
+# PAD_LANG_TO=longest
+# EVAL_CKPT=$WORK/logs/lerobot/lerobot_physical_intelligence_libero_smolpi0_lr1e-4bs32steps100000gpus2freeze32_cross_attn_vlml16_sa2_smolvlm2500_chunk50_nobs1_expw0.75_lrvlm1e-4_longest_pt_trans0false_decaylr2.5e-6100000_relactfalsefirst_camfalse_vim-1/checkpoints/best/pretrained_model/
+# ADD_IMAGE_TOKENS=true
+# ATTN_MODE=cross_attn
+# STATE_TO_PREFIX=true
+# CAUSAL_ACTION_ATTENTION_MASK=true
+# SELF_ATTN_EVERY_N_LAYERS=2
+# VLM_NAME=HuggingFaceTB/SmolVLM-500M-Instruct
+
+# CAUSAL_ATTENTION_ON_HISTORY=true
+# SELF_ATTN_ONLY_ACTIONS=false
+# EXPERT_WIDTH_MULTIPLIER=0.75
+# PAST_OBS_KEYS="image"
+# N_OBS_STEPS=1
+# NUM_EXPERT_LAYERS=0
+# CHUNK_SIZE=50
+# NUM_VLM_LAYERS=16
+# PAD_LANG_TO=longest
+# EVAL_CKPT=$WORK/logs/lerobot/lerobot_physical_intelligence_libero_smolpi0_lr2e-4bs32steps100000gpus2freeze32_cross_attn_vlml16_sa2_smolvlm2500_chunk50_nobs1_expw0.75_lrvlm1e-4_longest_pt_trans0false_decaylr2.5e-6100000_relactfalsefirst_camfalse_vim-1/checkpoints/best/pretrained_model/
+# ADD_IMAGE_TOKENS=true
+# ATTN_MODE=cross_attn
+# STATE_TO_PREFIX=true
+# CAUSAL_ACTION_ATTENTION_MASK=true
+# SELF_ATTN_EVERY_N_LAYERS=2
+# VLM_NAME=HuggingFaceTB/SmolVLM-500M-Instruct
+
+
+
+# CAUSAL_ATTENTION_ON_HISTORY=true
+# SELF_ATTN_ONLY_ACTIONS=false
+# EXPERT_WIDTH_MULTIPLIER=0.75
+# PAST_OBS_KEYS="image"
+# N_OBS_STEPS=1
+# NUM_EXPERT_LAYERS=0
+# CHUNK_SIZE=50
+# NUM_VLM_LAYERS=16
+# PAD_LANG_TO=longest
+# EVAL_CKPT=$WORK/logs/lerobot/lerobot_physical_intelligence_libero_smolpi0_lr3e-4bs32steps100000gpus2freeze32_cross_attn_vlml16_sa2_smolvlm2500_chunk50_nobs1_expw0.75_lrvlm1e-4_longest_pt_trans0false_decaylr2.5e-6100000_relactfalsefirst_camfalse_vim-1/checkpoints/best/pretrained_model/
+# ADD_IMAGE_TOKENS=true
+# ATTN_MODE=cross_attn
+# STATE_TO_PREFIX=true
+# CAUSAL_ACTION_ATTENTION_MASK=true
+# SELF_ATTN_EVERY_N_LAYERS=2
+# VLM_NAME=HuggingFaceTB/SmolVLM-500M-Instruct
+
+
+# REGRESSION_LOSS=true
+# CAUSAL_ATTENTION_ON_HISTORY=true
+# SELF_ATTN_ONLY_ACTIONS=false
+# EXPERT_WIDTH_MULTIPLIER=0.5
+# PAST_OBS_KEYS="image"
+# N_OBS_STEPS=1
+# NUM_EXPERT_LAYERS=0
+# CHUNK_SIZE=50
+# NUM_VLM_LAYERS=0
+# PAD_LANG_TO=longest
+# EVAL_CKPT=$WORK/logs/lerobot/lerobot_physical_intelligence_libero_smolpi0_lr9e-5bs8steps100000gpus2freeze32_cross_attn_vlml0_sa0_smolvlm2500_chunk50_nobs1_expw0.5_lrvlm1e-4_longest_pt_trans0false_decaylr2.5e-630000_relactfalsefirst_camfalse_vim-1_regtrue/checkpoints/best/pretrained_model/
+# ADD_IMAGE_TOKENS=true
+# ATTN_MODE=cross_attn
+# STATE_TO_PREFIX=true
+# CAUSAL_ACTION_ATTENTION_MASK=true
+# SELF_ATTN_EVERY_N_LAYERS=0
+# VLM_NAME=HuggingFaceTB/SmolVLM-500M-Instruct
+
+
+
+# REGRESSION_LOSS=true
+# CAUSAL_ATTENTION_ON_HISTORY=true
+# SELF_ATTN_ONLY_ACTIONS=false
+# EXPERT_WIDTH_MULTIPLIER=0.5
+# PAST_OBS_KEYS="image"
+# N_OBS_STEPS=1
+# NUM_EXPERT_LAYERS=0
+# CHUNK_SIZE=50
+# NUM_VLM_LAYERS=0
+# PAD_LANG_TO=longest
+# EVAL_CKPT=$WORK/logs/lerobot/lerobot_physical_intelligence_libero_smolpi0_lr2e-4bs8steps100000gpus2freeze32_cross_attn_vlml0_sa0_smolvlm2500_chunk50_nobs1_expw0.5_lrvlm1e-4_longest_pt_trans0false_decaylr2.5e-630000_relactfalsefirst_camfalse_vim-1_regtrue/checkpoints/best/pretrained_model/
+# ADD_IMAGE_TOKENS=true
+# ATTN_MODE=cross_attn
+# STATE_TO_PREFIX=true
+# CAUSAL_ACTION_ATTENTION_MASK=true
+# SELF_ATTN_EVERY_N_LAYERS=0
+# VLM_NAME=HuggingFaceTB/SmolVLM-500M-Instruct
+
+
+# CAUSAL_ATTENTION_ON_HISTORY=true
+# SELF_ATTN_ONLY_ACTIONS=false
+# EXPERT_WIDTH_MULTIPLIER=0.75
+# PAST_OBS_KEYS="image"
+# N_OBS_STEPS=1
+# NUM_EXPERT_LAYERS=0
+# CHUNK_SIZE=50
+# NUM_VLM_LAYERS=16
+# PAD_LANG_TO=longest
+# EVAL_CKPT=$WORK/logs/lerobot/lerobot_physical_intelligence_libero_smolpi0_lr9e-5bs32steps100000gpus2freeze32_cross_attn_vlml16_sa2_smolvlm2500_chunk50_nobs1_expw0.75_lrvlm1e-4_longest_pt_trans0false_decaylr2.5e-6100000_relactfalsefirst_camfalse_vim-1_regfalse/checkpoints/best/pretrained_model/
+# ADD_IMAGE_TOKENS=true
+# ATTN_MODE=cross_attn
+# STATE_TO_PREFIX=true
+# CAUSAL_ACTION_ATTENTION_MASK=true
+# SELF_ATTN_EVERY_N_LAYERS=2
+# VLM_NAME=HuggingFaceTB/SmolVLM-500M-Instruct
+
+
+# CAUSAL_ATTENTION_ON_HISTORY=true
+# SELF_ATTN_ONLY_ACTIONS=false
+# EXPERT_WIDTH_MULTIPLIER=0.75
+# PAST_OBS_KEYS="image"
+# N_OBS_STEPS=1
+# NUM_EXPERT_LAYERS=0
+# CHUNK_SIZE=50
+# NUM_VLM_LAYERS=16
+# PAD_LANG_TO=longest
+# EVAL_CKPT=$WORK/logs/lerobot/lerobot_physical_intelligence_libero_smolpi0_lr5e-4bs8steps100000gpus2freeze32_cross_attn_vlml16_sa2_smolvlm2250_chunk50_nobs1_expw0.75_lrvlm1e-4_longest_pt_trans0false_decaylr2.5e-630000_relactfalsefirst_camfalse_vim-1_regfalse/checkpoints/best/pretrained_model/
+# ADD_IMAGE_TOKENS=true
+# ATTN_MODE=cross_attn
+# STATE_TO_PREFIX=true
+# CAUSAL_ACTION_ATTENTION_MASK=true
+# SELF_ATTN_EVERY_N_LAYERS=2
+# VLM_NAME=HuggingFaceTB/SmolVLM2-256M-Video-Instruct 
+
+# CAUSAL_ATTENTION_ON_HISTORY=true
+# SELF_ATTN_ONLY_ACTIONS=false
+# EXPERT_WIDTH_MULTIPLIER=0.75
+# PAST_OBS_KEYS="image"
+# N_OBS_STEPS=1
+# NUM_EXPERT_LAYERS=0
+# CHUNK_SIZE=50
+# NUM_VLM_LAYERS=16
+# PAD_LANG_TO=longest
+# EVAL_CKPT=$WORK/logs/lerobot/lerobot_physical_intelligence_libero_smolpi0_lr8e-5bs8steps100000gpus2freeze32_cross_attn_vlml16_sa2_smolvlm22b_chunk50_nobs1_expw0.75_lrvlm1e-4_longest_pt_trans0false_decaylr2.5e-630000_relactfalsefirst_camfalse_vim-1_regfalse/checkpoints/best/pretrained_model/
+# ADD_IMAGE_TOKENS=true
+# ATTN_MODE=cross_attn
+# STATE_TO_PREFIX=true
+# CAUSAL_ACTION_ATTENTION_MASK=true
+# SELF_ATTN_EVERY_N_LAYERS=2
+# VLM_NAME=HuggingFaceTB/SmolVLM2-2.2B-Instruct
+
+# CAUSAL_ATTENTION_ON_HISTORY=true
+# SELF_ATTN_ONLY_ACTIONS=false
+# EXPERT_WIDTH_MULTIPLIER=0.75
+# PAST_OBS_KEYS="image"
+# N_OBS_STEPS=1
+# NUM_EXPERT_LAYERS=0
+# CHUNK_SIZE=50
+# NUM_VLM_LAYERS=16
+# PAD_LANG_TO=longest
+# EVAL_CKPT=$WORK/logs/lerobot/lerobot_physical_intelligence_libero_smolpi0_lr5e-4bs32steps100000gpus2freeze32_cross_attn_vlml16_sa2_smolvlm1250_chunk50_nobs1_expw0.75_lrvlm1e-4_longest_pt_trans0false_decaylr2.5e-630000_relactfalsefirst_camfalse_vim-1_regfalse/checkpoints/best/pretrained_model/
+# ADD_IMAGE_TOKENS=true
+# ATTN_MODE=cross_attn
+# STATE_TO_PREFIX=true
+# CAUSAL_ACTION_ATTENTION_MASK=true
+# SELF_ATTN_EVERY_N_LAYERS=2
+# VLM_NAME=HuggingFaceTB/SmolVLM2-256M-Video-Instruct 
+
+
+# CAUSAL_ATTENTION_ON_HISTORY=true
+# SELF_ATTN_ONLY_ACTIONS=false
+# EXPERT_WIDTH_MULTIPLIER=0.75
+# PAST_OBS_KEYS="image"
+# N_OBS_STEPS=1
+# NUM_EXPERT_LAYERS=0
+# CHUNK_SIZE=50
+# NUM_VLM_LAYERS=16
+# PAD_LANG_TO=longest
+# EVAL_CKPT=$WORK/logs/lerobot/lerobot_physical_intelligence_libero_smolpi0_lr4e-4bs32steps100000gpus2freeze32_cross_attn_vlml16_sa2_smolvlm1250_chunk50_nobs1_expw0.75_lrvlm1e-4_longest_pt_trans0false_decaylr2.5e-630000_relactfalsefirst_camfalse_vim-1_regfalse/checkpoints/best/pretrained_model/
+# ADD_IMAGE_TOKENS=true
+# ATTN_MODE=cross_attn
+# STATE_TO_PREFIX=true
+# CAUSAL_ACTION_ATTENTION_MASK=true
+# SELF_ATTN_EVERY_N_LAYERS=2
+# VLM_NAME=HuggingFaceTB/SmolVLM2-256M-Video-Instruct 
+
+
+# CAUSAL_ATTENTION_ON_HISTORY=true
+# SELF_ATTN_ONLY_ACTIONS=false
+# EXPERT_WIDTH_MULTIPLIER=0.75
+# PAST_OBS_KEYS="image"
+# N_OBS_STEPS=1
+# NUM_EXPERT_LAYERS=0
+# CHUNK_SIZE=50
+# NUM_VLM_LAYERS=16
+# PAD_LANG_TO=longest
+# EVAL_CKPT=$WORK/logs/lerobot/lerobot_physical_intelligence_libero_smolpi0_lr3e-4bs32steps100000gpus2freeze32_cross_attn_vlml16_sa2_smolvlm1250_chunk50_nobs1_expw0.75_lrvlm1e-4_longest_pt_trans0false_decaylr2.5e-630000_relactfalsefirst_camfalse_vim-1_regfalse/checkpoints/best/pretrained_model/
+# ADD_IMAGE_TOKENS=true
+# ATTN_MODE=cross_attn
+# STATE_TO_PREFIX=true
+# CAUSAL_ACTION_ATTENTION_MASK=true
+# SELF_ATTN_EVERY_N_LAYERS=2
+# VLM_NAME=HuggingFaceTB/SmolVLM2-256M-Video-Instruct 
+
+
+# CAUSAL_ATTENTION_ON_HISTORY=true
+# SELF_ATTN_ONLY_ACTIONS=false
+# EXPERT_WIDTH_MULTIPLIER=0.75
+# PAST_OBS_KEYS="image"
+# N_OBS_STEPS=1
+# NUM_EXPERT_LAYERS=0
+# CHUNK_SIZE=50
+# NUM_VLM_LAYERS=16
+# PAD_LANG_TO=longest
+# EVAL_CKPT=$WORK/logs/lerobot/lerobot_physical_intelligence_libero_smolpi0_lr6e-5bs32steps100000gpus2freeze32_cross_attn_vlml16_sa2_smolvlm12b_chunk50_nobs1_expw0.75_lrvlm1e-4_longest_pt_trans0false_decaylr2.5e-630000_relactfalsefirst_camfalse_vim-1_regfalse/checkpoints/best/pretrained_model/
+# ADD_IMAGE_TOKENS=true
+# ATTN_MODE=cross_attn
+# STATE_TO_PREFIX=true
+# CAUSAL_ACTION_ATTENTION_MASK=true
+# SELF_ATTN_EVERY_N_LAYERS=2
+# VLM_NAME=HuggingFaceTB/SmolVLM2-2.2B-Instruct
+
+
+# CAUSAL_ATTENTION_ON_HISTORY=true
+# SELF_ATTN_ONLY_ACTIONS=false
+# EXPERT_WIDTH_MULTIPLIER=0.75
+# PAST_OBS_KEYS="image"
+# N_OBS_STEPS=1
+# NUM_EXPERT_LAYERS=0
+# CHUNK_SIZE=50
+# NUM_VLM_LAYERS=16
+# PAD_LANG_TO=longest
+# EVAL_CKPT=$WORK/logs/lerobot/lerobot_physical_intelligence_libero_smolpi0_lr1e-4bs32steps100000gpus2freeze32_cross_attn_vlml16_sa2_smolvlm12b_chunk50_nobs1_expw0.75_lrvlm1e-4_longest_pt_trans0false_decaylr2.5e-630000_relactfalsefirst_camfalse_vim-1_regfalse/checkpoints/best/pretrained_model/
+# ADD_IMAGE_TOKENS=true
+# ATTN_MODE=cross_attn
+# STATE_TO_PREFIX=true
+# CAUSAL_ACTION_ATTENTION_MASK=true
+# SELF_ATTN_EVERY_N_LAYERS=2
+# VLM_NAME=HuggingFaceTB/SmolVLM2-2.2B-Instruct
+
+
+
+# CAUSAL_ATTENTION_ON_HISTORY=true
+# SELF_ATTN_ONLY_ACTIONS=false
+# EXPERT_WIDTH_MULTIPLIER=0.75
+# PAST_OBS_KEYS="image"
+# N_OBS_STEPS=1
+# NUM_EXPERT_LAYERS=0
+# CHUNK_SIZE=50
+# NUM_VLM_LAYERS=16
+# PAD_LANG_TO=longest
+# EVAL_CKPT=$WORK/logs/lerobot/lerobot_physical_intelligence_libero_smolpi0_lr9e-5bs32steps100000gpus2freeze32_cross_attn_vlml16_sa2_smolvlm12b_chunk50_nobs1_expw0.75_lrvlm1e-4_longest_pt_trans0false_decaylr2.5e-630000_relactfalsefirst_camfalse_vim-1_regfalse/checkpoints/best/pretrained_model/
+# ADD_IMAGE_TOKENS=true
+# ATTN_MODE=cross_attn
+# STATE_TO_PREFIX=true
+# CAUSAL_ACTION_ATTENTION_MASK=true
+# SELF_ATTN_EVERY_N_LAYERS=2
+# VLM_NAME=HuggingFaceTB/SmolVLM2-2.2B-Instruct
+
+
+# CAUSAL_ATTENTION_ON_HISTORY=true
+# SELF_ATTN_ONLY_ACTIONS=false
+# EXPERT_WIDTH_MULTIPLIER=0.75
+# PAST_OBS_KEYS="image"
+# N_OBS_STEPS=1
+# NUM_EXPERT_LAYERS=0
+# CHUNK_SIZE=50
+# NUM_VLM_LAYERS=16
+# PAD_LANG_TO=longest
+# EVAL_CKPT=$WORK/logs/lerobot/lerobot_physical_intelligence_libero_smolpi0_lr7e-5bs32steps100000gpus2freeze32_cross_attn_vlml16_sa2_smolvlm12b_chunk50_nobs1_expw0.75_lrvlm1e-4_longest_pt_trans0false_decaylr2.5e-630000_relactfalsefirst_camfalse_vim-1_regfalse/checkpoints/best/pretrained_model/
+# ADD_IMAGE_TOKENS=true
+# ATTN_MODE=cross_attn
+# STATE_TO_PREFIX=true
+# CAUSAL_ACTION_ATTENTION_MASK=true
+# SELF_ATTN_EVERY_N_LAYERS=2
+# VLM_NAME=HuggingFaceTB/SmolVLM2-2.2B-Instruct
+
+
+# CAUSAL_ATTENTION_ON_HISTORY=true
+# SELF_ATTN_ONLY_ACTIONS=false
+# EXPERT_WIDTH_MULTIPLIER=0.75
+# PAST_OBS_KEYS="image"
+# N_OBS_STEPS=1
+# NUM_EXPERT_LAYERS=0
+# CHUNK_SIZE=50
+# NUM_VLM_LAYERS=16
+# PAD_LANG_TO=longest
+# EVAL_CKPT=$WORK/logs/lerobot/lerobot_physical_intelligence_libero_smolpi0_lr8e-5bs32steps100000gpus2freeze32_cross_attn_vlml16_sa2_smolvlm12b_chunk50_nobs1_expw0.75_lrvlm1e-4_longest_pt_trans0false_decaylr2.5e-630000_relactfalsefirst_camfalse_vim-1_regfalse/checkpoints/best/pretrained_model/
+# ADD_IMAGE_TOKENS=true
+# ATTN_MODE=cross_attn
+# STATE_TO_PREFIX=true
+# CAUSAL_ACTION_ATTENTION_MASK=true
+# SELF_ATTN_EVERY_N_LAYERS=2
+# VLM_NAME=HuggingFaceTB/SmolVLM2-2.2B-Instruct
+
+
+CAUSAL_ATTENTION_ON_HISTORY=true
+SELF_ATTN_ONLY_ACTIONS=false
+EXPERT_WIDTH_MULTIPLIER=0.75
+PAST_OBS_KEYS="image"
+N_OBS_STEPS=1
+NUM_EXPERT_LAYERS=0
+CHUNK_SIZE=50
+NUM_VLM_LAYERS=16
+PAD_LANG_TO=longest
+EVAL_CKPT=/raid/jade/models/smolvlamust
+ADD_IMAGE_TOKENS=true
+ATTN_MODE=cross_attn
+STATE_TO_PREFIX=true
+CAUSAL_ACTION_ATTENTION_MASK=true
+SELF_ATTN_EVERY_N_LAYERS=2
+VLM_NAME=HuggingFaceTB/SmolVLM-500M-Instruct
+
+
+# CAUSAL_ATTENTION_ON_HISTORY=true
+# SELF_ATTN_ONLY_ACTIONS=false
+# EXPERT_WIDTH_MULTIPLIER=0.75
+# PAST_OBS_KEYS="image"
+# N_OBS_STEPS=1
+# NUM_EXPERT_LAYERS=0
+# CHUNK_SIZE=50
+# NUM_VLM_LAYERS=16
+# PAD_LANG_TO=longest
+# EVAL_CKPT=$WORK/logs/lerobot/lerobot_physical_intelligence_libero_smolpi0_lr6e-5bs32steps100000gpus2freeze32_cross_attn_vlml16_sa2_smolvlm22b_chunk50_nobs1_expw0.75_lrvlm1e-4_longest_pt_trans0false_decaylr2.5e-630000_relactfalsefirst_camfalse_vim-1_regfalse_compilefalse/checkpoints/best/pretrained_model/
+# ADD_IMAGE_TOKENS=true
+# ATTN_MODE=cross_attn
+# STATE_TO_PREFIX=true
+# CAUSAL_ACTION_ATTENTION_MASK=true
+# SELF_ATTN_EVERY_N_LAYERS=2
+# VLM_NAME=HuggingFaceTB/SmolVLM2-2.2B-Instruct
+
+# CAUSAL_ATTENTION_ON_HISTORY=true
+# SELF_ATTN_ONLY_ACTIONS=false
+# EXPERT_WIDTH_MULTIPLIER=0.75
+# PAST_OBS_KEYS="image"
+# N_OBS_STEPS=1
+# NUM_EXPERT_LAYERS=0
+# CHUNK_SIZE=50
+# NUM_VLM_LAYERS=16
+# PAD_LANG_TO=longest
+# EVAL_CKPT=$WORK/logs/lerobot/lerobot_physical_intelligence_libero_smolpi0_lr9e-5bs32steps100000gpus2freeze32_cross_attn_vlml16_sa2_smolvlm22b_chunk50_nobs1_expw0.75_lrvlm1e-4_longest_pt_trans0false_decaylr2.5e-630000_relactfalsefirst_camfalse_vim-1_regfalse_compilefalse/checkpoints/best/pretrained_model/
+# ADD_IMAGE_TOKENS=true
+# ATTN_MODE=cross_attn
+# STATE_TO_PREFIX=true
+# CAUSAL_ACTION_ATTENTION_MASK=true
+# SELF_ATTN_EVERY_N_LAYERS=2
+# VLM_NAME=HuggingFaceTB/SmolVLM2-2.2B-Instruct
+
+
+# CAUSAL_ATTENTION_ON_HISTORY=true
+# SELF_ATTN_ONLY_ACTIONS=false
+# EXPERT_WIDTH_MULTIPLIER=0.75
+# PAST_OBS_KEYS="image"
+# N_OBS_STEPS=1
+# NUM_EXPERT_LAYERS=0
+# CHUNK_SIZE=50
+# NUM_VLM_LAYERS=16
+# PAD_LANG_TO=longest
+# EVAL_CKPT=$WORK/logs/lerobot/lerobot_physical_intelligence_libero_smolpi0_lr8e-5bs32steps100000gpus2freeze32_cross_attn_vlml16_sa2_smolvlm22b_chunk50_nobs1_expw0.75_lrvlm1e-4_longest_pt_trans0false_decaylr2.5e-630000_relactfalsefirst_camfalse_vim-1_regfalse_compilefalse/checkpoints/best/pretrained_model/
+# ADD_IMAGE_TOKENS=true
+# ATTN_MODE=cross_attn
+# STATE_TO_PREFIX=true
+# CAUSAL_ACTION_ATTENTION_MASK=true
+# SELF_ATTN_EVERY_N_LAYERS=2
+# VLM_NAME=HuggingFaceTB/SmolVLM2-2.2B-Instruct
+
+# CAUSAL_ATTENTION_ON_HISTORY=true
+# SELF_ATTN_ONLY_ACTIONS=false
+# EXPERT_WIDTH_MULTIPLIER=0.75
+# PAST_OBS_KEYS="image"
+# N_OBS_STEPS=1
+# NUM_EXPERT_LAYERS=0
+# CHUNK_SIZE=50
+# NUM_VLM_LAYERS=16
+# PAD_LANG_TO=longest
+# EVAL_CKPT=$WORK/logs/lerobot/lerobot_physical_intelligence_libero_smolpi0_lr1e-4bs32steps100000gpus2freeze32_cross_attn_vlml16_sa2_smolvlm22b_chunk50_nobs1_expw0.75_lrvlm1e-4_longest_pt_trans0false_decaylr2.5e-630000_relactfalsefirst_camfalse_vim-1_regfalse_compilefalse/checkpoints/best/pretrained_model/
+# ADD_IMAGE_TOKENS=true
+# ATTN_MODE=cross_attn
+# STATE_TO_PREFIX=true
+# CAUSAL_ACTION_ATTENTION_MASK=true
+# SELF_ATTN_EVERY_N_LAYERS=2
+# VLM_NAME=HuggingFaceTB/SmolVLM2-2.2B-Instruct
+
+
+
+# CAUSAL_ATTENTION_ON_HISTORY=true
+# SELF_ATTN_ONLY_ACTIONS=false
+# EXPERT_WIDTH_MULTIPLIER=0.75
+# PAST_OBS_KEYS="image"
+# N_OBS_STEPS=1
+# NUM_EXPERT_LAYERS=0
+# CHUNK_SIZE=50
+# NUM_VLM_LAYERS=16
+# PAD_LANG_TO=longest
+# EVAL_CKPT=$WORK/logs/lerobot/lerobot_physical_intelligence_libero_smolpi0_lr7e-4bs32steps100000gpus2freeze32_cross_attn_vlml16_sa2_smolvlm1250_chunk50_nobs1_expw0.75_lrvlm1e-4_longest_pt_trans0false_decaylr2.5e-630000_relactfalsefirst_camfalse_vim-1_regfalse/checkpoints/best/pretrained_model/
+# ADD_IMAGE_TOKENS=true
+# ATTN_MODE=cross_attn
+# STATE_TO_PREFIX=true
+# CAUSAL_ACTION_ATTENTION_MASK=true
+# SELF_ATTN_EVERY_N_LAYERS=2
+# VLM_NAME=HuggingFaceTB/SmolVLM2-256M-Video-Instruct 
+
+
+# CAUSAL_ATTENTION_ON_HISTORY=true
+# SELF_ATTN_ONLY_ACTIONS=false
+# EXPERT_WIDTH_MULTIPLIER=0.75
+# PAST_OBS_KEYS="image"
+# N_OBS_STEPS=1
+# NUM_EXPERT_LAYERS=0
+# CHUNK_SIZE=50
+# NUM_VLM_LAYERS=16
+# PAD_LANG_TO=longest
+# EVAL_CKPT=$WORK/logs/lerobot/lerobot_physical_intelligence_libero_smolpi0_lr1e-4bs32steps100000gpus2freeze32_cross_attn_vlml16_sa2_smolvlm2250_chunk50_nobs1_expw0.75_lrvlm1e-4_longest_pt_trans0false_decaylr2.5e-630000_relactfalsefirst_camfalse_vim-1_regfalse_compilefalse/checkpoints/best/pretrained_model/
+# ADD_IMAGE_TOKENS=true
+# ATTN_MODE=cross_attn
+# STATE_TO_PREFIX=true
+# CAUSAL_ACTION_ATTENTION_MASK=true
+# SELF_ATTN_EVERY_N_LAYERS=2
+# VLM_NAME=HuggingFaceTB/SmolVLM2-256M-Video-Instruct 
+
+
+# CAUSAL_ATTENTION_ON_HISTORY=true
+# SELF_ATTN_ONLY_ACTIONS=false
+# EXPERT_WIDTH_MULTIPLIER=0.75
+# PAST_OBS_KEYS="image"
+# N_OBS_STEPS=1
+# NUM_EXPERT_LAYERS=0
+# CHUNK_SIZE=50
+# NUM_VLM_LAYERS=16
+# PAD_LANG_TO=longest
+# EVAL_CKPT=$WORK/logs/lerobot/lerobot_physical_intelligence_libero_smolpi0_lr3e-4bs32steps100000gpus2freeze32_cross_attn_vlml16_sa2_smolvlm2250_chunk50_nobs1_expw0.75_lrvlm1e-4_longest_pt_trans0false_decaylr2.5e-630000_relactfalsefirst_camfalse_vim-1_regfalse_compilefalse/checkpoints/best/pretrained_model/
+# ADD_IMAGE_TOKENS=true
+# ATTN_MODE=cross_attn
+# STATE_TO_PREFIX=true
+# CAUSAL_ACTION_ATTENTION_MASK=true
+# SELF_ATTN_EVERY_N_LAYERS=2
+# VLM_NAME=HuggingFaceTB/SmolVLM2-256M-Video-Instruct 
+
+# CAUSAL_ATTENTION_ON_HISTORY=true
+# SELF_ATTN_ONLY_ACTIONS=false
+# EXPERT_WIDTH_MULTIPLIER=0.75
+# PAST_OBS_KEYS="image"
+# N_OBS_STEPS=1
+# NUM_EXPERT_LAYERS=0
+# CHUNK_SIZE=50
+# NUM_VLM_LAYERS=16
+# PAD_LANG_TO=longest
+# EVAL_CKPT=$WORK/logs/lerobot/lerobot_physical_intelligence_libero_smolpi0_lr5e-4bs32steps100000gpus2freeze32_cross_attn_vlml16_sa2_smolvlm2250_chunk50_nobs1_expw0.75_lrvlm1e-4_longest_pt_trans0false_decaylr2.5e-630000_relactfalsefirst_camfalse_vim-1_regfalse_compilefalse/checkpoints/best/pretrained_model/
+# ADD_IMAGE_TOKENS=true
+# ATTN_MODE=cross_attn
+# STATE_TO_PREFIX=true
+# CAUSAL_ACTION_ATTENTION_MASK=true
+# SELF_ATTN_EVERY_N_LAYERS=2
+# VLM_NAME=HuggingFaceTB/SmolVLM2-256M-Video-Instruct 
+
+
+# CAUSAL_ATTENTION_ON_HISTORY=true
+# SELF_ATTN_ONLY_ACTIONS=false
+# EXPERT_WIDTH_MULTIPLIER=0.75
+# PAST_OBS_KEYS="image"
+# N_OBS_STEPS=1
+# NUM_EXPERT_LAYERS=0
+# CHUNK_SIZE=50
+# NUM_VLM_LAYERS=16
+# PAD_LANG_TO=longest
+# EVAL_CKPT=$WORK/logs/lerobot/lerobot_physical_intelligence_libero_smolpi0_lr7e-4bs32steps100000gpus2freeze32_cross_attn_vlml16_sa2_smolvlm2250_chunk50_nobs1_expw0.75_lrvlm1e-4_longest_pt_trans0false_decaylr2.5e-630000_relactfalsefirst_camfalse_vim-1_regfalse_compilefalse/checkpoints/best/pretrained_model/
+# ADD_IMAGE_TOKENS=true
+# ATTN_MODE=cross_attn
+# STATE_TO_PREFIX=true
+# CAUSAL_ACTION_ATTENTION_MASK=true
+# SELF_ATTN_EVERY_N_LAYERS=2
+# VLM_NAME=HuggingFaceTB/SmolVLM2-256M-Video-Instruct 
+
+# CAUSAL_ATTENTION_ON_HISTORY=true
+# SELF_ATTN_ONLY_ACTIONS=false
+# EXPERT_WIDTH_MULTIPLIER=0.75
+# PAST_OBS_KEYS="image"
+# N_OBS_STEPS=1
+# NUM_EXPERT_LAYERS=0
+# CHUNK_SIZE=50
+# NUM_VLM_LAYERS=16
+# PAD_LANG_TO=longest
+# EVAL_CKPT=$WORK/logs/lerobot/lerobot_physical_intelligence_libero_smolpi0_lr6e-4bs32steps100000gpus2freeze32_cross_attn_vlml16_sa2_smolvlm2250_chunk50_nobs1_expw0.75_lrvlm1e-4_longest_pt_trans0false_decaylr2.5e-630000_relactfalsefirst_camfalse_vim-1_regfalse_compilefalse/checkpoints/best/pretrained_model/
+# ADD_IMAGE_TOKENS=true
+# ATTN_MODE=cross_attn
+# STATE_TO_PREFIX=true
+# CAUSAL_ACTION_ATTENTION_MASK=true
+# SELF_ATTN_EVERY_N_LAYERS=2
+# VLM_NAME=HuggingFaceTB/SmolVLM2-256M-Video-Instruct 
+
+# CAUSAL_ATTENTION_ON_HISTORY=true
+# SELF_ATTN_ONLY_ACTIONS=false
+# EXPERT_WIDTH_MULTIPLIER=0.75
+# PAST_OBS_KEYS="image"
+# N_OBS_STEPS=1
+# NUM_EXPERT_LAYERS=0
+# CHUNK_SIZE=50
+# NUM_VLM_LAYERS=16
+# PAD_LANG_TO=longest
+# EVAL_CKPT=$WORK/logs/lerobot/lerobot_physical_intelligence_libero_smolpi0_lr4e-4bs32steps100000gpus2freeze32_cross_attn_vlml16_sa2_smolvlm2250_chunk50_nobs1_expw0.75_lrvlm1e-4_longest_pt_trans0false_decaylr2.5e-630000_relactfalsefirst_camfalse_vim-1_regfalse_compilefalse/checkpoints/best/pretrained_model/
+# ADD_IMAGE_TOKENS=true
+# ATTN_MODE=cross_attn
+# STATE_TO_PREFIX=true
+# CAUSAL_ACTION_ATTENTION_MASK=true
+# SELF_ATTN_EVERY_N_LAYERS=2
+# VLM_NAME=HuggingFaceTB/SmolVLM2-256M-Video-Instruct 
+
+
+# # TASK=libero_spatial
+# MULTITASK_EVAL=false
+# N_EPISODES=50
+
+# TASK=libero_spatial,libero_object,libero_goal,libero_10
+MULTITASK_EVAL=true
+# N_EPISODES=5
+N_EPISODES=1
+
+# MAX_PARRALLEL_TASKS=5
+# MAX_PARRALLEL_TASKS=2
+MAX_PARRALLEL_TASKS=1
+
+# NUM_EVALS=2
+# SEEDS=(1000 5000)
+SEEDS=(5000)
+# ACTION_STEPS_LIST=(1 10 30 50)
+ACTION_STEPS_LIST=(1)
+# ACTION_STEPS_LIST=(50)
+TASK_LIST=(libero_spatial libero_object libero_goal libero_10)
+TASK_LIST=(libero_spatial)
+for SEED in "${SEEDS[@]}"; do
+    for N_ACTION_STEPS in "${ACTION_STEPS_LIST[@]}"; do
+        for TASK in "${TASK_LIST[@]}"; do
+            echo "$TASK Evaluating: $EVAL_CKPT | N_ACTION_STEPS=$N_ACTION_STEPS | EVAL SEED=$SEED"
+            python src/lerobot/scripts/eval.py \
+                --output_dir=/raid/jade/logs/lerobot/tmp \
+                --env.type=$ENV \
+                --env.task=$TASK \
+                --eval.batch_size=$N_EPISODES \
+                --eval.n_episodes=$N_EPISODES \
+                --seed=$SEED \
+                --policy.use_amp=false \
+                --policy.path=$EVAL_CKPT \
+                --policy.n_action_steps=$N_ACTION_STEPS \
+                --policy.checkpoint_path=$EVAL_CKPT \
+                --env.multitask_eval=$MULTITASK_EVAL --env.max_parallel_tasks=$MAX_PARRALLEL_TASKS \
+                --policy.add_image_special_tokens=$ADD_IMAGE_TOKENS \
+                --policy.attention_mode=$ATTN_MODE \
+                --policy.causal_action_attention_mask=$CAUSAL_ACTION_ATTENTION_MASK \
+                --policy.state_to_prefix=$STATE_TO_PREFIX \
+                --policy.self_attn_every_n_layers=$SELF_ATTN_EVERY_N_LAYERS \
+                --policy.pad_language_to=$PAD_LANG_TO \
+                --policy.load_vlm_weights=$LOAD_VLM_WEIGHTS \
+                --policy.vlm_model_name=$VLM_NAME \
+                --policy.num_vlm_layers=$NUM_VLM_LAYERS \
+                --policy.chunk_size=$CHUNK_SIZE \
+                --policy.n_obs_steps=$N_OBS_STEPS \
+                --policy.past_obs_keys=$PAST_OBS_KEYS \
+                --policy.num_expert_layers=$NUM_EXPERT_LAYERS \
+                --policy.expert_width_multiplier=$EXPERT_WIDTH_MULTIPLIER \
+                --policy.peft_method=$PEFT_METHOD \
+                --policy.self_attn_only_actions=$SELF_ATTN_ONLY_ACTIONS \
+                --policy.causal_attention_on_history=$CAUSAL_ATTENTION_ON_HISTORY \
+                --policy.predict_relative_actions=$PREDICT_RELATIVE_ACTIONS --policy.relative_actions_mode=$RELATIVE_ACTIONS_MODE --policy.shuffle_camera_positions=$SHUFFLE_CAMERA_POSITIONS \
+                --policy.vlm_img_size=$VLM_IMG_SIZE \
+                --policy.regression_loss=$REGRESSION_LOSS 
+                # --policy.peft_config.r=$LORA_R --policy.peft_config.target_modules=$LORA_TARGET_MODULES  --policy.peft_method=$PEFT_METHOD --policy.peft_target_model=$PEFT_TARGET_MODEL 
+
+            echo "Done with: $EVAL_CKPT | Steps=$N_ACTION_STEPS | EVAL SEED=$SEED"
+            echo "------------------------------------------------------"
+        done
+    done
+done
+
+
+# ############################################################################################################################################
+# ############################################################################################################################################
+# ############################################################################################################################################
+# ########### Offline eval
+
+
+# # ############################
+# # # Community datasets V1
+# # # REPO_ID=pranavsaroha/so100_legos4,pranavsaroha/so100_onelego2,jpata/so100_pick_place_tangerine,pranavsaroha/so100_onelego3,pranavsaroha/so100_carrot_2,pranavsaroha/so100_carrot_5,pandaRQ/pick_med_1,HITHY/so100_strawberry,vladfatu/so100_above,koenvanwijk/orange50-1,koenvanwijk/orange50-variation-2,FeiYjf/new_GtoR,CSCSXX/pick_place_cube_1.18,vladfatu/so100_office,dragon-95/so100_sorting,dragon-95/so100_sorting_1,nbaron99/so100_pick_and_place4,Beegbrain/pick_place_green_block,Ityl/so100_recording2,dragon-95/so100_sorting_2,dragon-95/so100_sorting_3,aractingi/push_cube_offline_data,HITHY/so100_peach3,HITHY/so100_peach4,shreyasgite/so100_legocube_50,shreyasgite/so100_base_env,triton7777/so100_dataset_mix,Deason11/Open_the_drawer_to_place_items,Deason11/PLACE_TAPE_PUSH_DRAWER,NONHUMAN-RESEARCH/SOARM100_TASK_VENDA,mikechambers/block_cup_14,samsam0510/tooth_extraction_3,samsam0510/tooth_extraction_4,samsam0510/cube_reorientation_2,samsam0510/cube_reorientation_4,samsam0510/glove_reorientation_1,DorayakiLin/so100_pick_charger_on_tissue,zijian2022/noticehuman3,liuhuanjim013/so100_th
+# # # Inconsistent actions dim: Deason11/Open_the_drawer_to_place_items, Deason11/PLACE_TAPE_PUSH_DRAWER
+# # # Filtered datasets
+# # REPO_ID=pranavsaroha/so100_onelego2,pranavsaroha/so100_onelego3,pranavsaroha/so100_carrot_2,vladfatu/so100_above,koenvanwijk/orange50-1,CSCSXX/pick_place_cube_1.18,dragon-95/so100_sorting,dragon-95/so100_sorting_1,nbaron99/so100_pick_and_place4,Beegbrain/pick_place_green_block,dragon-95/so100_sorting_3,HITHY/so100_peach3,shreyasgite/so100_legocube_50,triton7777/so100_dataset_mix,NONHUMAN-RESEARCH/SOARM100_TASK_VENDA,mikechambers/block_cup_14,samsam0510/tooth_extraction_3,samsam0510/tooth_extraction_4,samsam0510/cube_reorientation_2,samsam0510/cube_reorientation_4,samsam0510/glove_reorientation_1,vladfatu/so100_office,pranavsaroha/so100_legos4,Ityl/so100_recording2,FeiYjf/new_GtoR,dragon-95/so100_sorting_2,HITHY/so100_peach4,jpata/so100_pick_place_tangerine,HITHY/so100_strawberry,shreyasgite/so100_base_env,koenvanwijk/orange50-variation-2,pranavsaroha/so100_carrot_5,pandaRQ/pick_med_1,aractingi/push_cube_offline_data,DorayakiLin/so100_pick_charger_on_tissue,zijian2022/noticehuman3,liuhuanjim013/so100_th
+# # SAMPLING_WEIGHTS=
+# # DATASET_NAME=so100_community_v1
+
+
+# # # Community datasets V2
+# # # Inconsistent actions: 1g0rrr/sam_openpi_solder1, 1g0rrr/sam_openpi03, 1g0rrr/sam_openpi_solder2
+# # # Other issues: pierfabre/rabbit bensprenger/right_arm_p_brick_in_box_with_y_noise_v0 pierfabre/horse pierfabre/pig2 pierfabre/pig3 pierfabre/cow2,pierfabre/sheep
+# # # REPO_ID=Chojins/chess_game_009_white,sihyun77/suho_3_17_1,sihyun77/sihyun_3_17_2,sihyun77/suho_3_17_3,sihyun77/sihyun_3_17_5,Odog16/so100_cube_drop_pick_v1,sihyun77/sihyun_main_2,sihyun77/suho_main_2,Bartm3/dice2,sihyun77/sihyun_main_3,Loki0929/so100_duck,pietroom/holdthis,pietroom/actualeasytask,Beegbrain/pick_lemon_and_drop_in_bowl,Beegbrain/sweep_tissue_cube,zijian2022/321,gxy1111/so100_pick_place,Odog16/so100_cube_stacking_v1,sihyun77/mond_1,andlyu/so100_indoor_1,andlyu/so100_indoor_3,frk2/so100large,lirislab/sweep_tissue_cube,lirislab/lemon_into_bowl,lirislab/red_cube_into_green_lego_block,lirislab/red_cube_into_blue_cube,00ri/so100_battery,frk2/so100largediffcam,FsqZ/so100_1,ZGGZZG/so100_drop0,Chojins/chess_game_000_white_red,smanni/train_so100_fluffy_box,ganker5/so100_push_20250328,ganker5/so100_dataline_0328,ganker5/so100_color_0328,CrazyYhang/A1234-B-C_mvA2B,RasmusP/so100_Orange2Green,sixpigs1/so100_pick_cube_in_box,ganker5/so100_push_20250331,ganker5/so100_dataline_20250331,lirislab/put_caps_into_teabox,lirislab/close_top_drawer_teabox,lirislab/open_top_drawer_teabox,lirislab/unfold_bottom_right,lirislab/push_cup_target,lirislab/put_banana_bowl,Chojins/chess_game_001_blue_stereo,Chojins/chess_game_001_red_stereo,ganker5/so100_toy_20250402,Gano007/so100_medic,00ri/so100_battery_bin_center,paszea/so100_whale_2,lirislab/fold_bottom_right,lirislab/put_coffee_cap_teabox,therarelab/so100_pick_place_2,paszea/so100_whale_3,paszea/so100_whale_4,paszea/so100_lego,LemonadeDai/so100_coca,zijian2022/backgrounda,zijian2022/backgroundb,356c/so100_nut_sort_1,Mwuqiu/so100_0408_muti,aimihat/so100_tape,lirislab/so100_demo,356c/so100_duck_reposition_1,zijian2022/sort1,weiye11/so100_410_zwy,VoicAndrei/so100_banana_to_plate_only,sixpigs1/so100_stack_cube_error,isadev/bougies3,zijian2022/close3,bensprenger/left_arm_yellow_brick_in_box_v0,lirislab/guess_who_so100,bensprenger/left_arm_yellow_brick_in_box_with_purple_noise_v0,roboticshack/team16-can-stacking,zijian2022/insert2,roboticshack/team-7-right-arm-grasp-tape,Jiangeng/so100_413,roboticshack/team9-pick_cube_place_static_plate,AndrejOrsula/lerobot_double_ball_stacking_random,roboticshack/left-arm-grasp-lego-brick,roboticshack/team-7-left-arm-grasp-motor,roboticshack/team9-pick_chicken_place_plate,roboticshack/team13-two-balls-stacking,tkc79/so100_lego_box_1,roboticshack/team13-three-balls-stacking,pierfabre/chicken,roboticshack/team16-water-pouring,ad330/cubePlace,Jiafei1224/so100_pa222per,paszea/so100_lego_2cam,bensprenger/chess_game_001_blue_stereo,Mohamedal/put_banana,tkc79/so100_lego_box_2,samanthalhy/so100_herding_1,jlesein/TestBoulon7
+# # REPO_ID=pierfabre/rabbit,bensprenger/right_arm_p_brick_in_box_with_y_noise_v0,pierfabre/horse,pierfabre/pig2,pierfabre/pig3,pierfabre/cow2,pierfabre/sheep,Chojins/chess_game_009_white,sihyun77/suho_3_17_1,sihyun77/sihyun_3_17_2,sihyun77/suho_3_17_3,sihyun77/sihyun_3_17_5,Odog16/so100_cube_drop_pick_v1,sihyun77/sihyun_main_2,sihyun77/suho_main_2,Bartm3/dice2,sihyun77/sihyun_main_3,Loki0929/so100_duck,pietroom/holdthis,pietroom/actualeasytask,Beegbrain/pick_lemon_and_drop_in_bowl,Beegbrain/sweep_tissue_cube,zijian2022/321,gxy1111/so100_pick_place,Odog16/so100_cube_stacking_v1,sihyun77/mond_1,andlyu/so100_indoor_1,andlyu/so100_indoor_3,frk2/so100large,lirislab/sweep_tissue_cube,lirislab/lemon_into_bowl,lirislab/red_cube_into_green_lego_block,lirislab/red_cube_into_blue_cube,00ri/so100_battery,frk2/so100largediffcam,FsqZ/so100_1,ZGGZZG/so100_drop0,Chojins/chess_game_000_white_red,smanni/train_so100_fluffy_box,ganker5/so100_push_20250328,ganker5/so100_dataline_0328,ganker5/so100_color_0328,CrazyYhang/A1234-B-C_mvA2B,RasmusP/so100_Orange2Green,sixpigs1/so100_pick_cube_in_box,ganker5/so100_push_20250331,ganker5/so100_dataline_20250331,lirislab/put_caps_into_teabox,lirislab/close_top_drawer_teabox,lirislab/open_top_drawer_teabox,lirislab/unfold_bottom_right,lirislab/push_cup_target,lirislab/put_banana_bowl,Chojins/chess_game_001_blue_stereo,Chojins/chess_game_001_red_stereo,ganker5/so100_toy_20250402,Gano007/so100_medic,00ri/so100_battery_bin_center,paszea/so100_whale_2,lirislab/fold_bottom_right,lirislab/put_coffee_cap_teabox,therarelab/so100_pick_place_2,paszea/so100_whale_3,paszea/so100_whale_4,paszea/so100_lego,LemonadeDai/so100_coca,zijian2022/backgrounda,zijian2022/backgroundb,356c/so100_nut_sort_1,Mwuqiu/so100_0408_muti,aimihat/so100_tape,lirislab/so100_demo,356c/so100_duck_reposition_1,zijian2022/sort1,weiye11/so100_410_zwy,VoicAndrei/so100_banana_to_plate_only,sixpigs1/so100_stack_cube_error,isadev/bougies3,zijian2022/close3,bensprenger/left_arm_yellow_brick_in_box_v0,lirislab/guess_who_so100,bensprenger/left_arm_yellow_brick_in_box_with_purple_noise_v0,roboticshack/team16-can-stacking,zijian2022/insert2,roboticshack/team-7-right-arm-grasp-tape,Jiangeng/so100_413,roboticshack/team9-pick_cube_place_static_plate,AndrejOrsula/lerobot_double_ball_stacking_random,roboticshack/left-arm-grasp-lego-brick,roboticshack/team-7-left-arm-grasp-motor,roboticshack/team9-pick_chicken_place_plate,roboticshack/team13-two-balls-stacking,tkc79/so100_lego_box_1,roboticshack/team13-three-balls-stacking,pierfabre/chicken,roboticshack/team16-water-pouring,ad330/cubePlace,Jiafei1224/so100_pa222per,paszea/so100_lego_2cam,bensprenger/chess_game_001_blue_stereo,Mohamedal/put_banana,tkc79/so100_lego_box_2,samanthalhy/so100_herding_1,jlesein/TestBoulon7
+# # SAMPLING_WEIGHTS=
+# # DATASET_NAME=so100_community_v2
+
+# # Community datasets V1+V2
+# # REPO_ID=pierfabre/rabbit,bensprenger/right_arm_p_brick_in_box_with_y_noise_v0,pierfabre/horse,pierfabre/pig2,pierfabre/pig3,pierfabre/cow2,pierfabre/sheep,Chojins/chess_game_009_white,sihyun77/suho_3_17_1,sihyun77/sihyun_3_17_2,sihyun77/suho_3_17_3,sihyun77/sihyun_3_17_5,Odog16/so100_cube_drop_pick_v1,sihyun77/sihyun_main_2,sihyun77/suho_main_2,Bartm3/dice2,sihyun77/sihyun_main_3,Loki0929/so100_duck,pietroom/holdthis,pietroom/actualeasytask,Beegbrain/pick_lemon_and_drop_in_bowl,Beegbrain/sweep_tissue_cube,zijian2022/321,gxy1111/so100_pick_place,Odog16/so100_cube_stacking_v1,sihyun77/mond_1,andlyu/so100_indoor_1,andlyu/so100_indoor_3,frk2/so100large,lirislab/sweep_tissue_cube,lirislab/lemon_into_bowl,lirislab/red_cube_into_green_lego_block,lirislab/red_cube_into_blue_cube,00ri/so100_battery,frk2/so100largediffcam,FsqZ/so100_1,ZGGZZG/so100_drop0,Chojins/chess_game_000_white_red,smanni/train_so100_fluffy_box,ganker5/so100_push_20250328,ganker5/so100_dataline_0328,ganker5/so100_color_0328,CrazyYhang/A1234-B-C_mvA2B,RasmusP/so100_Orange2Green,sixpigs1/so100_pick_cube_in_box,ganker5/so100_push_20250331,ganker5/so100_dataline_20250331,lirislab/put_caps_into_teabox,lirislab/close_top_drawer_teabox,lirislab/open_top_drawer_teabox,lirislab/unfold_bottom_right,lirislab/push_cup_target,lirislab/put_banana_bowl,Chojins/chess_game_001_blue_stereo,Chojins/chess_game_001_red_stereo,ganker5/so100_toy_20250402,Gano007/so100_medic,00ri/so100_battery_bin_center,paszea/so100_whale_2,lirislab/fold_bottom_right,lirislab/put_coffee_cap_teabox,therarelab/so100_pick_place_2,paszea/so100_whale_3,paszea/so100_whale_4,paszea/so100_lego,LemonadeDai/so100_coca,zijian2022/backgrounda,zijian2022/backgroundb,356c/so100_nut_sort_1,Mwuqiu/so100_0408_muti,aimihat/so100_tape,lirislab/so100_demo,356c/so100_duck_reposition_1,zijian2022/sort1,weiye11/so100_410_zwy,VoicAndrei/so100_banana_to_plate_only,sixpigs1/so100_stack_cube_error,isadev/bougies3,zijian2022/close3,bensprenger/left_arm_yellow_brick_in_box_v0,lirislab/guess_who_so100,bensprenger/left_arm_yellow_brick_in_box_with_purple_noise_v0,roboticshack/team16-can-stacking,zijian2022/insert2,roboticshack/team-7-right-arm-grasp-tape,Jiangeng/so100_413,roboticshack/team9-pick_cube_place_static_plate,AndrejOrsula/lerobot_double_ball_stacking_random,roboticshack/left-arm-grasp-lego-brick,roboticshack/team-7-left-arm-grasp-motor,roboticshack/team9-pick_chicken_place_plate,roboticshack/team13-two-balls-stacking,tkc79/so100_lego_box_1,roboticshack/team13-three-balls-stacking,pierfabre/chicken,roboticshack/team16-water-pouring,ad330/cubePlace,Jiafei1224/so100_pa222per,paszea/so100_lego_2cam,bensprenger/chess_game_001_blue_stereo,Mohamedal/put_banana,tkc79/so100_lego_box_2,samanthalhy/so100_herding_1,jlesein/TestBoulon7,pranavsaroha/so100_onelego2,pranavsaroha/so100_onelego3,pranavsaroha/so100_carrot_2,vladfatu/so100_above,koenvanwijk/orange50-1,CSCSXX/pick_place_cube_1.18,dragon-95/so100_sorting,dragon-95/so100_sorting_1,nbaron99/so100_pick_and_place4,Beegbrain/pick_place_green_block,dragon-95/so100_sorting_3,HITHY/so100_peach3,shreyasgite/so100_legocube_50,triton7777/so100_dataset_mix,NONHUMAN-RESEARCH/SOARM100_TASK_VENDA,mikechambers/block_cup_14,samsam0510/tooth_extraction_3,samsam0510/tooth_extraction_4,samsam0510/cube_reorientation_2,samsam0510/cube_reorientation_4,samsam0510/glove_reorientation_1,vladfatu/so100_office,pranavsaroha/so100_legos4,Ityl/so100_recording2,FeiYjf/new_GtoR,dragon-95/so100_sorting_2,HITHY/so100_peach4,jpata/so100_pick_place_tangerine,HITHY/so100_strawberry,shreyasgite/so100_base_env,koenvanwijk/orange50-variation-2,pranavsaroha/so100_carrot_5,pandaRQ/pick_med_1,aractingi/push_cube_offline_data,DorayakiLin/so100_pick_charger_on_tissue,zijian2022/noticehuman3,liuhuanjim013/so100_th
+# REPO_ID=pierfabre/rabbit,bensprenger/right_arm_p_brick_in_box_with_y_noise_v0,pierfabre/horse,pierfabre/pig2
+# SAMPLING_WEIGHTS=
+
+# # # Community V3
+# # # issues, yskim2025/unitylerobot (version), cranberrysoft/so100 (don't exist),29  datasets different actions: nguyen-v/so100_rotate_red_button satvikahuja/mixer_on_off_new_1 ...
+# # REPO_ID=satvikahuja/mixer_on_off_new_1,aergogo/so100_pick_place,andy309/so100_0314_fold_cloths,jchun/so100_pickplace_small_20250323_120056,astroyat/cube,Ofiroz91/so_100_cube2bowl,HappyPablo/dec3_data2,ZCM5115/so100_1210,francescocrivelli/orange_feeding,francescocrivelli/carrot_eating,0x00raghu/toffee_red,0x00raghu/toffee_red_2,0x00raghu/toffee_red_3__,0x00raghu/toffee_blue,0x00raghu/toffee_blue_2,0x00raghu/toffee_to_hand_1,0x00raghu/toffee_to_hand_2,liyitenga/so100_bi_hello,liyitenga/so100_bi_giveme5,ZCM5115/so100_2Arm3cameras_movebox,pranavsaroha/so100_carrot_1,pranavsaroha/so100_carrot_3,pranavsaroha/so100_carrot_4,maximilienroberti/so100_lego_red_box,pranavsaroha/so100_squishy,rabhishek100/so100_train_dataset,pranavsaroha/so100_squishy100,swarajgosavi/kikobot_pusht_real_v2,pandaRQ/pickmed,swarajgosavi/act_kikobot_pusht_real,pranavsaroha/so100_squishy2colors,pranavsaroha/so100_squishy2colors_1,Chojins/chess_game_001_white,jmrog/so100_sweet_pick,Chojins/chess_game_002_white,pranavsaroha/so100_squishy2colors_2_new,Chojins/chess_game_003_white,aractingi/pick_place_lego_cube,Chojins/chess_game_004_white,Chojins/chess_game_005_white,Chojins/chess_game_006_white,Chojins/chess_game_007_white,koenvanwijk/blue2,jlitch/so100multicam3,koenvanwijk/blue52,jlitch/so100multicam6,aractingi/pick_place_lego_cube_1,jlitch/so100multicam7,vladfatu/so100_ds,Chojins/chess_game_000_white,HITHY/so100-kiwi,HITHY/so100_peach1,HITHY/so100_redstrawberry,satvikahuja/orange_mixer_1,satvikahuja/mixer_on_off,satvikahuja/orange_pick_place_new1,satvikahuja/mixer_on_off_new,danmac1/real_real332,FeiYjf/Makalu_push,liyitenga/so100_pick_taffy1,chmadran/so100_dataset04,FeiYjf/Maklu_dataset,FeiYjf/new_Dataset,liyitenga/so100_pick_taffy2,satvikahuja/mixer_on_off_new_4,CSCSXX/pick_place_cube_1.17,liyitenga/so100_pick_taffy3,liyitenga/so100_pick_taffy4,yuz1wan/so100_pick_pink,yuz1wan/so100_pick_wahaha,yuz1wan/so100_pp_pink,yuz1wan/so100_pour_cup,liyitenga/so100_pick_taffy5,liyitenga/so100_pick_taffy6,yuz1wan/so100_button,yuz1wan/so100_pickplace,liyitenga/so100_pick_taffy7,FeiYjf/push_gg,FeiYjf/push_0094,swarajgosavi/act_kikobot_block_real,liyitenga/so100_pick_taffy8,phospho-ai/OrangeBrick3Cameras,vaishanthr/toy_pick_place,SeanLMH/so100_picknplace_v2,pepijn223/yellow_lego_in_box1,DimiSch/so100_50ep_2,DimiSch/so100_50ep_3,SeanLMH/so100_picknplace,nbaron99/so100_pick_and_place2,chmadran/so100_dataset08,vaishanthr/toy_pickplace_50ep,Beegbrain/pick_place_green_block_lr,Ityl/so100_recording1,vaishanthr/toy_pickplace,ad330/so100_box_pickPlace,Beegbrain/so100_put_cube_cup,aractingi/push_green_cube_hf,aractingi/push_green_cube_hf_cropped_resized,carpit680/giraffe_task,carpit680/giraffe_sock_demo_1,DimiSch/so100_terra_50_2,carpit680/giraffe_sock_demo_2,aractingi/push_cube_to_face_reward,aractingi/push_cube_to_face_reward_cropped_resized,aractingi/push_cube_reward_data,aractingi/push_cube_reward_data_cropped_resized,aractingi/push_cube_offline_data_cropped_resized,aractingi/push_cube_front_side_reward,aractingi/push_cube_front_side_reward_cropped_resized,aractingi/push_cube_front_side_reward_long,aractingi/push_cube_front_side_reward_long_cropped_resized,aractingi/push_cube_reward,aractingi/push_cube_reward_cropped_resized,aractingi/push_cube_square_reward_cropped_resized,aractingi/push_cube_square_reward_1,aractingi/push_cube_square_reward_1_cropped_resized,aractingi/push_cube_square_light_reward,aractingi/push_cube_square_light_offline_demo,aractingi/push_cube_square_light_offline_demo_cropped_resized,denghj/dataset_red_tape01,aractingi/push_cube_square_offline_demo,aractingi/push_cube_square_offline_demo_cropped_resized,Beegbrain/stack_two_cubes,FeiYjf/Test_NNNN,LegrandFrederic/Orange-brick-lower-resolution,aractingi/pick_place_lego_cube_cropped_resized,aractingi/push_cube_overfit,aractingi/push_cube_overfit_cropped_resized,HITHY/so100_peach,zaringleb/so100_cube_2,andreasBihlmaier/dual_arm_transfer_2025_02_16,zaringleb/so100_cube_4_binary,1g0rrr/reward_pickplace1,1g0rrr/reward_pickplace1_cropped_resized,FeiYjf/Hold_Pieces,FeiYjf/Grab_Pieces,hegdearyandev/so100_eraser_cup_v1,jbraumann/so100_1902,liyitenga/so100_pick_taffy10,mikechambers/block_cup_5,zaringleb/so100_cube_5_linear,yuz1wan/so100_pickplace_0223_2,yuz1wan/so100_pickplace_0223_3,samsam0510/mj_data_temp,samsam0510/tape_insert_1,samsam0510/tape_insert_2,pengjunkun/so100_push_to_hole,Deason11/Random_Kitchen,1g0rrr/reward_dataset_name2,1g0rrr/reward_dataset_name2_cropped_resized,1g0rrr/offline_dataset_name2,1g0rrr/offline_dataset_name2_cropped_resized,aractingi/push_cube_simp_cropped_resized,danielkr452/so100_work6,Loki0929/so100_100,yuz1wan/so100_fold_0227_1,yuz1wan/so100_fold_0227_2,speedyyoshi/so100_grasp_pink_block,lirislab/stack_two_red_cubes,lirislab/red_cube_into_mug,lirislab/green_lego_block_into_mug,lirislab/green_lego_block_into_mug_easy,kevin510/lerobot-cat-toy-placement,NONHUMAN-RESEARCH/SOARM100_TASK_VENDA_BOX,wangjl1512/pour_water,airthebear/so100_GL,zijian2022/noticehuman1,zijian2022/noticehuman2,kantine/so100_kapla_tower6,zijian2022/noticehuman5,zijian2022/llm40,Ashton3/lerobot-aloha,zijian2022/noticehuman50,AaronNewman/screwdriver_task_batch1,AaronNewman/screwdriver_task_batch2,AaronNewman/screwdriver_task_batch3,zijian2022/noticehuman60,zijian2022/noticehuman70,Bartm3/tape_to_bin,liuhuanjim013/so100_th_1,Pi-robot/barbecue_flip,Pi-robot/barbecue_put,wangjl1512/doll,sshh11/so100_orange_50ep_1,sshh11/so100_orange_50ep_2,DorayakiLin/so100_pick_cube_in_box,Bartm3/tape_to_bin2,luke250305/play_dice_250311.1,andy309/so100_0311_1152,sihyun77/suho_so100,sihyun77/si_so100,shreyasgite/so100_base_left,sihyun77/suho_red,liuhuanjim013/so100_block,andy309/so100_0313_no_wrist_camera,zijian2022/l9,zijian2022/n1_2,DorayakiLin/so100_stack_cube,andy309/so100_0313_no_wrist_camera_with_two_arms_cloths,joaoocruz00/so100_makeitD1,zijian2022/l10_1,zijian2022/l10_5,sihyun77/suho_red2,sihyun77/suho_angel,sihyun77/sihyun_king,acrampette/third_arm_01,Winster/so100_cube,1g0rrr/sam_openpi03,thedevansh/mar16_1336,hkphoooey/throw_stuffie,doujiangwang/task1_10epi_100000step,sihyun77/sihyun_3_17_1,acrampette/third_arm_02,imsyed00/so100_yellowbowl_pickplace_1,kumarhans/so100_tape_task,sihyun77/sihyun_main,doujiangwang/task2_10epi_100000step,kantine/industrial_robothon_buttons_expert,kantine/industrial_robothon_buttons_anomaly,kantine/industrial_robothon_hatchAndProbe_expert,kantine/industrial_robothon_hatchAndProbe_anomaly,Odog16/so100_tea_towel_folding_v1,zijian2022/so100_318,zijian2022/so100_318_1,Congying1112/so100_place_blue_bottle_with_two_cameras,Congying1112/so100_place_blue_bottle_with_two_cameras2,Congying1112/so100_place_blue_bottle_with_single_camera,pietroom/first_task_short,kantine/industrial_screws_sorting_expert,kantine/industrial_screws_sorting_anomaly,pietroom/second_task,zijian2022/c0,doujiangwang/task4_10epi_100000step,Congying1112/so100_switch_with_onhand_camera,HYAIYN/so100_get_orange_10epi,doujiangwang/task5_10epi_100000step,1g0rrr/sam_openpi_cube_low10,1g0rrr/sam_openpi_cube_top10,1g0rrr/sam_openpi_wire10,1g0rrr/sam_openpi_solder1,1g0rrr/sam_openpi_solder2,wcode/so100_put_pen_50,jchun/so100_pickplace_small_20250322_193929,bnarin/so100_tic_tac_toe_we_do_it_live,dc2ac/so100-t5,chmadran/so100_home_dataset,baladhurgesh97/so100_final_picking_3,bnarin/so100_tic_tac_toe_move_0_0,bnarin/so100_tic_tac_toe_move_1_0,bnarin/so100_tic_tac_toe_move_2_1,bnarin/so100_tic_tac_toe_move_4_0,zaringleb/so100_cube_6_2d,andlyu/so100_indoor_0,andlyu/so100_indoor_2,Winster/so100_sim,badwolf256/so100_twin_cam_duck,Congying1112/so100_simplepick_with_2_cameras_from_top,andlyu/so100_indoor_4,Zak-Y/so100_grap_dataset,kantine/domotic_pouringCoffee_expert,kantine/domotic_pouringCoffee_anomaly,lucasngoo/so100_strawberry_grape,kantine/domotic_makingCoffee_expert,kantine/domotic_makingCoffee_anomaly,ZGGZZG/so100_drop1,kantine/industrial_soldering_expert,kantine/industrial_soldering_anomaly,Yotofu/so100_sweeper_shoes,kantine/domotic_dishTidyUp_expert,kantine/domotic_dishTidyUp_anomaly,kantine/domotic_groceriesSorting_expert,kantine/domotic_groceriesSorting_anomaly,badwolf256/so100_twin_cam_duck_v2,kantine/domotic_vegetagblesAndFruitsSorting_expert,kantine/domotic_vegetagblesAndFruitsSorting_anomaly,kantine/domotic_setTheTable_expert,kantine/domotic_setTheTable_anomaly,therarelab/so100_pick_place,abhisb/so100_51_ep,andlyu/so100_indoor_val_0,allenchienxxx/so100Test,lizi178119985/so100_jia,badwolf256/so100_twin_cam_duck_v3,andrewcole712/so100_tape_bin_place,Gano007/so100_lolo,Zak-Y/so100_three_cameras_dataset,Gano007/so100_doliprane,XXRRSSRR/so100_v3_num_episodes_50,zijian2022/assemblyarm2,ganker5/so100_action_20250403,andlyu/so100_indoor_val2,Gano007/so100_gano,paszea/so100_whale_grab,paszea/so100_whale,Clementppr/lerobot_pick_and_place_dataset_world_model,andlyu/so100_indoor_10,RasmusP/so100_dataset50ep_a,RasmusP/so100_dataset50ep,Gano007/so100_second,zaringleb/so100_cude_linear_and_2d_comb,dsfsg/grasp_pens,zijian2022/digitalfix,zijian2022/digitalfix2,zijian2022/digitalfix3,T1g3rGE/so100_pickplace_small_20250407_171912,sihyun77/mond_13,abokinala/sputnik_100_11_pick_place_container,dsfsg/bring_bottle,duthvik/sputnik_100_13_pick_place_container,abokinala/sputnik_100_12_pick_place_container,Mwuqiu/so100_0408,AK51/4090_01,356c/so100_rope_reposition_1,paszea/so100_lego_mix,abokinala/sputnik_100_14_pick_place_container,abokinala/sputnik_100_23_pick_place_surface,jiajun001/eraser00_2,jlesein/TestBoulon2,duthvik/sputnik_100_31_pour_liquid,duthvik/sputnik_100_24_pick_place_surface,duthvik/sputnik_100_25_pick_place_surface,duthvik/sputnik_100_17_pick_place_container,duthvik/sputnik_100_26_pick_place_surface,VoicAndrei/so100_banana_to_plate_rebel_full,isadev/bougies1,danaaubakirova/so100_task_1,danaaubakirova/so100_task_2,danaaubakirova/so100_task_3,danaaubakirova/so100_task_4,sixpigs1/so100_pick_cube_in_box_error,sixpigs1/so100_push_cube_error,sixpigs1/so100_pull_cube_error,isadev/bougies2,therarelab/med_dis_rare_6,duthvik/sputnik_100_27_pick_place_surface,zijian2022/closer3,duthvik/sputnik_100_41_custom_tasks,duthvik/sputnik_100_42_custom_tasks,duthvik/sputnik_100_43_custom_tasks,duthvik/sputnik_100_44_custom_tasks,duthvik/sputnik_100_51_kitchen_tasks,duthvik/sputnik_100_52_kitchen_tasks,duthvik/sputnik_100_53_kitchen_tasks,duthvik/sputnik_100_45_custom_tasks,duthvik/sputnik_100_32_pour_liquid,duthvik/sputnik_100_29_pick_place_surface,duthvik/sputnik_100_18_pick_place_container,sixpigs1/so100_pull_cube_by_tool_error,sixpigs1/so100_insert_cylinder_error,abokinala/sputnik_100_54_kitchen_tasks,abokinala/sputnik_100_55_kitchen_tasks,m1b/so100_bluelego,abokinala/sputnik_100_46_custom_tasks,m1b/so100_bluelego_updt,kantine/flip_A0,kantine/flip_A1,kantine/flip_A2,kantine/flip_A3,lirislab/guess_who_no_cond,kantine/flip_A4,kantine/flip_A5,lirislab/guess_who_lighting,nguyen-v/so100_press_red_button,nguyen-v/so100_bimanual_grab_lemon_put_in_box2,pierfabre/cow,nguyen-v/press_red_button_new,nguyen-v/so100_rotate_red_button,raghav-katta-1/lerobot2,Cidoyi/so100_all_notes,roboticshack/team10-red-block,Cidoyi/so100_all_notes_1,roboticshack/team_5-QuiEstCe_everyBox,roboticshack/team11_pianobot,roboticshack/team2-guess_who_so100,roboticshack/team2-guess_who_so100_light,roboticshack/team2-guess_who_so100_edge_case,roboticshack/team2-guess_who_less_ligth,Cidoyi/so100_all_notes_3,dsfsg/grasp_pen_and_bottle,abokinala/sputnik_100_60_kitchen_tasks,abokinala/sputnik_100_58_kitchen_tasks,danaaubakirova/so100_v2_task_1,danaaubakirova/so100_v2_task_2,danaaubakirova/so100_v2_task_3,danaaubakirova/so100_v2_task_4,zijian2022/force1,zijian2022/force2,zijian2022/force3,jiajun001/eraser00_3,zijian2022/bi2,zijian2022/bi1,zijian2022/hand1,Setchii/so100_grab_ball,MossProphet/so100_square-1-2-3.2
+# # SAMPLING_WEIGHTS=
+# # DATASET_NAME=so100_community_v3
+
+# ##########################
+
+# ROBOT=so100
+# export TOKENIZERS_PARALLELISM=false
+# export MUJOCO_GL=egl
+
+
+
+# SAMPLING_WEIGHTS=
+# FEATURES_VERSION=2
+# NUM_IMAGE_TRANSFORMS=10
+# TRAIN_ON_ALL_FEATURES=true
+# NORM_PER_ROBOT=true
+# USE_IMAGENET_STATS=false
+
+# MAX_STATE_DIM=6
+# MAX_ACTION_DIM=6
+# MAX_NUM_IMAGES=3
+# MAX_IMAGE_DIM=256
+
+
+# SEED=5000
+# BATCH_SIZE=32
+# # EVAL_STEPS=1000
+# EVAL_STEPS=100
+
+
+
+
+
+# SELF_ATTN_ONLY_ACTIONS=false
+# EXPERT_WIDTH_MULTIPLIER=0.75
+# PAST_OBS_KEYS="image"
+# N_OBS_STEPS=1
+# NUM_EXPERT_LAYERS=0
+# CHUNK_SIZE=50
+# NUM_VLM_LAYERS=16
+# PAD_LANG_TO=longest
+# EVAL_CKPT=/lustre/fswork/projects/rech/dyf/ugz83ue/logs/lerobot/lerobot_so100_community_v1_v2_smolpi0_lr1e-4bs64steps200000gpus4freeze32_imgtoktrue_cross_attn_gap1_localimgfalse_statetopreftrue_explay0_vlml16_causalacttrue_sa2_smolvlm2500_chunk50_nobs1_expw0.75_feat2_lrvlm1e-4_droptrue_max_length/checkpoints/080000/pretrained_model/
+# ADD_IMAGE_TOKENS=true
+# ATTN_MODE=cross_attn
+# STATE_TO_PREFIX=true
+# CAUSAL_ACTION_ATTENTION_MASK=true
+# SELF_ATTN_EVERY_N_LAYERS=2
+# VLM_NAME=HuggingFaceTB/SmolVLM-500M-Instruct
+
+
+# python lerobot/scripts/offline_inference.py \
+#     --output_dir=$WORK/logs/lerobot/tmp \
+#     --batch_size=$BATCH_SIZE \
+#     --seed=$SEED \
+#     --eval_steps=$EVAL_STEPS \
+#     --use_amp=false \
+#     --device=cuda \
+#     --dataset.repo_id=$REPO_ID --dataset.local_files_only=true --dataset.sampling_weights=$SAMPLING_WEIGHTS --dataset.use_imagenet_stats=$USE_IMAGENET_STATS --policy.normalize_per_robot_type=$NORM_PER_ROBOT \
+#     --dataset.image_transforms.max_num_transforms=$NUM_IMAGE_TRANSFORMS --dataset.image_transforms.enable=true --dataset.train_on_all_features=$TRAIN_ON_ALL_FEATURES \
+#     --dataset.max_action_dim=$MAX_ACTION_DIM --dataset.max_state_dim=$MAX_STATE_DIM --dataset.max_num_images=$MAX_NUM_IMAGES --dataset.max_image_dim=$MAX_IMAGE_DIM --dataset.features_version=$FEATURES_VERSION \
+#     --policy.type=$POLICY \
+#     --policy.checkpoint_path=$EVAL_CKPT \
+#     --policy.checkpoint_keys_mapping=$CKPT_KEYS_MAPPING \
+#     --policy.add_image_special_tokens=$ADD_IMAGE_TOKENS \
+#     --policy.attention_mode=$ATTN_MODE \
+#     --policy.causal_action_attention_mask=$CAUSAL_ACTION_ATTENTION_MASK \
+#     --policy.state_to_prefix=$STATE_TO_PREFIX \
+#     --policy.self_attn_every_n_layers=$SELF_ATTN_EVERY_N_LAYERS \
+#     --policy.vlm_model_name=$VLM_NAME \
+#     --policy.pad_language_to=$PAD_LANG_TO \
+#     --policy.load_vlm_weights=$LOAD_VLM_WEIGHTS \
+#     --policy.num_vlm_layers=$NUM_VLM_LAYERS \
+#     --policy.chunk_size=$CHUNK_SIZE \
+#     --policy.n_obs_steps=$N_OBS_STEPS \
+#     --policy.past_obs_keys=$PAST_OBS_KEYS \
+#     --policy.num_expert_layers=$NUM_EXPERT_LAYERS \
+#     --policy.expert_width_multiplier=$EXPERT_WIDTH_MULTIPLIER \
+#     --policy.peft_method=$PEFT_METHOD \
+#     --policy.self_attn_only_actions=$SELF_ATTN_ONLY_ACTIONS \
+#     --policy.robot_type=$ROBOT 
+
+
+
+
+# MULTITASK_EVAL=true
+# N_EPISODES=5
+# MAX_PARRALLEL_TASKS=1
+# ACTION_STEPS_LIST=(1)
+# TASK_LIST=(libero_10)
+# for N_ACTION_STEPS in "${ACTION_STEPS_LIST[@]}"; do
+#     for TASK in "${TASK_LIST[@]}"; do
+#         echo "$TASK Evaluating: $EVAL_CKPT | N_ACTION_STEPS=$N_ACTION_STEPS"
+#         python lerobot/scripts/eval.py \
+#             --output_dir=$WORK/logs/lerobot/tmp \
+#             --env.type=$ENV \
+#             --env.task=$TASK \
+#             --eval.batch_size=$N_EPISODES \
+#             --eval.n_episodes=$N_EPISODES \
+#             --use_amp=false \
+#             --device=cuda \
+#             --policy.n_action_steps=$N_ACTION_STEPS \
+#             --policy.type=$POLICY \
+#             --policy.checkpoint_path=$EVAL_CKPT \
+#             --policy.checkpoint_keys_mapping=$CKPT_KEYS_MAPPING \
+#             --env.multitask_eval=$MULTITASK_EVAL --env.max_parallel_tasks=$MAX_PARRALLEL_TASKS \
+#             --policy.add_image_special_tokens=$ADD_IMAGE_TOKENS \
+#             --policy.attention_mode=$ATTN_MODE \
+#             --policy.causal_action_attention_mask=$CAUSAL_ACTION_ATTENTION_MASK \
+#             --policy.state_to_prefix=$STATE_TO_PREFIX \
+#             --policy.self_attn_every_n_layers=$SELF_ATTN_EVERY_N_LAYERS \
+#             --policy.vlm_model_name=$VLM_NAME \
+#             --policy.load_vlm_weights=$LOAD_VLM_WEIGHTS \
+# --policy.num_vlm_layers=$NUM_VLM_LAYERS \
+# --policy.chunk_size=$CHUNK_SIZE 
+
+#         echo "Done with: $EVAL_CKPT | Steps=$N_ACTION_STEPS"
+#         echo "------------------------------------------------------"
+#     done
+# done
+
diff --git a/src/lerobot/envs/configs.py b/src/lerobot/envs/configs.py
index 5c648de70..e965446db 100644
--- a/src/lerobot/envs/configs.py
+++ b/src/lerobot/envs/configs.py
@@ -320,8 +320,6 @@ class LiberoEnv(EnvConfig):
     @property
     def gym_kwargs(self) -> dict:
         return {
-            # "task": self.task,
             "obs_type": self.obs_type,
             "render_mode": self.render_mode,
-            # "max_episode_steps": self.episode_length,
         }
diff --git a/src/lerobot/envs/factory.py b/src/lerobot/envs/factory.py
index be49a9990..211b41714 100644
--- a/src/lerobot/envs/factory.py
+++ b/src/lerobot/envs/factory.py
@@ -56,37 +56,36 @@ def make_env(
             names to indexed vectorized environments (when multitask eval is used).
 
     """
-    if n_envs < 1:
-        raise ValueError("`n_envs must be at least 1")
+        if n_envs < 1:
+            raise ValueError("`n_envs` must be at least 1")
 
-    # batched version of the env that returns an observation of shape (b, c)
-    env_cls = gym.vector.AsyncVectorEnv if use_async_envs else gym.vector.SyncVectorEnv
+        env_cls = gym.vector.AsyncVectorEnv if use_async_envs else gym.vector.SyncVectorEnv
 
-    if "libero" in cfg.type:
-        from lerobot.envs.libero import create_libero_envs
+    
+        if "libero" in cfg.type:
+            from lerobot.envs.libero import create_libero_envs
+            return create_libero_envs(
+                task=cfg.task,
+                n_envs=n_envs,
+                camera_name=cfg.camera_name,
+                init_states=cfg.init_states,
+                gym_kwargs=cfg.gym_kwargs,
+                env_cls=env_cls,
+                multitask_eval=cfg.multitask_eval,
+            )
 
-        env = create_libero_envs(
-            task=cfg.task,
-            n_envs=n_envs,
-            camera_name=cfg.camera_name,
-            init_states=cfg.init_states,
-            gym_kwargs=cfg.gym_kwargs,
-            env_cls=env_cls,
-            multitask_eval=cfg.multitask_eval,
-        )
-    else:
+        
         package_name = f"gym_{cfg.type}"
         try:
             importlib.import_module(package_name)
         except ModuleNotFoundError as e:
-            print(
-                f"{package_name} is not installed. Please install it with `pip install 'lerobot[{cfg.type}]'`"
-            )
-            raise e
+            raise ModuleNotFoundError(
+                f"{package_name} is not installed. Install with: pip install \"lerobot[{cfg.type}]\""
+            ) from e
 
         gym_handle = f"{package_name}/{cfg.task}"
-        env = env_cls(
-            [lambda: gym.make(gym_handle, disable_env_checker=True, **cfg.gym_kwargs) for _ in range(n_envs)]
-        )
+        
+        def _make_one():
+            return gym.make(gym_handle, disable_env_checker=True, **(cfg.gym_kwargs or {}))
 
-    return env
+        return env_cls([_make_one for _ in range(n_envs)])
diff --git a/src/lerobot/envs/libero copy.py b/src/lerobot/envs/libero copy.py
new file mode 100644
index 000000000..83ccd2fb9
--- /dev/null
+++ b/src/lerobot/envs/libero copy.py	
@@ -0,0 +1,326 @@
+import math
+import os
+from collections import defaultdict
+from collections.abc import Callable
+from itertools import chain
+from typing import Any
+
+import gymnasium as gym
+import numpy as np
+import torch
+from gymnasium import spaces
+from libero.libero import benchmark, get_libero_path
+from libero.libero.envs import OffScreenRenderEnv
+
+
+def create_libero_envs(
+    task: str,
+    n_envs: int,
+    gym_kwargs: dict[str, Any] = None,
+    camera_name: str = "agentview_image,robot0_eye_in_hand_image",
+    init_states: bool = True,
+    env_cls: Callable = None,
+    multitask_eval: bool = True,
+) -> dict[str, dict[str, Any]]:
+    """
+    Here n_envs is per task and equal to the number of rollouts.
+    Returns:
+        dict[str, dict[str, list[LiberoEnv]]]: keys are task_suite and values are list of LiberoEnv envs.
+    """
+    print("num envs", n_envs)
+    print("multitask_eval", multitask_eval)
+    print("gym_kwargs", gym_kwargs)
+    if gym_kwargs is None:
+        gym_kwargs = {}
+
+    if not multitask_eval:
+        benchmark_dict = benchmark.get_benchmark_dict()
+        task_suite = benchmark_dict[task]()  # can also choose libero_spatial, libero_object, libero_10 etc.
+        tasks_id = list(range(len(task_suite.tasks)))
+        episode_indices = [0 for i in range(len(tasks_id))]
+        if len(tasks_id) == 1:
+            tasks_id = [tasks_id[0] for _ in range(n_envs)]
+            episode_indices = list(range(n_envs))
+        elif len(tasks_id) < n_envs and n_envs % len(tasks_id) == 0:
+            n_repeat = n_envs // len(tasks_id)
+            print("n_repeat", n_repeat)
+            episode_indices = []
+            for _ in range(len(tasks_id)):
+                episode_indices.extend(list(range(n_repeat)))
+            tasks_id = list(chain.from_iterable([[item] * n_repeat for item in tasks_id]))
+        elif n_envs < len(tasks_id):
+            tasks_id = tasks_id[:n_envs]
+            episode_indices = list(range(n_envs))[:n_envs]
+            print(f"WARNING: n_envs < len(tasks_id), evaluating only on {tasks_id}")
+        print(f"Creating Libero envs with task ids {tasks_id} from suite {task}")
+        assert n_envs == len(tasks_id), (
+            f"len(n_envs) and tasks_id should be the same, got {n_envs} and {len(tasks_id)}"
+        )
+        return env_cls(
+            [
+                lambda i=i: LiberoEnv(
+                    task_suite=task_suite,
+                    task_id=tasks_id[i],
+                    task_suite_name=task,
+                    camera_name=camera_name,
+                    init_states=init_states,
+                    episode_index=episode_indices[i],
+                    **gym_kwargs,
+                )
+                for i in range(n_envs)
+            ]
+        )
+    else:
+        envs = defaultdict(dict)
+        benchmark_dict = benchmark.get_benchmark_dict()
+        task = task.split(",")
+        for _task in task:
+            task_suite = benchmark_dict[
+                _task
+            ]()  # can also choose libero_spatial, libero_object, libero_10 etc.
+            tasks_ids = list(range(len(task_suite.tasks)))
+            for tasks_id in tasks_ids:
+                episode_indices = list(range(n_envs))
+                print(
+                    f"Creating Libero envs with task ids {tasks_id} from suite {_task}, episode_indices: {episode_indices}"
+                )
+                envs_list = [
+                    (
+                        lambda i=i,
+                        task_suite=task_suite,
+                        tasks_id=tasks_id,
+                        _task=_task,
+                        episode_indices=episode_indices: LiberoEnv(
+                            task_suite=task_suite,
+                            task_id=tasks_id,
+                            task_suite_name=_task,
+                            camera_name=camera_name,
+                            init_states=init_states,
+                            episode_index=episode_indices[i],
+                            **gym_kwargs,
+                        )
+                    )
+                    for i in range(n_envs)
+                ]
+                envs[_task][tasks_id] = env_cls(envs_list)
+        return envs
+
+
+def quat2axisangle(quat):
+    """
+    Copied from robosuite: https://github.com/ARISE-Initiative/robosuite/blob/eafb81f54ffc104f905ee48a16bb15f059176ad3/robosuite/utils/transform_utils.py#L490C1-L512C55
+
+    Converts quaternion to axis-angle format.
+    Returns a unit vector direction scaled by its angle in radians.
+
+    Args:
+        quat (np.array): (x,y,z,w) vec4 float angles
+
+    Returns:
+        np.array: (ax,ay,az) axis-angle exponential coordinates
+    """
+    # clip quaternion
+    if quat[3] > 1.0:
+        quat[3] = 1.0
+    elif quat[3] < -1.0:
+        quat[3] = -1.0
+
+    den = np.sqrt(1.0 - quat[3] * quat[3])
+    if math.isclose(den, 0.0):
+        # This is (close to) a zero degree rotation, immediately return
+        return np.zeros(3)
+
+    return (quat[:3] * 2.0 * math.acos(quat[3])) / den
+
+
+def get_task_init_states(task_suite, i):
+    init_states_path = os.path.join(
+        get_libero_path("init_states"),
+        task_suite.tasks[i].problem_folder,
+        task_suite.tasks[i].init_states_file,
+    )
+    init_states = torch.load(init_states_path, weights_only=False)  # nosec B614
+    return init_states
+
+
+def get_libero_dummy_action():
+    """Get dummy/no-op action, used to roll out the simulation while the robot does nothing."""
+    return [0, 0, 0, 0, 0, 0, -1]
+
+
+OBS_STATE_DIM = 8
+ACTION_DIM = 7
+
+
+class LiberoEnv(gym.Env):
+    metadata = {"render_modes": ["rgb_array"], "render_fps": 80}
+
+    def __init__(
+        self,
+        task_suite,
+        task_id,
+        task_suite_name,
+        camera_name="agentview_image,robot0_eye_in_hand_image",
+        obs_type="pixels",
+        render_mode="rgb_array",
+        observation_width=256,
+        observation_height=256,
+        visualization_width=640,
+        visualization_height=480,
+        init_states=True,
+        episode_index=0,
+    ):
+        super().__init__()
+        self.task_id = task_id
+        self.obs_type = obs_type
+        self.render_mode = render_mode
+        self.observation_width = observation_width
+        self.observation_height = observation_height
+        self.visualization_width = visualization_width
+        self.visualization_height = visualization_height
+        self.init_states = init_states
+        self.camera_name = camera_name.split(
+            ","
+        )  # agentview_image (main) or robot0_eye_in_hand_image (wrist)
+
+        # Map raw camera names to "image1" and "image2".
+        # The preprocessing step `preprocess_observation` will then prefix these with `.images.*`,
+        # following the LeRobot convention (e.g., `observation.images.image`, `observation.images.image2`).
+        # This ensures the policy consistently receives observations in the
+        # expected format regardless of the original camera naming.
+        self.camera_name_mapping = {
+            "agentview_image": "image",
+            "robot0_eye_in_hand_image": "image2",
+        }
+
+        self.num_steps_wait = (
+            10  # Do nothing for the first few timesteps to wait for the simulator drops objects
+        )
+        self.episode_index = episode_index
+
+        self._env = self._make_envs_task(task_suite, self.task_id)
+        if task_suite_name == "libero_spatial":
+            max_steps = 220  # longest training demo has 193 steps
+        elif task_suite_name == "libero_object":
+            max_steps = 280  # longest training demo has 254 steps
+        elif task_suite_name == "libero_goal":
+            max_steps = 300  # longest training demo has 270 steps
+        elif task_suite_name == "libero_10":
+            max_steps = 520  # longest training demo has 505 steps
+        elif task_suite_name == "libero_90":
+            max_steps = 400  # longest training demo has 373 steps
+        self._max_episode_steps = max_steps
+
+        images = {}
+        for cam in self.camera_name:
+            images[self.camera_name_mapping[cam]] = spaces.Box(
+                low=0,
+                high=255,
+                shape=(self.observation_height, self.observation_width, 3),
+                dtype=np.uint8,
+            )
+
+        if self.obs_type == "state":
+            raise NotImplementedError()
+        elif self.obs_type == "pixels":
+            self.observation_space = spaces.Dict(
+                {
+                    "pixels": spaces.Dict(images),
+                }
+            )
+        elif self.obs_type == "pixels_agent_pos":
+            self.observation_space = spaces.Dict(
+                {
+                    "pixels": spaces.Dict(images),
+                    "agent_pos": spaces.Box(
+                        low=-1000.0,
+                        high=1000.0,
+                        shape=(OBS_STATE_DIM,),
+                        dtype=np.float64,
+                    ),
+                }
+            )
+
+        self.action_space = spaces.Box(low=-1, high=1, shape=(ACTION_DIM,), dtype=np.float32)
+
+    def render(self):
+        raw_obs = self._env.env._get_observations()
+        image = self._format_raw_obs(raw_obs)["pixels"]["image"]
+        return image
+
+    def _make_envs_task(self, task_suite, task_id: int = 0):
+        task = task_suite.get_task(task_id)
+        self.task = task.name
+        self.task_description = task.language
+        task_bddl_file = os.path.join(get_libero_path("bddl_files"), task.problem_folder, task.bddl_file)
+
+        env_args = {
+            "bddl_file_name": task_bddl_file,
+            "camera_heights": self.observation_height,
+            "camera_widths": self.observation_width,
+        }
+        env = OffScreenRenderEnv(**env_args)
+        env.reset()
+        if self.init_states:
+            init_states = get_task_init_states(
+                task_suite, task_id
+            )  # for benchmarking purpose, we fix the a set of initial states FIXME(mshukor): should be in the reset()?
+            init_state_id = self.episode_index  # episode index
+            env.set_init_state(init_states[init_state_id])
+
+        return env
+
+    def _format_raw_obs(self, raw_obs):
+        images = {}
+        for camera_name in self.camera_name:
+            image = raw_obs[camera_name]
+            image = image[::-1, ::-1]  # rotate 180 degrees
+            images[self.camera_name_mapping[camera_name]] = image
+        state = np.concatenate(
+            (
+                raw_obs["robot0_eef_pos"],
+                quat2axisangle(raw_obs["robot0_eef_quat"]),
+                raw_obs["robot0_gripper_qpos"],
+            )
+        )
+        agent_pos = state
+        if self.obs_type == "state":
+            raise NotImplementedError()
+        elif self.obs_type == "pixels":
+            obs = {"pixels": images.copy()}
+        elif self.obs_type == "pixels_agent_pos":
+            obs = {
+                "pixels": images.copy(),
+                "agent_pos": agent_pos,
+            }
+        return obs
+
+    def reset(self, seed=None, **kwargs):
+        super().reset(seed=seed)
+
+        self._env.seed(seed)
+        raw_obs = self._env.reset()
+        # Do nothing for the first few timesteps to wait for the simulator drops objects
+        for _ in range(self.num_steps_wait):
+            raw_obs, _, _, _ = self._env.step(get_libero_dummy_action())
+        observation = self._format_raw_obs(raw_obs)
+        info = {"is_success": False}
+        return observation, info
+
+    def step(self, action):
+        assert action.ndim == 1
+        raw_obs, reward, done, info = self._env.step(action)
+
+        is_success = self._env.check_success()
+        terminated = done or is_success
+        info["is_success"] = done  # is_success
+
+        observation = self._format_raw_obs(raw_obs)
+        if done:
+            self.reset()
+            print(self.task, self.task_id, done, is_success)
+        truncated = False
+        return observation, reward, terminated, truncated, info
+
+    def close(self):
+        self._env.close()
diff --git a/src/lerobot/envs/libero.py b/src/lerobot/envs/libero.py
index f815228e7..83ccd2fb9 100644
--- a/src/lerobot/envs/libero.py
+++ b/src/lerobot/envs/libero.py
@@ -245,9 +245,8 @@ class LiberoEnv(gym.Env):
 
     def render(self):
         raw_obs = self._env.env._get_observations()
-        formatted = self._format_raw_obs(raw_obs)
-        # grab the "main" camera
-        return formatted["pixels"]["image"]
+        image = self._format_raw_obs(raw_obs)["pixels"]["image"]
+        return image
 
     def _make_envs_task(self, task_suite, task_id: int = 0):
         task = task_suite.get_task(task_id)
@@ -277,7 +276,6 @@ class LiberoEnv(gym.Env):
             image = raw_obs[camera_name]
             image = image[::-1, ::-1]  # rotate 180 degrees
             images[self.camera_name_mapping[camera_name]] = image
-        # images = image if len(images) == 1 else images
         state = np.concatenate(
             (
                 raw_obs["robot0_eef_pos"],
@@ -311,14 +309,17 @@ class LiberoEnv(gym.Env):
 
     def step(self, action):
         assert action.ndim == 1
-        action[-1] = 1.0 - action[-1]
         raw_obs, reward, done, info = self._env.step(action)
+
         is_success = self._env.check_success()
         terminated = done or is_success
-        info["is_success"] = is_success
+        info["is_success"] = done  # is_success
+
         observation = self._format_raw_obs(raw_obs)
+        if done:
+            self.reset()
+            print(self.task, self.task_id, done, is_success)
         truncated = False
-        # note if it is unable to complete get libero error after many steps
         return observation, reward, terminated, truncated, info
 
     def close(self):
diff --git a/src/lerobot/envs/libero2.py b/src/lerobot/envs/libero2.py
new file mode 100644
index 000000000..1e794072c
--- /dev/null
+++ b/src/lerobot/envs/libero2.py
@@ -0,0 +1,308 @@
+import math
+import os
+from collections import defaultdict
+from itertools import chain
+from typing import Any, Callable
+
+import gymnasium as gym
+import numpy as np
+import torch
+from gymnasium import spaces
+from libero.libero import benchmark, get_libero_path
+from libero.libero.envs import OffScreenRenderEnv
+
+
+OBS_IMAGE = "observation.image"
+OBS_IMAGE_2 = "observation.image2"
+def create_libero_envs(
+    task: str,
+    n_envs: int,
+    gym_kwargs: dict[str, Any] = None,
+    camera_name: str = "agentview_image,robot0_eye_in_hand_image",
+    init_states: bool = True,
+    env_cls: Callable = None,
+    multitask_eval: bool = True,
+) -> dict[str, dict[str, Any]]:
+    """
+    Here n_envs is per task and equal to the number of rollouts.
+    Returns:
+        dict[str, dict[str, list[LiberoEnv]]]: keys are task_suite and values are list of LiberoEnv envs.
+    """
+    if gym_kwargs is None:
+        gym_kwargs = {}
+
+    if not multitask_eval:
+        benchmark_dict = benchmark.get_benchmark_dict()
+        task_suite = benchmark_dict[task]()  # can also choose libero_spatial, libero_object, libero_10 etc.
+        tasks_id = list(range(len(task_suite.tasks)))
+        episode_indices = [0 for i in range(len(tasks_id))]
+        if len(tasks_id) == 1:
+            tasks_id = [tasks_id[0] for _ in range(n_envs)]
+            episode_indices = list(range(n_envs))
+        elif len(tasks_id) < n_envs and n_envs % len(tasks_id) == 0:
+            n_repeat = n_envs // len(tasks_id)
+            episode_indices = []
+            for i in range(len(tasks_id)):
+                episode_indices.extend(list(range(n_repeat)))
+            tasks_id = list(chain.from_iterable([[item] * n_repeat for item in tasks_id]))
+        elif n_envs < len(tasks_id):
+            tasks_id = tasks_id[:n_envs]
+            episode_indices = list(range(n_envs))[:n_envs]
+            print(f"WARNING: n_envs < len(tasks_id), evaluating only on {tasks_id}")
+        print(f"Creating Libero envs with task ids {tasks_id} from suite {task}")
+        assert n_envs == len(
+            tasks_id
+        ), f"len(n_envs) and tasks_id should be the same, got {n_envs} and {len(tasks_id)}"
+        return env_cls(
+            [
+                lambda i=i: LiberoEnv(
+                    task_suite=task_suite,
+                    task_id=tasks_id[i],
+                    task_suite_name=task,
+                    camera_name=camera_name,
+                    init_states=init_states,
+                    episode_index=episode_indices[i],
+                    **gym_kwargs,
+                )
+                for i in range(n_envs)
+            ]
+        )
+    else:
+        envs = defaultdict(dict)
+        benchmark_dict = benchmark.get_benchmark_dict()
+        task = task.split(",")
+        for _task in task:
+            task_suite = benchmark_dict[
+                _task
+            ]()  # can also choose libero_spatial, libero_object, libero_10 etc.
+            tasks_ids = list(range(len(task_suite.tasks)))
+            # tasks_ids = [0] # FIXME(mshukor): debug
+            for tasks_id in tasks_ids:
+                episode_indices = list(range(n_envs))
+                print(
+                    f"Creating Libero envs with task ids {tasks_id} from suite {_task}, episode_indices: {episode_indices}"
+                )
+                envs_list = [
+                    lambda i=i: LiberoEnv(
+                        task_suite=task_suite,
+                        task_id=tasks_id,
+                        task_suite_name=_task,
+                        camera_name=camera_name,
+                        init_states=init_states,
+                        episode_index=episode_indices[i],
+                        **gym_kwargs,
+                    )
+                    for i in range(n_envs)
+                ]
+                envs[_task][tasks_id] = env_cls(envs_list)
+        return envs
+
+
+def quat2axisangle(quat):
+    """
+    Copied from robosuite: https://github.com/ARISE-Initiative/robosuite/blob/eafb81f54ffc104f905ee48a16bb15f059176ad3/robosuite/utils/transform_utils.py#L490C1-L512C55
+
+    Converts quaternion to axis-angle format.
+    Returns a unit vector direction scaled by its angle in radians.
+
+    Args:
+        quat (np.array): (x,y,z,w) vec4 float angles
+
+    Returns:
+        np.array: (ax,ay,az) axis-angle exponential coordinates
+    """
+    # clip quaternion
+    if quat[3] > 1.0:
+        quat[3] = 1.0
+    elif quat[3] < -1.0:
+        quat[3] = -1.0
+
+    den = np.sqrt(1.0 - quat[3] * quat[3])
+    if math.isclose(den, 0.0):
+        # This is (close to) a zero degree rotation, immediately return
+        return np.zeros(3)
+
+    return (quat[:3] * 2.0 * math.acos(quat[3])) / den
+
+
+def get_task_init_states(task_suite, i):
+    init_states_path = os.path.join(
+        get_libero_path("init_states"),
+        task_suite.tasks[i].problem_folder,
+        task_suite.tasks[i].init_states_file,
+    )
+    init_states = torch.load(init_states_path, weights_only=False)
+    return init_states
+
+
+def get_libero_dummy_action():
+    """Get dummy/no-op action, used to roll out the simulation while the robot does nothing."""
+    return [0, 0, 0, 0, 0, 0, -1]
+
+
+class LiberoEnv(gym.Env):
+    metadata = {"render_modes": ["rgb_array"], "render_fps": 80}
+
+    def __init__(
+        self,
+        task_suite,
+        task_id,
+        task_suite_name,
+        camera_name="agentview_image,robot0_eye_in_hand_image",
+        obs_type="pixels",
+        render_mode="rgb_array",
+        observation_width=256,
+        observation_height=256,
+        visualization_width=640,
+        visualization_height=480,
+        init_states=True,
+        episode_index=0,
+    ):
+        super().__init__()
+        self.task_id = task_id
+        self.obs_type = obs_type
+        self.render_mode = render_mode
+        self.observation_width = observation_width
+        self.observation_height = observation_height
+        self.visualization_width = visualization_width
+        self.visualization_height = visualization_height
+        self.init_states = init_states
+        self.camera_name = camera_name.split(
+            ","
+        )  # agentview_image (main) or robot0_eye_in_hand_image (wrist)
+        self.camera_name_mapping = {
+            "agentview_image": OBS_IMAGE,
+            "robot0_eye_in_hand_image": OBS_IMAGE_2,
+        }
+        self.num_steps_wait = (
+            10  # Do nothing for the first few timesteps to wait for the simulator drops objects
+        )
+        self.episode_index = episode_index
+
+        self._env = self._make_envs_task(task_suite, self.task_id)
+        if task_suite_name == "libero_spatial":
+            max_steps = 220  # longest training demo has 193 steps
+        elif task_suite_name == "libero_object":
+            max_steps = 280  # longest training demo has 254 steps
+        elif task_suite_name == "libero_goal":
+            max_steps = 300  # longest training demo has 270 steps
+        elif task_suite_name == "libero_10":
+            max_steps = 520  # longest training demo has 505 steps
+        elif task_suite_name == "libero_90":
+            max_steps = 400  # longest training demo has 373 steps
+        self._max_episode_steps = max_steps
+
+        images = {}
+        for cam in self.camera_name:
+            images[self.camera_name_mapping[cam]] = spaces.Box(
+                low=0,
+                high=255,
+                shape=(self.observation_height, self.observation_width, 3),
+                dtype=np.uint8,
+            )
+
+        if self.obs_type == "state":
+            raise NotImplementedError()
+        elif self.obs_type == "pixels":
+            self.observation_space = spaces.Dict(
+                {
+                    "pixels": spaces.Dict(images),
+                }
+            )
+        elif self.obs_type == "pixels_agent_pos":
+            self.observation_space = spaces.Dict(
+                {
+                    "pixels": spaces.Dict(images),
+                    "agent_pos": spaces.Box(
+                        low=-1000.0,
+                        high=1000.0,
+                        shape=(8,),
+                        dtype=np.float64,
+                    ),
+                }
+            )
+
+        self.action_space = spaces.Box(low=-1, high=1, shape=(7,), dtype=np.float32)
+
+    def render(self):
+        raw_obs = self._env.env._get_observations()
+        image = self._format_raw_obs(raw_obs)["pixels"][OBS_IMAGE]
+        return image
+
+    def _make_envs_task(self, task_suite, task_id: int = 0):
+        task = task_suite.get_task(task_id)
+        self.task = task.name
+        self.task_description = task.language
+        task_bddl_file = os.path.join(get_libero_path("bddl_files"), task.problem_folder, task.bddl_file)
+
+        env_args = {
+            "bddl_file_name": task_bddl_file,
+            "camera_heights": self.observation_height,
+            "camera_widths": self.observation_width,
+        }
+        env = OffScreenRenderEnv(**env_args)
+        env.reset()
+        if self.init_states:
+            init_states = get_task_init_states(
+                task_suite, task_id
+            )  # for benchmarking purpose, we fix the a set of initial states FIXME(mshukor): should be in the reset()?
+            init_state_id = self.episode_index  # episode index
+            env.set_init_state(init_states[init_state_id])
+
+        return env
+
+    def _format_raw_obs(self, raw_obs):
+        images = {}
+        for camera_name in self.camera_name:
+            image = raw_obs[camera_name]
+            image = image[::-1, ::-1]  # rotate 180 degrees
+            images[self.camera_name_mapping[camera_name]] = image
+        # images = image if len(images) == 1 else images
+        state = np.concatenate(
+            (
+                raw_obs["robot0_eef_pos"],
+                quat2axisangle(raw_obs["robot0_eef_quat"]),
+                raw_obs["robot0_gripper_qpos"],
+            )
+        )
+        agent_pos = state
+        if self.obs_type == "state":
+            raise NotImplementedError()
+        elif self.obs_type == "pixels":
+            obs = {"pixels": images.copy()}
+        elif self.obs_type == "pixels_agent_pos":
+            obs = {
+                "pixels": images.copy(),
+                "agent_pos": agent_pos,
+            }
+        return obs
+
+    def reset(self, seed=None, **kwargs):
+        super().reset(seed=seed)
+
+        self._env.seed(seed)
+        raw_obs = self._env.reset()
+        # Do nothing for the first few timesteps to wait for the simulator drops objects
+        for _ in range(self.num_steps_wait):
+            raw_obs, _, _, _ = self._env.step(get_libero_dummy_action())
+        observation = self._format_raw_obs(raw_obs)
+        info = {"is_success": False}
+        return observation, info
+
+    def step(self, action):
+        assert action.ndim == 1
+        raw_obs, reward, done, info = self._env.step(action)
+
+        is_success = self._env.check_success()
+        terminated = done or is_success
+        info["is_success"] = done  # is_success
+
+        observation = self._format_raw_obs(raw_obs)
+        if done:
+            self.reset()
+            print(self.task, self.task_id, done, is_success)
+        truncated = False
+        return observation, reward, terminated, truncated, info
+
+    def close(self):
+        self._env.close()
diff --git a/src/lerobot/envs/utils.py b/src/lerobot/envs/utils.py
index 2cf9efcfe..5ae252dbe 100644
--- a/src/lerobot/envs/utils.py
+++ b/src/lerobot/envs/utils.py
@@ -80,7 +80,56 @@ def preprocess_observation(observations: dict[str, np.ndarray]) -> dict[str, Ten
 
     return return_observations
 
+def preprocess_observation1(
+    observations: dict[str, np.ndarray], cfg: dict[str, Any] = None
+) -> dict[str, Tensor]:
+    # TODO(aliberts, rcadene): refactor this to use features from the environment (no hardcoding)
+    """Convert environment observation to LeRobot format observation.
+    Args:
+        observation: Dictionary of observation batches from a Gym vector environment.
+    Returns:
+        Dictionary of observation batches with keys renamed to LeRobot format and values as tensors.
+    """
+    # map to expected inputs for the policy
+    return_observations = {}
+    image_key = list(cfg.image_features.keys())[0] if cfg else "observation.image"
+    state_key = cfg.robot_state_feature_key if cfg else "observation.state"
+    if "pixels" in observations:
+        if isinstance(observations["pixels"], dict):
+            # imgs = {f"{image_key}.{key}": img for key, img in observations["pixels"].items()}
+            imgs = observations["pixels"]  # keys should be OBS_IMAGE, OBS_IMAGE_2, OBS_IMAGE_3
+        else:
+            imgs = {f"{image_key}": observations["pixels"]}
 
+        for imgkey, img in imgs.items():
+            # TODO(aliberts, rcadene): use transforms.ToTensor()?
+            img = torch.from_numpy(img)
+
+            # sanity check that images are channel last
+            _, h, w, c = img.shape
+            assert c < h and c < w, f"expect channel last images, but instead got {img.shape=}"
+
+            # sanity check that images are uint8
+            assert img.dtype == torch.uint8, f"expect torch.uint8, but instead {img.dtype=}"
+
+            # convert to channel first of type float32 in range [0,1]
+            img = einops.rearrange(img, "b h w c -> b c h w").contiguous()
+            img = img.type(torch.float32)
+            img /= 255
+
+            return_observations[imgkey] = img
+
+    if "environment_state" in observations:
+        return_observations["observation.environment_state"] = torch.from_numpy(
+            observations["environment_state"]
+        ).float()
+
+    # TODO(rcadene): enable pixels only baseline with `obs_type="pixels"` in environment by removing
+    # requirement for "agent_pos"
+    return_observations[state_key] = torch.from_numpy(observations["agent_pos"]).float()
+    if "task" in observations:
+        return_observations["task"] = observations["task"]
+    return return_observations
 def env_to_policy_features(env_cfg: EnvConfig) -> dict[str, PolicyFeature]:
     # TODO(aliberts, rcadene): remove this hardcoding of keys and just use the nested keys as is
     # (need to also refactor preprocess_observation and externalize normalization from policies)
diff --git a/src/lerobot/policies/factory.py b/src/lerobot/policies/factory.py
index 79461d3a9..b7d92d988 100644
--- a/src/lerobot/policies/factory.py
+++ b/src/lerobot/policies/factory.py
@@ -177,6 +177,6 @@ def make_policy(
         policy = policy_cls(**kwargs)
     policy.to(cfg.device)
     assert isinstance(policy, nn.Module)
+    breakpoint()
     # policy = torch.compile(policy, mode="reduce-overhead")
-
     return policy
diff --git a/src/lerobot/policies/smolpi0/modeling_smolpi0.py b/src/lerobot/policies/smolpi0/modeling_smolpi0.py
index fa2d3d5a7..9a128f7b6 100644
--- a/src/lerobot/policies/smolpi0/modeling_smolpi0.py
+++ b/src/lerobot/policies/smolpi0/modeling_smolpi0.py
@@ -51,7 +51,9 @@ policy = Pi0Policy.from_pretrained("lerobot/pi0")
 
 import math
 from collections import deque
-
+import os
+import re
+import safetensors
 import torch
 import torch.nn.functional as F  # noqa: N812
 from torch import Tensor, nn
@@ -169,7 +171,72 @@ def resize_with_pad(img, width, height, pad_value=-1):
     padded_img = F.pad(resized_img, (pad_width, 0, pad_height, 0), value=pad_value)
     return padded_img
 
+_VARIANT_RE = re.compile(r"\.so\d+(?:-[\w]+)?_buffer_")
+def canonicalise(k: str) -> str:
+    """
+    Remove dataset-variant markers like '.so100-blue_' or '.so100_' from a
+    normalisation-buffer key.
+    """
+    return _VARIANT_RE.sub(".buffer_", k)
 
+def standardise_state_dict(
+    checkpoint: dict[str, torch.Tensor], ref_keys: set[str], *, verbose: bool = True
+) -> tuple[dict[str, torch.Tensor], list[str]]:
+    """
+    • Re-keys `checkpoint ` so that every entry matches the *reference* key set.
+    • If several variant keys collapse to the same canonical name we keep the
+      first one and log the collision.
+    • Returns the new dict + a list of entries that could not be matched.
+    """
+    out, collisions, unmatched = {}, {}, []
+
+    for k, v in checkpoint.items():
+        canon = canonicalise(k)
+        if canon in ref_keys:
+            if canon in out:  # duplicate after collapsing
+                collisions.setdefault(canon, []).append(k)
+            else:
+                out[canon] = v
+        else:
+            unmatched.append(k)
+
+    if verbose:
+        for canon, variants in collisions.items():
+            print(f"[standardise_state_dict] '{canon}'  ←  {variants}")
+        if unmatched:
+            print(f"[standardise_state_dict] kept {len(unmatched)} unmatched keys")
+
+    out.update({k: checkpoint[k] for k in unmatched})
+    return out, unmatched
+
+def load_smolvla(
+    model: torch.nn.Module,
+    filename: str | os.PathLike,
+    *,
+    device: str = "cpu",
+    checkpoint_keys_mapping: str = "",
+) -> torch.nn.Module:
+    state_dict = safetensors.torch.load_file(filename, device=device)
+
+    # Optional user-supplied renames (e.g. "model._orig_mod.//model.")
+    if checkpoint_keys_mapping and "//" in checkpoint_keys_mapping:
+        state_dict = rename_checkpoint_keys(state_dict, checkpoint_keys_mapping)
+
+    state_dict, _ = standardise_state_dict(state_dict, set(model.state_dict().keys()))
+
+    # HACK(aliberts): to not overwrite normalization parameters as they should come from the dataset
+    norm_keys = ("normalize_inputs", "normalize_targets", "unnormalize_outputs")
+    state_dict = {k: v for k, v in state_dict.items() if not k.startswith(norm_keys)}
+
+    missing, unexpected = model.load_state_dict(state_dict, strict=False)
+    if not all(key.startswith(norm_keys) for key in missing) or unexpected:
+        raise RuntimeError(
+            "SmolVLA %d missing / %d unexpected keys",
+            len(missing),
+            len(unexpected),
+        )
+
+    return model
 def pad_vector(vector, new_dim):
     """Can be (batch_size x sequence_length x features_dimension)
     or (batch_size x features_dimension)
@@ -219,7 +286,27 @@ def aloha_gripper_to_angular(value):
     # The values 0.4 and 1.5 were measured on an actual Trossen robot.
     return normalize(value, min_val=0.4, max_val=1.5)
 
+def rename_checkpoint_keys(checkpoint: dict, rename_str: str):
+    """
+    Renames keys in a checkpoint dictionary based on the given rename string.
 
+    Args:
+        checkpoint (dict): The checkpoint dictionary.
+        rename_str (str): A string specifying key mappings in the format "old1//new1,old2//new2".
+
+    Returns:
+        dict: The modified checkpoint with renamed keys.
+    """
+
+    rename_dict = dict(pair.split("//") for pair in rename_str.split(","))
+
+    new_checkpoint = {}
+    for k, v in checkpoint.items():
+        for old_key, new_key in rename_dict.items():
+            if old_key in k:
+                k = k.replace(old_key, new_key)
+        new_checkpoint[k] = v
+    return new_checkpoint
 def aloha_gripper_from_angular(value):
     # Convert from the gripper position used by pi0 to the gripper position that is used by Aloha.
     # Note that the units are still angular but the range is different.
@@ -333,7 +420,7 @@ class SMOLPI0Policy(PreTrainedPolicy):
             self.model.vlm_with_expert.merge_lora_weights()
 
     @torch.no_grad
-    def select_action_chunk(self, batch: dict[str, Tensor], noise: Tensor | None = None) -> Tensor:
+    def predict_action_chunk(self, batch: dict[str, Tensor], noise: Tensor | None = None) -> Tensor:
         """Select a single action given environment observations.
 
         This method wraps `select_actions` in order to return one action at a time for execution in the
@@ -364,7 +451,24 @@ class SMOLPI0Policy(PreTrainedPolicy):
             actions = self._pi_aloha_encode_actions(actions)
 
         return actions
-    
+
+    # HACK(aliberts, danaaubakirova): we overwrite this classmethod here to fix smolVLA-specific issues
+    @classmethod
+    def _load_as_safetensor(
+        cls,
+        model: "SmolVLAPolicy",
+        model_file: str,
+        map_location: str,
+        strict: bool,
+        **kwargs,
+    ):
+        safetensors.torch.load_model(model, model_file, strict=strict, device=map_location)
+        return load_smolvla(
+            model,
+            model_file,
+            device=map_location,
+            checkpoint_keys_mapping="model._orig_mod.//model.",
+        )
     @torch.no_grad
     def select_action(self, batch: dict[str, Tensor], noise: Tensor | None = None) -> Tensor:
         """Select a single action given environment observations.
diff --git a/src/lerobot/policies/smolvla/modeling_smolvla.py b/src/lerobot/policies/smolvla/modeling_smolvla.py
index 9b7e3520a..1c07d98e5 100644
--- a/src/lerobot/policies/smolvla/modeling_smolvla.py
+++ b/src/lerobot/policies/smolvla/modeling_smolvla.py
@@ -1027,7 +1027,7 @@ from lerobot.policies.utils import (
     populate_queues,
 )
 from lerobot.utils.utils import get_safe_dtype
-
+# OBS_STATE = 'state'
 # Matches ".soNNN", optionally followed by "-something", up to the "_buffer_" marker
 _VARIANT_RE = re.compile(r"\.so\d+(?:-[\w]+)?_buffer_")
 
@@ -1347,6 +1347,7 @@ class SmolVLAPolicy(PreTrainedPolicy):
 
         # Unpad actions
         original_action_dim = self.config.action_feature.shape[0]
+        original_action_dim = 7
         actions = actions[:, :, :original_action_dim]
 
         actions = self.unnormalize_outputs({ACTION: actions})[ACTION]
diff --git a/src/lerobot/scripts/eval.py b/src/lerobot/scripts/eval.py
index 92a3bf833..6fbae645b 100644
--- a/src/lerobot/scripts/eval.py
+++ b/src/lerobot/scripts/eval.py
@@ -69,7 +69,7 @@ from tqdm import trange
 from lerobot.configs import parser
 from lerobot.configs.eval import EvalPipelineConfig
 from lerobot.envs.factory import make_env
-from lerobot.envs.utils import add_envs_task, check_env_attributes_and_types, preprocess_observation
+from lerobot.envs.utils import add_envs_task, check_env_attributes_and_types, preprocess_observation, preprocess_observation1
 from lerobot.policies.factory import make_policy
 from lerobot.policies.pretrained import PreTrainedPolicy
 from lerobot.policies.utils import get_device_from_parameters
@@ -125,6 +125,10 @@ def rollout(
 
     # Reset the policy and environments.
     policy.reset()
+    # added by jade
+    # for k in list(policy.config.input_features.keys()):
+    #         if k.startswith("observation.image"):
+    #             policy.config.input_features["observation.images." + k.split("observation.", 1)[1]] = policy.config.input_features.pop(k)
     observation, info = env.reset(seed=seeds)
     if render_callback is not None:
         render_callback(env)
@@ -149,6 +153,7 @@ def rollout(
     while not np.all(done) and step < max_steps:
         # Numpy array to tensor and changing dictionary keys to LeRobot policy format.
         observation = preprocess_observation(observation)
+        # observation = preprocess_observation1(observation)
         if return_observations:
             all_observations.append(deepcopy(observation))
 
@@ -159,6 +164,26 @@ def rollout(
         # Infer "task" from attributes of environments.
         # TODO: works with SyncVectorEnv but not AsyncVectorEnv
         observation = add_envs_task(env, observation)
+        # breakpoint()
+        # observation = {
+        #     k.replace("observation.images.", "observation.") if k.startswith("observation.images.") else k: v
+        #     for k, v in observation.items()
+        # # }
+        # if "observation.image" in observation:
+        #     observation["image"] = observation.pop("observation.image").to(
+        #         device, non_blocking=device.type == "cuda"
+        #     )
+
+        # if "observation.image2" in observation:
+        #     observation["wrist_image"] = observation.pop("observation.image2").to(
+        #         device, non_blocking=device.type == "cuda"
+        #     )
+
+        # if "observation.state" in observation:
+        #     observation["state"] = observation.pop("observation.state").to(
+        #         device, non_blocking=device.type == "cuda"
+        #     )
+
         with torch.inference_mode():
             action = policy.select_action(observation)
         # Convert to CPU / numpy.
@@ -489,12 +514,11 @@ def _inject_normalization_stats(policy: SmolVLAPolicy, dataset_meta: LeRobotData
     print("Normalization layers recreated with dataset stats.")
 
 
-def load_smolvla(cfg, dataset_repo: str):
+def load_smolvla(cfg, dataset_repo: str, policy):
     from lerobot.datasets.lerobot_dataset import LeRobotDataset
     dataset = LeRobotDataset(dataset_repo, root='/raid/jade/.cache/huggingface/datasets/')
-    policy = make_policy(cfg=cfg, ds_meta=dataset.meta)
     _inject_normalization_stats(policy=policy, dataset_meta=dataset.meta)  # only needed if stats are missing
-    return policy, dataset
+    return policy.to("cuda"), dataset
 
 
 @parser.wrap()
@@ -505,7 +529,7 @@ def eval_main(cfg: EvalPipelineConfig):
     device = get_safe_torch_device(cfg.policy.device, log=True)
     #login to hf
     from huggingface_hub import login
-    login()
+    # login()
     torch.backends.cudnn.benchmark = True
     torch.backends.cuda.matmul.allow_tf32 = True
     set_seed(cfg.seed)
@@ -520,9 +544,10 @@ def eval_main(cfg: EvalPipelineConfig):
         cfg=cfg.policy,
         env_cfg=cfg.env,
     )
-    # breakpoint()
-    load_smolvla(cfg.policy, "physical-intelligence/libero")
-    # breakpoint()
+    breakpoint()
+    # policy, _ = load_smolvla(cfg.policy, "physical-intelligence/libero", policy)
+    # rename "image" -> "observation.image"
+    
     policy.eval()
     with torch.no_grad(), torch.autocast(device_type=device.type) if cfg.policy.use_amp else nullcontext():
         if cfg.env.multitask_eval:

From 96cc634a661283f780f27326c406361c72da7655 Mon Sep 17 00:00:00 2001
From: Jade Choghari <chogharijade@gmail.com>
Date: Thu, 11 Sep 2025 12:21:21 +0200
Subject: [PATCH 04/12] add new changes

---
 examples/6_evaluate_libero.sh    |  10 +-
 examples/8_train_smolvla_must.sh |  10 +-
 src/lerobot/envs/factory.py      |  59 +++---
 src/lerobot/envs/libero.py       | 351 ++++++++++++++++++++++---------
 src/lerobot/envs/utils.py        |  44 ++++
 src/lerobot/policies/factory.py  |   1 -
 src/lerobot/scripts/eval.py      | 261 ++++++++++++-----------
 src/lerobot/scripts/train.py     |  73 +++----
 8 files changed, 505 insertions(+), 304 deletions(-)

diff --git a/examples/6_evaluate_libero.sh b/examples/6_evaluate_libero.sh
index c15d71c95..46355dfa1 100644
--- a/examples/6_evaluate_libero.sh
+++ b/examples/6_evaluate_libero.sh
@@ -18,11 +18,11 @@ export CUDA_VISIBLE_DEVICES=2
 
 # CONFIGURATION
 POLICY_PATH="/raid/jade/logs/lerobot/lerobot_2_HuggingFaceVLA_libero_smolvla_lr1e-4bs32steps100000/checkpoints/100000/pretrained_model"
-POLICY_PATH="/raid/jade/logs/lerobot/lerobot_new_HuggingfaceVLA_libero_smolvla_lr1e-4bs32steps100000/checkpoints/100000/pretrained_model"
-TASK=libero_spatial
+POLICY_PATH="/raid/jade/models/smolvlamust"
+TASK=libero_spatial,libero_object
 ENV_TYPE="libero"
-BATCH_SIZE=10
-N_EPISODES=10
+BATCH_SIZE=1
+N_EPISODES=1
 # storage / caches
 RAID=/raid/jade
 N_ACTION_STEPS=1
@@ -46,7 +46,7 @@ python src/lerobot/scripts/eval.py \
     --env.type="$ENV_TYPE" \
     --eval.batch_size="$BATCH_SIZE" \
     --eval.n_episodes="$N_EPISODES" \
-    --env.multitask_eval=False \
+    --env.multitask_eval=True \
     --env.task=$TASK \
 # python examples/evaluate_libero.py \
 #     --policy_path "$POLICY_PATH" \
diff --git a/examples/8_train_smolvla_must.sh b/examples/8_train_smolvla_must.sh
index 98029f3ad..828627d85 100644
--- a/examples/8_train_smolvla_must.sh
+++ b/examples/8_train_smolvla_must.sh
@@ -56,14 +56,16 @@ EXPERT_WIDTH_MULTIPLIER=0.5
 # number of gpus to use
 NUM_PROCESSES=2
 NUM_VLM_LAYERS=0
-SELF_ATTN_EVERY_N_LAYERS=2
+SELF_ATTN_EVERY_N_LAYERS=0
 CHUNK_SIZE=50
-export CUDA_VISIBLE_DEVICES=0
+export CUDA_VISIBLE_DEVICES=1
 PORT=29522
 PREFIX_LENGTH=0
 LOAD_VLM_WEIGHTS=true
+MAX_ACTION_DIM=32
+MAX_STATE_DIM=32
 # naming/output dir
-TRAIN_DIR=$RAID/logs/lerobot/lerobot_new_${REPO_ID//\//_}_${POLICY}_lr${LR}bs${BATCH_SIZE}steps${OFFLINE_STEPS}
+TRAIN_DIR=$RAID/logs/lerobot/lerobot_new_sep11_v2_${REPO_ID//\//_}_${POLICY}_lr${LR}bs${BATCH_SIZE}steps${OFFLINE_STEPS}
 echo "Training dir: $TRAIN_DIR"
 
 rm -rf "$TRAIN_DIR"
@@ -137,5 +139,7 @@ python src/lerobot/scripts/train.py \
     --policy.load_vlm_weights=$LOAD_VLM_WEIGHTS \
     --policy.expert_width_multiplier=$EXPERT_WIDTH_MULTIPLIER \
     --policy.self_attn_every_n_layers=$SELF_ATTN_EVERY_N_LAYERS \
+    --policy.max_action_dim=$MAX_ACTION_DIM \
+    --policy.max_state_dim=$MAX_STATE_DIM \
     --seed=$SEED \
     --wandb.enable=false
diff --git a/src/lerobot/envs/factory.py b/src/lerobot/envs/factory.py
index 211b41714..d38b2eed3 100644
--- a/src/lerobot/envs/factory.py
+++ b/src/lerobot/envs/factory.py
@@ -56,36 +56,41 @@ def make_env(
             names to indexed vectorized environments (when multitask eval is used).
 
     """
-        if n_envs < 1:
-            raise ValueError("`n_envs` must be at least 1")
+    if n_envs < 1:
+        raise ValueError("`n_envs` must be at least 1")
 
-        env_cls = gym.vector.AsyncVectorEnv if use_async_envs else gym.vector.SyncVectorEnv
+    env_cls = gym.vector.AsyncVectorEnv if use_async_envs else gym.vector.SyncVectorEnv
+
+
+    if "libero" in cfg.type:
+        from lerobot.envs.libero import create_libero_envs
+        return create_libero_envs(
+            task=cfg.task,
+            n_envs=n_envs,
+            camera_name=cfg.camera_name,
+            init_states=cfg.init_states,
+            gym_kwargs=cfg.gym_kwargs,
+            env_cls=env_cls,
+            multitask_eval=cfg.multitask_eval,
+        )
 
     
-        if "libero" in cfg.type:
-            from lerobot.envs.libero import create_libero_envs
-            return create_libero_envs(
-                task=cfg.task,
-                n_envs=n_envs,
-                camera_name=cfg.camera_name,
-                init_states=cfg.init_states,
-                gym_kwargs=cfg.gym_kwargs,
-                env_cls=env_cls,
-                multitask_eval=cfg.multitask_eval,
-            )
+    package_name = f"gym_{cfg.type}"
+    try:
+        importlib.import_module(package_name)
+    except ModuleNotFoundError as e:
+        raise ModuleNotFoundError(
+            f"{package_name} is not installed. Install with: pip install \"lerobot[{cfg.type}]\""
+        ) from e
 
-        
-        package_name = f"gym_{cfg.type}"
-        try:
-            importlib.import_module(package_name)
-        except ModuleNotFoundError as e:
-            raise ModuleNotFoundError(
-                f"{package_name} is not installed. Install with: pip install \"lerobot[{cfg.type}]\""
-            ) from e
+    gym_handle = f"{package_name}/{cfg.task}"
+    
+    def _make_one():
+        return gym.make(gym_handle, disable_env_checker=True, **(cfg.gym_kwargs or {}))
 
-        gym_handle = f"{package_name}/{cfg.task}"
-        
-        def _make_one():
-            return gym.make(gym_handle, disable_env_checker=True, **(cfg.gym_kwargs or {}))
+    vec = env_cls([_make_one for _ in range(n_envs)])
+
+    # normalize to {suite: {task_id: vec_env}} for consistency
+    suite_name = cfg.type  # e.g., "pusht", "aloha"
+    return {suite_name: {0: vec}}
 
-        return env_cls([_make_one for _ in range(n_envs)])
diff --git a/src/lerobot/envs/libero.py b/src/lerobot/envs/libero.py
index 83ccd2fb9..ff1574416 100644
--- a/src/lerobot/envs/libero.py
+++ b/src/lerobot/envs/libero.py
@@ -1,3 +1,5 @@
+from __future__ import annotations
+
 import math
 import os
 from collections import defaultdict
@@ -12,100 +14,153 @@ from gymnasium import spaces
 from libero.libero import benchmark, get_libero_path
 from libero.libero.envs import OffScreenRenderEnv
 
+import logging
+from collections import defaultdict
+from typing import Any, Callable, Dict, Iterable, List, Mapping, Sequence
+
+
+logger = logging.getLogger(__name__)
+
+# ---- Helpers -----------------------------------------------------------------
+
+def _parse_camera_names(camera_name: str | Sequence[str]) -> List[str]:
+    """Normalize camera_name into a non-empty list of strings."""
+    if isinstance(camera_name, str):
+        cams = [c.strip() for c in camera_name.split(",") if c.strip()]
+    elif isinstance(camera_name, (list, tuple)):
+        cams = [str(c).strip() for c in camera_name if str(c).strip()]
+    else:
+        raise TypeError(f"camera_name must be str or sequence[str], got {type(camera_name).__name__}")
+    if not cams:
+        raise ValueError("camera_name resolved to an empty list.")
+    return cams
+
+
+def _get_suite(name: str):
+    """Instantiate a LIBERO suite by name with clear validation."""
+    bench = benchmark.get_benchmark_dict()
+    if name not in bench:
+        raise ValueError(f"Unknown LIBERO suite '{name}'. Available: {', '.join(sorted(bench.keys()))}")
+    suite = bench[name]()
+    if not getattr(suite, "tasks", None):
+        raise ValueError(f"Suite '{name}' has no tasks.")
+    return suite
+
+
+def _select_task_ids(total_tasks: int, task_ids: Iterable[int] | None) -> List[int]:
+    """Validate/normalize task ids. If None → all tasks."""
+    if task_ids is None:
+        return list(range(total_tasks))
+    ids = sorted(set(int(t) for t in task_ids))
+    for t in ids:
+        if t < 0 or t >= total_tasks:
+            raise ValueError(f"task_id {t} out of range [0, {total_tasks-1}].")
+    return ids
+
+
+def _make_env_fns(
+    *,
+    suite,
+    suite_name: str,
+    task_id: int,
+    n_envs: int,
+    camera_names: List[str],
+    init_states: bool,
+    gym_kwargs: Mapping[str, Any],
+    LiberoEnv: type,  # injected to avoid forward ref issues if needed
+) -> List[Callable[[], "LiberoEnv"]]:
+    """Build n_envs factory callables for a single (suite, task_id)."""
+    joined_cams = ",".join(camera_names)  # keep backward-compat: downstream expects a string
+    fns: List[Callable[[], "LiberoEnv"]] = []
+    for i in range(n_envs):
+        def _mk(i=i, suite=suite, task_id=task_id, suite_name=suite_name, joined_cams=joined_cams, init_states=init_states, gym_kwargs=dict(gym_kwargs)):
+            return LiberoEnv(
+                task_suite=suite,
+                task_id=task_id,
+                task_suite_name=suite_name,
+                camera_name=joined_cams,
+                init_states=init_states,
+                episode_index=i,
+                **gym_kwargs,
+            )
+        fns.append(_mk)
+    return fns
+
+# ---- Main API ----------------------------------------------------------------
 
 def create_libero_envs(
     task: str,
     n_envs: int,
-    gym_kwargs: dict[str, Any] = None,
-    camera_name: str = "agentview_image,robot0_eye_in_hand_image",
+    gym_kwargs: dict[str, Any] | None = None,
+    camera_name: str | Sequence[str] = "agentview_image,robot0_eye_in_hand_image",
     init_states: bool = True,
-    env_cls: Callable = None,
-    multitask_eval: bool = True,
-) -> dict[str, dict[str, Any]]:
+    env_cls: Callable[[Sequence[Callable[[], Any]]], Any] | None = None,
+    multitask_eval: bool = True,  # kept for signature compatibility; return type is consistent regardless
+) -> dict[str, dict[int, Any]]:
     """
-    Here n_envs is per task and equal to the number of rollouts.
+    Create vectorized LIBERO environments with a consistent return shape.
+
     Returns:
-        dict[str, dict[str, list[LiberoEnv]]]: keys are task_suite and values are list of LiberoEnv envs.
+        dict[suite_name][task_id] -> vec_env (env_cls([...]) with exactly n_envs factories)
+    Notes:
+        - n_envs is the number of rollouts *per task* (episode_index = 0..n_envs-1).
+        - `task` can be a single suite or a comma-separated list of suites.
+        - You may pass `task_ids` (list[int]) inside `gym_kwargs` to restrict tasks per suite.
     """
-    print("num envs", n_envs)
-    print("multitask_eval", multitask_eval)
-    print("gym_kwargs", gym_kwargs)
-    if gym_kwargs is None:
-        gym_kwargs = {}
+    if env_cls is None or not callable(env_cls):
+        raise ValueError("env_cls must be a callable that wraps a list of environment factory callables.")
+    if not isinstance(n_envs, int) or n_envs <= 0:
+        raise ValueError(f"n_envs must be a positive int; got {n_envs}.")
 
-    if not multitask_eval:
-        benchmark_dict = benchmark.get_benchmark_dict()
-        task_suite = benchmark_dict[task]()  # can also choose libero_spatial, libero_object, libero_10 etc.
-        tasks_id = list(range(len(task_suite.tasks)))
-        episode_indices = [0 for i in range(len(tasks_id))]
-        if len(tasks_id) == 1:
-            tasks_id = [tasks_id[0] for _ in range(n_envs)]
-            episode_indices = list(range(n_envs))
-        elif len(tasks_id) < n_envs and n_envs % len(tasks_id) == 0:
-            n_repeat = n_envs // len(tasks_id)
-            print("n_repeat", n_repeat)
-            episode_indices = []
-            for _ in range(len(tasks_id)):
-                episode_indices.extend(list(range(n_repeat)))
-            tasks_id = list(chain.from_iterable([[item] * n_repeat for item in tasks_id]))
-        elif n_envs < len(tasks_id):
-            tasks_id = tasks_id[:n_envs]
-            episode_indices = list(range(n_envs))[:n_envs]
-            print(f"WARNING: n_envs < len(tasks_id), evaluating only on {tasks_id}")
-        print(f"Creating Libero envs with task ids {tasks_id} from suite {task}")
-        assert n_envs == len(tasks_id), (
-            f"len(n_envs) and tasks_id should be the same, got {n_envs} and {len(tasks_id)}"
-        )
-        return env_cls(
-            [
-                lambda i=i: LiberoEnv(
-                    task_suite=task_suite,
-                    task_id=tasks_id[i],
-                    task_suite_name=task,
-                    camera_name=camera_name,
-                    init_states=init_states,
-                    episode_index=episode_indices[i],
-                    **gym_kwargs,
-                )
-                for i in range(n_envs)
-            ]
-        )
-    else:
-        envs = defaultdict(dict)
-        benchmark_dict = benchmark.get_benchmark_dict()
-        task = task.split(",")
-        for _task in task:
-            task_suite = benchmark_dict[
-                _task
-            ]()  # can also choose libero_spatial, libero_object, libero_10 etc.
-            tasks_ids = list(range(len(task_suite.tasks)))
-            for tasks_id in tasks_ids:
-                episode_indices = list(range(n_envs))
-                print(
-                    f"Creating Libero envs with task ids {tasks_id} from suite {_task}, episode_indices: {episode_indices}"
-                )
-                envs_list = [
-                    (
-                        lambda i=i,
-                        task_suite=task_suite,
-                        tasks_id=tasks_id,
-                        _task=_task,
-                        episode_indices=episode_indices: LiberoEnv(
-                            task_suite=task_suite,
-                            task_id=tasks_id,
-                            task_suite_name=_task,
-                            camera_name=camera_name,
-                            init_states=init_states,
-                            episode_index=episode_indices[i],
-                            **gym_kwargs,
-                        )
-                    )
-                    for i in range(n_envs)
-                ]
-                envs[_task][tasks_id] = env_cls(envs_list)
-        return envs
+    gym_kwargs = dict(gym_kwargs or {})
+    task_ids_filter = gym_kwargs.pop("task_ids", None)  # optional: limit to specific tasks
 
+    # Avoid circular import/type issues: assume LiberoEnv is defined in this module
+    try:
+        LiberoEnv  # type: ignore[name-defined]
+    except NameError:
+        # If LiberoEnv is in the same file, this won't run. If it's elsewhere, import here.
+        exit()
+        # from .libero_env import LiberoEnv  # adjust if your class lives in another module
 
+    camera_names = _parse_camera_names(camera_name)
+    suite_names = [s.strip() for s in str(task).split(",") if s.strip()]
+    if not suite_names:
+        raise ValueError("`task` must contain at least one LIBERO suite name.")
+
+    logger.info(
+        "Creating LIBERO envs | suites=%s | n_envs(per task)=%d | init_states=%s | multitask_eval=%s",
+        suite_names, n_envs, init_states, bool(multitask_eval)
+    )
+    if task_ids_filter is not None:
+        logger.info("Restricting to task_ids=%s", task_ids_filter)
+
+    out: Dict[str, Dict[int, Any]] = defaultdict(dict)
+
+    for suite_name in suite_names:
+        suite = _get_suite(suite_name)
+        total = len(suite.tasks)
+        selected = _select_task_ids(total, task_ids_filter)
+
+        if not selected:
+            raise ValueError(f"No tasks selected for suite '{suite_name}' (available: {total}).")
+
+        for tid in selected:
+            fns = _make_env_fns(
+                suite=suite,
+                suite_name=suite_name,
+                task_id=tid,
+                n_envs=n_envs,
+                camera_names=camera_names,
+                init_states=init_states,
+                gym_kwargs=gym_kwargs,
+                LiberoEnv=LiberoEnv,
+            )
+            out[suite_name][tid] = env_cls(fns)
+            logger.debug("Built vec env | suite=%s | task_id=%d | n_envs=%d", suite_name, tid, n_envs)
+
+    # return plain dicts for predictability
+    return {suite: dict(task_map) for suite, task_map in out.items()}
 def quat2axisangle(quat):
     """
     Copied from robosuite: https://github.com/ARISE-Initiative/robosuite/blob/eafb81f54ffc104f905ee48a16bb15f059176ad3/robosuite/utils/transform_utils.py#L490C1-L512C55
@@ -199,17 +254,15 @@ class LiberoEnv(gym.Env):
         self.episode_index = episode_index
 
         self._env = self._make_envs_task(task_suite, self.task_id)
-        if task_suite_name == "libero_spatial":
-            max_steps = 220  # longest training demo has 193 steps
-        elif task_suite_name == "libero_object":
-            max_steps = 280  # longest training demo has 254 steps
-        elif task_suite_name == "libero_goal":
-            max_steps = 300  # longest training demo has 270 steps
-        elif task_suite_name == "libero_10":
-            max_steps = 520  # longest training demo has 505 steps
-        elif task_suite_name == "libero_90":
-            max_steps = 400  # longest training demo has 373 steps
-        self._max_episode_steps = max_steps
+        TASK_SUITE_MAX_STEPS: dict[str, int] = {
+            "libero_spatial": 220,  # longest training demo has 193 steps
+            "libero_object": 280,   # longest training demo has 254 steps
+            "libero_goal": 300,     # longest training demo has 270 steps
+            "libero_10": 520,       # longest training demo has 505 steps
+            "libero_90": 400,       # longest training demo has 373 steps
+        }
+        default_steps = 500
+        self._max_episode_steps = TASK_SUITE_MAX_STEPS.get(task_suite_name, default_steps)
 
         images = {}
         for cam in self.camera_name:
@@ -221,7 +274,11 @@ class LiberoEnv(gym.Env):
             )
 
         if self.obs_type == "state":
-            raise NotImplementedError()
+            raise NotImplementedError(
+                "The 'state' observation type is not supported in LiberoEnv. "
+                "Please switch to an image-based obs_type (e.g. 'pixels', 'pixels_agent_pos')."
+            )
+
         elif self.obs_type == "pixels":
             self.observation_space = spaces.Dict(
                 {
@@ -285,7 +342,10 @@ class LiberoEnv(gym.Env):
         )
         agent_pos = state
         if self.obs_type == "state":
-            raise NotImplementedError()
+            raise NotImplementedError(
+                "The 'state' observation type is not supported in LiberoEnv. "
+                "Please switch to an image-based obs_type (e.g. 'pixels', 'pixels_agent_pos')."
+            )
         elif self.obs_type == "pixels":
             obs = {"pixels": images.copy()}
         elif self.obs_type == "pixels_agent_pos":
@@ -308,7 +368,11 @@ class LiberoEnv(gym.Env):
         return observation, info
 
     def step(self, action):
-        assert action.ndim == 1
+        if action.ndim != 1:
+            raise ValueError(
+                f"Expected action to be 1-D (shape (action_dim,)), "
+                f"but got shape {action.shape} with ndim={action.ndim}"
+            )
         raw_obs, reward, done, info = self._env.step(action)
 
         is_success = self._env.check_success()
@@ -324,3 +388,96 @@ class LiberoEnv(gym.Env):
 
     def close(self):
         self._env.close()
+
+
+def create_libero_envs1(
+    task: str,
+    n_envs: int,
+    gym_kwargs: dict[str, Any] = None,
+    camera_name: str = "agentview_image,robot0_eye_in_hand_image",
+    init_states: bool = True,
+    env_cls: Callable = None,
+    multitask_eval: bool = True,
+) -> dict[str, dict[str, Any]]:
+    """
+    Here n_envs is per task and equal to the number of rollouts.
+    Returns:
+        dict[str, dict[str, list[LiberoEnv]]]: keys are task_suite and values are list of LiberoEnv envs.
+    """
+    print("num envs", n_envs)
+    print("multitask_eval", multitask_eval)
+    print("gym_kwargs", gym_kwargs)
+    if gym_kwargs is None:
+        gym_kwargs = {}
+
+    if not multitask_eval:
+        benchmark_dict = benchmark.get_benchmark_dict()
+        task_suite = benchmark_dict[task]()  # can also choose libero_spatial, libero_object, libero_10 etc.
+        tasks_id = list(range(len(task_suite.tasks)))
+        episode_indices = [0 for i in range(len(tasks_id))]
+        if len(tasks_id) == 1:
+            tasks_id = [tasks_id[0] for _ in range(n_envs)]
+            episode_indices = list(range(n_envs))
+        elif len(tasks_id) < n_envs and n_envs % len(tasks_id) == 0:
+            n_repeat = n_envs // len(tasks_id)
+            print("n_repeat", n_repeat)
+            episode_indices = []
+            for _ in range(len(tasks_id)):
+                episode_indices.extend(list(range(n_repeat)))
+            tasks_id = list(chain.from_iterable([[item] * n_repeat for item in tasks_id]))
+        elif n_envs < len(tasks_id):
+            tasks_id = tasks_id[:n_envs]
+            episode_indices = list(range(n_envs))[:n_envs]
+            print(f"WARNING: n_envs < len(tasks_id), evaluating only on {tasks_id}")
+        print(f"Creating Libero envs with task ids {tasks_id} from suite {task}")
+        assert n_envs == len(tasks_id), (
+            f"len(n_envs) and tasks_id should be the same, got {n_envs} and {len(tasks_id)}"
+        )
+        return env_cls(
+            [
+                lambda i=i: LiberoEnv(
+                    task_suite=task_suite,
+                    task_id=tasks_id[i],
+                    task_suite_name=task,
+                    camera_name=camera_name,
+                    init_states=init_states,
+                    episode_index=episode_indices[i],
+                    **gym_kwargs,
+                )
+                for i in range(n_envs)
+            ]
+        )
+    else:
+        envs = defaultdict(dict)
+        benchmark_dict = benchmark.get_benchmark_dict()
+        task = task.split(",")
+        for _task in task:
+            task_suite = benchmark_dict[
+                _task
+            ]()  # can also choose libero_spatial, libero_object, libero_10 etc.
+            tasks_ids = list(range(len(task_suite.tasks)))
+            for tasks_id in tasks_ids:
+                episode_indices = list(range(n_envs))
+                print(
+                    f"Creating Libero envs with task ids {tasks_id} from suite {_task}, episode_indices: {episode_indices}"
+                )
+                envs_list = [
+                    (
+                        lambda i=i,
+                        task_suite=task_suite,
+                        tasks_id=tasks_id,
+                        _task=_task,
+                        episode_indices=episode_indices: LiberoEnv(
+                            task_suite=task_suite,
+                            task_id=tasks_id,
+                            task_suite_name=_task,
+                            camera_name=camera_name,
+                            init_states=init_states,
+                            episode_index=episode_indices[i],
+                            **gym_kwargs,
+                        )
+                    )
+                    for i in range(n_envs)
+                ]
+                envs[_task][tasks_id] = env_cls(envs_list)
+        return envs
diff --git a/src/lerobot/envs/utils.py b/src/lerobot/envs/utils.py
index 5ae252dbe..9490f670e 100644
--- a/src/lerobot/envs/utils.py
+++ b/src/lerobot/envs/utils.py
@@ -182,3 +182,47 @@ def add_envs_task(env: gym.vector.VectorEnv, observation: dict[str, Any]) -> dic
         num_envs = observation[list(observation.keys())[0]].shape[0]
         observation["task"] = ["" for _ in range(num_envs)]
     return observation
+
+def _close_single_env(env: Any) -> None:
+    """Try to close a single env object if it exposes .close()."""
+    try:
+        close_fn = getattr(env, "close", None)
+        if callable(close_fn):
+            close_fn()
+    except Exception as exc:
+        # Best-effort close: log but don't raise
+        LOG.debug("Exception while closing env %s: %s", env, exc)
+
+def close_envs(env_or_collection: Any) -> None:
+    """
+    Close a single env or any nested structure of envs.
+
+    Accepts:
+      - a single env with .close()
+      - a Mapping of things (e.g. dict)
+      - a Sequence of things (list/tuple) but NOT str/bytes
+      - nested combinations of the above
+
+    This is intentionally permissive and best-effort: it will swallow exceptions
+    encountered while closing individual envs and continue.
+    """
+    # Guard: single object with close()
+    if hasattr(env_or_collection, "close") and not isinstance(env_or_collection, (Mapping, Sequence)):
+        _close_single_env(env_or_collection)
+        return
+
+    # Mapping (e.g., {suite: {task_id: vec_env}})
+    if isinstance(env_or_collection, Mapping):
+        for v in env_or_collection.values():
+            close_envs(v)
+        return
+
+    # Sequence (list/tuple) but skip str/bytes
+    if isinstance(env_or_collection, Sequence) and not isinstance(env_or_collection, (str, bytes)):
+        for v in env_or_collection:
+            close_envs(v)
+        return
+
+    # Fallback: try to close if possible
+    if hasattr(env_or_collection, "close"):
+        _close_single_env(env_or_collection)
\ No newline at end of file
diff --git a/src/lerobot/policies/factory.py b/src/lerobot/policies/factory.py
index b7d92d988..03fa44a2a 100644
--- a/src/lerobot/policies/factory.py
+++ b/src/lerobot/policies/factory.py
@@ -177,6 +177,5 @@ def make_policy(
         policy = policy_cls(**kwargs)
     policy.to(cfg.device)
     assert isinstance(policy, nn.Module)
-    breakpoint()
     # policy = torch.compile(policy, mode="reduce-overhead")
     return policy
diff --git a/src/lerobot/scripts/eval.py b/src/lerobot/scripts/eval.py
index 6fbae645b..795ed2b3c 100644
--- a/src/lerobot/scripts/eval.py
+++ b/src/lerobot/scripts/eval.py
@@ -45,7 +45,6 @@ Note that in both examples, the repo/folder should contain at least `config.json
 
 You can learn about the CLI options for this script in the `EvalPipelineConfig` in lerobot/configs/eval.py
 """
-
 import concurrent
 import json
 import logging
@@ -80,6 +79,9 @@ from lerobot.utils.utils import (
     init_logging,
     inside_slurm,
 )
+from typing import TypedDict, Dict, List, Tuple, Iterator
+from collections import defaultdict
+import concurrent.futures as cf
 
 
 def rollout(
@@ -537,8 +539,8 @@ def eval_main(cfg: EvalPipelineConfig):
     logging.info(colored("Output dir:", "yellow", attrs=["bold"]) + f" {cfg.output_dir}")
 
     logging.info("Making environment.")
-    env = make_env(cfg.env, n_envs=cfg.eval.batch_size, use_async_envs=cfg.eval.use_async_envs)
-
+    envs = make_env(cfg.env, n_envs=cfg.eval.batch_size, use_async_envs=cfg.eval.use_async_envs)
+    breakpoint()
     logging.info("Making policy.")
     policy = make_policy(
         cfg=cfg.policy,
@@ -550,41 +552,29 @@ def eval_main(cfg: EvalPipelineConfig):
     
     policy.eval()
     with torch.no_grad(), torch.autocast(device_type=device.type) if cfg.policy.use_amp else nullcontext():
-        if cfg.env.multitask_eval:
-            info = eval_policy_multitask(
-                env,
-                policy,
-                cfg.eval.n_episodes,
-                max_episodes_rendered=10,
-                videos_dir=Path(cfg.output_dir) / "videos",
-                start_seed=cfg.seed,
-                max_parallel_tasks=cfg.env.max_parallel_tasks,
-                verbose=False,
-            )
-            print("Overall Aggregated Metrics:")
-            print(info["overall"]["aggregated"])
-
-            # Print per-suite stats
-            for task_group, task_group_info in info.items():
-                if task_group == "overall":
-                    continue  # Skip the overall stats since we already printed it
-                print(f"\nAggregated Metrics for {task_group}:")
-                print(task_group_info["aggregated"])
-            for _task_group, v in env.items():
-                for _env in v.values():
-                    _env.close()
-        else:
-            info = eval_policy(
-                env,
-                policy,
-                cfg.eval.n_episodes,
-                max_episodes_rendered=10,
-                videos_dir=Path(cfg.output_dir) / "videos",
-                start_seed=cfg.seed,
-            )
-            print(info["aggregated"])
-            env.close()
+        info = eval_policy_all(
+            envs,
+            policy,
+            cfg.eval.n_episodes,
+            max_episodes_rendered=10,
+            videos_dir=Path(cfg.output_dir) / "videos",
+            start_seed=cfg.seed,
+            max_parallel_tasks=cfg.env.max_parallel_tasks,
+            verbose=False,
+        )
+        print("Overall Aggregated Metrics:")
+        print(info["overall"]["aggregated"])
 
+        # Print per-suite stats
+        for task_group, task_group_info in info.items():
+            if task_group == "overall":
+                continue  # Skip the overall stats since we already printed it
+            print(f"\nAggregated Metrics for {task_group}:")
+            print(task_group_info["aggregated"])
+    # Close all vec envs
+    for _suite, task_map in envs.items():
+        for _vec in task_map.values():
+            _vec.close()
     # Save info
     with open(Path(cfg.output_dir) / "eval_info.json", "w") as f:
         json.dump(info, f, indent=2)
@@ -592,9 +582,19 @@ def eval_main(cfg: EvalPipelineConfig):
     logging.info("End of eval")
 
 
-def eval_policy_multitask(
-    envs: dict[str, dict[str, gym.vector.VectorEnv]],
-    policy,
+# ---- typed payload returned by one task eval ----
+class TaskMetrics(TypedDict):
+    sum_rewards: List[float]
+    max_rewards: List[float]
+    successes: List[bool]
+    video_paths: List[str]
+
+ACC_KEYS = ("sum_rewards", "max_rewards", "successes", "video_paths")
+
+
+def eval_policy_all(
+    envs: dict[str, dict[int, gym.vector.VectorEnv]],
+    policy: PreTrainedPolicy,
     n_episodes: int,
     max_episodes_rendered: int = 0,
     videos_dir: Path | None = None,
@@ -603,129 +603,136 @@ def eval_policy_multitask(
     max_parallel_tasks: int = 5,
     verbose: bool = True,
 ) -> dict:
+    """
+    Evaluate a policy over a dict-of-dicts of vectorized envs:
+    envs[suite_name][task_id] -> gym.vector.VectorEnv
+    Returns a dict with per-suite aggregates and an 'overall' block.
+    """
     global_start = time.time()
-    results = {}
 
-    overall_rewards, overall_max_rewards, overall_successes = [], [], []
-    overall_video_paths = []
-    overall_episode_data = None
+    # inner: evaluate a single (suite, task) 
+    def eval_one(
+        task_group: str,
+        task_id: int,
+        env: gym.vector.VectorEnv,
+        *,
+        policy: PreTrainedPolicy,
+        n_episodes: int,
+        max_episodes_rendered: int,
+        videos_dir: Path | None,
+        return_episode_data: bool,
+        start_seed: int | None,
+    ) -> TaskMetrics:
+        """Evaluates one task_id of one suite using the provided vec env."""
+        if verbose:
+            print(f"Evaluating: task_group={task_group}, task_id={task_id} ...")
 
-    def eval_task(task_group, task_id, env):
-        """Evaluates a single task in parallel."""
-        print(f"Evaluating: task_group: {task_group}, task_id: {task_id} ...")
+        task_videos_dir = None
         if videos_dir is not None:
             task_videos_dir = videos_dir / f"{task_group}_{task_id}"
             task_videos_dir.mkdir(parents=True, exist_ok=True)
+
         task_result = eval_policy(
-            env,
-            policy,
-            n_episodes,
-            max_episodes_rendered,
-            task_videos_dir,
-            return_episode_data,
-            start_seed,
+            env=env,
+            policy=policy,
+            n_episodes=n_episodes,
+            max_episodes_rendered=max_episodes_rendered,
+            videos_dir=task_videos_dir,
+            return_episode_data=return_episode_data,
+            start_seed=start_seed,
         )
 
         per_episode = task_result["per_episode"]
-        return {
-            "task_group": task_group,
-            "task_id": task_id,
-            "sum_rewards": [ep["sum_reward"] for ep in per_episode],
-            "max_rewards": [ep["max_reward"] for ep in per_episode],
-            "successes": [ep["success"] for ep in per_episode],
-            "video_paths": task_result.get("video_paths", []),
-        }
+        return TaskMetrics(
+            sum_rewards=[ep["sum_reward"] for ep in per_episode],
+            max_rewards=[ep["max_reward"] for ep in per_episode],
+            successes=[ep["success"] for ep in per_episode],
+            video_paths=task_result.get("video_paths", []),
+        )
 
-    task_group_results = {}
-    if max_parallel_tasks == 1:
-        # sequential mode (safe for colab / EGL)
-        for task_group, tasks in envs.items():
-            for task_id, env in tasks.items():
-                task_result = eval_task(task_group, task_id, env)
-                if task_group not in task_group_results:
-                    task_group_results[task_group] = {
-                        "sum_rewards": [],
-                        "max_rewards": [],
-                        "successes": [],
-                        "video_paths": [],
-                    }
-                task_group_results[task_group]["sum_rewards"].extend(task_result["sum_rewards"])
-                task_group_results[task_group]["max_rewards"].extend(task_result["max_rewards"])
-                task_group_results[task_group]["successes"].extend(task_result["successes"])
-                task_group_results[task_group]["video_paths"].extend(task_result["video_paths"])
-    else:
-        with concurrent.futures.ThreadPoolExecutor(max_workers=max_parallel_tasks) as executor:
-            future_to_task = {
-                executor.submit(eval_task, task_group, task_id, env): (task_group, task_id)
-                for task_group, tasks in envs.items()
-                for task_id, env in tasks.items()
-            }
+    # result producer: sequential or threaded, same consumer 
+    def iter_task_results() -> Iterator[Tuple[str, int, TaskMetrics]]:
+        if max_parallel_tasks == 1:
+            for task_group, tasks in envs.items():
+                for task_id, vec in tasks.items():
+                    yield task_group, task_id, eval_one(
+                        task_group, task_id, vec,
+                        policy=policy,
+                        n_episodes=n_episodes,
+                        max_episodes_rendered=max_episodes_rendered,
+                        videos_dir=videos_dir,
+                        return_episode_data=return_episode_data,
+                        start_seed=start_seed,
+                    )
+        else:
+            with cf.ThreadPoolExecutor(max_workers=max_parallel_tasks) as executor:
+                fut2key: Dict[cf.Future, Tuple[str, int]] = {}
+                for task_group, tasks in envs.items():
+                    for task_id, vec in tasks.items():
+                        fut = executor.submit(
+                            eval_one, task_group, task_id, vec,
+                            policy=policy,
+                            n_episodes=n_episodes,
+                            max_episodes_rendered=max_episodes_rendered,
+                            videos_dir=videos_dir,
+                            return_episode_data=return_episode_data,
+                            start_seed=start_seed,
+                        )
+                        fut2key[fut] = (task_group, task_id)
+                for fut in cf.as_completed(fut2key):
+                    task_group, task_id = fut2key[fut]
+                    yield task_group, task_id, fut.result()
 
-            task_group_results = {}
+    # single accumulator path on the main thread 
+    group_acc: Dict[str, Dict[str, List]] = defaultdict(lambda: {k: [] for k in ACC_KEYS})
+    overall: Dict[str, List] = {k: [] for k in ACC_KEYS}
 
-            for future in concurrent.futures.as_completed(future_to_task):
-                task_result = future.result()
-                task_group = task_result["task_group"]
+    for task_group, task_id, metrics in iter_task_results():
+        acc = group_acc[task_group]
+        for k in ACC_KEYS:
+            acc[k].extend(metrics[k])
+            overall[k].extend(metrics[k])
 
-                if task_group not in task_group_results:
-                    task_group_results[task_group] = {
-                        "sum_rewards": [],
-                        "max_rewards": [],
-                        "successes": [],
-                        "video_paths": [],
-                    }
-
-                task_group_results[task_group]["sum_rewards"].extend(task_result["sum_rewards"])
-                task_group_results[task_group]["max_rewards"].extend(task_result["max_rewards"])
-                task_group_results[task_group]["successes"].extend(task_result["successes"])
-                task_group_results[task_group]["video_paths"].extend(task_result["video_paths"])
-
-    # Process results per task group
-    for task_group, data in task_group_results.items():
+    # build outputs
+    results: Dict[str, dict] = {}
+    for task_group, data in group_acc.items():
         suite_rewards = data["sum_rewards"]
-        suite_max_rewards = data["max_rewards"]
-        suite_successes = data["successes"]
-        suite_video_paths = data["video_paths"]
+        suite_max = data["max_rewards"]
+        suite_succ = data["successes"]
+        suite_vids = data["video_paths"]
 
         suite_eval_s = time.time() - global_start
         suite_eval_ep_s = suite_eval_s / max(1, len(suite_rewards))
 
         results[task_group] = {
             "aggregated": {
-                "avg_sum_reward": float(np.nanmean(suite_rewards)),
-                "avg_max_reward": float(np.nanmean(suite_max_rewards)),
-                "pc_success": float(np.nanmean(suite_successes) * 100),
+                "avg_sum_reward": float(np.nanmean(suite_rewards)) if suite_rewards else float("nan"),
+                "avg_max_reward": float(np.nanmean(suite_max)) if suite_max else float("nan"),
+                "pc_success": float(np.nanmean(suite_succ) * 100) if suite_succ else float("nan"),
                 "eval_s": suite_eval_s,
                 "eval_ep_s": suite_eval_ep_s,
             },
-            "video_paths": suite_video_paths,
-            "episodes": None,  # Modify if episode data is needed
+            "video_paths": suite_vids,
+            "episodes": None,
         }
 
-        overall_rewards.extend(suite_rewards)
-        overall_max_rewards.extend(suite_max_rewards)
-        overall_successes.extend(suite_successes)
-        overall_video_paths.extend(suite_video_paths)
-
-    # Global metrics
     global_eval_s = time.time() - global_start
-    global_eval_ep_s = global_eval_s / max(1, len(overall_rewards))
-
+    global_eval_ep_s = global_eval_s / max(1, len(overall["sum_rewards"]))
     results["overall"] = {
         "aggregated": {
-            "avg_sum_reward": float(np.nanmean(overall_rewards)),
-            "avg_max_reward": float(np.nanmean(overall_max_rewards)),
-            "pc_success": float(np.nanmean(overall_successes) * 100),
+            "avg_sum_reward": float(np.nanmean(overall["sum_rewards"])) if overall["sum_rewards"] else float("nan"),
+            "avg_max_reward": float(np.nanmean(overall["max_rewards"])) if overall["max_rewards"] else float("nan"),
+            "pc_success": float(np.nanmean(overall["successes"]) * 100) if overall["successes"] else float("nan"),
             "eval_s": global_eval_s,
             "eval_ep_s": global_eval_ep_s,
         },
-        "video_paths": overall_video_paths,
-        "episodes": overall_episode_data,
+        "video_paths": overall["video_paths"],
+        "episodes": None,
     }
-
     return results
 
 
+
 if __name__ == "__main__":
     init_logging()
     eval_main()
diff --git a/src/lerobot/scripts/train.py b/src/lerobot/scripts/train.py
index 74219fc38..3feeb0512 100644
--- a/src/lerobot/scripts/train.py
+++ b/src/lerobot/scripts/train.py
@@ -30,11 +30,12 @@ from lerobot.datasets.factory import make_dataset
 from lerobot.datasets.sampler import EpisodeAwareSampler
 from lerobot.datasets.utils import cycle
 from lerobot.envs.factory import make_env
+from lerobot.envs.utils import close_envs
 from lerobot.optim.factory import make_optimizer_and_scheduler
 from lerobot.policies.factory import make_policy
 from lerobot.policies.pretrained import PreTrainedPolicy
 from lerobot.policies.utils import get_device_from_parameters
-from lerobot.scripts.eval import eval_policy, eval_policy_multitask
+from lerobot.scripts.eval import eval_policy_all
 from lerobot.utils.logging_utils import AverageMeter, MetricsTracker
 from lerobot.utils.random_utils import set_seed
 from lerobot.utils.train_utils import (
@@ -270,62 +271,46 @@ def train(cfg: TrainPipelineConfig):
         if cfg.env and is_eval_step:
             step_id = get_step_identifier(step, cfg.steps)
             logging.info(f"Eval policy at step {step}")
-            with (
-                torch.no_grad(),
-                torch.autocast(device_type=device.type) if cfg.policy.use_amp else nullcontext(),
-            ):
-                if cfg.env.multitask_eval:
-                    eval_info = eval_policy_multitask(
-                        eval_env,
-                        policy,
-                        cfg.eval.n_episodes,
-                        videos_dir=cfg.output_dir / "eval" / f"videos_step_{step_id}",
-                        max_episodes_rendered=4,
-                        start_seed=cfg.seed,
-                        max_parallel_tasks=cfg.env.max_parallel_tasks,
-                    )
-                    aggregated = eval_info["overall"]["aggregated"]
-                    # Print per-suite stats, log?
-                    for task_group, task_group_info in eval_info.items():
-                        if task_group == "overall":
-                            continue  # Skip the overall stats since we already printed it
-                        print(f"\nAggregated Metrics for {task_group}:")
-                        print(task_group_info["aggregated"])
-                else:
-                    eval_info = eval_policy(
-                        eval_env,
-                        policy,
-                        cfg.eval.n_episodes,
-                        videos_dir=cfg.output_dir / "eval" / f"videos_step_{step_id}",
-                        max_episodes_rendered=4,
-                        start_seed=cfg.seed,
-                    )
-                    aggregated = eval_info["aggregated"]
+            with torch.no_grad(), (torch.autocast(device_type=device.type) if cfg.policy.use_amp else nullcontext()):
+                eval_info = eval_policy_all(
+                    eval_env,                         # dict[suite][task_id] -> vec_env
+                    policy,
+                    cfg.eval.n_episodes,
+                    videos_dir=videos_dir,
+                    max_episodes_rendered=4,
+                    start_seed=cfg.seed,
+                    max_parallel_tasks=cfg.env.max_parallel_tasks,
+                    verbose=False,
+                )
 
+            # overall metrics (suite-agnostic)
+            aggregated = eval_info["overall"]["aggregated"]
+
+            # optional: per-suite logging
+            for suite, suite_info in eval_info.items():
+                if suite == "overall":
+                    continue
+                logging.info("Suite %s aggregated: %s", suite, suite_info["aggregated"])
+
+            # meters/tracker
             eval_metrics = {
                 "avg_sum_reward": AverageMeter("∑rwrd", ":.3f"),
-                "pc_success": AverageMeter("success", ":.1f"),
-                "eval_s": AverageMeter("eval_s", ":.3f"),
+                "pc_success":     AverageMeter("success", ":.1f"),
+                "eval_s":         AverageMeter("eval_s", ":.3f"),
             }
             eval_tracker = MetricsTracker(
                 cfg.batch_size, dataset.num_frames, dataset.num_episodes, eval_metrics, initial_step=step
             )
-            eval_tracker.eval_s = aggregated.pop("eval_s")
-            eval_tracker.avg_sum_reward = aggregated.pop("avg_sum_reward")
-            eval_tracker.pc_success = aggregated.pop("pc_success")
-            logging.info(eval_tracker)
+            eval_tracker.eval_s         = aggregated.get("eval_s", 0.0)
+            eval_tracker.avg_sum_reward = aggregated.get("avg_sum_reward", float("nan"))
+            eval_tracker.pc_success     = aggregated.get("pc_success", float("nan"))
             if wandb_logger:
                 wandb_log_dict = {**eval_tracker.to_dict(), **eval_info}
                 wandb_logger.log_dict(wandb_log_dict, step, mode="eval")
                 wandb_logger.log_video(eval_info["video_paths"][0], step, mode="eval")
 
     if eval_env:
-        if cfg.env.multitask_eval:
-            for _task_group, envs_dict in eval_env.items():
-                for _idx, env in envs_dict.items():
-                    env.close()
-        else:
-            eval_env.close()
+        close_envs(eval_env)
     logging.info("End of training")
 
     if cfg.policy.push_to_hub:

From 565c992589ef2bf94a7fda75e857b3852702ee06 Mon Sep 17 00:00:00 2001
From: Jade Choghari <chogharijade@gmail.com>
Date: Thu, 11 Sep 2025 13:47:58 +0200
Subject: [PATCH 05/12] iterate on review

---
 examples/6_evaluate_libero_2.sh    |   76 -
 examples/7_train_acc.sh            |   93 -
 examples/7_train_libero_smolvla.sh |   89 -
 examples/8_train_smolvla_must.sh   |  145 --
 examples/9_evaluate_must.sh        | 2811 ----------------------------
 examples/checker.py                |   27 -
 examples/checker2.py               |   35 -
 examples/convert_data.py           |  253 ---
 examples/convert_libero.py         |  126 --
 examples/evaluate_libero.py        |  255 ---
 examples/requirements.in           |    8 -
 examples/script2.py                |   70 -
 examples/script3.py                |   64 -
 examples/script4.py                |    3 -
 log_text.txt                       | 1765 -----------------
 src/lerobot/envs/libero copy.py    |  326 ----
 src/lerobot/envs/libero2.py        |  308 ---
 tmux_log.txt                       | 2008 --------------------
 18 files changed, 8462 deletions(-)
 delete mode 100644 examples/6_evaluate_libero_2.sh
 delete mode 100644 examples/7_train_acc.sh
 delete mode 100644 examples/7_train_libero_smolvla.sh
 delete mode 100644 examples/8_train_smolvla_must.sh
 delete mode 100644 examples/9_evaluate_must.sh
 delete mode 100644 examples/checker.py
 delete mode 100644 examples/checker2.py
 delete mode 100644 examples/convert_data.py
 delete mode 100644 examples/convert_libero.py
 delete mode 100644 examples/evaluate_libero.py
 delete mode 100644 examples/requirements.in
 delete mode 100644 examples/script2.py
 delete mode 100644 examples/script3.py
 delete mode 100644 examples/script4.py
 delete mode 100644 log_text.txt
 delete mode 100644 src/lerobot/envs/libero copy.py
 delete mode 100644 src/lerobot/envs/libero2.py
 delete mode 100644 tmux_log.txt

diff --git a/examples/6_evaluate_libero_2.sh b/examples/6_evaluate_libero_2.sh
deleted file mode 100644
index 9d05c0330..000000000
--- a/examples/6_evaluate_libero_2.sh
+++ /dev/null
@@ -1,76 +0,0 @@
-#!/bin/bash
-
-# storage / caches
-RAID=/raid/jade
-export TRANSFORMERS_CACHE=$RAID/.cache/huggingface/transformers
-export HF_HOME=$RAID/.cache/huggingface
-export HF_DATASETS_CACHE=$RAID/.cache/huggingface/datasets
-export HF_LEROBOT_HOME=$RAID/.cache/huggingface/lerobot
-export WANDB_CACHE_DIR=$RAID/.cache/wandb
-export TMPDIR=$RAID/.cache/tmp
-mkdir -p $TMPDIR
-export WANDB_MODE=offline
-export HF_DATASETS_OFFLINE=1
-export HF_HUB_OFFLINE=1
-export TOKENIZERS_PARALLELISM=false
-export MUJOCO_GL=egl
-export CUDA_VISIBLE_DEVICES=3
-
-# CONFIGURATION
-POLICY_PATH="/raid/jade/logs/lerobot/lerobot_2_HuggingFaceVLA_libero_smolvla_lr1e-4bs32steps100000/checkpoints/100000/pretrained_model"
-POLICY_PATH="AustineJohnBreaker/smolvla_stratch_libero_spatial"
-TASK=libero_spatial
-ENV_TYPE="libero"
-BATCH_SIZE=10
-N_EPISODES=10
-USE_AMP=false
-N_ACTION_STEPS=1
-SELF_ATTN_EVERY_N_LAYERS=2
-VLM_NAME=HuggingFaceTB/SmolVLM-500M-Instruct
-PAD_LANG_TO=longest
-LOAD_VLM_WEIGHTS=true
-NUM_VLM_LAYERS=16
-CHUNK_SIZE=50
-N_OBS_STEPS=1
-NUM_EXPERT_LAYERS=0
-EXPERT_WIDTH_MULTIPLIER=0.5
-
-
-# storage / caches
-RAID=/raid/jade
-export TRANSFORMERS_CACHE=$RAID/.cache/huggingface/transformers
-export HF_HOME=$RAID/.cache/huggingface
-export HF_DATASETS_CACHE=$RAID/.cache/huggingface/datasets
-export HF_LEROBOT_HOME=$RAID/.cache/huggingface/lerobot
-export WANDB_CACHE_DIR=$RAID/.cache/wandb
-export TMPDIR=$RAID/.cache/tmp
-mkdir -p $TMPDIR
-export WANDB_MODE=offline
-# export HF_DATASETS_OFFLINE=1
-# export HF_HUB_OFFLINE=1
-export TOKENIZERS_PARALLELISM=false
-export MUJOCO_GL=egl
-export MUJOCO_GL=egl
-ADD_IMAGE_TOKENS=true
-unset HF_HUB_OFFLINE
-# RUN EVALUATION
-python src/lerobot/scripts/eval.py \
-    --policy.path="$POLICY_PATH" \
-    --env.type="$ENV_TYPE" \
-    --eval.batch_size="$BATCH_SIZE" \
-    --eval.n_episodes="$N_EPISODES" \
-    --env.multitask_eval=False \
-    --env.task=$TASK \
-    --policy.use_amp=$USE_AMP \
-    --policy.n_action_steps=$N_ACTION_STEPS \
-    # --policy.add_image_special_tokens=$ADD_IMAGE_TOKENS \
-    --policy.attention_mode=$ATTN_MODE \
-    --policy.self_attn_every_n_layers=$SELF_ATTN_EVERY_N_LAYERS \
-    --policy.vlm_model_name=$VLM_NAME \
-    --policy.pad_language_to=$PAD_LANG_TO \
-    --policy.load_vlm_weights=$LOAD_VLM_WEIGHTS \
-    --policy.num_vlm_layers=$NUM_VLM_LAYERS \
-    --policy.chunk_size=$CHUNK_SIZE \
-    --policy.n_obs_steps=$N_OBS_STEPS \
-    --policy.num_expert_layers=$NUM_EXPERT_LAYERS \
-    --policy.expert_width_multiplier=$EXPERT_WIDTH_MULTIPLIER \
diff --git a/examples/7_train_acc.sh b/examples/7_train_acc.sh
deleted file mode 100644
index 27f445143..000000000
--- a/examples/7_train_acc.sh
+++ /dev/null
@@ -1,93 +0,0 @@
-#!/bin/bash
-# smolvla training with accelerate
-
-set -euo pipefail
-
-# repo/env
-cd ~/lerobot || exit 1
-# conda activate lerobot
-export LC_ALL=C
-
-rm -f core-*
-
-# storage / caches
-RAID=/raid/jade
-export TRANSFORMERS_CACHE=$RAID/.cache/huggingface/transformers
-export HF_HOME=$RAID/.cache/huggingface
-export HF_DATASETS_CACHE=$RAID/.cache/huggingface/datasets
-export HF_LEROBOT_HOME=$RAID/.cache/huggingface/lerobot
-export WANDB_CACHE_DIR=$RAID/.cache/wandb
-export TMPDIR=$RAID/.cache/tmp
-mkdir -p $TMPDIR
-export WANDB_MODE=offline
-export HF_DATASETS_OFFLINE=1
-export HF_HUB_OFFLINE=1
-export TOKENIZERS_PARALLELISM=false
-export MUJOCO_GL=egl
-
-# CONFIG
-ENV=libero
-TASK=libero_spatial
-REPO_ID=physical-intelligence/libero
-
-POLICY=smolvla
-VLM=HuggingFaceTB/SmolVLM2-500M-Instruct
-
-# Optim / scheduling
-LR=1e-4
-DECAY_LR=2.5e-6
-DECAY_STEPS=30000
-USE_AMP=true   # set to true for mixed precision
-TRAIN_EXPERT_ONLY=true
-N_ACTION_STEPS=1
-SEED=1000
-
-# Training loop
-OFFLINE_STEPS=100000
-BATCH_SIZE=32
-EVAL_FREQ=0
-SAVE_FREQ=20000
-EVAL_BATCH_SIZE=1
-NUM_EPISODES=1
-
-# number of gpus to use
-NUM_PROCESSES=2
-export CUDA_VISIBLE_DEVICES=1,3
-PORT=29522
-
-# naming/output dir
-TRAIN_DIR=$RAID/logs/lerobot/lerobot_2_${REPO_ID//\//_}_${POLICY}_lr${LR}bs${BATCH_SIZE}steps${OFFLINE_STEPS}
-echo "Training dir: $TRAIN_DIR"
-
-rm -rf "$TRAIN_DIR"
-
-# RUN
-python -m accelerate.commands.launch \
-  --num_processes $NUM_PROCESSES \
-  --num_machines 1 \
-  --main_process_port $PORT \
-  --mixed_precision=$( [ "$USE_AMP" = true ] && echo "bf16" || echo "no" ) \
-  src/lerobot/scripts/train_accelerate.py \
-    --policy.type=$POLICY \
-    --policy.use_amp=True \
-    --policy.vlm_model_name=$VLM \
-    --dataset.repo_id=$REPO_ID \
-    --dataset.root=$HF_DATASETS_CACHE \
-    --env.type=$ENV \
-    --env.task=$TASK \
-    --output_dir=$TRAIN_DIR \
-    --batch_size=$BATCH_SIZE \
-    --steps=$OFFLINE_STEPS \
-    --eval_freq=$EVAL_FREQ \
-    --save_freq=$SAVE_FREQ \
-    --eval.batch_size=$EVAL_BATCH_SIZE \
-    --eval.n_episodes=$NUM_EPISODES \
-    --policy.optimizer_lr=$LR \
-    --policy.repo_id=None \
-    --policy.scheduler_decay_lr=$DECAY_LR \
-    --policy.scheduler_decay_steps=$DECAY_STEPS \
-    --policy.n_action_steps=$N_ACTION_STEPS \
-    --policy.train_expert_only=$TRAIN_EXPERT_ONLY \
-    --policy.vlm_model_name=$VLM \
-    --seed=$SEED \
-    --wandb.enable=false
diff --git a/examples/7_train_libero_smolvla.sh b/examples/7_train_libero_smolvla.sh
deleted file mode 100644
index f0b9de4e5..000000000
--- a/examples/7_train_libero_smolvla.sh
+++ /dev/null
@@ -1,89 +0,0 @@
-#!/bin/bash
-# smolvla training
-
-set -euo pipefail
-
-# repo/env
-cd ~/lerobot || exit 1
-# conda activate lerobot
-export LC_ALL=C
-
-
-rm -f core-*
-
-# storage / caches (use RAID to avoid filling $HOME)
-RAID=/raid/jade
-export TRANSFORMERS_CACHE=$RAID/.cache/huggingface/transformers
-export HF_HOME=$RAID/.cache/huggingface
-export HF_DATASETS_CACHE=$RAID/.cache/huggingface/datasets
-export HF_LEROBOT_HOME=$RAID/.cache/huggingface/lerobot
-export WANDB_CACHE_DIR=$RAID/.cache/wandb
-export TMPDIR=$RAID/.cache/tmp
-mkdir -p $TMPDIR
-export WANDB_MODE=offline
-# export HF_DATASETS_OFFLINE=1
-# export HF_HUB_OFFLINE=1
-export TOKENIZERS_PARALLELISM=false
-export MUJOCO_GL=egl
-
-# will only use if accelerate is used
-PORT=29522
-
-# =================== CONFIG ===================
-ENV=libero
-TASK=libero_object
-REPO_ID=physical-intelligence/libero
-ROOT=$RAID
-POLICY=smolvla
-VLM=HuggingFaceTB/SmolVLM2-500M-Instruct
-
-# Optim / scheduling
-LR=1e-4
-DECAY_LR=2.5e-6
-DECAY_STEPS=30000
-USE_AMP=false
-TRAIN_EXPERT_ONLY=true
-N_ACTION_STEPS=1
-SEED=1000
-
-# Training loop
-OFFLINE_STEPS=100000
-BATCH_SIZE=32
-EVAL_FREQ=0
-SAVE_FREQ=300000
-EVAL_BATCH_SIZE=1
-NUM_EPISODES=1
-
-# GPU selection 0, 1, 2, 3
-export CUDA_VISIBLE_DEVICES=0
-
-# naming/output dir
-TRAIN_DIR=$RAID/logs/lerobot/lerobot_solo_${REPO_ID//\//_}_${POLICY}_lr${LR}bs${BATCH_SIZE}steps${OFFLINE_STEPS}
-echo "Training dir: $TRAIN_DIR"
-
-# train
-rm -rf "$TRAIN_DIR"
-
-python src/lerobot/scripts/train.py \
-  --policy.type=$POLICY \
-  --policy.vlm_model_name=$VLM \
-  --dataset.repo_id=$REPO_ID \
-  --env.type=$ENV \
-  --env.task=$TASK \
-  --output_dir=$TRAIN_DIR \
-  --batch_size=$BATCH_SIZE \
-  --steps=$OFFLINE_STEPS \
-  --eval_freq=$EVAL_FREQ \
-  --save_freq=$SAVE_FREQ \
-  --eval.batch_size=$EVAL_BATCH_SIZE \
-  --eval.n_episodes=$NUM_EPISODES \
-  --policy.use_amp=$USE_AMP \
-  --policy.optimizer_lr=$LR \
-  --policy.repo_id=None \
-  --policy.scheduler_decay_lr=$DECAY_LR \
-  --policy.scheduler_decay_steps=$DECAY_STEPS \
-  --policy.n_action_steps=$N_ACTION_STEPS \
-  --policy.train_expert_only=$TRAIN_EXPERT_ONLY \
-  --policy.vlm_model_name=$VLM \
-  --seed=$SEED \
-  --wandb.enable=false
diff --git a/examples/8_train_smolvla_must.sh b/examples/8_train_smolvla_must.sh
deleted file mode 100644
index 828627d85..000000000
--- a/examples/8_train_smolvla_must.sh
+++ /dev/null
@@ -1,145 +0,0 @@
-#!/bin/bash
-# smolvla training with accelerate
-
-set -euo pipefail
-
-# repo/env
-cd ~/lerobot || exit 1
-# conda activate lerobot
-export LC_ALL=C
-
-rm -f core-*
-
-# storage / caches
-RAID=/raid/jade
-export TRANSFORMERS_CACHE=$RAID/.cache/huggingface/transformers
-export HF_HOME=$RAID/.cache/huggingface
-export HF_DATASETS_CACHE=$RAID/.cache/huggingface/datasets
-export HF_LEROBOT_HOME=$RAID/.cache/huggingface/lerobot
-export WANDB_CACHE_DIR=$RAID/.cache/wandb
-export TMPDIR=$RAID/.cache/tmp
-mkdir -p $TMPDIR
-export WANDB_MODE=offline
-# export HF_DATASETS_OFFLINE=1
-# export HF_HUB_OFFLINE=1
-export TOKENIZERS_PARALLELISM=false
-export MUJOCO_GL=egl
-
-# CONFIG
-ENV=libero
-TASK=libero_spatial
-REPO_ID=HuggingfaceVLA/libero
-
-POLICY=smolvla
-VLM=HuggingFaceTB/SmolVLM2-500M-Instruct
-
-# Optim / scheduling
-LR=1e-4
-DECAY_LR=2.5e-6
-DECAY_STEPS=30000
-USE_AMP=true   # set to true for mixed precision
-TRAIN_EXPERT_ONLY=true
-N_ACTION_STEPS=1
-SEED=1000
-LOAD_VLM_WEIGHTS=true
-# Training loop
-OFFLINE_STEPS=100000
-BATCH_SIZE=32
-EVAL_FREQ=0
-SAVE_FREQ=20000
-EVAL_BATCH_SIZE=1
-NUM_EPISODES=1
-ADD_IMAGE_TOKENS=tru
-N_OBS_STEPS=1
-ATTN_MODE=cross_attn
-EXPERT_WIDTH_MULTIPLIER=0.5
-# number of gpus to use
-NUM_PROCESSES=2
-NUM_VLM_LAYERS=0
-SELF_ATTN_EVERY_N_LAYERS=0
-CHUNK_SIZE=50
-export CUDA_VISIBLE_DEVICES=1
-PORT=29522
-PREFIX_LENGTH=0
-LOAD_VLM_WEIGHTS=true
-MAX_ACTION_DIM=32
-MAX_STATE_DIM=32
-# naming/output dir
-TRAIN_DIR=$RAID/logs/lerobot/lerobot_new_sep11_v2_${REPO_ID//\//_}_${POLICY}_lr${LR}bs${BATCH_SIZE}steps${OFFLINE_STEPS}
-echo "Training dir: $TRAIN_DIR"
-
-rm -rf "$TRAIN_DIR"
-
-# RUN
-# python -m accelerate.commands.launch \
-#   --num_processes $NUM_PROCESSES \
-#   --num_machines 1 \
-#   --main_process_port $PORT \
-#   --mixed_precision=$( [ "$USE_AMP" = true ] && echo "bf16" || echo "no" ) \
-#   src/lerobot/scripts/train_accelerate.py \
-#     --policy.type=$POLICY \
-#     --policy.use_amp=True \
-#     --policy.vlm_model_name=$VLM \
-#     --dataset.repo_id=$REPO_ID \
-#     --dataset.root=$HF_DATASETS_CACHE \
-#     --env.type=$ENV \
-#     --env.task=$TASK \
-#     --output_dir=$TRAIN_DIR \
-#     --batch_size=$BATCH_SIZE \
-#     --steps=$OFFLINE_STEPS \
-#     --eval_freq=$EVAL_FREQ \
-#     --save_freq=$SAVE_FREQ \
-#     --eval.batch_size=$EVAL_BATCH_SIZE \
-#     --eval.n_episodes=$NUM_EPISODES \
-#     --policy.optimizer_lr=$LR \
-#     --policy.repo_id=None \
-#     --policy.scheduler_decay_lr=$DECAY_LR \
-#     --policy.scheduler_decay_steps=$DECAY_STEPS \
-#     --policy.n_action_steps=$N_ACTION_STEPS \
-#     --policy.train_expert_only=$TRAIN_EXPERT_ONLY \
-#     --policy.vlm_model_name=$VLM \
-#     --policy.n_obs_steps=$N_OBS_STEPS \
-#     --policy.attention_mode=$ATTN_MODE \
-#     --policy.prefix_length=$PREFIX_LENGTH \
-#     --policy.num_vlm_layers=$NUM_VLM_LAYERS \
-#     --policy.chunk_size=$CHUNK_SIZE \
-#     --policy.expert_width_multiplier=$EXPERT_WIDTH_MULTIPLIER \
-#     --policy.self_attn_every_n_layers=$SELF_ATTN_EVERY_N_LAYERS \
-#     --seed=$SEED \
-#     --wandb.enable=false
-
-
-python src/lerobot/scripts/train.py \
-    --policy.type=$POLICY \
-    --policy.use_amp=False \
-    --policy.vlm_model_name=$VLM \
-    --dataset.repo_id=$REPO_ID \
-    --dataset.root='/raid/jade/.cache/huggingface/lerobot/HuggingFaceVLA/libero/data' \
-    --env.type=$ENV \
-    --env.task=$TASK \
-    --output_dir=$TRAIN_DIR \
-    --batch_size=$BATCH_SIZE \
-    --steps=$OFFLINE_STEPS \
-    --eval_freq=$EVAL_FREQ \
-    --save_freq=$SAVE_FREQ \
-    --eval.batch_size=$EVAL_BATCH_SIZE \
-    --eval.n_episodes=$NUM_EPISODES \
-    --policy.optimizer_lr=$LR \
-    --policy.repo_id=None \
-    --policy.scheduler_decay_lr=$DECAY_LR \
-    --policy.scheduler_decay_steps=$DECAY_STEPS \
-    --policy.n_action_steps=$N_ACTION_STEPS \
-    --policy.train_expert_only=$TRAIN_EXPERT_ONLY \
-    --policy.vlm_model_name=$VLM \
-    --policy.n_obs_steps=$N_OBS_STEPS \
-    --policy.attention_mode=$ATTN_MODE \
-    --policy.prefix_length=$PREFIX_LENGTH \
-    --policy.num_vlm_layers=$NUM_VLM_LAYERS \
-    --policy.chunk_size=$CHUNK_SIZE \
-    --policy.load_vlm_weights=$LOAD_VLM_WEIGHTS \
-    --policy.expert_width_multiplier=$EXPERT_WIDTH_MULTIPLIER \
-    --policy.self_attn_every_n_layers=$SELF_ATTN_EVERY_N_LAYERS \
-    --policy.max_action_dim=$MAX_ACTION_DIM \
-    --policy.max_state_dim=$MAX_STATE_DIM \
-    --seed=$SEED \
-    --wandb.enable=false
diff --git a/examples/9_evaluate_must.sh b/examples/9_evaluate_must.sh
deleted file mode 100644
index 534153330..000000000
--- a/examples/9_evaluate_must.sh
+++ /dev/null
@@ -1,2811 +0,0 @@
-#!/bin/bash
-
-#SBATCH --job-name=lerobot_eval_smolpi0_libero_eval10ep_ca_sa2_16vlm_w075_smolvlm2b_lr7e5
-#SBATCH --nodes=1
-#SBATCH --ntasks=1
-#SBATCH --gpus-per-node=1
-#SBATCH --mail-type=END,FAIL
-#SBATCH --output=/lustre/fswork/projects/rech/dyf/ugz83ue/logs/slurm/lerobot_eval_smolpi0_libero_eval10ep_ca_sa2_16vlm_w075_smolvlm2b_lr7e5.out
-###SBATCH --nodelist=jean-zay-a101
-#SBATCH --cpus-per-task=45
-###SBATCH --exclusive
-#SBATCH --time=15:00:00
-#SBATCH --mail-user=mustafa.shukor@isir.upmc.fr
-
-##SBATCH --partition=gpu_p2
-##SBATCH --qos=qos_gpu-t3
-###SBATCH -C v100-32g
-##SBATCH -A dyf@v100
-
-##SBATCH --partition=gpu_p5
-##SBATCH -C a100
-###SBATCH -A dyf@a100
-##SBATCH -A lqm@a100
-##SBATCH --qos=qos_gpu_a100-dev 
-##SBATCH --qos=qos_gpu_a100-t3 
-
-#SBATCH --partition=gpu_p6
-#SBATCH -C h100
-#SBATCH -A lqm@h100
-###SBATCH --qos=qos_gpu_h100-dev 
-#SBATCH --qos=qos_gpu_h100-t3 
-
-###SBATCH --begin=now+2hour
-
-# cd ~/lerobot_pi
-# source ~/.bashrc
-# source activate lerobot
-# export LC_ALL=C
-
-# rm core-*
-export CUDA_VISIBLE_DEVICES=3
-# storage / caches
-RAID=/raid/jade
-export TRANSFORMERS_CACHE=$RAID/.cache/huggingface/transformers
-export HF_HOME=$RAID/.cache/huggingface
-export HF_DATASETS_CACHE=$RAID/.cache/huggingface/datasets
-export HF_LEROBOT_HOME=$RAID/.cache/huggingface/lerobot
-export WANDB_CACHE_DIR=$RAID/.cache/wandb
-export TMPDIR=$RAID/.cache/tmp
-mkdir -p $TMPDIR
-export WANDB_MODE=offline
-export HF_DATASETS_OFFLINE=1
-# export HF_HUB_OFFLINE=1
-export TOKENIZERS_PARALLELISM=false
-export MUJOCO_GL=egl
-export CUDA_VISIBLE_DEVICES=3
-
-PORT=29512
-
-## then later
-## wandb sync wandb/offline-run-*
-
-
-ENV=libero
-
-# TASK=libero_10
-TASK=libero_spatial
-# TASK=libero_spatial
-# TASK=libero_10
-# TASK=libero_spatial
-
-
-POLICY_NAME=smolpi0
-
-POLICY=smolpi0
-ENV=libero
-
-
-
-
-
-
-CKPT_KEYS_MAPPING=model._orig_mod.//model.
-LOAD_VLM_WEIGHTS=true
-PEFT_METHOD=freeze
-SELF_ATTN_ONLY_ACTIONS=false
-CAUSAL_ATTENTION_ON_HISTORY=false
-
-PREDICT_RELATIVE_ACTIONS=false
-RELATIVE_ACTIONS_MODE=first
-SHUFFLE_CAMERA_POSITIONS=false
-
-VLM_IMG_SIZE=-1
-REGRESSION_LOSS=false
-
-
-# ## Baseline for ablation study
-# EXPERT_WIDTH_MULTIPLIER=0.5
-# PAST_OBS_KEYS="image"
-# N_OBS_STEPS=1
-# NUM_EXPERT_LAYERS=0
-# CHUNK_SIZE=50
-# NUM_VLM_LAYERS=0
-# PAD_LANG_TO=max_length
-# EVAL_CKPT=$WORK/logs/lerobot/lerobot_physical_intelligence_libero_smolpi0_lr1e-4_bs8_steps100000_gpus2_freeze32_onlyexpert_1act_promptfalse_imgtoktrue_nobs1_compiletrue_cross_attn_pref0_gap1_localimgfalse_reverseimgorderfalse_statetopreftrue/checkpoints/last/pretrained_model/
-# ADD_IMAGE_TOKENS=true
-# ATTN_MODE=cross_attn
-# STATE_TO_PREFIX=true
-# CAUSAL_ACTION_ATTENTION_MASK=false
-# SELF_ATTN_EVERY_N_LAYERS=0
-# VLM_NAME=HuggingFaceTB/SmolVLM2-500M-Video-Instruct
-
-# EXPERT_WIDTH_MULTIPLIER=0.5
-# PAST_OBS_KEYS="image"
-# N_OBS_STEPS=1
-# NUM_EXPERT_LAYERS=0
-# CHUNK_SIZE=50
-# NUM_VLM_LAYERS=0
-# PAD_LANG_TO=max_length
-# EVAL_CKPT=$WORK/logs/lerobot/lerobot_physical_intelligence_libero_smolpi0_lr1e-4_bs8_steps100000_gpus2_freeze32_onlyexpert_1act_promptfalse_imgtoktrue_nobs1_compiletrue_cross_attn/checkpoints/last/pretrained_model/
-# ADD_IMAGE_TOKENS=true
-# ATTN_MODE=cross_attn
-# STATE_TO_PREFIX=false
-# CAUSAL_ACTION_ATTENTION_MASK=false
-# SELF_ATTN_EVERY_N_LAYERS=0
-# VLM_NAME=HuggingFaceTB/SmolVLM2-500M-Video-Instruct
-
-
-# EXPERT_WIDTH_MULTIPLIER=0.5
-# PAST_OBS_KEYS="image"
-# N_OBS_STEPS=1
-# NUM_EXPERT_LAYERS=0
-# CHUNK_SIZE=50
-# NUM_VLM_LAYERS=0
-# PAD_LANG_TO=max_length
-# EVAL_CKPT=$WORK/logs/lerobot/lerobot_physical_intelligence_libero_smolpi0_lr1e-4_bs8_steps100000_gpus2_freeze32_onlyexpert_1act_promptfalse_imgtoktrue_nobs1_compiletrue_self_attn/checkpoints/last/pretrained_model/
-# ADD_IMAGE_TOKENS=true
-# ATTN_MODE=self_attn
-# STATE_TO_PREFIX=false
-# CAUSAL_ACTION_ATTENTION_MASK=false
-# SELF_ATTN_EVERY_N_LAYERS=0
-# VLM_NAME=HuggingFaceTB/SmolVLM2-500M-Video-Instruct
-
-
-# EXPERT_WIDTH_MULTIPLIER=0.5
-# PAST_OBS_KEYS="image"
-# N_OBS_STEPS=1
-# NUM_EXPERT_LAYERS=0
-# CHUNK_SIZE=50
-# NUM_VLM_LAYERS=0
-# PAD_LANG_TO=longest
-# EVAL_CKPT=$WORK/logs/lerobot/lerobot_physical_intelligence_libero_smolpi0_lr1e-4bs8steps100000gpus2freeze32_imgtoktrue_self_attn_gap1_localimgfalse_statetopreffalse_explay0_vlml0_causalacttrue_sa0/checkpoints/last/pretrained_model/
-# ADD_IMAGE_TOKENS=true
-# ATTN_MODE=self_attn
-# STATE_TO_PREFIX=false
-# CAUSAL_ACTION_ATTENTION_MASK=true
-# SELF_ATTN_EVERY_N_LAYERS=0
-# VLM_NAME=HuggingFaceTB/SmolVLM2-500M-Video-Instruct
-
-# EXPERT_WIDTH_MULTIPLIER=0.5
-# PAST_OBS_KEYS="image"
-# N_OBS_STEPS=1
-# NUM_EXPERT_LAYERS=0
-# CHUNK_SIZE=50
-# NUM_VLM_LAYERS=0
-# PAD_LANG_TO=longest
-# EVAL_CKPT=$WORK/logs/lerobot/lerobot_physical_intelligence_libero_smolpi0_lr5e-4bs8steps100000gpus2freeze32_imgtoktrue_cross_attn_gap1_localimgfalse_statetopreftrue_explay0_vlml0_causalactfalse_sa0_smolvlm2250/checkpoints/best/pretrained_model/
-# ADD_IMAGE_TOKENS=true
-# ATTN_MODE=cross_attn
-# STATE_TO_PREFIX=true
-# CAUSAL_ACTION_ATTENTION_MASK=false
-# SELF_ATTN_EVERY_N_LAYERS=0
-# VLM_NAME=HuggingFaceTB/SmolVLM2-256M-Video-Instruct
-
-# EXPERT_WIDTH_MULTIPLIER=0.5
-# PAST_OBS_KEYS="image"
-# N_OBS_STEPS=1
-# NUM_EXPERT_LAYERS=0
-# CHUNK_SIZE=50
-# NUM_VLM_LAYERS=0
-# PAD_LANG_TO=longest
-# EVAL_CKPT=$WORK/logs/lerobot/lerobot_physical_intelligence_libero_smolpi0_lr8e-5bs8steps100000gpus2freeze32_imgtoktrue_cross_attn_gap1_localimgfalse_statetopreftrue_explay0_vlml0_causalactfalse_sa0_smolvlm22b/checkpoints/best/pretrained_model/
-# ADD_IMAGE_TOKENS=true
-# ATTN_MODE=cross_attn
-# STATE_TO_PREFIX=true
-# CAUSAL_ACTION_ATTENTION_MASK=false
-# SELF_ATTN_EVERY_N_LAYERS=0
-# VLM_NAME=HuggingFaceTB/SmolVLM2-2.2B-Instruct
-
-# EXPERT_WIDTH_MULTIPLIER=0.5
-# PAST_OBS_KEYS="image"
-# N_OBS_STEPS=1
-# NUM_EXPERT_LAYERS=0
-# CHUNK_SIZE=50
-# NUM_VLM_LAYERS=0
-# PAD_LANG_TO=longest
-# EVAL_CKPT=$WORK/logs/lerobot/lerobot_physical_intelligence_libero_smolpi0_lr1e-4_bs8_steps100000_gpus2_freeze32_onlyexpert_1act_promptfalse_imgtoktrue_nobs1_compiletrue_self_attn_pref0_gap1_localimgfalse_reverseimgorderfalse_statetopreftrue_toklongest_explay0_vlml0_causalacttrue/checkpoints/last/pretrained_model/
-# ADD_IMAGE_TOKENS=true
-# ATTN_MODE=self_attn
-# STATE_TO_PREFIX=true
-# CAUSAL_ACTION_ATTENTION_MASK=true
-# SELF_ATTN_EVERY_N_LAYERS=0
-# VLM_NAME=HuggingFaceTB/SmolVLM2-500M-Video-Instruct
-
-# EXPERT_WIDTH_MULTIPLIER=0.5
-# PAST_OBS_KEYS="image"
-# N_OBS_STEPS=1
-# NUM_EXPERT_LAYERS=0
-# CHUNK_SIZE=50
-# NUM_VLM_LAYERS=0
-# PAD_LANG_TO=longest
-# EVAL_CKPT=$WORK/logs/lerobot/lerobot_physical_intelligence_libero_smolpi0_lr1e-4bs8steps100000gpus2freeze32_imgtoktrue_cross_attn_gap1_localimgfalse_statetopreftrue_explay0_vlml0_causalacttrue_sa2/checkpoints/best/pretrained_model/
-# ADD_IMAGE_TOKENS=true
-# ATTN_MODE=cross_attn
-# STATE_TO_PREFIX=true
-# CAUSAL_ACTION_ATTENTION_MASK=true
-# SELF_ATTN_EVERY_N_LAYERS=2
-# VLM_NAME=HuggingFaceTB/SmolVLM2-500M-Video-Instruct
-
-# EXPERT_WIDTH_MULTIPLIER=0.5
-# PAST_OBS_KEYS="image"
-# N_OBS_STEPS=1
-# NUM_EXPERT_LAYERS=0
-# CHUNK_SIZE=50
-# NUM_VLM_LAYERS=0
-# PAD_LANG_TO=longest
-# EVAL_CKPT=$WORK/logs/lerobot/lerobot_physical_intelligence_libero_smolpi0_lr1e-4_bs8_steps100000_gpus2_freeze32_onlyexpert_1act_promptfalse_imgtoktrue_nobs1_compiletrue_cross_attn_pref0_gap1_localimgfalse_reverseimgorderfalse_statetopreffalse/checkpoints/last/pretrained_model/
-# ADD_IMAGE_TOKENS=true
-# ATTN_MODE=cross_attn
-# STATE_TO_PREFIX=false
-# CAUSAL_ACTION_ATTENTION_MASK=true
-# SELF_ATTN_EVERY_N_LAYERS=0
-# VLM_NAME=HuggingFaceTB/SmolVLM2-500M-Video-Instruct
-
-# EXPERT_WIDTH_MULTIPLIER=0.5
-# PAST_OBS_KEYS="image"
-# N_OBS_STEPS=1
-# NUM_EXPERT_LAYERS=0
-# CHUNK_SIZE=50
-# NUM_VLM_LAYERS=0
-# PAD_LANG_TO=longest
-# EVAL_CKPT=$WORK/logs/lerobot/lerobot_physical_intelligence_libero_smolpi0_lr1e-4bs8steps100000gpus2freeze32_imgtoktrue_self_attn_gap1_localimgfalse_statetopreffalse_explay0_vlml0_causalacttrue_sa0_smolvlm2500/checkpoints/best/pretrained_model/
-# ADD_IMAGE_TOKENS=true
-# ATTN_MODE=self_attn
-# STATE_TO_PREFIX=false
-# CAUSAL_ACTION_ATTENTION_MASK=true
-# SELF_ATTN_EVERY_N_LAYERS=0
-# VLM_NAME=HuggingFaceTB/SmolVLM2-500M-Video-Instruct
-
-# EXPERT_WIDTH_MULTIPLIER=0.5
-# PAST_OBS_KEYS="image"
-# N_OBS_STEPS=1
-# NUM_EXPERT_LAYERS=0
-# CHUNK_SIZE=50
-# NUM_VLM_LAYERS=0
-# PAD_LANG_TO=longest
-# EVAL_CKPT=$WORK/logs/lerobot/lerobot_physical_intelligence_libero_smolpi0_lr1e-4bs8steps100000gpus2freeze32_imgtoktrue_self_attn_gap1_localimgfalse_statetopreftrue_explay0_vlml0_causalactfalse_sa0_smolvlm2500/checkpoints/best/pretrained_model/
-# ADD_IMAGE_TOKENS=true
-# ATTN_MODE=self_attn
-# STATE_TO_PREFIX=true
-# CAUSAL_ACTION_ATTENTION_MASK=false
-# SELF_ATTN_EVERY_N_LAYERS=0
-# VLM_NAME=HuggingFaceTB/SmolVLM2-500M-Video-Instruct
-
-# EXPERT_WIDTH_MULTIPLIER=0.5
-# PAST_OBS_KEYS="image"
-# N_OBS_STEPS=1
-# NUM_EXPERT_LAYERS=0
-# CHUNK_SIZE=50
-# NUM_VLM_LAYERS=8
-# PAD_LANG_TO=longest
-# EVAL_CKPT=$WORK/logs/lerobot/lerobot_physical_intelligence_libero_smolpi0_lr1e-4bs8steps100000gpus2freeze32_imgtoktrue_cross_attn_gap1_localimgfalse_statetopreftrue_explay0_vlml8_causalactfalse_sa0/checkpoints/best/pretrained_model/
-# ADD_IMAGE_TOKENS=true
-# ATTN_MODE=cross_attn
-# STATE_TO_PREFIX=true
-# CAUSAL_ACTION_ATTENTION_MASK=false
-# SELF_ATTN_EVERY_N_LAYERS=0
-# VLM_NAME=HuggingFaceTB/SmolVLM2-500M-Video-Instruct
-
-# EXPERT_WIDTH_MULTIPLIER=0.5
-# PAST_OBS_KEYS="image"
-# N_OBS_STEPS=1
-# NUM_EXPERT_LAYERS=0
-# CHUNK_SIZE=50
-# NUM_VLM_LAYERS=16
-# PAD_LANG_TO=longest
-# EVAL_CKPT=$WORK/logs/lerobot/lerobot_physical_intelligence_libero_smolpi0_lr1e-4bs8steps100000gpus2freeze32_imgtoktrue_cross_attn_gap1_localimgfalse_statetopreftrue_explay0_vlml16_causalactfalse_sa0/checkpoints/last/pretrained_model/
-# ADD_IMAGE_TOKENS=true
-# ATTN_MODE=cross_attn
-# STATE_TO_PREFIX=true
-# CAUSAL_ACTION_ATTENTION_MASK=false
-# SELF_ATTN_EVERY_N_LAYERS=0
-# VLM_NAME=HuggingFaceTB/SmolVLM2-500M-Video-Instruct
-
-# EXPERT_WIDTH_MULTIPLIER=0.5
-# PAST_OBS_KEYS="image"
-# N_OBS_STEPS=1
-# NUM_EXPERT_LAYERS=0
-# CHUNK_SIZE=50
-# NUM_VLM_LAYERS=24
-# PAD_LANG_TO=longest
-# EVAL_CKPT=$WORK/logs/lerobot/lerobot_physical_intelligence_libero_smolpi0_lr1e-4bs8steps100000gpus2freeze32_imgtoktrue_cross_attn_gap1_localimgfalse_statetopreftrue_explay0_vlml24_causalactfalse_sa0/checkpoints/best/pretrained_model/
-# ADD_IMAGE_TOKENS=true
-# ATTN_MODE=cross_attn
-# STATE_TO_PREFIX=true
-# CAUSAL_ACTION_ATTENTION_MASK=false
-# SELF_ATTN_EVERY_N_LAYERS=0
-# VLM_NAME=HuggingFaceTB/SmolVLM2-500M-Video-Instruct
-
-# EXPERT_WIDTH_MULTIPLIER=0.5
-# PAST_OBS_KEYS="image"
-# N_OBS_STEPS=1
-# NUM_EXPERT_LAYERS=0
-# CHUNK_SIZE=50
-# NUM_VLM_LAYERS=0
-# PAD_LANG_TO=longest
-# EVAL_CKPT=$WORK/logs/lerobot/lerobot_physical_intelligence_libero_smolpi0_lr1e-4bs8steps100000gpus2freeze32_imgtokfalse_cross_attn_gap1_localimgfalse_statetopreftrue_explay0_vlml0_causalactfalse_sa0_smolvlm2500/checkpoints/best/pretrained_model/
-# ADD_IMAGE_TOKENS=false
-# ATTN_MODE=cross_attn
-# STATE_TO_PREFIX=true
-# CAUSAL_ACTION_ATTENTION_MASK=false
-# SELF_ATTN_EVERY_N_LAYERS=0
-# VLM_NAME=HuggingFaceTB/SmolVLM2-500M-Video-Instruct
-
-# EXPERT_WIDTH_MULTIPLIER=0.5
-# PAST_OBS_KEYS="image"
-# N_OBS_STEPS=1
-# NUM_EXPERT_LAYERS=0
-# CHUNK_SIZE=100
-# NUM_VLM_LAYERS=0
-# PAD_LANG_TO=longest
-# EVAL_CKPT=$WORK/logs/lerobot/lerobot_physical_intelligence_libero_smolpi0_lr1e-4bs8steps100000gpus2freeze32_imgtoktrue_cross_attn_gap1_localimgfalse_statetopreftrue_explay0_vlml0_causalactfalse_sa0_smolvlm2500_chunk100/checkpoints/best/pretrained_model/
-# ADD_IMAGE_TOKENS=true
-# ATTN_MODE=cross_attn
-# STATE_TO_PREFIX=true
-# CAUSAL_ACTION_ATTENTION_MASK=false
-# SELF_ATTN_EVERY_N_LAYERS=0
-# VLM_NAME=HuggingFaceTB/SmolVLM2-500M-Video-Instruct
-
-# EXPERT_WIDTH_MULTIPLIER=0.5
-# PAST_OBS_KEYS="image"
-# N_OBS_STEPS=1
-# NUM_EXPERT_LAYERS=0
-# CHUNK_SIZE=30
-# NUM_VLM_LAYERS=0
-# PAD_LANG_TO=longest
-# EVAL_CKPT=$WORK/logs/lerobot/lerobot_physical_intelligence_libero_smolpi0_lr1e-4bs8steps100000gpus2freeze32_imgtoktrue_cross_attn_gap1_localimgfalse_statetopreftrue_explay0_vlml0_causalactfalse_sa0_smolvlm2500_chunk30/checkpoints/best/pretrained_model/
-# ADD_IMAGE_TOKENS=true
-# ATTN_MODE=cross_attn
-# STATE_TO_PREFIX=true
-# CAUSAL_ACTION_ATTENTION_MASK=false
-# SELF_ATTN_EVERY_N_LAYERS=0
-# VLM_NAME=HuggingFaceTB/SmolVLM2-500M-Video-Instruct
-
-# EXPERT_WIDTH_MULTIPLIER=0.5
-# PAST_OBS_KEYS="image"
-# N_OBS_STEPS=1
-# NUM_EXPERT_LAYERS=0
-# CHUNK_SIZE=10
-# NUM_VLM_LAYERS=0
-# PAD_LANG_TO=longest
-# EVAL_CKPT=$WORK/logs/lerobot/lerobot_physical_intelligence_libero_smolpi0_lr1e-4bs8steps100000gpus2freeze32_imgtoktrue_cross_attn_gap1_localimgfalse_statetopreftrue_explay0_vlml0_causalactfalse_sa0_smolvlm2500_chunk10/checkpoints/best/pretrained_model/
-# ADD_IMAGE_TOKENS=true
-# ATTN_MODE=cross_attn
-# STATE_TO_PREFIX=true
-# CAUSAL_ACTION_ATTENTION_MASK=false
-# SELF_ATTN_EVERY_N_LAYERS=0
-# VLM_NAME=HuggingFaceTB/SmolVLM2-500M-Video-Instruct
-
-# EXPERT_WIDTH_MULTIPLIER=0.5
-# PAST_OBS_KEYS="image"
-# N_OBS_STEPS=1
-# NUM_EXPERT_LAYERS=0
-# CHUNK_SIZE=1
-# NUM_VLM_LAYERS=0
-# PAD_LANG_TO=longest
-# EVAL_CKPT=$WORK/logs/lerobot/lerobot_physical_intelligence_libero_smolpi0_lr1e-4bs8steps100000gpus2freeze32_imgtoktrue_cross_attn_gap1_localimgfalse_statetopreftrue_explay0_vlml0_causalactfalse_sa0_smolvlm2500_chunk1/checkpoints/best/pretrained_model/
-# ADD_IMAGE_TOKENS=true
-# ATTN_MODE=cross_attn
-# STATE_TO_PREFIX=true
-# CAUSAL_ACTION_ATTENTION_MASK=false
-# SELF_ATTN_EVERY_N_LAYERS=0
-# VLM_NAME=HuggingFaceTB/SmolVLM2-500M-Video-Instruct
-
-# EXPERT_WIDTH_MULTIPLIER=0.5
-# PAST_OBS_KEYS="image"
-# N_OBS_STEPS=1
-# NUM_EXPERT_LAYERS=16
-# CHUNK_SIZE=50
-# NUM_VLM_LAYERS=0
-# PAD_LANG_TO=longest
-# EVAL_CKPT=$WORK/logs/lerobot/lerobot_physical_intelligence_libero_smolpi0_lr1e-4bs8steps100000gpus2freeze32_imgtoktrue_cross_attn_gap1_localimgfalse_statetopreftrue_explay16_vlml0_causalactfalse_sa0_smolvlm2500_chunk50_nobs1/checkpoints/best/pretrained_model/
-# ADD_IMAGE_TOKENS=true
-# ATTN_MODE=cross_attn
-# STATE_TO_PREFIX=true
-# CAUSAL_ACTION_ATTENTION_MASK=false
-# SELF_ATTN_EVERY_N_LAYERS=0
-# VLM_NAME=HuggingFaceTB/SmolVLM2-500M-Video-Instruct
-
-
-# EXPERT_WIDTH_MULTIPLIER=0.5
-# PAST_OBS_KEYS="image"
-# N_OBS_STEPS=2
-# NUM_EXPERT_LAYERS=0
-# CHUNK_SIZE=50
-# NUM_VLM_LAYERS=0
-# PAD_LANG_TO=longest
-# EVAL_CKPT=$WORK/logs/lerobot/lerobot_physical_intelligence_libero_smolpi0_lr1e-4bs8steps100000gpus2freeze32_imgtoktrue_cross_attn_gap1_localimgfalse_statetopreftrue_explay0_vlml0_causalactfalse_sa0_smolvlm2500_chunk50_nobs2/checkpoints/best/pretrained_model/
-# ADD_IMAGE_TOKENS=true
-# ATTN_MODE=cross_attn
-# STATE_TO_PREFIX=true
-# CAUSAL_ACTION_ATTENTION_MASK=false
-# SELF_ATTN_EVERY_N_LAYERS=0
-# VLM_NAME=HuggingFaceTB/SmolVLM2-500M-Video-Instruct
-
-# EXPERT_WIDTH_MULTIPLIER=0.5
-# PAST_OBS_KEYS="image"
-# N_OBS_STEPS=3
-# NUM_EXPERT_LAYERS=0
-# CHUNK_SIZE=50
-# NUM_VLM_LAYERS=0
-# PAD_LANG_TO=longest
-# EVAL_CKPT=$WORK/logs/lerobot/lerobot_physical_intelligence_libero_smolpi0_lr1e-4_bs8_steps100000_gpus2_freeze32_onlyexpert_1act_promptfalse_imgtoktrue_nobs3_compiletrue_cross_attn_pref0_gap1_localimgfalse_reverseimgorderfalse_statetopreftrue_toklongest/checkpoints/last/pretrained_model/
-# ADD_IMAGE_TOKENS=true
-# ATTN_MODE=cross_attn
-# STATE_TO_PREFIX=true
-# CAUSAL_ACTION_ATTENTION_MASK=false
-# SELF_ATTN_EVERY_N_LAYERS=0
-# VLM_NAME=HuggingFaceTB/SmolVLM2-500M-Video-Instruct
-
-# EXPERT_WIDTH_MULTIPLIER=0.5
-# PAST_OBS_KEYS="observation.state"
-# N_OBS_STEPS=3
-# NUM_EXPERT_LAYERS=0
-# CHUNK_SIZE=50
-# NUM_VLM_LAYERS=0
-# PAD_LANG_TO=longest
-# EVAL_CKPT=$WORK/logs/lerobot/lerobot_physical_intelligence_libero_smolpi0_lr1e-4bs8steps100000gpus2freeze32_imgtoktrue_cross_attn_gap1_localimgfalse_statetopreftrue_explay0_vlml0_causalactfalse_sa0_smolvlm2500_chunk50_nobs3_paststates/checkpoints/best/pretrained_model/
-# ADD_IMAGE_TOKENS=true
-# ATTN_MODE=cross_attn
-# STATE_TO_PREFIX=true
-# CAUSAL_ACTION_ATTENTION_MASK=false
-# SELF_ATTN_EVERY_N_LAYERS=0
-# VLM_NAME=HuggingFaceTB/SmolVLM2-500M-Video-Instruct
-
-# EXPERT_WIDTH_MULTIPLIER=0.5
-# PAST_OBS_KEYS="observation.state,image"
-# N_OBS_STEPS=3
-# NUM_EXPERT_LAYERS=0
-# CHUNK_SIZE=50
-# NUM_VLM_LAYERS=0
-# PAD_LANG_TO=longest
-# EVAL_CKPT=$WORK/logs/lerobot/lerobot_physical_intelligence_libero_smolpi0_lr1e-4bs8steps100000gpus2freeze32_imgtoktrue_cross_attn_gap1_localimgfalse_statetopreftrue_explay0_vlml0_causalactfalse_sa0_smolvlm2500_chunk50_nobs3_paststatesimgs/checkpoints/best/pretrained_model/
-# ADD_IMAGE_TOKENS=true
-# ATTN_MODE=cross_attn
-# STATE_TO_PREFIX=true
-# CAUSAL_ACTION_ATTENTION_MASK=false
-# SELF_ATTN_EVERY_N_LAYERS=0
-# VLM_NAME=HuggingFaceTB/SmolVLM2-500M-Video-Instruct
-
-
-# EXPERT_WIDTH_MULTIPLIER=1
-# PAST_OBS_KEYS="image"
-# N_OBS_STEPS=1
-# NUM_EXPERT_LAYERS=0
-# CHUNK_SIZE=50
-# NUM_VLM_LAYERS=0
-# PAD_LANG_TO=longest
-# EVAL_CKPT=$WORK/logs/lerobot/lerobot_physical_intelligence_libero_smolpi0_lr9.5e-5bs8steps100000gpus2freeze32_imgtoktrue_cross_attn_gap1_localimgfalse_statetopreftrue_explay0_vlml0_causalactfalse_sa0_smolvlm2500_chunk50_nobs1_expw1/checkpoints/best/pretrained_model/
-# ADD_IMAGE_TOKENS=true
-# ATTN_MODE=cross_attn
-# STATE_TO_PREFIX=true
-# CAUSAL_ACTION_ATTENTION_MASK=false
-# SELF_ATTN_EVERY_N_LAYERS=0
-# VLM_NAME=HuggingFaceTB/SmolVLM2-500M-Video-Instruct
-
-
-# EXPERT_WIDTH_MULTIPLIER=0.75
-# PAST_OBS_KEYS="image"
-# N_OBS_STEPS=1
-# NUM_EXPERT_LAYERS=0
-# CHUNK_SIZE=50
-# NUM_VLM_LAYERS=0
-# PAD_LANG_TO=longest
-# EVAL_CKPT=$WORK/logs/lerobot/lerobot_physical_intelligence_libero_smolpi0_lr9.5e-5bs8steps100000gpus2freeze32_imgtoktrue_cross_attn_gap1_localimgfalse_statetopreftrue_explay0_vlml0_causalactfalse_sa0_smolvlm2500_chunk50_nobs1_expw0.75/checkpoints/best/pretrained_model/
-# ADD_IMAGE_TOKENS=true
-# ATTN_MODE=cross_attn
-# STATE_TO_PREFIX=true
-# CAUSAL_ACTION_ATTENTION_MASK=false
-# SELF_ATTN_EVERY_N_LAYERS=0
-# VLM_NAME=HuggingFaceTB/SmolVLM2-500M-Video-Instruct
-
-
-# EXPERT_WIDTH_MULTIPLIER=0.25
-# PAST_OBS_KEYS="image"
-# N_OBS_STEPS=1
-# NUM_EXPERT_LAYERS=0
-# CHUNK_SIZE=50
-# NUM_VLM_LAYERS=0
-# PAD_LANG_TO=longest
-# EVAL_CKPT=$WORK/logs/lerobot/lerobot_physical_intelligence_libero_smolpi0_lr2e-4bs8steps100000gpus2freeze32_imgtoktrue_cross_attn_gap1_localimgfalse_statetopreftrue_explay0_vlml0_causalactfalse_sa0_smolvlm2500_chunk50_nobs1_expw0.25/checkpoints/best/pretrained_model/
-# ADD_IMAGE_TOKENS=true
-# ATTN_MODE=cross_attn
-# STATE_TO_PREFIX=true
-# CAUSAL_ACTION_ATTENTION_MASK=false
-# SELF_ATTN_EVERY_N_LAYERS=0
-# VLM_NAME=HuggingFaceTB/SmolVLM2-500M-Video-Instruct
-
-
-# EXPERT_WIDTH_MULTIPLIER=0.5
-# PAST_OBS_KEYS="observation.state,image"
-# N_OBS_STEPS=1
-# NUM_EXPERT_LAYERS=0
-# CHUNK_SIZE=50
-# NUM_VLM_LAYERS=16
-# PAD_LANG_TO=longest
-# EVAL_CKPT=$WORK/logs/lerobot/lerobot_physical_intelligence_libero_smolpi0_lr1e-4bs8steps100000gpus2freeze32_imgtokfalse_cross_attn_gap1_localimgfalse_statetopreftrue_explay0_vlml16_causalactfalse_sa0_smolvlm2500_chunk50_nobs1_expw0.5/checkpoints/best/pretrained_model/
-# ADD_IMAGE_TOKENS=false
-# ATTN_MODE=cross_attn
-# STATE_TO_PREFIX=true
-# CAUSAL_ACTION_ATTENTION_MASK=false
-# SELF_ATTN_EVERY_N_LAYERS=0
-# VLM_NAME=HuggingFaceTB/SmolVLM2-500M-Video-Instruct
-
-
-# EXPERT_WIDTH_MULTIPLIER=0.5
-# PAST_OBS_KEYS="observation.state,image"
-# N_OBS_STEPS=1
-# NUM_EXPERT_LAYERS=0
-# CHUNK_SIZE=50
-# NUM_VLM_LAYERS=16
-# PAD_LANG_TO=longest
-# EVAL_CKPT=$WORK/logs/lerobot/lerobot_physical_intelligence_libero_smolpi0_lr1e-4bs16steps100000gpus2freeze32_imgtokfalse_cross_attn_gap1_localimgfalse_statetopreftrue_explay0_vlml16_causalactfalse_sa0_smolvlm2500_chunk50_nobs1_expw0.5/checkpoints/best/pretrained_model/
-# ADD_IMAGE_TOKENS=false
-# ATTN_MODE=cross_attn
-# STATE_TO_PREFIX=true
-# CAUSAL_ACTION_ATTENTION_MASK=false
-# SELF_ATTN_EVERY_N_LAYERS=0
-# VLM_NAME=HuggingFaceTB/SmolVLM2-500M-Video-Instruct
-
-
-
-# EXPERT_WIDTH_MULTIPLIER=0.5
-# PAST_OBS_KEYS="observation.state,image"
-# N_OBS_STEPS=1
-# NUM_EXPERT_LAYERS=0
-# CHUNK_SIZE=50
-# NUM_VLM_LAYERS=16
-# PAD_LANG_TO=longest
-# EVAL_CKPT=$WORK/logs/lerobot/lerobot_physical_intelligence_libero_smolpi0_lr1e-4bs32steps100000gpus2freeze32_imgtokfalse_cross_attn_gap1_localimgfalse_statetopreftrue_explay0_vlml16_causalactfalse_sa0_smolvlm2500_chunk50_nobs1_expw0.5/checkpoints/best/pretrained_model/
-# ADD_IMAGE_TOKENS=false
-# ATTN_MODE=cross_attn
-# STATE_TO_PREFIX=true
-# CAUSAL_ACTION_ATTENTION_MASK=false
-# SELF_ATTN_EVERY_N_LAYERS=0
-# VLM_NAME=HuggingFaceTB/SmolVLM2-500M-Video-Instruct
-
-
-# EXPERT_WIDTH_MULTIPLIER=0.5
-# PAST_OBS_KEYS="image"
-# N_OBS_STEPS=1
-# NUM_EXPERT_LAYERS=0
-# CHUNK_SIZE=50
-# NUM_VLM_LAYERS=0
-# PAD_LANG_TO=longest
-# EVAL_CKPT=$WORK/logs/lerobot/lerobot_physical_intelligence_libero_smolpi0_lr5e-4bs8steps100000gpus2freeze32_imgtoktrue_cross_attn_gap1_localimgfalse_statetopreftrue_explay0_vlml0_causalactfalse_sa0_smolvlm1250_chunk50_nobs1_expw0.5/checkpoints/best/pretrained_model/
-# ADD_IMAGE_TOKENS=true
-# ATTN_MODE=cross_attn
-# STATE_TO_PREFIX=true
-# CAUSAL_ACTION_ATTENTION_MASK=false
-# SELF_ATTN_EVERY_N_LAYERS=0
-# VLM_NAME=HuggingFaceTB/SmolVLM-256M-Instruct
-
-
-# EXPERT_WIDTH_MULTIPLIER=0.5
-# PAST_OBS_KEYS="image"
-# N_OBS_STEPS=1
-# NUM_EXPERT_LAYERS=0
-# CHUNK_SIZE=50
-# NUM_VLM_LAYERS=0
-# PAD_LANG_TO=longest
-# EVAL_CKPT=$WORK/logs/lerobot/lerobot_physical_intelligence_libero_smolpi0_lr8e-5bs8steps100000gpus2freeze32_imgtoktrue_cross_attn_gap1_localimgfalse_statetopreftrue_explay0_vlml0_causalactfalse_sa0_smolvlm12b_chunk50_nobs1_expw0.5/checkpoints/best/pretrained_model/
-# ADD_IMAGE_TOKENS=true
-# ATTN_MODE=cross_attn
-# STATE_TO_PREFIX=true
-# CAUSAL_ACTION_ATTENTION_MASK=false
-# SELF_ATTN_EVERY_N_LAYERS=0
-# VLM_NAME=HuggingFaceTB/SmolVLM-Instruct
-
-
-
-# EXPERT_WIDTH_MULTIPLIER=0.5
-# PAST_OBS_KEYS="image"
-# N_OBS_STEPS=1
-# NUM_EXPERT_LAYERS=0
-# CHUNK_SIZE=50
-# NUM_VLM_LAYERS=0
-# PAD_LANG_TO=longest
-# EVAL_CKPT=$WORK/logs/lerobot/lerobot_physical_intelligence_libero_smolpi0_lr1e-4bs8steps100000gpus2freeze32_imgtoktrue_cross_attn_gap1_localimgfalse_statetopreftrue_explay0_vlml0_causalactfalse_sa0_smolvlm1500_chunk50_nobs1_expw0.5/checkpoints/best/pretrained_model/
-# ADD_IMAGE_TOKENS=true
-# ATTN_MODE=cross_attn
-# STATE_TO_PREFIX=true
-# CAUSAL_ACTION_ATTENTION_MASK=false
-# SELF_ATTN_EVERY_N_LAYERS=0
-# VLM_NAME=HuggingFaceTB/SmolVLM-500M-Instruct
-
-
-# EXPERT_WIDTH_MULTIPLIER=0.5
-# PAST_OBS_KEYS="image"
-# N_OBS_STEPS=1
-# NUM_EXPERT_LAYERS=0
-# CHUNK_SIZE=50
-# NUM_VLM_LAYERS=0
-# PAD_LANG_TO=longest
-# EVAL_CKPT=$WORK/logs/lerobot/lerobot_physical_intelligence_libero_smolpi0_lr8e-5bs8steps100000gpus2freeze32_imgtokfalse_cross_attn_gap1_localimgfalse_statetopreftrue_explay0_vlml0_causalactfalse_sa0_smolvlm1500_chunk50_nobs1_expw0.5/checkpoints/best/pretrained_model/
-# ADD_IMAGE_TOKENS=false
-# ATTN_MODE=cross_attn
-# STATE_TO_PREFIX=true
-# CAUSAL_ACTION_ATTENTION_MASK=false
-# SELF_ATTN_EVERY_N_LAYERS=0
-# VLM_NAME=HuggingFaceTB/SmolVLM-500M-Instruct
-
-
-
-# EXPERT_WIDTH_MULTIPLIER=0.5
-# PAST_OBS_KEYS="image"
-# N_OBS_STEPS=1
-# NUM_EXPERT_LAYERS=0
-# CHUNK_SIZE=50
-# NUM_VLM_LAYERS=0
-# PAD_LANG_TO=longest
-# EVAL_CKPT=$WORK/logs/lerobot/lerobot_physical_intelligence_libero_smolpi0_lr8e-5bs8steps100000gpus2freeze32_imgtokfalse_cross_attn_gap1_localimgfalse_statetopreftrue_explay0_vlml0_causalactfalse_sa0_smolvlm1500_chunk50_nobs1_expw0.5/checkpoints/best/pretrained_model/
-# ADD_IMAGE_TOKENS=false
-# ATTN_MODE=cross_attn
-# STATE_TO_PREFIX=true
-# CAUSAL_ACTION_ATTENTION_MASK=false
-# SELF_ATTN_EVERY_N_LAYERS=0
-# VLM_NAME=HuggingFaceTB/SmolVLM-500M-Instruct
-
-# EXPERT_WIDTH_MULTIPLIER=0.5
-# PAST_OBS_KEYS="image"
-# N_OBS_STEPS=1
-# NUM_EXPERT_LAYERS=0
-# CHUNK_SIZE=50
-# NUM_VLM_LAYERS=16
-# PAD_LANG_TO=longest
-# EVAL_CKPT=$WORK/logs/lerobot/lerobot_physical_intelligence_libero_smolpi0_lr1e-4bs8steps100000gpus2freeze32_imgtoktrue_cross_attn_gap1_localimgfalse_statetopreftrue_explay0_vlml16_causalactfalse_sa0_smolvlm2500_chunk50_nobs1_expw0.5/checkpoints/best/pretrained_model/
-# ADD_IMAGE_TOKENS=true
-# ATTN_MODE=cross_attn
-# STATE_TO_PREFIX=true
-# CAUSAL_ACTION_ATTENTION_MASK=false
-# SELF_ATTN_EVERY_N_LAYERS=0
-# VLM_NAME=HuggingFaceTB/SmolVLM-500M-Instruct
-
-
-# EXPERT_WIDTH_MULTIPLIER=0.5
-# PAST_OBS_KEYS="image"
-# N_OBS_STEPS=1
-# NUM_EXPERT_LAYERS=0
-# CHUNK_SIZE=50
-# NUM_VLM_LAYERS=16
-# PAD_LANG_TO=longest
-# EVAL_CKPT=$WORK/logs/lerobot/lerobot_physical_intelligence_libero_smolpi0_lr1e-4bs8steps100000gpus2freeze32_imgtoktrue_cross_attn_gap1_localimgfalse_statetopreftrue_explay0_vlml16_causalacttrue_sa2_smolvlm2500_chunk50_nobs1_expw0.5/checkpoints/best/pretrained_model/
-# ADD_IMAGE_TOKENS=true
-# ATTN_MODE=cross_attn
-# STATE_TO_PREFIX=true
-# CAUSAL_ACTION_ATTENTION_MASK=true
-# SELF_ATTN_EVERY_N_LAYERS=2
-# VLM_NAME=HuggingFaceTB/SmolVLM-500M-Instruct
-
-
-# EXPERT_WIDTH_MULTIPLIER=0.5
-# PAST_OBS_KEYS="image"
-# N_OBS_STEPS=1
-# NUM_EXPERT_LAYERS=0
-# CHUNK_SIZE=50
-# NUM_VLM_LAYERS=16
-# PAD_LANG_TO=longest
-# EVAL_CKPT=$WORK/logs/lerobot/lerobot_physical_intelligence_libero_smolpi0_lr1e-4bs32steps100000gpus2freeze32_imgtoktrue_cross_attn_gap1_localimgfalse_statetopreftrue_explay0_vlml16_causalacttrue_sa2_smolvlm2500_chunk50_nobs1_expw0.5/checkpoints/best/pretrained_model/
-# ADD_IMAGE_TOKENS=true
-# ATTN_MODE=cross_attn
-# STATE_TO_PREFIX=true
-# CAUSAL_ACTION_ATTENTION_MASK=true
-# SELF_ATTN_EVERY_N_LAYERS=2
-# VLM_NAME=HuggingFaceTB/SmolVLM-500M-Instruct
-
-
-
-# EXPERT_WIDTH_MULTIPLIER=0.5
-# PAST_OBS_KEYS="image"
-# N_OBS_STEPS=1
-# NUM_EXPERT_LAYERS=0
-# CHUNK_SIZE=50
-# NUM_VLM_LAYERS=16
-# PAD_LANG_TO=longest
-# EVAL_CKPT=$WORK/logs/lerobot/lerobot_physical_intelligence_libero_smolpi0_lr1e-4bs32steps100000gpus2freeze32_imgtoktrue_cross_attn_gap1_localimgfalse_statetopreftrue_explay0_vlml16_causalactfalse_sa0_smolvlm2500_chunk50_nobs1_expw0.5/checkpoints/best/pretrained_model/
-# ADD_IMAGE_TOKENS=true
-# ATTN_MODE=cross_attn
-# STATE_TO_PREFIX=true
-# CAUSAL_ACTION_ATTENTION_MASK=false
-# SELF_ATTN_EVERY_N_LAYERS=0
-# VLM_NAME=HuggingFaceTB/SmolVLM-500M-Instruct
-
-
-# EXPERT_WIDTH_MULTIPLIER=0.5
-# PAST_OBS_KEYS="image"
-# N_OBS_STEPS=1
-# NUM_EXPERT_LAYERS=0
-# CHUNK_SIZE=50
-# NUM_VLM_LAYERS=0
-# PAD_LANG_TO=longest
-# EVAL_CKPT=$WORK/logs/lerobot/lerobot_physical_intelligence_libero_smolpi0_lr1e-4bs8steps100000gpus2freeze32_imgtoktrue_cross_attn_gap1_localimgfalse_statetopreftrue_explay0_vlml0_causalactfalse_sa0_smolvlm2500_chunk50_nobs1_expw0.5_rep/checkpoints/best/pretrained_model/
-# ADD_IMAGE_TOKENS=true
-# ATTN_MODE=cross_attn
-# STATE_TO_PREFIX=true
-# CAUSAL_ACTION_ATTENTION_MASK=false
-# SELF_ATTN_EVERY_N_LAYERS=0
-# VLM_NAME=HuggingFaceTB/SmolVLM-500M-Instruct
-
-# EXPERT_WIDTH_MULTIPLIER=0.5
-# PAST_OBS_KEYS="image"
-# N_OBS_STEPS=1
-# NUM_EXPERT_LAYERS=0
-# CHUNK_SIZE=50
-# NUM_VLM_LAYERS=16
-# PAD_LANG_TO=longest
-# EVAL_CKPT=$WORK/logs/lerobot/lerobot_physical_intelligence_libero_smolpi0_lr1e-4bs32steps100000gpus2freeze32_imgtoktrue_cross_attn_gap1_localimgfalse_statetopreftrue_explay0_vlml16_causalacttrue_sa2_smolvlm2500_chunk50_nobs1_expw0.5/checkpoints/best/pretrained_model/
-# ADD_IMAGE_TOKENS=true
-# ATTN_MODE=cross_attn
-# STATE_TO_PREFIX=true
-# CAUSAL_ACTION_ATTENTION_MASK=true
-# SELF_ATTN_EVERY_N_LAYERS=2
-# VLM_NAME=HuggingFaceTB/SmolVLM-500M-Instruct
-
-# EXPERT_WIDTH_MULTIPLIER=0.5
-# PAST_OBS_KEYS="image"
-# N_OBS_STEPS=1
-# NUM_EXPERT_LAYERS=0
-# CHUNK_SIZE=50
-# NUM_VLM_LAYERS=16
-# PAD_LANG_TO=longest
-# EVAL_CKPT=$WORK/logs/lerobot/lerobot_physical_intelligence_libero_smolpi0_lr1e-4bs32steps100000gpus2freeze32_imgtoktrue_cross_attn_gap1_localimgfalse_statetopreftrue_explay0_vlml16_causalactfalse_sa2_smolvlm2500_chunk50_nobs1_expw0.5_rep/checkpoints/best/pretrained_model/
-# ADD_IMAGE_TOKENS=true
-# ATTN_MODE=cross_attn
-# STATE_TO_PREFIX=true
-# CAUSAL_ACTION_ATTENTION_MASK=false
-# SELF_ATTN_EVERY_N_LAYERS=2
-# VLM_NAME=HuggingFaceTB/SmolVLM-500M-Instruct
-
-
-# EXPERT_WIDTH_MULTIPLIER=0.5
-# PAST_OBS_KEYS="image"
-# N_OBS_STEPS=1
-# NUM_EXPERT_LAYERS=0
-# CHUNK_SIZE=50
-# NUM_VLM_LAYERS=16
-# PAD_LANG_TO=longest
-# EVAL_CKPT=$WORK/logs/lerobot/lerobot_physical_intelligence_libero_smolpi0_lr1e-4bs32steps100000gpus2freeze32_imgtokfalse_cross_attn_gap1_localimgfalse_statetopreftrue_explay0_vlml16_causalacttrue_sa2_smolvlm2500_chunk50_nobs1_expw0.5/checkpoints/best/pretrained_model/
-# ADD_IMAGE_TOKENS=false
-# ATTN_MODE=cross_attn
-# STATE_TO_PREFIX=true
-# CAUSAL_ACTION_ATTENTION_MASK=true
-# SELF_ATTN_EVERY_N_LAYERS=2
-# VLM_NAME=HuggingFaceTB/SmolVLM-500M-Instruct
-
-
-# EXPERT_WIDTH_MULTIPLIER=0.5
-# PAST_OBS_KEYS="image"
-# N_OBS_STEPS=1
-# NUM_EXPERT_LAYERS=0
-# CHUNK_SIZE=50
-# NUM_VLM_LAYERS=0
-# PAD_LANG_TO=longest
-# EVAL_CKPT=$WORK/logs/lerobot/lerobot_physical_intelligence_libero_smolpi0_lr1e-4bs8steps100000gpus2full8_imgtoktrue_cross_attn_gap1_localimgfalse_statetopreftrue_explay0_vlml0_causalacttrue_sa0_smolvlm2500_chunk50_nobs1_expw0.5_lrvlm1e-6/checkpoints/best/pretrained_model/
-# ADD_IMAGE_TOKENS=true
-# ATTN_MODE=cross_attn
-# STATE_TO_PREFIX=true
-# CAUSAL_ACTION_ATTENTION_MASK=true
-# SELF_ATTN_EVERY_N_LAYERS=0
-# VLM_NAME=HuggingFaceTB/SmolVLM-500M-Instruct
-
-
-# EXPERT_WIDTH_MULTIPLIER=0.5
-# PAST_OBS_KEYS="image"
-# N_OBS_STEPS=1
-# NUM_EXPERT_LAYERS=0
-# CHUNK_SIZE=50
-# NUM_VLM_LAYERS=0
-# PAD_LANG_TO=longest
-# EVAL_CKPT=$WORK/logs/lerobot/lerobot_physical_intelligence_libero_smolpi0_lr1e-4bs8steps100000gpus2full8_imgtoktrue_cross_attn_gap1_localimgfalse_statetopreftrue_explay0_vlml0_causalacttrue_sa0_smolvlm2500_chunk50_nobs1_expw0.5_lrvlm1e-5/checkpoints/best/pretrained_model/
-# ADD_IMAGE_TOKENS=true
-# ATTN_MODE=cross_attn
-# STATE_TO_PREFIX=true
-# CAUSAL_ACTION_ATTENTION_MASK=true
-# SELF_ATTN_EVERY_N_LAYERS=0
-# VLM_NAME=HuggingFaceTB/SmolVLM-500M-Instruct
-
-
-# EXPERT_WIDTH_MULTIPLIER=0.5
-# PAST_OBS_KEYS="image"
-# N_OBS_STEPS=1
-# NUM_EXPERT_LAYERS=0
-# CHUNK_SIZE=50
-# NUM_VLM_LAYERS=0
-# PAD_LANG_TO=longest
-# EVAL_CKPT=$WORK/logs/lerobot/lerobot_physical_intelligence_libero_smolpi0_lr1e-4bs8steps100000gpus2full8_imgtoktrue_cross_attn_gap1_localimgfalse_statetopreftrue_explay0_vlml0_causalacttrue_sa0_smolvlm2500_chunk50_nobs1_expw0.5_lrvlm1e-4/checkpoints/best/pretrained_model/
-# ADD_IMAGE_TOKENS=true
-# ATTN_MODE=cross_attn
-# STATE_TO_PREFIX=true
-# CAUSAL_ACTION_ATTENTION_MASK=true
-# SELF_ATTN_EVERY_N_LAYERS=0
-# VLM_NAME=HuggingFaceTB/SmolVLM-500M-Instruct
-
-
-# PEFT_METHOD=lora
-# PEFT_TARGET_MODEL=text
-# LORA_TARGET_MODULES=q_proj,v_proj,k_proj
-# LORA_R=32
-# EXPERT_WIDTH_MULTIPLIER=0.5
-# PAST_OBS_KEYS="image"
-# N_OBS_STEPS=1
-# NUM_EXPERT_LAYERS=0
-# CHUNK_SIZE=50
-# NUM_VLM_LAYERS=0
-# PAD_LANG_TO=longest
-# EVAL_CKPT=$WORK/logs/lerobot/lerobot_physical_intelligence_libero_smolpi0_lr1e-4bs8steps100000gpus2lora32_imgtoktrue_cross_attn_gap1_localimgfalse_statetopreftrue_explay0_vlml0_causalacttrue_sa0_smolvlm2500_chunk50_nobs1_expw0.5_lrvlm1e-4_loraqkv/checkpoints/best/pretrained_model/
-# ADD_IMAGE_TOKENS=true
-# ATTN_MODE=cross_attn
-# STATE_TO_PREFIX=true
-# CAUSAL_ACTION_ATTENTION_MASK=true
-# SELF_ATTN_EVERY_N_LAYERS=0
-# VLM_NAME=HuggingFaceTB/SmolVLM-500M-Instruct
-
-
-# PEFT_METHOD=lora
-# PEFT_TARGET_MODEL=text
-# LORA_TARGET_MODULES=q_proj,v_proj,k_proj
-# LORA_R=32
-# EXPERT_WIDTH_MULTIPLIER=0.5
-# PAST_OBS_KEYS="image"
-# N_OBS_STEPS=1
-# NUM_EXPERT_LAYERS=0
-# CHUNK_SIZE=50
-# NUM_VLM_LAYERS=0
-# PAD_LANG_TO=longest
-# EVAL_CKPT=$WORK/logs/lerobot/lerobot_physical_intelligence_libero_smolpi0_lr1e-4bs8steps100000gpus2lora32_imgtoktrue_cross_attn_gap1_localimgfalse_statetopreftrue_explay0_vlml0_causalacttrue_sa0_smolvlm2500_chunk50_nobs1_expw0.5_lrvlm1e-5_loraqkv/checkpoints/best/pretrained_model/
-# ADD_IMAGE_TOKENS=true
-# ATTN_MODE=cross_attn
-# STATE_TO_PREFIX=true
-# CAUSAL_ACTION_ATTENTION_MASK=true
-# SELF_ATTN_EVERY_N_LAYERS=0
-# VLM_NAME=HuggingFaceTB/SmolVLM-500M-Instruct
-
-# PEFT_METHOD=lora
-# PEFT_TARGET_MODEL=text
-# LORA_TARGET_MODULES=q_proj,v_proj,k_proj,up_proj,down_proj,gate_proj
-# LORA_R=32
-# EXPERT_WIDTH_MULTIPLIER=0.5
-# PAST_OBS_KEYS="image"
-# N_OBS_STEPS=1
-# NUM_EXPERT_LAYERS=0
-# CHUNK_SIZE=50
-# NUM_VLM_LAYERS=0
-# PAD_LANG_TO=longest
-# EVAL_CKPT=$WORK/logs/lerobot/lerobot_physical_intelligence_libero_smolpi0_lr1e-4bs8steps100000gpus2lora32_imgtoktrue_cross_attn_gap1_localimgfalse_statetopreftrue_explay0_vlml0_causalacttrue_sa0_smolvlm2500_chunk50_nobs1_expw0.5_lrvlm1e-4/checkpoints/best/pretrained_model/
-# ADD_IMAGE_TOKENS=true
-# ATTN_MODE=cross_attn
-# STATE_TO_PREFIX=true
-# CAUSAL_ACTION_ATTENTION_MASK=true
-# SELF_ATTN_EVERY_N_LAYERS=0
-# VLM_NAME=HuggingFaceTB/SmolVLM-500M-Instruct
-
-
-# PEFT_METHOD=lora
-# PEFT_TARGET_MODEL=text
-# LORA_TARGET_MODULES=q_proj,v_proj,k_proj,up_proj,down_proj,gate_proj
-# LORA_R=32
-# EXPERT_WIDTH_MULTIPLIER=0.5
-# PAST_OBS_KEYS="image"
-# N_OBS_STEPS=1
-# NUM_EXPERT_LAYERS=0
-# CHUNK_SIZE=50
-# NUM_VLM_LAYERS=0
-# PAD_LANG_TO=longest
-# EVAL_CKPT=$WORK/logs/lerobot/lerobot_physical_intelligence_libero_smolpi0_lr1e-4bs8steps100000gpus2lora32_imgtoktrue_cross_attn_gap1_localimgfalse_statetopreftrue_explay0_vlml0_causalacttrue_sa0_smolvlm2500_chunk50_nobs1_expw0.5_lrvlm1e-5/checkpoints/best/pretrained_model/
-# ADD_IMAGE_TOKENS=true
-# ATTN_MODE=cross_attn
-# STATE_TO_PREFIX=true
-# CAUSAL_ACTION_ATTENTION_MASK=true
-# SELF_ATTN_EVERY_N_LAYERS=0
-# VLM_NAME=HuggingFaceTB/SmolVLM-500M-Instruct
-
-
-# PEFT_METHOD=lora
-# PEFT_TARGET_MODEL=text
-# LORA_TARGET_MODULES=q_proj,v_proj,k_proj,up_proj,down_proj,gate_proj
-# LORA_R=32
-# EXPERT_WIDTH_MULTIPLIER=0.5
-# PAST_OBS_KEYS="image"
-# N_OBS_STEPS=1
-# NUM_EXPERT_LAYERS=0
-# CHUNK_SIZE=50
-# NUM_VLM_LAYERS=0
-# PAD_LANG_TO=longest
-# EVAL_CKPT=$WORK/logs/lerobot/lerobot_physical_intelligence_libero_smolpi0_lr1e-4bs8steps100000gpus2lora32_imgtoktrue_cross_attn_gap1_localimgfalse_statetopreftrue_explay0_vlml0_causalacttrue_sa0_smolvlm2500_chunk50_nobs1_expw0.5_lrvlm1e-6/checkpoints/best/pretrained_model/
-# ADD_IMAGE_TOKENS=true
-# ATTN_MODE=cross_attn
-# STATE_TO_PREFIX=true
-# CAUSAL_ACTION_ATTENTION_MASK=true
-# SELF_ATTN_EVERY_N_LAYERS=0
-# VLM_NAME=HuggingFaceTB/SmolVLM-500M-Instruct
-
-
-# EXPERT_WIDTH_MULTIPLIER=0.5
-# PAST_OBS_KEYS="image"
-# N_OBS_STEPS=1
-# NUM_EXPERT_LAYERS=0
-# CHUNK_SIZE=50
-# NUM_VLM_LAYERS=0
-# PAD_LANG_TO=longest
-# EVAL_CKPT=$WORK/logs/lerobot/lerobot_physical_intelligence_libero_smolpi0_lr1e-4bs8steps100000gpus2lora32_imgtoktrue_cross_attn_gap1_localimgfalse_statetopreftrue_explay0_vlml0_causalacttrue_sa0_smolvlm2500_chunk50_nobs1_expw0.5_lrvlm1e-5/checkpoints/best/pretrained_model/
-# ADD_IMAGE_TOKENS=true
-# ATTN_MODE=cross_attn
-# STATE_TO_PREFIX=true
-# CAUSAL_ACTION_ATTENTION_MASK=true
-# SELF_ATTN_EVERY_N_LAYERS=0
-# VLM_NAME=HuggingFaceTB/SmolVLM-500M-Instruct
-
-
-
-# EXPERT_WIDTH_MULTIPLIER=0.5
-# PAST_OBS_KEYS="image"
-# N_OBS_STEPS=1
-# NUM_EXPERT_LAYERS=0
-# CHUNK_SIZE=50
-# NUM_VLM_LAYERS=16
-# PAD_LANG_TO=longest
-# EVAL_CKPT=$WORK/logs/lerobot/lerobot_physical_intelligence_libero_smolpi0_lr1e-4bs32steps100000gpus2freeze32_imgtokfalse_cross_attn_gap1_localimgfalse_statetopreftrue_explay0_vlml16_causalacttrue_sa0_smolvlm2500_chunk50_nobs1_expw0.5_lrvlm1e-4_loraqkv/checkpoints/best/pretrained_model/
-# ADD_IMAGE_TOKENS=false
-# ATTN_MODE=cross_attn
-# STATE_TO_PREFIX=true
-# CAUSAL_ACTION_ATTENTION_MASK=true
-# SELF_ATTN_EVERY_N_LAYERS=0
-# VLM_NAME=HuggingFaceTB/SmolVLM-500M-Instruct
-
-
-
-# EXPERT_WIDTH_MULTIPLIER=0.5
-# PAST_OBS_KEYS="image"
-# N_OBS_STEPS=1
-# NUM_EXPERT_LAYERS=0
-# CHUNK_SIZE=50
-# NUM_VLM_LAYERS=0
-# PAD_LANG_TO=longest
-# EVAL_CKPT=$WORK/logs/lerobot/lerobot_physical_intelligence_libero_smolpi0_lr1e-4bs32steps100000gpus2freeze32_imgtokfalse_cross_attn_gap1_localimgfalse_statetopreftrue_explay0_vlml0_causalacttrue_sa2_smolvlm2500_chunk50_nobs1_expw0.5_lrvlm1e-4/checkpoints/best/pretrained_model/
-# ADD_IMAGE_TOKENS=false
-# ATTN_MODE=cross_attn
-# STATE_TO_PREFIX=true
-# CAUSAL_ACTION_ATTENTION_MASK=true
-# SELF_ATTN_EVERY_N_LAYERS=2
-# VLM_NAME=HuggingFaceTB/SmolVLM-500M-Instruct
-
-
-# EXPERT_WIDTH_MULTIPLIER=0.5
-# PAST_OBS_KEYS="image"
-# N_OBS_STEPS=1
-# NUM_EXPERT_LAYERS=0
-# CHUNK_SIZE=50
-# NUM_VLM_LAYERS=0
-# PAD_LANG_TO=longest
-# EVAL_CKPT=$WORK/logs/lerobot/lerobot_physical_intelligence_libero_smolpi0_lr1e-4bs32steps100000gpus2freeze32_imgtoktrue_cross_attn_gap1_localimgfalse_statetopreftrue_explay0_vlml0_causalacttrue_sa2_smolvlm2500_chunk50_nobs1_expw0.5_lrvlm1e-4/checkpoints/best/pretrained_model/
-# ADD_IMAGE_TOKENS=true
-# ATTN_MODE=cross_attn
-# STATE_TO_PREFIX=true
-# CAUSAL_ACTION_ATTENTION_MASK=true
-# SELF_ATTN_EVERY_N_LAYERS=2
-# VLM_NAME=HuggingFaceTB/SmolVLM-500M-Instruct
-
-
-
-# EXPERT_WIDTH_MULTIPLIER=0.5
-# PAST_OBS_KEYS="image"
-# N_OBS_STEPS=1
-# NUM_EXPERT_LAYERS=0
-# CHUNK_SIZE=50
-# NUM_VLM_LAYERS=16
-# PAD_LANG_TO=longest
-# EVAL_CKPT=$WORK/logs/lerobot/lerobot_physical_intelligence_libero_smolpi0_lr1e-4bs16steps100000gpus2freeze32_imgtoktrue_cross_attn_gap1_localimgfalse_statetopreftrue_explay0_vlml16_causalacttrue_sa2_smolvlm2500_chunk50_nobs1_expw0.5_lrvlm1e-4/checkpoints/best/pretrained_model/
-# ADD_IMAGE_TOKENS=true
-# ATTN_MODE=cross_attn
-# STATE_TO_PREFIX=true
-# CAUSAL_ACTION_ATTENTION_MASK=true
-# SELF_ATTN_EVERY_N_LAYERS=2
-# VLM_NAME=HuggingFaceTB/SmolVLM-500M-Instruct
-
-
-# EXPERT_WIDTH_MULTIPLIER=0.5
-# PAST_OBS_KEYS="image"
-# N_OBS_STEPS=1
-# NUM_EXPERT_LAYERS=0
-# CHUNK_SIZE=50
-# NUM_VLM_LAYERS=0
-# PAD_LANG_TO=longest
-# EVAL_CKPT=$WORK/logs/lerobot/lerobot_physical_intelligence_libero_smolpi0_lr1e-4bs16steps100000gpus2freeze32_imgtoktrue_cross_attn_gap1_localimgfalse_statetopreftrue_explay0_vlml0_causalacttrue_sa2_smolvlm2500_chunk50_nobs1_expw0.5_lrvlm1e-4/checkpoints/best/pretrained_model/
-# ADD_IMAGE_TOKENS=true
-# ATTN_MODE=cross_attn
-# STATE_TO_PREFIX=true
-# CAUSAL_ACTION_ATTENTION_MASK=true
-# SELF_ATTN_EVERY_N_LAYERS=2
-# VLM_NAME=HuggingFaceTB/SmolVLM-500M-Instruct
-
-# EXPERT_WIDTH_MULTIPLIER=0.5
-# PAST_OBS_KEYS="image"
-# N_OBS_STEPS=1
-# NUM_EXPERT_LAYERS=0
-# CHUNK_SIZE=50
-# NUM_VLM_LAYERS=0
-# PAD_LANG_TO=longest
-# EVAL_CKPT=$WORK/logs/lerobot/lerobot_physical_intelligence_libero_smolpi0_lr1e-4bs16steps100000gpus2freeze32_imgtokfalse_cross_attn_gap1_localimgfalse_statetopreftrue_explay0_vlml0_causalacttrue_sa2_smolvlm2500_chunk50_nobs1_expw0.5_lrvlm1e-4/checkpoints/best/pretrained_model/
-# ADD_IMAGE_TOKENS=false
-# ATTN_MODE=cross_attn
-# STATE_TO_PREFIX=true
-# CAUSAL_ACTION_ATTENTION_MASK=true
-# SELF_ATTN_EVERY_N_LAYERS=2
-# VLM_NAME=HuggingFaceTB/SmolVLM-500M-Instruct
-
-
-
-# EXPERT_WIDTH_MULTIPLIER=0.5
-# PAST_OBS_KEYS="image"
-# N_OBS_STEPS=1
-# NUM_EXPERT_LAYERS=0
-# CHUNK_SIZE=50
-# NUM_VLM_LAYERS=0
-# PAD_LANG_TO=longest
-# EVAL_CKPT=$WORK/logs/lerobot/lerobot_physical_intelligence_libero_smolpi0_lr1e-4bs8steps100000gpus2freeze32_imgtokfalse_cross_attn_gap1_localimgfalse_statetopreftrue_explay0_vlml0_causalacttrue_sa0_smolvlm2500_chunk50_nobs1_expw0.5_lrvlm1e-4/checkpoints/best/pretrained_model/
-# ADD_IMAGE_TOKENS=false
-# ATTN_MODE=cross_attn
-# STATE_TO_PREFIX=true
-# CAUSAL_ACTION_ATTENTION_MASK=true
-# SELF_ATTN_EVERY_N_LAYERS=0
-# VLM_NAME=HuggingFaceTB/SmolVLM-500M-Instruct
-
-
-# SELF_ATTN_ONLY_ACTIONS=true
-# EXPERT_WIDTH_MULTIPLIER=0.5
-# PAST_OBS_KEYS="image"
-# N_OBS_STEPS=1
-# NUM_EXPERT_LAYERS=0
-# CHUNK_SIZE=50
-# NUM_VLM_LAYERS=16
-# PAD_LANG_TO=longest
-# EVAL_CKPT=$WORK/logs/lerobot/lerobot_physical_intelligence_libero_smolpi0_lr1e-4bs8steps100000gpus2freeze32_imgtoktrue_cross_attn_gap1_localimgfalse_statetopreftrue_explay0_vlml16_causalacttrue_sa2_smolvlm2500_chunk50_nobs1_expw0.5_lrvlm1e-4_saacttrue/checkpoints/best/pretrained_model/
-# ADD_IMAGE_TOKENS=true
-# ATTN_MODE=cross_attn
-# STATE_TO_PREFIX=true
-# CAUSAL_ACTION_ATTENTION_MASK=true
-# SELF_ATTN_EVERY_N_LAYERS=2
-# VLM_NAME=HuggingFaceTB/SmolVLM-500M-Instruct
-
-
-# SELF_ATTN_ONLY_ACTIONS=false
-# EXPERT_WIDTH_MULTIPLIER=0.5
-# PAST_OBS_KEYS="image"
-# N_OBS_STEPS=1
-# NUM_EXPERT_LAYERS=0
-# CHUNK_SIZE=50
-# NUM_VLM_LAYERS=16
-# PAD_LANG_TO=max_length
-# EVAL_CKPT=$WORK/logs/lerobot/lerobot_physical_intelligence_libero_smolpi0_lr1e-4bs8steps100000gpus2freeze32_imgtoktrue_cross_attn_gap1_localimgfalse_statetopreftrue_explay0_vlml16_causalacttrue_sa2_smolvlm2500_chunk50_nobs1_expw0.5_lrvlm1e-4_saactfalse_droptrue_max_length/checkpoints/best/pretrained_model/
-# ADD_IMAGE_TOKENS=true
-# ATTN_MODE=cross_attn
-# STATE_TO_PREFIX=true
-# CAUSAL_ACTION_ATTENTION_MASK=true
-# SELF_ATTN_EVERY_N_LAYERS=2
-# VLM_NAME=HuggingFaceTB/SmolVLM-500M-Instruct
-
-
-
-# SELF_ATTN_ONLY_ACTIONS=false
-# EXPERT_WIDTH_MULTIPLIER=0.5
-# PAST_OBS_KEYS="image"
-# N_OBS_STEPS=1
-# NUM_EXPERT_LAYERS=0
-# CHUNK_SIZE=50
-# NUM_VLM_LAYERS=16
-# PAD_LANG_TO=max_length
-# EVAL_CKPT=$WORK/logs/lerobot/lerobot_physical_intelligence_libero_smolpi0_lr1e-4bs8steps100000gpus2freeze32_imgtoktrue_cross_attn_gap1_localimgfalse_statetopreftrue_explay0_vlml16_causalacttrue_sa2_smolvlm2500_chunk50_nobs1_expw0.5_lrvlm1e-4_saactfalse_dropfalse_max_length/checkpoints/best/pretrained_model/
-# ADD_IMAGE_TOKENS=true
-# ATTN_MODE=cross_attn
-# STATE_TO_PREFIX=true
-# CAUSAL_ACTION_ATTENTION_MASK=true
-# SELF_ATTN_EVERY_N_LAYERS=2
-# VLM_NAME=HuggingFaceTB/SmolVLM-500M-Instruct
-
-# SELF_ATTN_ONLY_ACTIONS=false
-# EXPERT_WIDTH_MULTIPLIER=0.75
-# PAST_OBS_KEYS="image"
-# N_OBS_STEPS=1
-# NUM_EXPERT_LAYERS=0
-# CHUNK_SIZE=50
-# NUM_VLM_LAYERS=16
-# PAD_LANG_TO=max_length
-# EVAL_CKPT=$WORK/logs/lerobot/lerobot_physical_intelligence_libero_smolpi0_lr9.5e-5bs8steps100000gpus2freeze32_imgtoktrue_cross_attn_gap1_localimgfalse_statetopreftrue_explay0_vlml16_causalacttrue_sa2_smolvlm2500_chunk50_nobs1_expw0.75_lrvlm1e-4_saactfalse_dropfalse_max_length/checkpoints/best/pretrained_model/
-# ADD_IMAGE_TOKENS=true
-# ATTN_MODE=cross_attn
-# STATE_TO_PREFIX=true
-# CAUSAL_ACTION_ATTENTION_MASK=true
-# SELF_ATTN_EVERY_N_LAYERS=2
-# VLM_NAME=HuggingFaceTB/SmolVLM-500M-Instruct
-
-
-# SELF_ATTN_ONLY_ACTIONS=false
-# EXPERT_WIDTH_MULTIPLIER=0.75
-# PAST_OBS_KEYS="image"
-# N_OBS_STEPS=1
-# NUM_EXPERT_LAYERS=0
-# CHUNK_SIZE=50
-# NUM_VLM_LAYERS=16
-# PAD_LANG_TO=longest
-# EVAL_CKPT=$WORK/logs/lerobot/lerobot_physical_intelligence_libero_smolpi0_lr9.5e-5bs32steps100000gpus2freeze32_imgtoktrue_cross_attn_gap1_localimgfalse_statetopreftrue_explay0_vlml16_causalacttrue_sa2_smolvlm2500_chunk50_nobs1_expw0.75_lrvlm1e-4_saactfalse_dropfalse_longest/checkpoints/best/pretrained_model/
-# ADD_IMAGE_TOKENS=true
-# ATTN_MODE=cross_attn
-# STATE_TO_PREFIX=true
-# CAUSAL_ACTION_ATTENTION_MASK=true
-# SELF_ATTN_EVERY_N_LAYERS=2
-# VLM_NAME=HuggingFaceTB/SmolVLM-500M-Instruct
-
-
-# SELF_ATTN_ONLY_ACTIONS=false
-# EXPERT_WIDTH_MULTIPLIER=0.75
-# PAST_OBS_KEYS="image"
-# N_OBS_STEPS=1
-# NUM_EXPERT_LAYERS=0
-# CHUNK_SIZE=50
-# NUM_VLM_LAYERS=16
-# PAD_LANG_TO=longest
-# EVAL_CKPT=$WORK/logs/lerobot/lerobot_physical_intelligence_libero_smolpi0_lr1e-4bs32steps100000gpus2freeze32_imgtoktrue_cross_attn_gap1_localimgfalse_statetopreftrue_explay0_vlml16_causalacttrue_sa2_smolvlm2500_chunk50_nobs1_expw0.75_lrvlm1e-4_saactfalse_dropfalse_longest/checkpoints/best/pretrained_model/
-# ADD_IMAGE_TOKENS=true
-# ATTN_MODE=cross_attn
-# STATE_TO_PREFIX=true
-# CAUSAL_ACTION_ATTENTION_MASK=true
-# SELF_ATTN_EVERY_N_LAYERS=2
-# VLM_NAME=HuggingFaceTB/SmolVLM-500M-Instruct
-
-
-# SELF_ATTN_ONLY_ACTIONS=false
-# EXPERT_WIDTH_MULTIPLIER=0.5
-# PAST_OBS_KEYS="image"
-# N_OBS_STEPS=1
-# NUM_EXPERT_LAYERS=0
-# CHUNK_SIZE=50
-# NUM_VLM_LAYERS=16
-# PAD_LANG_TO=longest
-# EVAL_CKPT=$WORK/logs/lerobot/lerobot_physical_intelligence_libero_smolpi0_lr1e-4bs8steps100000gpus2freeze32_imgtoktrue_cross_attn_gap1_localimgfalse_statetopreftrue_explay0_vlml16_causalacttrue_sa2_smolvlm2500_chunk50_nobs1_expw0.5_lrvlm1e-4_longest_ptdroidfull/checkpoints/best/pretrained_model/
-# ADD_IMAGE_TOKENS=true
-# ATTN_MODE=cross_attn
-# STATE_TO_PREFIX=true
-# CAUSAL_ACTION_ATTENTION_MASK=true
-# SELF_ATTN_EVERY_N_LAYERS=2
-# VLM_NAME=HuggingFaceTB/SmolVLM-500M-Instruct
-
-# SELF_ATTN_ONLY_ACTIONS=false
-# EXPERT_WIDTH_MULTIPLIER=0.5
-# PAST_OBS_KEYS="image"
-# N_OBS_STEPS=1
-# NUM_EXPERT_LAYERS=0
-# CHUNK_SIZE=50
-# NUM_VLM_LAYERS=16
-# PAD_LANG_TO=longest
-# EVAL_CKPT=$WORK/logs/lerobot/lerobot_physical_intelligence_libero_smolpi0_lr1e-4bs8steps100000gpus2freeze32_imgtoktrue_cross_attn_gap1_localimgfalse_statetopreftrue_explay0_vlml16_causalacttrue_sa2_smolvlm2500_chunk50_nobs1_expw0.5_lrvlm1e-4_longest_ptcomv3freeze/checkpoints/best/pretrained_model/
-# ADD_IMAGE_TOKENS=true
-# ATTN_MODE=cross_attn
-# STATE_TO_PREFIX=true
-# CAUSAL_ACTION_ATTENTION_MASK=true
-# SELF_ATTN_EVERY_N_LAYERS=2
-# VLM_NAME=HuggingFaceTB/SmolVLM-500M-Instruct
-
-# SELF_ATTN_ONLY_ACTIONS=false
-# EXPERT_WIDTH_MULTIPLIER=0.5
-# PAST_OBS_KEYS="image"
-# N_OBS_STEPS=1
-# NUM_EXPERT_LAYERS=0
-# CHUNK_SIZE=50
-# NUM_VLM_LAYERS=16
-# PAD_LANG_TO=longest
-# EVAL_CKPT=$WORK/logs/lerobot/lerobot_physical_intelligence_libero_smolpi0_lr1e-4bs8steps100000gpus2freeze32_imgtoktrue_cross_attn_gap1_localimgfalse_statetopreftrue_explay0_vlml16_causalacttrue_sa2_smolvlm2500_chunk50_nobs1_expw0.5_lrvlm1e-4_longest_ptcomv1v2full/checkpoints/best/pretrained_model/
-# ADD_IMAGE_TOKENS=true
-# ATTN_MODE=cross_attn
-# STATE_TO_PREFIX=true
-# CAUSAL_ACTION_ATTENTION_MASK=true
-# SELF_ATTN_EVERY_N_LAYERS=2
-# VLM_NAME=HuggingFaceTB/SmolVLM-500M-Instruct
-
-# SELF_ATTN_ONLY_ACTIONS=false
-# EXPERT_WIDTH_MULTIPLIER=0.5
-# PAST_OBS_KEYS="image"
-# N_OBS_STEPS=1
-# NUM_EXPERT_LAYERS=0
-# CHUNK_SIZE=50
-# NUM_VLM_LAYERS=16
-# PAD_LANG_TO=longest
-# EVAL_CKPT=$WORK/logs/lerobot/lerobot_physical_intelligence_libero_smolpi0_lr1e-4bs8steps100000gpus2freeze32_imgtoktrue_cross_attn_gap1_localimgfalse_statetopreftrue_explay0_vlml16_causalacttrue_sa2_smolvlm2500_chunk50_nobs1_expw0.5_lrvlm1e-4_longest_ptcomv1v2freeze/checkpoints/best/pretrained_model/
-# ADD_IMAGE_TOKENS=true
-# ATTN_MODE=cross_attn
-# STATE_TO_PREFIX=true
-# CAUSAL_ACTION_ATTENTION_MASK=true
-# SELF_ATTN_EVERY_N_LAYERS=2
-# VLM_NAME=HuggingFaceTB/SmolVLM-500M-Instruct
-
-
-
-# SELF_ATTN_ONLY_ACTIONS=false
-# EXPERT_WIDTH_MULTIPLIER=0.5
-# PAST_OBS_KEYS="image"
-# N_OBS_STEPS=1
-# NUM_EXPERT_LAYERS=0
-# CHUNK_SIZE=50
-# NUM_VLM_LAYERS=16
-# PAD_LANG_TO=longest
-# EVAL_CKPT=$WORK/logs/lerobot/lerobot_physical_intelligence_libero_smolpi0_lr2e-4bs8steps100000gpus2freeze32_imgtoktrue_cross_attn_gap1_localimgfalse_statetopreftrue_explay0_vlml16_causalacttrue_sa2_smolvlm2500_chunk50_nobs1_expw0.5_lrvlm1e-4_longest_pt_trans1true/checkpoints/best/pretrained_model/
-# ADD_IMAGE_TOKENS=true
-# ATTN_MODE=cross_attn
-# STATE_TO_PREFIX=true
-# CAUSAL_ACTION_ATTENTION_MASK=true
-# SELF_ATTN_EVERY_N_LAYERS=2
-# VLM_NAME=HuggingFaceTB/SmolVLM-500M-Instruct
-
-
-# SELF_ATTN_ONLY_ACTIONS=false
-# EXPERT_WIDTH_MULTIPLIER=0.5
-# PAST_OBS_KEYS="image"
-# N_OBS_STEPS=1
-# NUM_EXPERT_LAYERS=0
-# CHUNK_SIZE=50
-# NUM_VLM_LAYERS=16
-# PAD_LANG_TO=longest
-# EVAL_CKPT=$WORK/logs/lerobot/lerobot_physical_intelligence_libero_smolpi0_lr2e-4bs8steps100000gpus2freeze32_imgtoktrue_cross_attn_gap1_localimgfalse_statetopreftrue_explay0_vlml16_causalacttrue_sa2_smolvlm2500_chunk50_nobs1_expw0.5_lrvlm1e-4_longest_pt_trans3true/checkpoints/best/pretrained_model/
-# ADD_IMAGE_TOKENS=true
-# ATTN_MODE=cross_attn
-# STATE_TO_PREFIX=true
-# CAUSAL_ACTION_ATTENTION_MASK=true
-# SELF_ATTN_EVERY_N_LAYERS=2
-# VLM_NAME=HuggingFaceTB/SmolVLM-500M-Instruct
-
-# SELF_ATTN_ONLY_ACTIONS=false
-# EXPERT_WIDTH_MULTIPLIER=0.5
-# PAST_OBS_KEYS="image"
-# N_OBS_STEPS=1
-# NUM_EXPERT_LAYERS=0
-# CHUNK_SIZE=50
-# NUM_VLM_LAYERS=0
-# PAD_LANG_TO=longest
-# EVAL_CKPT=$WORK/logs/lerobot/lerobot_physical_intelligence_libero_smolpi0_lr1e-4bs32steps100000gpus2freeze32_imgtoktrue_self_attn_gap1_localimgfalse_statetopreftrue_explay0_vlml0_causalacttrue_sa0_smolvlm2500_chunk50_nobs1_expw0.5_lrvlm1e-4_longest_pt_trans1false/checkpoints/best/pretrained_model/
-# ADD_IMAGE_TOKENS=true
-# ATTN_MODE=self_attn
-# STATE_TO_PREFIX=true
-# CAUSAL_ACTION_ATTENTION_MASK=true
-# SELF_ATTN_EVERY_N_LAYERS=0
-# VLM_NAME=HuggingFaceTB/SmolVLM-500M-Instruct
-
-
-# SELF_ATTN_ONLY_ACTIONS=false
-# EXPERT_WIDTH_MULTIPLIER=0.5
-# PAST_OBS_KEYS="image"
-# N_OBS_STEPS=1
-# NUM_EXPERT_LAYERS=0
-# CHUNK_SIZE=50
-# NUM_VLM_LAYERS=0
-# PAD_LANG_TO=longest
-# EVAL_CKPT=$WORK/logs/lerobot/lerobot_physical_intelligence_libero_smolpi0_lr1e-4bs32steps100000gpus2freeze32_imgtoktrue_self_attn_gap1_localimgfalse_statetopreftrue_explay0_vlml0_causalactfalse_sa0_smolvlm2500_chunk50_nobs1_expw0.5_lrvlm1e-4_longest_pt_trans1false/checkpoints/best/pretrained_model/
-# ADD_IMAGE_TOKENS=true
-# ATTN_MODE=self_attn
-# STATE_TO_PREFIX=true
-# CAUSAL_ACTION_ATTENTION_MASK=false
-# SELF_ATTN_EVERY_N_LAYERS=0
-# VLM_NAME=HuggingFaceTB/SmolVLM-500M-Instruct
-
-# SELF_ATTN_ONLY_ACTIONS=false
-# EXPERT_WIDTH_MULTIPLIER=0.5
-# PAST_OBS_KEYS="image"
-# N_OBS_STEPS=1
-# NUM_EXPERT_LAYERS=0
-# CHUNK_SIZE=50
-# NUM_VLM_LAYERS=0
-# PAD_LANG_TO=longest
-# EVAL_CKPT=$WORK/logs/lerobot/lerobot_physical_intelligence_libero_smolpi0_lr1e-4bs32steps100000gpus2freeze32_imgtoktrue_cross_attn_gap1_localimgfalse_statetopreftrue_explay0_vlml0_causalacttrue_sa2_smolvlm2500_chunk50_nobs1_expw0.5_lrvlm1e-4_longest_pt_trans1false/checkpoints/best/pretrained_model/
-# ADD_IMAGE_TOKENS=true
-# ATTN_MODE=cross_attn
-# STATE_TO_PREFIX=true
-# CAUSAL_ACTION_ATTENTION_MASK=true
-# SELF_ATTN_EVERY_N_LAYERS=2
-# VLM_NAME=HuggingFaceTB/SmolVLM-500M-Instruct
-
-
-# SELF_ATTN_ONLY_ACTIONS=false
-# EXPERT_WIDTH_MULTIPLIER=0.5
-# PAST_OBS_KEYS="image"
-# N_OBS_STEPS=1
-# NUM_EXPERT_LAYERS=0
-# CHUNK_SIZE=50
-# NUM_VLM_LAYERS=0
-# PAD_LANG_TO=longest
-# EVAL_CKPT=$WORK/logs/lerobot/lerobot_physical_intelligence_libero_smolpi0_lr1e-4bs32steps100000gpus2freeze32_imgtoktrue_cross_attn_gap1_localimgfalse_statetopreftrue_explay0_vlml0_causalacttrue_sa0_smolvlm2500_chunk50_nobs1_expw0.5_lrvlm1e-4_longest_pt_trans1false/checkpoints/best/pretrained_model/
-# ADD_IMAGE_TOKENS=true
-# ATTN_MODE=cross_attn
-# STATE_TO_PREFIX=true
-# CAUSAL_ACTION_ATTENTION_MASK=true
-# SELF_ATTN_EVERY_N_LAYERS=0
-# VLM_NAME=HuggingFaceTB/SmolVLM-500M-Instruct
-
-
-# SELF_ATTN_ONLY_ACTIONS=false
-# EXPERT_WIDTH_MULTIPLIER=0.75
-# PAST_OBS_KEYS="image"
-# N_OBS_STEPS=1
-# NUM_EXPERT_LAYERS=0
-# CHUNK_SIZE=50
-# NUM_VLM_LAYERS=16
-# PAD_LANG_TO=longest
-# EVAL_CKPT=$WORK/logs/lerobot/lerobot_physical_intelligence_libero_smolpi0_lr1e-4bs8steps100000gpus2freeze32_imgtoktrue_cross_attn_gap1_localimgfalse_statetopreftrue_explay0_vlml16_causalacttrue_sa2_smolvlm2500_chunk50_nobs1_expw0.75_lrvlm1e-4_longest_ptcomv3freeze25_trans1false/checkpoints/best/pretrained_model/
-# ADD_IMAGE_TOKENS=true
-# ATTN_MODE=cross_attn
-# STATE_TO_PREFIX=true
-# CAUSAL_ACTION_ATTENTION_MASK=true
-# SELF_ATTN_EVERY_N_LAYERS=2
-# VLM_NAME=HuggingFaceTB/SmolVLM-500M-Instruct
-
-
-# SELF_ATTN_ONLY_ACTIONS=false
-# EXPERT_WIDTH_MULTIPLIER=0.75
-# PAST_OBS_KEYS="image"
-# N_OBS_STEPS=1
-# NUM_EXPERT_LAYERS=0
-# CHUNK_SIZE=50
-# NUM_VLM_LAYERS=16
-# PAD_LANG_TO=longest
-# EVAL_CKPT=$WORK/logs/lerobot/lerobot_physical_intelligence_libero_smolpi0_lr1e-4bs8steps100000gpus2freeze32_imgtoktrue_cross_attn_gap1_localimgfalse_statetopreftrue_explay0_vlml16_causalacttrue_sa2_smolvlm2500_chunk50_nobs1_expw0.75_lrvlm1e-4_longest_ptcomv3freeze50_trans1false/checkpoints/best/pretrained_model/
-# ADD_IMAGE_TOKENS=true
-# ATTN_MODE=cross_attn
-# STATE_TO_PREFIX=true
-# CAUSAL_ACTION_ATTENTION_MASK=true
-# SELF_ATTN_EVERY_N_LAYERS=2
-# VLM_NAME=HuggingFaceTB/SmolVLM-500M-Instruct
-
-# SELF_ATTN_ONLY_ACTIONS=false
-# EXPERT_WIDTH_MULTIPLIER=0.75
-# PAST_OBS_KEYS="image"
-# N_OBS_STEPS=1
-# NUM_EXPERT_LAYERS=0
-# CHUNK_SIZE=50
-# NUM_VLM_LAYERS=16
-# PAD_LANG_TO=longest
-# EVAL_CKPT=$WORK/logs/lerobot/lerobot_physical_intelligence_libero_smolpi0_lr1e-4bs8steps100000gpus2freeze32_imgtoktrue_cross_attn_gap1_localimgfalse_statetopreftrue_explay0_vlml16_causalacttrue_sa2_smolvlm2500_chunk50_nobs1_expw0.75_lrvlm1e-4_longest_ptcomv3freeze75_trans1false/checkpoints/best/pretrained_model/
-# ADD_IMAGE_TOKENS=true
-# ATTN_MODE=cross_attn
-# STATE_TO_PREFIX=true
-# CAUSAL_ACTION_ATTENTION_MASK=true
-# SELF_ATTN_EVERY_N_LAYERS=2
-# VLM_NAME=HuggingFaceTB/SmolVLM-500M-Instruct
-
-# SELF_ATTN_ONLY_ACTIONS=false
-# EXPERT_WIDTH_MULTIPLIER=0.75
-# PAST_OBS_KEYS="image"
-# N_OBS_STEPS=1
-# NUM_EXPERT_LAYERS=0
-# CHUNK_SIZE=50
-# NUM_VLM_LAYERS=16
-# PAD_LANG_TO=longest
-# EVAL_CKPT=$WORK/logs/lerobot/lerobot_physical_intelligence_libero_smolpi0_lr1e-4bs8steps100000gpus2freeze32_imgtoktrue_cross_attn_gap1_localimgfalse_statetopreftrue_explay0_vlml16_causalacttrue_sa2_smolvlm2500_chunk50_nobs1_expw0.75_lrvlm1e-4_longest_ptcomv3freeze100_trans1false/checkpoints/best/pretrained_model/
-# ADD_IMAGE_TOKENS=true
-# ATTN_MODE=cross_attn
-# STATE_TO_PREFIX=true
-# CAUSAL_ACTION_ATTENTION_MASK=true
-# SELF_ATTN_EVERY_N_LAYERS=2
-# VLM_NAME=HuggingFaceTB/SmolVLM-500M-Instruct
-
-# SELF_ATTN_ONLY_ACTIONS=false
-# EXPERT_WIDTH_MULTIPLIER=0.5
-# PAST_OBS_KEYS="image"
-# N_OBS_STEPS=1
-# NUM_EXPERT_LAYERS=0
-# CHUNK_SIZE=50
-# NUM_VLM_LAYERS=16
-# PAD_LANG_TO=longest
-# EVAL_CKPT=$WORK/logs/lerobot/lerobot_physical_intelligence_libero_smolpi0_lr1e-4bs8steps100000gpus2freeze32_imgtoktrue_cross_attn_gap1_localimgfalse_statetopreftrue_explay0_vlml16_causalacttrue_sa2_smolvlm2500_chunk50_nobs1_expw0.5_lrvlm1e-4_longest_pt_trans6true/checkpoints/best/pretrained_model/
-# ADD_IMAGE_TOKENS=true
-# ATTN_MODE=cross_attn
-# STATE_TO_PREFIX=true
-# CAUSAL_ACTION_ATTENTION_MASK=true
-# SELF_ATTN_EVERY_N_LAYERS=2
-# VLM_NAME=HuggingFaceTB/SmolVLM-500M-Instruct
-
-# SELF_ATTN_ONLY_ACTIONS=false
-# EXPERT_WIDTH_MULTIPLIER=0.5
-# PAST_OBS_KEYS="image"
-# N_OBS_STEPS=1
-# NUM_EXPERT_LAYERS=0
-# CHUNK_SIZE=50
-# NUM_VLM_LAYERS=16
-# PAD_LANG_TO=longest
-# EVAL_CKPT=$WORK/logs/lerobot/lerobot_physical_intelligence_libero_smolpi0_lr1e-4bs8steps100000gpus2freeze32_imgtoktrue_cross_attn_gap1_localimgfalse_statetopreftrue_explay0_vlml16_causalacttrue_sa2_smolvlm2500_chunk50_nobs1_expw0.5_lrvlm1e-4_longest_pt_trans4true/checkpoints/best/pretrained_model/
-# ADD_IMAGE_TOKENS=true
-# ATTN_MODE=cross_attn
-# STATE_TO_PREFIX=true
-# CAUSAL_ACTION_ATTENTION_MASK=true
-# SELF_ATTN_EVERY_N_LAYERS=2
-# VLM_NAME=HuggingFaceTB/SmolVLM-500M-Instruct
-
-# SELF_ATTN_ONLY_ACTIONS=false
-# EXPERT_WIDTH_MULTIPLIER=0.5
-# PAST_OBS_KEYS="image"
-# N_OBS_STEPS=1
-# NUM_EXPERT_LAYERS=0
-# CHUNK_SIZE=50
-# NUM_VLM_LAYERS=16
-# PAD_LANG_TO=longest
-# EVAL_CKPT=$WORK/logs/lerobot/lerobot_physical_intelligence_libero_smolpi0_lr1e-4bs8steps100000gpus2freeze32_imgtoktrue_cross_attn_gap1_localimgfalse_statetopreftrue_explay0_vlml16_causalacttrue_sa2_smolvlm2500_chunk50_nobs1_expw0.5_lrvlm1e-4_longest_pt_trans7true/checkpoints/best/pretrained_model/
-# ADD_IMAGE_TOKENS=true
-# ATTN_MODE=cross_attn
-# STATE_TO_PREFIX=true
-# CAUSAL_ACTION_ATTENTION_MASK=true
-# SELF_ATTN_EVERY_N_LAYERS=2
-# VLM_NAME=HuggingFaceTB/SmolVLM-500M-Instruct
-
-# SELF_ATTN_ONLY_ACTIONS=false
-# EXPERT_WIDTH_MULTIPLIER=0.5
-# PAST_OBS_KEYS="image"
-# N_OBS_STEPS=1
-# NUM_EXPERT_LAYERS=0
-# CHUNK_SIZE=50
-# NUM_VLM_LAYERS=16
-# PAD_LANG_TO=longest
-# EVAL_CKPT=$WORK/logs/lerobot/lerobot_physical_intelligence_libero_smolpi0_lr1e-4bs8steps100000gpus2freeze32_imgtoktrue_cross_attn_gap1_localimgfalse_statetopreftrue_explay0_vlml16_causalacttrue_sa2_smolvlm2500_chunk50_nobs1_expw0.5_lrvlm1e-4_longest_pt_trans5true/checkpoints/best/pretrained_model/
-# ADD_IMAGE_TOKENS=true
-# ATTN_MODE=cross_attn
-# STATE_TO_PREFIX=true
-# CAUSAL_ACTION_ATTENTION_MASK=true
-# SELF_ATTN_EVERY_N_LAYERS=2
-# VLM_NAME=HuggingFaceTB/SmolVLM-500M-Instruct
-
-
-# SELF_ATTN_ONLY_ACTIONS=false
-# EXPERT_WIDTH_MULTIPLIER=0.5
-# PAST_OBS_KEYS="image"
-# N_OBS_STEPS=1
-# NUM_EXPERT_LAYERS=0
-# CHUNK_SIZE=50
-# NUM_VLM_LAYERS=16
-# PAD_LANG_TO=longest
-# EVAL_CKPT=$WORK/logs/lerobot/lerobot_physical_intelligence_libero_smolpi0_lr1e-4bs8steps100000gpus2freeze32_imgtoktrue_cross_attn_gap1_localimgfalse_statetopreftrue_explay0_vlml16_causalacttrue_sa2_smolvlm2500_chunk50_nobs1_expw0.5_lrvlm1e-4_longest_pt_trans2true/checkpoints/best/pretrained_model/
-# ADD_IMAGE_TOKENS=true
-# ATTN_MODE=cross_attn
-# STATE_TO_PREFIX=true
-# CAUSAL_ACTION_ATTENTION_MASK=true
-# SELF_ATTN_EVERY_N_LAYERS=2
-# VLM_NAME=HuggingFaceTB/SmolVLM-500M-Instruct
-
-
-
-# SELF_ATTN_ONLY_ACTIONS=false
-# EXPERT_WIDTH_MULTIPLIER=0.75
-# PAST_OBS_KEYS="image"
-# N_OBS_STEPS=1
-# NUM_EXPERT_LAYERS=0
-# CHUNK_SIZE=50
-# NUM_VLM_LAYERS=16
-# PAD_LANG_TO=longest
-# EVAL_CKPT=$WORK/logs/lerobot/lerobot_physical_intelligence_libero_smolpi0_lr1e-4bs32steps100000gpus2freeze32_imgtoktrue_cross_attn_gap1_localimgfalse_statetopreftrue_explay0_vlml16_causalacttrue_sa2_smolvlm2500_chunk50_nobs1_expw0.75_lrvlm1e-4_longest_pt_trans1true/checkpoints/best/pretrained_model/
-# ADD_IMAGE_TOKENS=true
-# ATTN_MODE=cross_attn
-# STATE_TO_PREFIX=true
-# CAUSAL_ACTION_ATTENTION_MASK=true
-# SELF_ATTN_EVERY_N_LAYERS=2
-# VLM_NAME=HuggingFaceTB/SmolVLM-500M-Instruct
-
-# SELF_ATTN_ONLY_ACTIONS=false
-# EXPERT_WIDTH_MULTIPLIER=0.5
-# PAST_OBS_KEYS="image"
-# N_OBS_STEPS=1
-# NUM_EXPERT_LAYERS=0
-# CHUNK_SIZE=50
-# NUM_VLM_LAYERS=16
-# PAD_LANG_TO=longest
-# EVAL_CKPT=$WORK/logs/lerobot/lerobot_physical_intelligence_libero_smolpi0_lr1e-4bs8steps100000gpus2freeze32_imgtoktrue_cross_attn_gap1_localimgfalse_statetopreftrue_explay0_vlml16_causalacttrue_sa2_smolvlm2500_chunk50_nobs1_expw0.5_lrvlm1e-4_longest_pt_trans8true/checkpoints/best/pretrained_model/
-# ADD_IMAGE_TOKENS=true
-# ATTN_MODE=cross_attn
-# STATE_TO_PREFIX=true
-# CAUSAL_ACTION_ATTENTION_MASK=true
-# SELF_ATTN_EVERY_N_LAYERS=2
-# VLM_NAME=HuggingFaceTB/SmolVLM-500M-Instruct
-
-# SELF_ATTN_ONLY_ACTIONS=false
-# EXPERT_WIDTH_MULTIPLIER=0.5
-# PAST_OBS_KEYS="image"
-# N_OBS_STEPS=1
-# NUM_EXPERT_LAYERS=0
-# CHUNK_SIZE=50
-# NUM_VLM_LAYERS=16
-# PAD_LANG_TO=longest
-# EVAL_CKPT=$WORK/logs/lerobot/lerobot_physical_intelligence_libero_smolpi0_lr1e-4bs8steps100000gpus2freeze32_imgtoktrue_cross_attn_gap1_localimgfalse_statetopreftrue_explay0_vlml16_causalacttrue_sa2_smolvlm2500_chunk50_nobs1_expw0.5_lrvlm1e-4_longest_pt_trans9true/checkpoints/best/pretrained_model/
-# ADD_IMAGE_TOKENS=true
-# ATTN_MODE=cross_attn
-# STATE_TO_PREFIX=true
-# CAUSAL_ACTION_ATTENTION_MASK=true
-# SELF_ATTN_EVERY_N_LAYERS=2
-# VLM_NAME=HuggingFaceTB/SmolVLM-500M-Instruct
-
-# SELF_ATTN_ONLY_ACTIONS=false
-# EXPERT_WIDTH_MULTIPLIER=0.5
-# PAST_OBS_KEYS="image"
-# N_OBS_STEPS=1
-# NUM_EXPERT_LAYERS=0
-# CHUNK_SIZE=50
-# NUM_VLM_LAYERS=16
-# PAD_LANG_TO=longest
-# EVAL_CKPT=$WORK/logs/lerobot/lerobot_physical_intelligence_libero_smolpi0_lr1e-4bs8steps100000gpus2freeze32_imgtoktrue_cross_attn_gap1_localimgfalse_statetopreftrue_explay0_vlml16_causalacttrue_sa2_smolvlm2500_chunk50_nobs1_expw0.5_lrvlm1e-4_longest_ptcomv1v2full/checkpoints/best/pretrained_model/
-# ADD_IMAGE_TOKENS=true
-# ATTN_MODE=cross_attn
-# STATE_TO_PREFIX=true
-# CAUSAL_ACTION_ATTENTION_MASK=true
-# SELF_ATTN_EVERY_N_LAYERS=2
-# VLM_NAME=HuggingFaceTB/SmolVLM-500M-Instruct
-
-# SELF_ATTN_ONLY_ACTIONS=false
-# EXPERT_WIDTH_MULTIPLIER=0.25
-# PAST_OBS_KEYS="image"
-# N_OBS_STEPS=1
-# NUM_EXPERT_LAYERS=0
-# CHUNK_SIZE=50
-# NUM_VLM_LAYERS=0
-# PAD_LANG_TO=longest
-# EVAL_CKPT=$WORK/logs/lerobot/lerobot_physical_intelligence_libero_smolpi0_lr1e-4bs8steps100000gpus2freeze32_imgtoktrue_cross_attn_gap1_localimgfalse_statetopreftrue_explay0_vlml0_causalacttrue_sa0_smolvlm2500_chunk50_nobs1_expw0.25_lrvlm1e-4_longest_pt_trans0false/checkpoints/best/pretrained_model/
-# ADD_IMAGE_TOKENS=true
-# ATTN_MODE=cross_attn
-# STATE_TO_PREFIX=true
-# CAUSAL_ACTION_ATTENTION_MASK=true
-# SELF_ATTN_EVERY_N_LAYERS=0
-# VLM_NAME=HuggingFaceTB/SmolVLM-500M-Instruct
-
-
-# SELF_ATTN_ONLY_ACTIONS=false
-# EXPERT_WIDTH_MULTIPLIER=1
-# PAST_OBS_KEYS="image"
-# N_OBS_STEPS=1
-# NUM_EXPERT_LAYERS=0
-# CHUNK_SIZE=50
-# NUM_VLM_LAYERS=0
-# PAD_LANG_TO=longest
-# EVAL_CKPT=$WORK/logs/lerobot/lerobot_physical_intelligence_libero_smolpi0_lr1e-4bs8steps100000gpus2freeze32_imgtoktrue_cross_attn_gap1_localimgfalse_statetopreftrue_explay0_vlml0_causalacttrue_sa0_smolvlm2500_chunk50_nobs1_expw1_lrvlm1e-4_longest_pt_trans0false/checkpoints/best/pretrained_model/
-# ADD_IMAGE_TOKENS=true
-# ATTN_MODE=cross_attn
-# STATE_TO_PREFIX=true
-# CAUSAL_ACTION_ATTENTION_MASK=true
-# SELF_ATTN_EVERY_N_LAYERS=0
-# VLM_NAME=HuggingFaceTB/SmolVLM-500M-Instruct
-
-# SELF_ATTN_ONLY_ACTIONS=false
-# EXPERT_WIDTH_MULTIPLIER=0.75
-# PAST_OBS_KEYS="image"
-# N_OBS_STEPS=1
-# NUM_EXPERT_LAYERS=0
-# CHUNK_SIZE=50
-# NUM_VLM_LAYERS=0
-# PAD_LANG_TO=longest
-# EVAL_CKPT=$WORK/logs/lerobot/lerobot_physical_intelligence_libero_smolpi0_lr1e-4bs8steps100000gpus2freeze32_imgtoktrue_cross_attn_gap1_localimgfalse_statetopreftrue_explay0_vlml0_causalacttrue_sa0_smolvlm2500_chunk50_nobs1_expw0.75_lrvlm1e-4_longest_pt_trans0false/checkpoints/best/pretrained_model/
-# ADD_IMAGE_TOKENS=true
-# ATTN_MODE=cross_attn
-# STATE_TO_PREFIX=true
-# CAUSAL_ACTION_ATTENTION_MASK=true
-# SELF_ATTN_EVERY_N_LAYERS=0
-# VLM_NAME=HuggingFaceTB/SmolVLM-500M-Instruct
-
-# SELF_ATTN_ONLY_ACTIONS=false
-# EXPERT_WIDTH_MULTIPLIER=0.75
-# PAST_OBS_KEYS="image"
-# N_OBS_STEPS=1
-# NUM_EXPERT_LAYERS=0
-# CHUNK_SIZE=50
-# NUM_VLM_LAYERS=0
-# PAD_LANG_TO=longest
-# EVAL_CKPT=$WORK/logs/lerobot/lerobot_physical_intelligence_libero_smolpi0_lr1e-4bs32steps100000gpus2freeze32_imgtoktrue_cross_attn_gap1_localimgfalse_statetopreftrue_explay0_vlml0_causalacttrue_sa0_smolvlm2500_chunk50_nobs1_expw0.75_lrvlm1e-4_longest_pt_trans0false/checkpoints/best/pretrained_model/
-# ADD_IMAGE_TOKENS=true
-# ATTN_MODE=cross_attn
-# STATE_TO_PREFIX=true
-# CAUSAL_ACTION_ATTENTION_MASK=true
-# SELF_ATTN_EVERY_N_LAYERS=0
-# VLM_NAME=HuggingFaceTB/SmolVLM-500M-Instruct
-
-# SELF_ATTN_ONLY_ACTIONS=false
-# EXPERT_WIDTH_MULTIPLIER=0.25
-# PAST_OBS_KEYS="image"
-# N_OBS_STEPS=1
-# NUM_EXPERT_LAYERS=0
-# CHUNK_SIZE=50
-# NUM_VLM_LAYERS=0
-# PAD_LANG_TO=longest
-# EVAL_CKPT=$WORK/logs/lerobot/lerobot_physical_intelligence_libero_smolpi0_lr1e-4bs32steps100000gpus2freeze32_imgtoktrue_cross_attn_gap1_localimgfalse_statetopreftrue_explay0_vlml0_causalacttrue_sa0_smolvlm2500_chunk50_nobs1_expw0.25_lrvlm1e-4_longest_pt_trans0false/checkpoints/best/pretrained_model/
-# ADD_IMAGE_TOKENS=true
-# ATTN_MODE=cross_attn
-# STATE_TO_PREFIX=true
-# CAUSAL_ACTION_ATTENTION_MASK=true
-# SELF_ATTN_EVERY_N_LAYERS=0
-# VLM_NAME=HuggingFaceTB/SmolVLM-500M-Instruct
-
-# SELF_ATTN_ONLY_ACTIONS=false
-# EXPERT_WIDTH_MULTIPLIER=1
-# PAST_OBS_KEYS="image"
-# N_OBS_STEPS=1
-# NUM_EXPERT_LAYERS=0
-# CHUNK_SIZE=50
-# NUM_VLM_LAYERS=0
-# PAD_LANG_TO=longest
-# EVAL_CKPT=$WORK/logs/lerobot/lerobot_physical_intelligence_libero_smolpi0_lr1e-4bs32steps100000gpus2freeze32_imgtoktrue_cross_attn_gap1_localimgfalse_statetopreftrue_explay0_vlml0_causalacttrue_sa0_smolvlm2500_chunk50_nobs1_expw1_lrvlm1e-4_longest_pt_trans0false/checkpoints/best/pretrained_model/
-# ADD_IMAGE_TOKENS=true
-# ATTN_MODE=cross_attn
-# STATE_TO_PREFIX=true
-# CAUSAL_ACTION_ATTENTION_MASK=true
-# SELF_ATTN_EVERY_N_LAYERS=0
-# VLM_NAME=HuggingFaceTB/SmolVLM-500M-Instruct
-
-
-# SELF_ATTN_ONLY_ACTIONS=false
-# EXPERT_WIDTH_MULTIPLIER=0.75
-# PAST_OBS_KEYS="observation.state"
-# N_OBS_STEPS=3
-# NUM_EXPERT_LAYERS=0
-# CHUNK_SIZE=50
-# NUM_VLM_LAYERS=16
-# PAD_LANG_TO=longest
-# EVAL_CKPT=$WORK/logs/lerobot/lerobot_physical_intelligence_libero_smolpi0_lr1e-4bs8steps100000gpus2freeze32_imgtoktrue_cross_attn_gap1_localimgfalse_statetopreftrue_explay0_vlml16_causalacttrue_sa2_smolvlm2500_chunk50_nobs3statestrue_expw0.75_lrvlm1e-4_longest_pt_trans0false/checkpoints/best/pretrained_model/
-# ADD_IMAGE_TOKENS=true
-# ATTN_MODE=cross_attn
-# STATE_TO_PREFIX=true
-# CAUSAL_ACTION_ATTENTION_MASK=true
-# SELF_ATTN_EVERY_N_LAYERS=2
-# VLM_NAME=HuggingFaceTB/SmolVLM-500M-Instruct
-
-
-# CAUSAL_ATTENTION_ON_HISTORY=true
-# SELF_ATTN_ONLY_ACTIONS=false
-# EXPERT_WIDTH_MULTIPLIER=0.75
-# PAST_OBS_KEYS="image,observation.state"
-# N_OBS_STEPS=3
-# NUM_EXPERT_LAYERS=0
-# CHUNK_SIZE=50
-# NUM_VLM_LAYERS=16
-# PAD_LANG_TO=longest
-# EVAL_CKPT=$WORK/logs/lerobot/lerobot_physical_intelligence_libero_smolpi0_lr1e-4bs8steps100000gpus2freeze32_imgtoktrue_cross_attn_gap1_localimgfalse_statetopreftrue_explay0_vlml16_causalacttrue_sa2_smolvlm2500_chunk50_nobs3_expw0.75_lrvlm1e-4_longest_pt_trans0false/checkpoints/best/pretrained_model/
-# ADD_IMAGE_TOKENS=true
-# ATTN_MODE=cross_attn
-# STATE_TO_PREFIX=true
-# CAUSAL_ACTION_ATTENTION_MASK=true
-# SELF_ATTN_EVERY_N_LAYERS=2
-# VLM_NAME=HuggingFaceTB/SmolVLM-500M-Instruct
-
-
-
-# CAUSAL_ATTENTION_ON_HISTORY=true
-# SELF_ATTN_ONLY_ACTIONS=false
-# EXPERT_WIDTH_MULTIPLIER=0.75
-# PAST_OBS_KEYS="image,observation.state"
-# N_OBS_STEPS=1
-# NUM_EXPERT_LAYERS=0
-# CHUNK_SIZE=50
-# NUM_VLM_LAYERS=16
-# PAD_LANG_TO=longest
-# EVAL_CKPT=$WORK/logs/lerobot/lerobot_physical_intelligence_libero_smolpi0_lr1e-4bs8steps100000gpus2freeze32_imgtoktrue_cross_attn_vlml16_causalacttrue_sa2_smolvlm2500_chunk50_nobs1statestrue_expw0.75_lrvlm1e-4_longest_pt_trans0false_decaylr1e-5100000/checkpoints/best/pretrained_model/
-# ADD_IMAGE_TOKENS=true
-# ATTN_MODE=cross_attn
-# STATE_TO_PREFIX=true
-# CAUSAL_ACTION_ATTENTION_MASK=true
-# SELF_ATTN_EVERY_N_LAYERS=2
-# VLM_NAME=HuggingFaceTB/SmolVLM-500M-Instruct
-
-
-
-# CAUSAL_ATTENTION_ON_HISTORY=true
-# SELF_ATTN_ONLY_ACTIONS=false
-# EXPERT_WIDTH_MULTIPLIER=0.75
-# PAST_OBS_KEYS="image,observation.state"
-# N_OBS_STEPS=1
-# NUM_EXPERT_LAYERS=0
-# CHUNK_SIZE=50
-# NUM_VLM_LAYERS=16
-# PAD_LANG_TO=longest
-# EVAL_CKPT=$WORK/logs/lerobot/lerobot_physical_intelligence_libero_smolpi0_lr1e-4bs8steps100000gpus2freeze32_imgtoktrue_cross_attn_vlml16_causalacttrue_sa2_smolvlm2500_chunk50_nobs1statestrue_expw0.75_lrvlm1e-4_longest_pt_trans0false_decaylr1e-530000/checkpoints/best/pretrained_model/
-# ADD_IMAGE_TOKENS=true
-# ATTN_MODE=cross_attn
-# STATE_TO_PREFIX=true
-# CAUSAL_ACTION_ATTENTION_MASK=true
-# SELF_ATTN_EVERY_N_LAYERS=2
-# VLM_NAME=HuggingFaceTB/SmolVLM-500M-Instruct
-
-
-# CAUSAL_ATTENTION_ON_HISTORY=true
-# SELF_ATTN_ONLY_ACTIONS=false
-# EXPERT_WIDTH_MULTIPLIER=0.75
-# PAST_OBS_KEYS="image,observation.state"
-# N_OBS_STEPS=1
-# NUM_EXPERT_LAYERS=0
-# CHUNK_SIZE=50
-# NUM_VLM_LAYERS=16
-# PAD_LANG_TO=longest
-# EVAL_CKPT=$WORK/logs/lerobot/lerobot_physical_intelligence_libero_smolpi0_lr1e-4bs8steps100000gpus2freeze32_imgtoktrue_cross_attn_vlml16_causalacttrue_sa2_smolvlm2500_chunk50_nobs1statestrue_expw0.75_lrvlm1e-4_longest_pt_trans0false_decaylr5e-6100000/checkpoints/best/pretrained_model/
-# ADD_IMAGE_TOKENS=true
-# ATTN_MODE=cross_attn
-# STATE_TO_PREFIX=true
-# CAUSAL_ACTION_ATTENTION_MASK=true
-# SELF_ATTN_EVERY_N_LAYERS=2
-# VLM_NAME=HuggingFaceTB/SmolVLM-500M-Instruct
-
-
-# CAUSAL_ATTENTION_ON_HISTORY=true
-# SELF_ATTN_ONLY_ACTIONS=false
-# EXPERT_WIDTH_MULTIPLIER=0.75
-# PAST_OBS_KEYS="image"
-# N_OBS_STEPS=1
-# NUM_EXPERT_LAYERS=0
-# CHUNK_SIZE=50
-# NUM_VLM_LAYERS=16
-# PAD_LANG_TO=longest
-# EVAL_CKPT=$WORK/logs/lerobot/lerobot_physical_intelligence_libero_smolpi0_lr1e-4bs8steps100000gpus2freeze32_imgtoktrue_cross_attn_vlml16_causalacttrue_sa2_smolvlm2500_chunk50_nobs1statestrue_expw0.75_lrvlm1e-4_longest_pt_trans0false_decaylr2.5e-6100000/checkpoints/best/pretrained_model/
-# ADD_IMAGE_TOKENS=true
-# ATTN_MODE=cross_attn
-# STATE_TO_PREFIX=true
-# CAUSAL_ACTION_ATTENTION_MASK=true
-# SELF_ATTN_EVERY_N_LAYERS=2
-# VLM_NAME=HuggingFaceTB/SmolVLM-500M-Instruct
-
-# CAUSAL_ATTENTION_ON_HISTORY=true
-# SELF_ATTN_ONLY_ACTIONS=false
-# EXPERT_WIDTH_MULTIPLIER=0.75
-# PAST_OBS_KEYS="image"
-# N_OBS_STEPS=1
-# NUM_EXPERT_LAYERS=0
-# CHUNK_SIZE=50
-# NUM_VLM_LAYERS=16
-# PAD_LANG_TO=longest
-# EVAL_CKPT=$WORK/logs/lerobot/lerobot_physical_intelligence_libero_smolpi0_lr1e-4bs8steps100000gpus2freeze32_imgtoktrue_cross_attn_vlml16_causalacttrue_sa2_smolvlm2500_chunk50_nobs1statestrue_expw0.75_lrvlm1e-4_longest_pt_trans0false_decaylr2.5e-630000/checkpoints/best/pretrained_model/
-# ADD_IMAGE_TOKENS=true
-# ATTN_MODE=cross_attn
-# STATE_TO_PREFIX=true
-# CAUSAL_ACTION_ATTENTION_MASK=true
-# SELF_ATTN_EVERY_N_LAYERS=2
-# VLM_NAME=HuggingFaceTB/SmolVLM-500M-Instruct
-
-# CAUSAL_ATTENTION_ON_HISTORY=true
-# SELF_ATTN_ONLY_ACTIONS=false
-# EXPERT_WIDTH_MULTIPLIER=0.75
-# PAST_OBS_KEYS="image"
-# N_OBS_STEPS=1
-# NUM_EXPERT_LAYERS=0
-# CHUNK_SIZE=50
-# NUM_VLM_LAYERS=16
-# PAD_LANG_TO=longest
-# EVAL_CKPT=$WORK/logs/lerobot/lerobot_physical_intelligence_libero_smolpi0_lr1e-4bs8steps100000gpus2freeze32_imgtoktrue_cross_attn_vlml16_causalacttrue_sa2_smolvlm2500_chunk50_nobs1statestrue_expw0.75_lrvlm1e-4_longest_pt_trans0true_decaylr2.5e-630000/checkpoints/best/pretrained_model/
-# ADD_IMAGE_TOKENS=true
-# ATTN_MODE=cross_attn
-# STATE_TO_PREFIX=true
-# CAUSAL_ACTION_ATTENTION_MASK=true
-# SELF_ATTN_EVERY_N_LAYERS=2
-# VLM_NAME=HuggingFaceTB/SmolVLM-500M-Instruct
-
-
-# CAUSAL_ATTENTION_ON_HISTORY=true
-# SELF_ATTN_ONLY_ACTIONS=false
-# EXPERT_WIDTH_MULTIPLIER=0.75
-# PAST_OBS_KEYS="image"
-# N_OBS_STEPS=1
-# NUM_EXPERT_LAYERS=0
-# CHUNK_SIZE=50
-# NUM_VLM_LAYERS=16
-# PAD_LANG_TO=longest
-# EVAL_CKPT=$WORK/logs/lerobot/lerobot_physical_intelligence_libero_smolpi0_lr1e-4bs32steps100000gpus2freeze32_imgtoktrue_cross_attn_vlml16_causalacttrue_sa2_smolvlm2500_chunk50_nobs1statestrue_expw0.75_lrvlm1e-4_longest_pt_trans0false_decaylr1e-5200000/checkpoints/best/pretrained_model/
-# ADD_IMAGE_TOKENS=true
-# ATTN_MODE=cross_attn
-# STATE_TO_PREFIX=true
-# CAUSAL_ACTION_ATTENTION_MASK=true
-# SELF_ATTN_EVERY_N_LAYERS=2
-# VLM_NAME=HuggingFaceTB/SmolVLM-500M-Instruct
-
-
-# CAUSAL_ATTENTION_ON_HISTORY=true
-# SELF_ATTN_ONLY_ACTIONS=false
-# EXPERT_WIDTH_MULTIPLIER=0.75
-# PAST_OBS_KEYS="image"
-# N_OBS_STEPS=1
-# NUM_EXPERT_LAYERS=0
-# CHUNK_SIZE=50
-# NUM_VLM_LAYERS=16
-# PAD_LANG_TO=longest
-# EVAL_CKPT=$WORK/logs/lerobot/lerobot_physical_intelligence_libero_smolpi0_lr1e-4bs32steps100000gpus2freeze32_imgtoktrue_cross_attn_vlml16_causalacttrue_sa2_smolvlm2500_chunk50_nobs1statestrue_expw0.75_lrvlm1e-4_longest_pt_trans0false_decaylr5e-6200000/checkpoints/best/pretrained_model/
-# ADD_IMAGE_TOKENS=true
-# ATTN_MODE=cross_attn
-# STATE_TO_PREFIX=true
-# CAUSAL_ACTION_ATTENTION_MASK=true
-# SELF_ATTN_EVERY_N_LAYERS=2
-# VLM_NAME=HuggingFaceTB/SmolVLM-500M-Instruct
-
-# CAUSAL_ATTENTION_ON_HISTORY=true
-# SELF_ATTN_ONLY_ACTIONS=false
-# EXPERT_WIDTH_MULTIPLIER=0.75
-# PAST_OBS_KEYS="image"
-# N_OBS_STEPS=1
-# NUM_EXPERT_LAYERS=0
-# CHUNK_SIZE=50
-# NUM_VLM_LAYERS=16
-# PAD_LANG_TO=longest
-# EVAL_CKPT=$WORK/logs/lerobot/lerobot_physical_intelligence_libero_smolpi0_lr1e-4bs32steps100000gpus2freeze32_imgtoktrue_cross_attn_vlml16_causalacttrue_sa2_smolvlm2500_chunk50_nobs1statestrue_expw0.75_lrvlm1e-4_longest_pt_trans0false_decaylr1e-530000/checkpoints/best/pretrained_model/
-# ADD_IMAGE_TOKENS=true
-# ATTN_MODE=cross_attn
-# STATE_TO_PREFIX=true
-# CAUSAL_ACTION_ATTENTION_MASK=true
-# SELF_ATTN_EVERY_N_LAYERS=2
-# VLM_NAME=HuggingFaceTB/SmolVLM-500M-Instruct
-
-
-# CAUSAL_ATTENTION_ON_HISTORY=true
-# SELF_ATTN_ONLY_ACTIONS=false
-# EXPERT_WIDTH_MULTIPLIER=0.75
-# PAST_OBS_KEYS="image"
-# N_OBS_STEPS=1
-# NUM_EXPERT_LAYERS=0
-# CHUNK_SIZE=50
-# NUM_VLM_LAYERS=16
-# PAD_LANG_TO=longest
-# EVAL_CKPT=$WORK/logs/lerobot/lerobot_physical_intelligence_libero_smolpi0_lr1e-4bs32steps100000gpus2freeze32_imgtoktrue_cross_attn_vlml16_causalacttrue_sa2_smolvlm2500_chunk50_nobs1statestrue_expw0.75_lrvlm1e-4_longest_pt_trans0false_decaylr2.5e-6200000/checkpoints/best/pretrained_model/
-# ADD_IMAGE_TOKENS=true
-# ATTN_MODE=cross_attn
-# STATE_TO_PREFIX=true
-# CAUSAL_ACTION_ATTENTION_MASK=true
-# SELF_ATTN_EVERY_N_LAYERS=2
-# VLM_NAME=HuggingFaceTB/SmolVLM-500M-Instruct
-
-
-# CAUSAL_ATTENTION_ON_HISTORY=true
-# SELF_ATTN_ONLY_ACTIONS=false
-# EXPERT_WIDTH_MULTIPLIER=0.75
-# PAST_OBS_KEYS="image"
-# N_OBS_STEPS=1
-# NUM_EXPERT_LAYERS=0
-# CHUNK_SIZE=50
-# NUM_VLM_LAYERS=16
-# PAD_LANG_TO=longest
-# EVAL_CKPT=$WORK/logs/lerobot/lerobot_physical_intelligence_libero_smolpi0_lr1e-4bs32steps100000gpus2freeze32_imgtoktrue_cross_attn_vlml16_causalacttrue_sa2_smolvlm2500_chunk50_nobs1statestrue_expw0.75_lrvlm1e-4_longest_pt_trans0false_decaylr2.5e-6100000/checkpoints/best/pretrained_model/
-# ADD_IMAGE_TOKENS=true
-# ATTN_MODE=cross_attn
-# STATE_TO_PREFIX=true
-# CAUSAL_ACTION_ATTENTION_MASK=true
-# SELF_ATTN_EVERY_N_LAYERS=2
-# VLM_NAME=HuggingFaceTB/SmolVLM-500M-Instruct
-
-
-# CAUSAL_ATTENTION_ON_HISTORY=true
-# SELF_ATTN_ONLY_ACTIONS=false
-# EXPERT_WIDTH_MULTIPLIER=0.75
-# PAST_OBS_KEYS="image"
-# N_OBS_STEPS=1
-# NUM_EXPERT_LAYERS=0
-# CHUNK_SIZE=50
-# NUM_VLM_LAYERS=16
-# PAD_LANG_TO=longest
-# EVAL_CKPT=$WORK/logs/lerobot/lerobot_physical_intelligence_libero_smolpi0_lr1e-4bs32steps100000gpus2freeze32_imgtoktrue_cross_attn_vlml16_causalacttrue_sa2_smolvlm2500_chunk50_nobs1statestrue_expw0.75_lrvlm1e-4_longest_pt_trans0false_decaylr2.5e-630000/checkpoints/best/pretrained_model/
-# ADD_IMAGE_TOKENS=true
-# ATTN_MODE=cross_attn
-# STATE_TO_PREFIX=true
-# CAUSAL_ACTION_ATTENTION_MASK=true
-# SELF_ATTN_EVERY_N_LAYERS=2
-# VLM_NAME=HuggingFaceTB/SmolVLM-500M-Instruct
-
-
-# CAUSAL_ATTENTION_ON_HISTORY=true
-# SELF_ATTN_ONLY_ACTIONS=false
-# EXPERT_WIDTH_MULTIPLIER=0.75
-# PAST_OBS_KEYS="image"
-# N_OBS_STEPS=1
-# NUM_EXPERT_LAYERS=0
-# CHUNK_SIZE=50
-# NUM_VLM_LAYERS=16
-# PAD_LANG_TO=longest
-# EVAL_CKPT=$WORK/logs/lerobot/lerobot_physical_intelligence_libero_smolpi0_lr1e-4bs32steps100000gpus2freeze32_imgtoktrue_cross_attn_vlml16_causalacttrue_sa2_smolvlm2500_chunk50_nobs1statestrue_expw0.75_lrvlm1e-4_longest_pt_trans0false_decaylr5e-6100000/checkpoints/best/pretrained_model/
-# ADD_IMAGE_TOKENS=true
-# ATTN_MODE=cross_attn
-# STATE_TO_PREFIX=true
-# CAUSAL_ACTION_ATTENTION_MASK=true
-# SELF_ATTN_EVERY_N_LAYERS=2
-# VLM_NAME=HuggingFaceTB/SmolVLM-500M-Instruct
-
-
-# CAUSAL_ATTENTION_ON_HISTORY=true
-# SELF_ATTN_ONLY_ACTIONS=false
-# EXPERT_WIDTH_MULTIPLIER=0.75
-# PAST_OBS_KEYS="image"
-# N_OBS_STEPS=1
-# NUM_EXPERT_LAYERS=0
-# CHUNK_SIZE=50
-# NUM_VLM_LAYERS=16
-# PAD_LANG_TO=longest
-# EVAL_CKPT=$WORK/logs/lerobot/lerobot_physical_intelligence_libero_smolpi0_lr1e-4bs8steps100000gpus2freeze32_imgtoktrue_cross_attn_vlml16_causalacttrue_sa2_smolvla500base_chunk50_nobs1statestrue_expw0.75_lrvlm1e-4_longest_pt_trans0false_decaylr2.5e-630000_relactfalsefirst_camfalse/checkpoints/best/pretrained_model/
-# ADD_IMAGE_TOKENS=true
-# ATTN_MODE=cross_attn
-# STATE_TO_PREFIX=true
-# CAUSAL_ACTION_ATTENTION_MASK=true
-# SELF_ATTN_EVERY_N_LAYERS=2
-# VLM_NAME=HuggingFaceTB/SmolVLA-500M-Base 
-
-
-
-# PREDICT_RELATIVE_ACTIONS=true
-# RELATIVE_ACTIONS_MODE=relative
-# CAUSAL_ATTENTION_ON_HISTORY=true
-# SELF_ATTN_ONLY_ACTIONS=false
-# EXPERT_WIDTH_MULTIPLIER=0.75
-# PAST_OBS_KEYS="image"
-# N_OBS_STEPS=1
-# NUM_EXPERT_LAYERS=0
-# CHUNK_SIZE=50
-# NUM_VLM_LAYERS=16
-# PAD_LANG_TO=longest
-# EVAL_CKPT=$WORK/logs/lerobot/lerobot_physical_intelligence_libero_smolpi0_lr1e-4bs32steps100000gpus2freeze32_imgtoktrue_cross_attn_vlml16_causalacttrue_sa2_smolvlm2500_chunk50_nobs1statestrue_expw0.75_lrvlm1e-4_longest_pt_trans0false_decaylr2.5e-630000_relacttruerelative/checkpoints/best/pretrained_model/
-# ADD_IMAGE_TOKENS=true
-# ATTN_MODE=cross_attn
-# STATE_TO_PREFIX=true
-# CAUSAL_ACTION_ATTENTION_MASK=true
-# SELF_ATTN_EVERY_N_LAYERS=2
-# VLM_NAME=HuggingFaceTB/SmolVLM-500M-Instruct
-
-
-
-# PREDICT_RELATIVE_ACTIONS=true
-# RELATIVE_ACTIONS_MODE=first
-# CAUSAL_ATTENTION_ON_HISTORY=true
-# SELF_ATTN_ONLY_ACTIONS=false
-# EXPERT_WIDTH_MULTIPLIER=0.75
-# PAST_OBS_KEYS="image"
-# N_OBS_STEPS=1
-# NUM_EXPERT_LAYERS=0
-# CHUNK_SIZE=50
-# NUM_VLM_LAYERS=16
-# PAD_LANG_TO=longest
-# EVAL_CKPT=$WORK/logs/lerobot/lerobot_physical_intelligence_libero_smolpi0_lr1e-4bs32steps100000gpus2freeze32_imgtoktrue_cross_attn_vlml16_causalacttrue_sa2_smolvlm2500_chunk50_nobs1statestrue_expw0.75_lrvlm1e-4_longest_pt_trans0false_decaylr2.5e-630000_relacttruefirst/checkpoints/best/pretrained_model/
-# ADD_IMAGE_TOKENS=true
-# ATTN_MODE=cross_attn
-# STATE_TO_PREFIX=true
-# CAUSAL_ACTION_ATTENTION_MASK=true
-# SELF_ATTN_EVERY_N_LAYERS=2
-# VLM_NAME=HuggingFaceTB/SmolVLM-500M-Instruct
-
-
-# CAUSAL_ATTENTION_ON_HISTORY=true
-# SELF_ATTN_ONLY_ACTIONS=false
-# EXPERT_WIDTH_MULTIPLIER=0.75
-# PAST_OBS_KEYS="image"
-# N_OBS_STEPS=1
-# NUM_EXPERT_LAYERS=0
-# CHUNK_SIZE=50
-# NUM_VLM_LAYERS=16
-# PAD_LANG_TO=longest
-# EVAL_CKPT=$WORK/logs/lerobot/lerobot_physical_intelligence_libero_smolpi0_lr1e-4bs32steps100000gpus2freeze32_imgtoktrue_cross_attn_vlml16_causalacttrue_sa2_smolvla500base_chunk50_nobs1statestrue_expw0.75_lrvlm1e-4_longest_pt_trans0false_decaylr2.5e-630000_relactfalsefirst_camfalse/checkpoints/best/pretrained_model/
-# ADD_IMAGE_TOKENS=true
-# ATTN_MODE=cross_attn
-# STATE_TO_PREFIX=true
-# CAUSAL_ACTION_ATTENTION_MASK=true
-# SELF_ATTN_EVERY_N_LAYERS=2
-# VLM_NAME=HuggingFaceTB/SmolVLA-500M-Base 
-
-
-# CAUSAL_ATTENTION_ON_HISTORY=true
-# SELF_ATTN_ONLY_ACTIONS=false
-# EXPERT_WIDTH_MULTIPLIER=0.75
-# PAST_OBS_KEYS="image"
-# N_OBS_STEPS=1
-# NUM_EXPERT_LAYERS=0
-# CHUNK_SIZE=50
-# NUM_VLM_LAYERS=16
-# PAD_LANG_TO=longest
-# EVAL_CKPT=$WORK/logs/lerobot/lerobot_physical_intelligence_libero_smolpi0_lr8e-5bs8steps100000gpus2freeze32_imgtoktrue_cross_attn_vlml16_causalacttrue_sa2_smolvlm2500_chunk50_nobs1statestrue_expw0.75_lrvlm1e-4_longest_pt_trans0false_decaylr2.5e-6100000_relactfalsefirst_camfalse/checkpoints/best/pretrained_model/
-# ADD_IMAGE_TOKENS=true
-# ATTN_MODE=cross_attn
-# STATE_TO_PREFIX=true
-# CAUSAL_ACTION_ATTENTION_MASK=true
-# SELF_ATTN_EVERY_N_LAYERS=2
-# VLM_NAME=HuggingFaceTB/SmolVLM-500M-Instruct
-
-
-# CAUSAL_ATTENTION_ON_HISTORY=true
-# SELF_ATTN_ONLY_ACTIONS=false
-# EXPERT_WIDTH_MULTIPLIER=0.75
-# PAST_OBS_KEYS="image"
-# N_OBS_STEPS=1
-# NUM_EXPERT_LAYERS=0
-# CHUNK_SIZE=50
-# NUM_VLM_LAYERS=16
-# PAD_LANG_TO=longest
-# EVAL_CKPT=$WORK/logs/lerobot/lerobot_physical_intelligence_libero_smolpi0_lr6e-5bs8steps100000gpus2freeze32_imgtoktrue_cross_attn_vlml16_causalacttrue_sa2_smolvlm2500_chunk50_nobs1statestrue_expw0.75_lrvlm1e-4_longest_pt_trans0false_decaylr2.5e-6100000_relactfalsefirst_camfalse/checkpoints/best/pretrained_model/
-# ADD_IMAGE_TOKENS=true
-# ATTN_MODE=cross_attn
-# STATE_TO_PREFIX=true
-# CAUSAL_ACTION_ATTENTION_MASK=true
-# SELF_ATTN_EVERY_N_LAYERS=2
-# VLM_NAME=HuggingFaceTB/SmolVLM-500M-Instruct
-
-# CAUSAL_ATTENTION_ON_HISTORY=true
-# SELF_ATTN_ONLY_ACTIONS=false
-# EXPERT_WIDTH_MULTIPLIER=0.75
-# PAST_OBS_KEYS="image"
-# N_OBS_STEPS=1
-# NUM_EXPERT_LAYERS=0
-# CHUNK_SIZE=50
-# NUM_VLM_LAYERS=16
-# PAD_LANG_TO=longest
-# EVAL_CKPT=$WORK/logs/lerobot/lerobot_physical_intelligence_libero_smolpi0_lr4e-5bs8steps100000gpus2freeze32_imgtoktrue_cross_attn_vlml16_causalacttrue_sa2_smolvlm2500_chunk50_nobs1statestrue_expw0.75_lrvlm1e-4_longest_pt_trans0false_decaylr2.5e-6100000_relactfalsefirst_camfalse/checkpoints/best/pretrained_model/
-# ADD_IMAGE_TOKENS=true
-# ATTN_MODE=cross_attn
-# STATE_TO_PREFIX=true
-# CAUSAL_ACTION_ATTENTION_MASK=true
-# SELF_ATTN_EVERY_N_LAYERS=2
-# VLM_NAME=HuggingFaceTB/SmolVLM-500M-Instruct
-
-# CAUSAL_ATTENTION_ON_HISTORY=true
-# SELF_ATTN_ONLY_ACTIONS=false
-# EXPERT_WIDTH_MULTIPLIER=0.75
-# PAST_OBS_KEYS="image"
-# N_OBS_STEPS=1
-# NUM_EXPERT_LAYERS=0
-# CHUNK_SIZE=50
-# NUM_VLM_LAYERS=16
-# PAD_LANG_TO=longest
-# EVAL_CKPT=$WORK/logs/lerobot/lerobot_physical_intelligence_libero_smolpi0_lr1e-4bs32steps100000gpus2freeze32_cross_attn_vlml16_sa2_smolvlm2500_chunk50_nobs1_expw0.75_lrvlm1e-4_longest_ptcomv1v2freezebs64transv0_trans0false_decaylr2.5e-630000_relactfalsefirst_camfalse_vim-1/checkpoints/best/pretrained_model/
-# ADD_IMAGE_TOKENS=true
-# ATTN_MODE=cross_attn
-# STATE_TO_PREFIX=true
-# CAUSAL_ACTION_ATTENTION_MASK=true
-# SELF_ATTN_EVERY_N_LAYERS=2
-# VLM_NAME=HuggingFaceTB/SmolVLM-500M-Instruct
-
-# CAUSAL_ATTENTION_ON_HISTORY=true
-# SELF_ATTN_ONLY_ACTIONS=false
-# EXPERT_WIDTH_MULTIPLIER=0.75
-# PAST_OBS_KEYS="image"
-# N_OBS_STEPS=1
-# NUM_EXPERT_LAYERS=0
-# CHUNK_SIZE=50
-# NUM_VLM_LAYERS=16
-# PAD_LANG_TO=longest
-# EVAL_CKPT=$WORK/logs/lerobot/lerobot_physical_intelligence_libero_smolpi0_lr1e-4bs32steps100000gpus2freeze32_cross_attn_vlml16_sa2_smolvlm2500_chunk50_nobs1_expw0.75_lrvlm1e-4_longest_pt_trans1true_decaylr2.5e-630000_relactfalsefirst_camfalse_vim-1/checkpoints/best/pretrained_model/
-# ADD_IMAGE_TOKENS=true
-# ATTN_MODE=cross_attn
-# STATE_TO_PREFIX=true
-# CAUSAL_ACTION_ATTENTION_MASK=true
-# SELF_ATTN_EVERY_N_LAYERS=2
-# VLM_NAME=HuggingFaceTB/SmolVLM-500M-Instruct
-
-# REGRESSION_LOSS=true
-# CAUSAL_ATTENTION_ON_HISTORY=true
-# SELF_ATTN_ONLY_ACTIONS=false
-# EXPERT_WIDTH_MULTIPLIER=0.5
-# PAST_OBS_KEYS="image"
-# N_OBS_STEPS=1
-# NUM_EXPERT_LAYERS=0
-# CHUNK_SIZE=50
-# NUM_VLM_LAYERS=0
-# PAD_LANG_TO=longest
-# EVAL_CKPT=$WORK/logs/lerobot/lerobot_physical_intelligence_libero_smolpi0_lr1e-4bs8steps100000gpus2freeze32_cross_attn_vlml0_sa0_smolvlm2500_chunk50_nobs1_expw0.5_lrvlm1e-4_longest_pt_trans0false_decaylr2.5e-630000_relactfalsefirst_camfalse_vim-1_regtrue/checkpoints/best/pretrained_model/
-# ADD_IMAGE_TOKENS=true
-# ATTN_MODE=cross_attn
-# STATE_TO_PREFIX=true
-# CAUSAL_ACTION_ATTENTION_MASK=true
-# SELF_ATTN_EVERY_N_LAYERS=0
-# VLM_NAME=HuggingFaceTB/SmolVLM-500M-Instruct
-
-# REGRESSION_LOSS=true
-# CAUSAL_ATTENTION_ON_HISTORY=true
-# SELF_ATTN_ONLY_ACTIONS=false
-# EXPERT_WIDTH_MULTIPLIER=0.75
-# PAST_OBS_KEYS="image"
-# N_OBS_STEPS=1
-# NUM_EXPERT_LAYERS=0
-# CHUNK_SIZE=50
-# NUM_VLM_LAYERS=16
-# PAD_LANG_TO=longest
-# EVAL_CKPT=$WORK/logs/lerobot/lerobot_physical_intelligence_libero_smolpi0_lr1e-4bs32steps100000gpus2freeze32_cross_attn_vlml16_sa2_smolvlm2500_chunk50_nobs1_expw0.75_lrvlm1e-4_longest_pt_trans0false_decaylr2.5e-630000_relactfalsefirst_camfalse_vim-1_regtrue/checkpoints/best/pretrained_model/
-# ADD_IMAGE_TOKENS=true
-# ATTN_MODE=cross_attn
-# STATE_TO_PREFIX=true
-# CAUSAL_ACTION_ATTENTION_MASK=true
-# SELF_ATTN_EVERY_N_LAYERS=2
-# VLM_NAME=HuggingFaceTB/SmolVLM-500M-Instruct
-
-
-# CAUSAL_ATTENTION_ON_HISTORY=true
-# SELF_ATTN_ONLY_ACTIONS=false
-# EXPERT_WIDTH_MULTIPLIER=0.75
-# PAST_OBS_KEYS="image"
-# N_OBS_STEPS=1
-# NUM_EXPERT_LAYERS=0
-# CHUNK_SIZE=50
-# NUM_VLM_LAYERS=16
-# PAD_LANG_TO=longest
-# EVAL_CKPT=$WORK/logs/lerobot/lerobot_physical_intelligence_libero_smolpi0_lr1e-4bs32steps100000gpus2freeze32_cross_attn_vlml16_sa2_smolvlm2500_chunk50_nobs1_expw0.75_lrvlm1e-4_longest_pt_trans0false_decaylr2.5e-6100000_relactfalsefirst_camfalse_vim-1/checkpoints/best/pretrained_model/
-# ADD_IMAGE_TOKENS=true
-# ATTN_MODE=cross_attn
-# STATE_TO_PREFIX=true
-# CAUSAL_ACTION_ATTENTION_MASK=true
-# SELF_ATTN_EVERY_N_LAYERS=2
-# VLM_NAME=HuggingFaceTB/SmolVLM-500M-Instruct
-
-# CAUSAL_ATTENTION_ON_HISTORY=true
-# SELF_ATTN_ONLY_ACTIONS=false
-# EXPERT_WIDTH_MULTIPLIER=0.75
-# PAST_OBS_KEYS="image"
-# N_OBS_STEPS=1
-# NUM_EXPERT_LAYERS=0
-# CHUNK_SIZE=50
-# NUM_VLM_LAYERS=16
-# PAD_LANG_TO=longest
-# EVAL_CKPT=$WORK/logs/lerobot/lerobot_physical_intelligence_libero_smolpi0_lr2e-4bs32steps100000gpus2freeze32_cross_attn_vlml16_sa2_smolvlm2500_chunk50_nobs1_expw0.75_lrvlm1e-4_longest_pt_trans0false_decaylr2.5e-6100000_relactfalsefirst_camfalse_vim-1/checkpoints/best/pretrained_model/
-# ADD_IMAGE_TOKENS=true
-# ATTN_MODE=cross_attn
-# STATE_TO_PREFIX=true
-# CAUSAL_ACTION_ATTENTION_MASK=true
-# SELF_ATTN_EVERY_N_LAYERS=2
-# VLM_NAME=HuggingFaceTB/SmolVLM-500M-Instruct
-
-
-
-# CAUSAL_ATTENTION_ON_HISTORY=true
-# SELF_ATTN_ONLY_ACTIONS=false
-# EXPERT_WIDTH_MULTIPLIER=0.75
-# PAST_OBS_KEYS="image"
-# N_OBS_STEPS=1
-# NUM_EXPERT_LAYERS=0
-# CHUNK_SIZE=50
-# NUM_VLM_LAYERS=16
-# PAD_LANG_TO=longest
-# EVAL_CKPT=$WORK/logs/lerobot/lerobot_physical_intelligence_libero_smolpi0_lr3e-4bs32steps100000gpus2freeze32_cross_attn_vlml16_sa2_smolvlm2500_chunk50_nobs1_expw0.75_lrvlm1e-4_longest_pt_trans0false_decaylr2.5e-6100000_relactfalsefirst_camfalse_vim-1/checkpoints/best/pretrained_model/
-# ADD_IMAGE_TOKENS=true
-# ATTN_MODE=cross_attn
-# STATE_TO_PREFIX=true
-# CAUSAL_ACTION_ATTENTION_MASK=true
-# SELF_ATTN_EVERY_N_LAYERS=2
-# VLM_NAME=HuggingFaceTB/SmolVLM-500M-Instruct
-
-
-# REGRESSION_LOSS=true
-# CAUSAL_ATTENTION_ON_HISTORY=true
-# SELF_ATTN_ONLY_ACTIONS=false
-# EXPERT_WIDTH_MULTIPLIER=0.5
-# PAST_OBS_KEYS="image"
-# N_OBS_STEPS=1
-# NUM_EXPERT_LAYERS=0
-# CHUNK_SIZE=50
-# NUM_VLM_LAYERS=0
-# PAD_LANG_TO=longest
-# EVAL_CKPT=$WORK/logs/lerobot/lerobot_physical_intelligence_libero_smolpi0_lr9e-5bs8steps100000gpus2freeze32_cross_attn_vlml0_sa0_smolvlm2500_chunk50_nobs1_expw0.5_lrvlm1e-4_longest_pt_trans0false_decaylr2.5e-630000_relactfalsefirst_camfalse_vim-1_regtrue/checkpoints/best/pretrained_model/
-# ADD_IMAGE_TOKENS=true
-# ATTN_MODE=cross_attn
-# STATE_TO_PREFIX=true
-# CAUSAL_ACTION_ATTENTION_MASK=true
-# SELF_ATTN_EVERY_N_LAYERS=0
-# VLM_NAME=HuggingFaceTB/SmolVLM-500M-Instruct
-
-
-
-# REGRESSION_LOSS=true
-# CAUSAL_ATTENTION_ON_HISTORY=true
-# SELF_ATTN_ONLY_ACTIONS=false
-# EXPERT_WIDTH_MULTIPLIER=0.5
-# PAST_OBS_KEYS="image"
-# N_OBS_STEPS=1
-# NUM_EXPERT_LAYERS=0
-# CHUNK_SIZE=50
-# NUM_VLM_LAYERS=0
-# PAD_LANG_TO=longest
-# EVAL_CKPT=$WORK/logs/lerobot/lerobot_physical_intelligence_libero_smolpi0_lr2e-4bs8steps100000gpus2freeze32_cross_attn_vlml0_sa0_smolvlm2500_chunk50_nobs1_expw0.5_lrvlm1e-4_longest_pt_trans0false_decaylr2.5e-630000_relactfalsefirst_camfalse_vim-1_regtrue/checkpoints/best/pretrained_model/
-# ADD_IMAGE_TOKENS=true
-# ATTN_MODE=cross_attn
-# STATE_TO_PREFIX=true
-# CAUSAL_ACTION_ATTENTION_MASK=true
-# SELF_ATTN_EVERY_N_LAYERS=0
-# VLM_NAME=HuggingFaceTB/SmolVLM-500M-Instruct
-
-
-# CAUSAL_ATTENTION_ON_HISTORY=true
-# SELF_ATTN_ONLY_ACTIONS=false
-# EXPERT_WIDTH_MULTIPLIER=0.75
-# PAST_OBS_KEYS="image"
-# N_OBS_STEPS=1
-# NUM_EXPERT_LAYERS=0
-# CHUNK_SIZE=50
-# NUM_VLM_LAYERS=16
-# PAD_LANG_TO=longest
-# EVAL_CKPT=$WORK/logs/lerobot/lerobot_physical_intelligence_libero_smolpi0_lr9e-5bs32steps100000gpus2freeze32_cross_attn_vlml16_sa2_smolvlm2500_chunk50_nobs1_expw0.75_lrvlm1e-4_longest_pt_trans0false_decaylr2.5e-6100000_relactfalsefirst_camfalse_vim-1_regfalse/checkpoints/best/pretrained_model/
-# ADD_IMAGE_TOKENS=true
-# ATTN_MODE=cross_attn
-# STATE_TO_PREFIX=true
-# CAUSAL_ACTION_ATTENTION_MASK=true
-# SELF_ATTN_EVERY_N_LAYERS=2
-# VLM_NAME=HuggingFaceTB/SmolVLM-500M-Instruct
-
-
-# CAUSAL_ATTENTION_ON_HISTORY=true
-# SELF_ATTN_ONLY_ACTIONS=false
-# EXPERT_WIDTH_MULTIPLIER=0.75
-# PAST_OBS_KEYS="image"
-# N_OBS_STEPS=1
-# NUM_EXPERT_LAYERS=0
-# CHUNK_SIZE=50
-# NUM_VLM_LAYERS=16
-# PAD_LANG_TO=longest
-# EVAL_CKPT=$WORK/logs/lerobot/lerobot_physical_intelligence_libero_smolpi0_lr5e-4bs8steps100000gpus2freeze32_cross_attn_vlml16_sa2_smolvlm2250_chunk50_nobs1_expw0.75_lrvlm1e-4_longest_pt_trans0false_decaylr2.5e-630000_relactfalsefirst_camfalse_vim-1_regfalse/checkpoints/best/pretrained_model/
-# ADD_IMAGE_TOKENS=true
-# ATTN_MODE=cross_attn
-# STATE_TO_PREFIX=true
-# CAUSAL_ACTION_ATTENTION_MASK=true
-# SELF_ATTN_EVERY_N_LAYERS=2
-# VLM_NAME=HuggingFaceTB/SmolVLM2-256M-Video-Instruct 
-
-# CAUSAL_ATTENTION_ON_HISTORY=true
-# SELF_ATTN_ONLY_ACTIONS=false
-# EXPERT_WIDTH_MULTIPLIER=0.75
-# PAST_OBS_KEYS="image"
-# N_OBS_STEPS=1
-# NUM_EXPERT_LAYERS=0
-# CHUNK_SIZE=50
-# NUM_VLM_LAYERS=16
-# PAD_LANG_TO=longest
-# EVAL_CKPT=$WORK/logs/lerobot/lerobot_physical_intelligence_libero_smolpi0_lr8e-5bs8steps100000gpus2freeze32_cross_attn_vlml16_sa2_smolvlm22b_chunk50_nobs1_expw0.75_lrvlm1e-4_longest_pt_trans0false_decaylr2.5e-630000_relactfalsefirst_camfalse_vim-1_regfalse/checkpoints/best/pretrained_model/
-# ADD_IMAGE_TOKENS=true
-# ATTN_MODE=cross_attn
-# STATE_TO_PREFIX=true
-# CAUSAL_ACTION_ATTENTION_MASK=true
-# SELF_ATTN_EVERY_N_LAYERS=2
-# VLM_NAME=HuggingFaceTB/SmolVLM2-2.2B-Instruct
-
-# CAUSAL_ATTENTION_ON_HISTORY=true
-# SELF_ATTN_ONLY_ACTIONS=false
-# EXPERT_WIDTH_MULTIPLIER=0.75
-# PAST_OBS_KEYS="image"
-# N_OBS_STEPS=1
-# NUM_EXPERT_LAYERS=0
-# CHUNK_SIZE=50
-# NUM_VLM_LAYERS=16
-# PAD_LANG_TO=longest
-# EVAL_CKPT=$WORK/logs/lerobot/lerobot_physical_intelligence_libero_smolpi0_lr5e-4bs32steps100000gpus2freeze32_cross_attn_vlml16_sa2_smolvlm1250_chunk50_nobs1_expw0.75_lrvlm1e-4_longest_pt_trans0false_decaylr2.5e-630000_relactfalsefirst_camfalse_vim-1_regfalse/checkpoints/best/pretrained_model/
-# ADD_IMAGE_TOKENS=true
-# ATTN_MODE=cross_attn
-# STATE_TO_PREFIX=true
-# CAUSAL_ACTION_ATTENTION_MASK=true
-# SELF_ATTN_EVERY_N_LAYERS=2
-# VLM_NAME=HuggingFaceTB/SmolVLM2-256M-Video-Instruct 
-
-
-# CAUSAL_ATTENTION_ON_HISTORY=true
-# SELF_ATTN_ONLY_ACTIONS=false
-# EXPERT_WIDTH_MULTIPLIER=0.75
-# PAST_OBS_KEYS="image"
-# N_OBS_STEPS=1
-# NUM_EXPERT_LAYERS=0
-# CHUNK_SIZE=50
-# NUM_VLM_LAYERS=16
-# PAD_LANG_TO=longest
-# EVAL_CKPT=$WORK/logs/lerobot/lerobot_physical_intelligence_libero_smolpi0_lr4e-4bs32steps100000gpus2freeze32_cross_attn_vlml16_sa2_smolvlm1250_chunk50_nobs1_expw0.75_lrvlm1e-4_longest_pt_trans0false_decaylr2.5e-630000_relactfalsefirst_camfalse_vim-1_regfalse/checkpoints/best/pretrained_model/
-# ADD_IMAGE_TOKENS=true
-# ATTN_MODE=cross_attn
-# STATE_TO_PREFIX=true
-# CAUSAL_ACTION_ATTENTION_MASK=true
-# SELF_ATTN_EVERY_N_LAYERS=2
-# VLM_NAME=HuggingFaceTB/SmolVLM2-256M-Video-Instruct 
-
-
-# CAUSAL_ATTENTION_ON_HISTORY=true
-# SELF_ATTN_ONLY_ACTIONS=false
-# EXPERT_WIDTH_MULTIPLIER=0.75
-# PAST_OBS_KEYS="image"
-# N_OBS_STEPS=1
-# NUM_EXPERT_LAYERS=0
-# CHUNK_SIZE=50
-# NUM_VLM_LAYERS=16
-# PAD_LANG_TO=longest
-# EVAL_CKPT=$WORK/logs/lerobot/lerobot_physical_intelligence_libero_smolpi0_lr3e-4bs32steps100000gpus2freeze32_cross_attn_vlml16_sa2_smolvlm1250_chunk50_nobs1_expw0.75_lrvlm1e-4_longest_pt_trans0false_decaylr2.5e-630000_relactfalsefirst_camfalse_vim-1_regfalse/checkpoints/best/pretrained_model/
-# ADD_IMAGE_TOKENS=true
-# ATTN_MODE=cross_attn
-# STATE_TO_PREFIX=true
-# CAUSAL_ACTION_ATTENTION_MASK=true
-# SELF_ATTN_EVERY_N_LAYERS=2
-# VLM_NAME=HuggingFaceTB/SmolVLM2-256M-Video-Instruct 
-
-
-# CAUSAL_ATTENTION_ON_HISTORY=true
-# SELF_ATTN_ONLY_ACTIONS=false
-# EXPERT_WIDTH_MULTIPLIER=0.75
-# PAST_OBS_KEYS="image"
-# N_OBS_STEPS=1
-# NUM_EXPERT_LAYERS=0
-# CHUNK_SIZE=50
-# NUM_VLM_LAYERS=16
-# PAD_LANG_TO=longest
-# EVAL_CKPT=$WORK/logs/lerobot/lerobot_physical_intelligence_libero_smolpi0_lr6e-5bs32steps100000gpus2freeze32_cross_attn_vlml16_sa2_smolvlm12b_chunk50_nobs1_expw0.75_lrvlm1e-4_longest_pt_trans0false_decaylr2.5e-630000_relactfalsefirst_camfalse_vim-1_regfalse/checkpoints/best/pretrained_model/
-# ADD_IMAGE_TOKENS=true
-# ATTN_MODE=cross_attn
-# STATE_TO_PREFIX=true
-# CAUSAL_ACTION_ATTENTION_MASK=true
-# SELF_ATTN_EVERY_N_LAYERS=2
-# VLM_NAME=HuggingFaceTB/SmolVLM2-2.2B-Instruct
-
-
-# CAUSAL_ATTENTION_ON_HISTORY=true
-# SELF_ATTN_ONLY_ACTIONS=false
-# EXPERT_WIDTH_MULTIPLIER=0.75
-# PAST_OBS_KEYS="image"
-# N_OBS_STEPS=1
-# NUM_EXPERT_LAYERS=0
-# CHUNK_SIZE=50
-# NUM_VLM_LAYERS=16
-# PAD_LANG_TO=longest
-# EVAL_CKPT=$WORK/logs/lerobot/lerobot_physical_intelligence_libero_smolpi0_lr1e-4bs32steps100000gpus2freeze32_cross_attn_vlml16_sa2_smolvlm12b_chunk50_nobs1_expw0.75_lrvlm1e-4_longest_pt_trans0false_decaylr2.5e-630000_relactfalsefirst_camfalse_vim-1_regfalse/checkpoints/best/pretrained_model/
-# ADD_IMAGE_TOKENS=true
-# ATTN_MODE=cross_attn
-# STATE_TO_PREFIX=true
-# CAUSAL_ACTION_ATTENTION_MASK=true
-# SELF_ATTN_EVERY_N_LAYERS=2
-# VLM_NAME=HuggingFaceTB/SmolVLM2-2.2B-Instruct
-
-
-
-# CAUSAL_ATTENTION_ON_HISTORY=true
-# SELF_ATTN_ONLY_ACTIONS=false
-# EXPERT_WIDTH_MULTIPLIER=0.75
-# PAST_OBS_KEYS="image"
-# N_OBS_STEPS=1
-# NUM_EXPERT_LAYERS=0
-# CHUNK_SIZE=50
-# NUM_VLM_LAYERS=16
-# PAD_LANG_TO=longest
-# EVAL_CKPT=$WORK/logs/lerobot/lerobot_physical_intelligence_libero_smolpi0_lr9e-5bs32steps100000gpus2freeze32_cross_attn_vlml16_sa2_smolvlm12b_chunk50_nobs1_expw0.75_lrvlm1e-4_longest_pt_trans0false_decaylr2.5e-630000_relactfalsefirst_camfalse_vim-1_regfalse/checkpoints/best/pretrained_model/
-# ADD_IMAGE_TOKENS=true
-# ATTN_MODE=cross_attn
-# STATE_TO_PREFIX=true
-# CAUSAL_ACTION_ATTENTION_MASK=true
-# SELF_ATTN_EVERY_N_LAYERS=2
-# VLM_NAME=HuggingFaceTB/SmolVLM2-2.2B-Instruct
-
-
-# CAUSAL_ATTENTION_ON_HISTORY=true
-# SELF_ATTN_ONLY_ACTIONS=false
-# EXPERT_WIDTH_MULTIPLIER=0.75
-# PAST_OBS_KEYS="image"
-# N_OBS_STEPS=1
-# NUM_EXPERT_LAYERS=0
-# CHUNK_SIZE=50
-# NUM_VLM_LAYERS=16
-# PAD_LANG_TO=longest
-# EVAL_CKPT=$WORK/logs/lerobot/lerobot_physical_intelligence_libero_smolpi0_lr7e-5bs32steps100000gpus2freeze32_cross_attn_vlml16_sa2_smolvlm12b_chunk50_nobs1_expw0.75_lrvlm1e-4_longest_pt_trans0false_decaylr2.5e-630000_relactfalsefirst_camfalse_vim-1_regfalse/checkpoints/best/pretrained_model/
-# ADD_IMAGE_TOKENS=true
-# ATTN_MODE=cross_attn
-# STATE_TO_PREFIX=true
-# CAUSAL_ACTION_ATTENTION_MASK=true
-# SELF_ATTN_EVERY_N_LAYERS=2
-# VLM_NAME=HuggingFaceTB/SmolVLM2-2.2B-Instruct
-
-
-# CAUSAL_ATTENTION_ON_HISTORY=true
-# SELF_ATTN_ONLY_ACTIONS=false
-# EXPERT_WIDTH_MULTIPLIER=0.75
-# PAST_OBS_KEYS="image"
-# N_OBS_STEPS=1
-# NUM_EXPERT_LAYERS=0
-# CHUNK_SIZE=50
-# NUM_VLM_LAYERS=16
-# PAD_LANG_TO=longest
-# EVAL_CKPT=$WORK/logs/lerobot/lerobot_physical_intelligence_libero_smolpi0_lr8e-5bs32steps100000gpus2freeze32_cross_attn_vlml16_sa2_smolvlm12b_chunk50_nobs1_expw0.75_lrvlm1e-4_longest_pt_trans0false_decaylr2.5e-630000_relactfalsefirst_camfalse_vim-1_regfalse/checkpoints/best/pretrained_model/
-# ADD_IMAGE_TOKENS=true
-# ATTN_MODE=cross_attn
-# STATE_TO_PREFIX=true
-# CAUSAL_ACTION_ATTENTION_MASK=true
-# SELF_ATTN_EVERY_N_LAYERS=2
-# VLM_NAME=HuggingFaceTB/SmolVLM2-2.2B-Instruct
-
-
-CAUSAL_ATTENTION_ON_HISTORY=true
-SELF_ATTN_ONLY_ACTIONS=false
-EXPERT_WIDTH_MULTIPLIER=0.75
-PAST_OBS_KEYS="image"
-N_OBS_STEPS=1
-NUM_EXPERT_LAYERS=0
-CHUNK_SIZE=50
-NUM_VLM_LAYERS=16
-PAD_LANG_TO=longest
-EVAL_CKPT=/raid/jade/models/smolvlamust
-ADD_IMAGE_TOKENS=true
-ATTN_MODE=cross_attn
-STATE_TO_PREFIX=true
-CAUSAL_ACTION_ATTENTION_MASK=true
-SELF_ATTN_EVERY_N_LAYERS=2
-VLM_NAME=HuggingFaceTB/SmolVLM-500M-Instruct
-
-
-# CAUSAL_ATTENTION_ON_HISTORY=true
-# SELF_ATTN_ONLY_ACTIONS=false
-# EXPERT_WIDTH_MULTIPLIER=0.75
-# PAST_OBS_KEYS="image"
-# N_OBS_STEPS=1
-# NUM_EXPERT_LAYERS=0
-# CHUNK_SIZE=50
-# NUM_VLM_LAYERS=16
-# PAD_LANG_TO=longest
-# EVAL_CKPT=$WORK/logs/lerobot/lerobot_physical_intelligence_libero_smolpi0_lr6e-5bs32steps100000gpus2freeze32_cross_attn_vlml16_sa2_smolvlm22b_chunk50_nobs1_expw0.75_lrvlm1e-4_longest_pt_trans0false_decaylr2.5e-630000_relactfalsefirst_camfalse_vim-1_regfalse_compilefalse/checkpoints/best/pretrained_model/
-# ADD_IMAGE_TOKENS=true
-# ATTN_MODE=cross_attn
-# STATE_TO_PREFIX=true
-# CAUSAL_ACTION_ATTENTION_MASK=true
-# SELF_ATTN_EVERY_N_LAYERS=2
-# VLM_NAME=HuggingFaceTB/SmolVLM2-2.2B-Instruct
-
-# CAUSAL_ATTENTION_ON_HISTORY=true
-# SELF_ATTN_ONLY_ACTIONS=false
-# EXPERT_WIDTH_MULTIPLIER=0.75
-# PAST_OBS_KEYS="image"
-# N_OBS_STEPS=1
-# NUM_EXPERT_LAYERS=0
-# CHUNK_SIZE=50
-# NUM_VLM_LAYERS=16
-# PAD_LANG_TO=longest
-# EVAL_CKPT=$WORK/logs/lerobot/lerobot_physical_intelligence_libero_smolpi0_lr9e-5bs32steps100000gpus2freeze32_cross_attn_vlml16_sa2_smolvlm22b_chunk50_nobs1_expw0.75_lrvlm1e-4_longest_pt_trans0false_decaylr2.5e-630000_relactfalsefirst_camfalse_vim-1_regfalse_compilefalse/checkpoints/best/pretrained_model/
-# ADD_IMAGE_TOKENS=true
-# ATTN_MODE=cross_attn
-# STATE_TO_PREFIX=true
-# CAUSAL_ACTION_ATTENTION_MASK=true
-# SELF_ATTN_EVERY_N_LAYERS=2
-# VLM_NAME=HuggingFaceTB/SmolVLM2-2.2B-Instruct
-
-
-# CAUSAL_ATTENTION_ON_HISTORY=true
-# SELF_ATTN_ONLY_ACTIONS=false
-# EXPERT_WIDTH_MULTIPLIER=0.75
-# PAST_OBS_KEYS="image"
-# N_OBS_STEPS=1
-# NUM_EXPERT_LAYERS=0
-# CHUNK_SIZE=50
-# NUM_VLM_LAYERS=16
-# PAD_LANG_TO=longest
-# EVAL_CKPT=$WORK/logs/lerobot/lerobot_physical_intelligence_libero_smolpi0_lr8e-5bs32steps100000gpus2freeze32_cross_attn_vlml16_sa2_smolvlm22b_chunk50_nobs1_expw0.75_lrvlm1e-4_longest_pt_trans0false_decaylr2.5e-630000_relactfalsefirst_camfalse_vim-1_regfalse_compilefalse/checkpoints/best/pretrained_model/
-# ADD_IMAGE_TOKENS=true
-# ATTN_MODE=cross_attn
-# STATE_TO_PREFIX=true
-# CAUSAL_ACTION_ATTENTION_MASK=true
-# SELF_ATTN_EVERY_N_LAYERS=2
-# VLM_NAME=HuggingFaceTB/SmolVLM2-2.2B-Instruct
-
-# CAUSAL_ATTENTION_ON_HISTORY=true
-# SELF_ATTN_ONLY_ACTIONS=false
-# EXPERT_WIDTH_MULTIPLIER=0.75
-# PAST_OBS_KEYS="image"
-# N_OBS_STEPS=1
-# NUM_EXPERT_LAYERS=0
-# CHUNK_SIZE=50
-# NUM_VLM_LAYERS=16
-# PAD_LANG_TO=longest
-# EVAL_CKPT=$WORK/logs/lerobot/lerobot_physical_intelligence_libero_smolpi0_lr1e-4bs32steps100000gpus2freeze32_cross_attn_vlml16_sa2_smolvlm22b_chunk50_nobs1_expw0.75_lrvlm1e-4_longest_pt_trans0false_decaylr2.5e-630000_relactfalsefirst_camfalse_vim-1_regfalse_compilefalse/checkpoints/best/pretrained_model/
-# ADD_IMAGE_TOKENS=true
-# ATTN_MODE=cross_attn
-# STATE_TO_PREFIX=true
-# CAUSAL_ACTION_ATTENTION_MASK=true
-# SELF_ATTN_EVERY_N_LAYERS=2
-# VLM_NAME=HuggingFaceTB/SmolVLM2-2.2B-Instruct
-
-
-
-# CAUSAL_ATTENTION_ON_HISTORY=true
-# SELF_ATTN_ONLY_ACTIONS=false
-# EXPERT_WIDTH_MULTIPLIER=0.75
-# PAST_OBS_KEYS="image"
-# N_OBS_STEPS=1
-# NUM_EXPERT_LAYERS=0
-# CHUNK_SIZE=50
-# NUM_VLM_LAYERS=16
-# PAD_LANG_TO=longest
-# EVAL_CKPT=$WORK/logs/lerobot/lerobot_physical_intelligence_libero_smolpi0_lr7e-4bs32steps100000gpus2freeze32_cross_attn_vlml16_sa2_smolvlm1250_chunk50_nobs1_expw0.75_lrvlm1e-4_longest_pt_trans0false_decaylr2.5e-630000_relactfalsefirst_camfalse_vim-1_regfalse/checkpoints/best/pretrained_model/
-# ADD_IMAGE_TOKENS=true
-# ATTN_MODE=cross_attn
-# STATE_TO_PREFIX=true
-# CAUSAL_ACTION_ATTENTION_MASK=true
-# SELF_ATTN_EVERY_N_LAYERS=2
-# VLM_NAME=HuggingFaceTB/SmolVLM2-256M-Video-Instruct 
-
-
-# CAUSAL_ATTENTION_ON_HISTORY=true
-# SELF_ATTN_ONLY_ACTIONS=false
-# EXPERT_WIDTH_MULTIPLIER=0.75
-# PAST_OBS_KEYS="image"
-# N_OBS_STEPS=1
-# NUM_EXPERT_LAYERS=0
-# CHUNK_SIZE=50
-# NUM_VLM_LAYERS=16
-# PAD_LANG_TO=longest
-# EVAL_CKPT=$WORK/logs/lerobot/lerobot_physical_intelligence_libero_smolpi0_lr1e-4bs32steps100000gpus2freeze32_cross_attn_vlml16_sa2_smolvlm2250_chunk50_nobs1_expw0.75_lrvlm1e-4_longest_pt_trans0false_decaylr2.5e-630000_relactfalsefirst_camfalse_vim-1_regfalse_compilefalse/checkpoints/best/pretrained_model/
-# ADD_IMAGE_TOKENS=true
-# ATTN_MODE=cross_attn
-# STATE_TO_PREFIX=true
-# CAUSAL_ACTION_ATTENTION_MASK=true
-# SELF_ATTN_EVERY_N_LAYERS=2
-# VLM_NAME=HuggingFaceTB/SmolVLM2-256M-Video-Instruct 
-
-
-# CAUSAL_ATTENTION_ON_HISTORY=true
-# SELF_ATTN_ONLY_ACTIONS=false
-# EXPERT_WIDTH_MULTIPLIER=0.75
-# PAST_OBS_KEYS="image"
-# N_OBS_STEPS=1
-# NUM_EXPERT_LAYERS=0
-# CHUNK_SIZE=50
-# NUM_VLM_LAYERS=16
-# PAD_LANG_TO=longest
-# EVAL_CKPT=$WORK/logs/lerobot/lerobot_physical_intelligence_libero_smolpi0_lr3e-4bs32steps100000gpus2freeze32_cross_attn_vlml16_sa2_smolvlm2250_chunk50_nobs1_expw0.75_lrvlm1e-4_longest_pt_trans0false_decaylr2.5e-630000_relactfalsefirst_camfalse_vim-1_regfalse_compilefalse/checkpoints/best/pretrained_model/
-# ADD_IMAGE_TOKENS=true
-# ATTN_MODE=cross_attn
-# STATE_TO_PREFIX=true
-# CAUSAL_ACTION_ATTENTION_MASK=true
-# SELF_ATTN_EVERY_N_LAYERS=2
-# VLM_NAME=HuggingFaceTB/SmolVLM2-256M-Video-Instruct 
-
-# CAUSAL_ATTENTION_ON_HISTORY=true
-# SELF_ATTN_ONLY_ACTIONS=false
-# EXPERT_WIDTH_MULTIPLIER=0.75
-# PAST_OBS_KEYS="image"
-# N_OBS_STEPS=1
-# NUM_EXPERT_LAYERS=0
-# CHUNK_SIZE=50
-# NUM_VLM_LAYERS=16
-# PAD_LANG_TO=longest
-# EVAL_CKPT=$WORK/logs/lerobot/lerobot_physical_intelligence_libero_smolpi0_lr5e-4bs32steps100000gpus2freeze32_cross_attn_vlml16_sa2_smolvlm2250_chunk50_nobs1_expw0.75_lrvlm1e-4_longest_pt_trans0false_decaylr2.5e-630000_relactfalsefirst_camfalse_vim-1_regfalse_compilefalse/checkpoints/best/pretrained_model/
-# ADD_IMAGE_TOKENS=true
-# ATTN_MODE=cross_attn
-# STATE_TO_PREFIX=true
-# CAUSAL_ACTION_ATTENTION_MASK=true
-# SELF_ATTN_EVERY_N_LAYERS=2
-# VLM_NAME=HuggingFaceTB/SmolVLM2-256M-Video-Instruct 
-
-
-# CAUSAL_ATTENTION_ON_HISTORY=true
-# SELF_ATTN_ONLY_ACTIONS=false
-# EXPERT_WIDTH_MULTIPLIER=0.75
-# PAST_OBS_KEYS="image"
-# N_OBS_STEPS=1
-# NUM_EXPERT_LAYERS=0
-# CHUNK_SIZE=50
-# NUM_VLM_LAYERS=16
-# PAD_LANG_TO=longest
-# EVAL_CKPT=$WORK/logs/lerobot/lerobot_physical_intelligence_libero_smolpi0_lr7e-4bs32steps100000gpus2freeze32_cross_attn_vlml16_sa2_smolvlm2250_chunk50_nobs1_expw0.75_lrvlm1e-4_longest_pt_trans0false_decaylr2.5e-630000_relactfalsefirst_camfalse_vim-1_regfalse_compilefalse/checkpoints/best/pretrained_model/
-# ADD_IMAGE_TOKENS=true
-# ATTN_MODE=cross_attn
-# STATE_TO_PREFIX=true
-# CAUSAL_ACTION_ATTENTION_MASK=true
-# SELF_ATTN_EVERY_N_LAYERS=2
-# VLM_NAME=HuggingFaceTB/SmolVLM2-256M-Video-Instruct 
-
-# CAUSAL_ATTENTION_ON_HISTORY=true
-# SELF_ATTN_ONLY_ACTIONS=false
-# EXPERT_WIDTH_MULTIPLIER=0.75
-# PAST_OBS_KEYS="image"
-# N_OBS_STEPS=1
-# NUM_EXPERT_LAYERS=0
-# CHUNK_SIZE=50
-# NUM_VLM_LAYERS=16
-# PAD_LANG_TO=longest
-# EVAL_CKPT=$WORK/logs/lerobot/lerobot_physical_intelligence_libero_smolpi0_lr6e-4bs32steps100000gpus2freeze32_cross_attn_vlml16_sa2_smolvlm2250_chunk50_nobs1_expw0.75_lrvlm1e-4_longest_pt_trans0false_decaylr2.5e-630000_relactfalsefirst_camfalse_vim-1_regfalse_compilefalse/checkpoints/best/pretrained_model/
-# ADD_IMAGE_TOKENS=true
-# ATTN_MODE=cross_attn
-# STATE_TO_PREFIX=true
-# CAUSAL_ACTION_ATTENTION_MASK=true
-# SELF_ATTN_EVERY_N_LAYERS=2
-# VLM_NAME=HuggingFaceTB/SmolVLM2-256M-Video-Instruct 
-
-# CAUSAL_ATTENTION_ON_HISTORY=true
-# SELF_ATTN_ONLY_ACTIONS=false
-# EXPERT_WIDTH_MULTIPLIER=0.75
-# PAST_OBS_KEYS="image"
-# N_OBS_STEPS=1
-# NUM_EXPERT_LAYERS=0
-# CHUNK_SIZE=50
-# NUM_VLM_LAYERS=16
-# PAD_LANG_TO=longest
-# EVAL_CKPT=$WORK/logs/lerobot/lerobot_physical_intelligence_libero_smolpi0_lr4e-4bs32steps100000gpus2freeze32_cross_attn_vlml16_sa2_smolvlm2250_chunk50_nobs1_expw0.75_lrvlm1e-4_longest_pt_trans0false_decaylr2.5e-630000_relactfalsefirst_camfalse_vim-1_regfalse_compilefalse/checkpoints/best/pretrained_model/
-# ADD_IMAGE_TOKENS=true
-# ATTN_MODE=cross_attn
-# STATE_TO_PREFIX=true
-# CAUSAL_ACTION_ATTENTION_MASK=true
-# SELF_ATTN_EVERY_N_LAYERS=2
-# VLM_NAME=HuggingFaceTB/SmolVLM2-256M-Video-Instruct 
-
-
-# # TASK=libero_spatial
-# MULTITASK_EVAL=false
-# N_EPISODES=50
-
-# TASK=libero_spatial,libero_object,libero_goal,libero_10
-MULTITASK_EVAL=true
-# N_EPISODES=5
-N_EPISODES=1
-
-# MAX_PARRALLEL_TASKS=5
-# MAX_PARRALLEL_TASKS=2
-MAX_PARRALLEL_TASKS=1
-
-# NUM_EVALS=2
-# SEEDS=(1000 5000)
-SEEDS=(5000)
-# ACTION_STEPS_LIST=(1 10 30 50)
-ACTION_STEPS_LIST=(1)
-# ACTION_STEPS_LIST=(50)
-TASK_LIST=(libero_spatial libero_object libero_goal libero_10)
-TASK_LIST=(libero_spatial)
-for SEED in "${SEEDS[@]}"; do
-    for N_ACTION_STEPS in "${ACTION_STEPS_LIST[@]}"; do
-        for TASK in "${TASK_LIST[@]}"; do
-            echo "$TASK Evaluating: $EVAL_CKPT | N_ACTION_STEPS=$N_ACTION_STEPS | EVAL SEED=$SEED"
-            python src/lerobot/scripts/eval.py \
-                --output_dir=/raid/jade/logs/lerobot/tmp \
-                --env.type=$ENV \
-                --env.task=$TASK \
-                --eval.batch_size=$N_EPISODES \
-                --eval.n_episodes=$N_EPISODES \
-                --seed=$SEED \
-                --policy.use_amp=false \
-                --policy.path=$EVAL_CKPT \
-                --policy.n_action_steps=$N_ACTION_STEPS \
-                --policy.checkpoint_path=$EVAL_CKPT \
-                --env.multitask_eval=$MULTITASK_EVAL --env.max_parallel_tasks=$MAX_PARRALLEL_TASKS \
-                --policy.add_image_special_tokens=$ADD_IMAGE_TOKENS \
-                --policy.attention_mode=$ATTN_MODE \
-                --policy.causal_action_attention_mask=$CAUSAL_ACTION_ATTENTION_MASK \
-                --policy.state_to_prefix=$STATE_TO_PREFIX \
-                --policy.self_attn_every_n_layers=$SELF_ATTN_EVERY_N_LAYERS \
-                --policy.pad_language_to=$PAD_LANG_TO \
-                --policy.load_vlm_weights=$LOAD_VLM_WEIGHTS \
-                --policy.vlm_model_name=$VLM_NAME \
-                --policy.num_vlm_layers=$NUM_VLM_LAYERS \
-                --policy.chunk_size=$CHUNK_SIZE \
-                --policy.n_obs_steps=$N_OBS_STEPS \
-                --policy.past_obs_keys=$PAST_OBS_KEYS \
-                --policy.num_expert_layers=$NUM_EXPERT_LAYERS \
-                --policy.expert_width_multiplier=$EXPERT_WIDTH_MULTIPLIER \
-                --policy.peft_method=$PEFT_METHOD \
-                --policy.self_attn_only_actions=$SELF_ATTN_ONLY_ACTIONS \
-                --policy.causal_attention_on_history=$CAUSAL_ATTENTION_ON_HISTORY \
-                --policy.predict_relative_actions=$PREDICT_RELATIVE_ACTIONS --policy.relative_actions_mode=$RELATIVE_ACTIONS_MODE --policy.shuffle_camera_positions=$SHUFFLE_CAMERA_POSITIONS \
-                --policy.vlm_img_size=$VLM_IMG_SIZE \
-                --policy.regression_loss=$REGRESSION_LOSS 
-                # --policy.peft_config.r=$LORA_R --policy.peft_config.target_modules=$LORA_TARGET_MODULES  --policy.peft_method=$PEFT_METHOD --policy.peft_target_model=$PEFT_TARGET_MODEL 
-
-            echo "Done with: $EVAL_CKPT | Steps=$N_ACTION_STEPS | EVAL SEED=$SEED"
-            echo "------------------------------------------------------"
-        done
-    done
-done
-
-
-# ############################################################################################################################################
-# ############################################################################################################################################
-# ############################################################################################################################################
-# ########### Offline eval
-
-
-# # ############################
-# # # Community datasets V1
-# # # REPO_ID=pranavsaroha/so100_legos4,pranavsaroha/so100_onelego2,jpata/so100_pick_place_tangerine,pranavsaroha/so100_onelego3,pranavsaroha/so100_carrot_2,pranavsaroha/so100_carrot_5,pandaRQ/pick_med_1,HITHY/so100_strawberry,vladfatu/so100_above,koenvanwijk/orange50-1,koenvanwijk/orange50-variation-2,FeiYjf/new_GtoR,CSCSXX/pick_place_cube_1.18,vladfatu/so100_office,dragon-95/so100_sorting,dragon-95/so100_sorting_1,nbaron99/so100_pick_and_place4,Beegbrain/pick_place_green_block,Ityl/so100_recording2,dragon-95/so100_sorting_2,dragon-95/so100_sorting_3,aractingi/push_cube_offline_data,HITHY/so100_peach3,HITHY/so100_peach4,shreyasgite/so100_legocube_50,shreyasgite/so100_base_env,triton7777/so100_dataset_mix,Deason11/Open_the_drawer_to_place_items,Deason11/PLACE_TAPE_PUSH_DRAWER,NONHUMAN-RESEARCH/SOARM100_TASK_VENDA,mikechambers/block_cup_14,samsam0510/tooth_extraction_3,samsam0510/tooth_extraction_4,samsam0510/cube_reorientation_2,samsam0510/cube_reorientation_4,samsam0510/glove_reorientation_1,DorayakiLin/so100_pick_charger_on_tissue,zijian2022/noticehuman3,liuhuanjim013/so100_th
-# # # Inconsistent actions dim: Deason11/Open_the_drawer_to_place_items, Deason11/PLACE_TAPE_PUSH_DRAWER
-# # # Filtered datasets
-# # REPO_ID=pranavsaroha/so100_onelego2,pranavsaroha/so100_onelego3,pranavsaroha/so100_carrot_2,vladfatu/so100_above,koenvanwijk/orange50-1,CSCSXX/pick_place_cube_1.18,dragon-95/so100_sorting,dragon-95/so100_sorting_1,nbaron99/so100_pick_and_place4,Beegbrain/pick_place_green_block,dragon-95/so100_sorting_3,HITHY/so100_peach3,shreyasgite/so100_legocube_50,triton7777/so100_dataset_mix,NONHUMAN-RESEARCH/SOARM100_TASK_VENDA,mikechambers/block_cup_14,samsam0510/tooth_extraction_3,samsam0510/tooth_extraction_4,samsam0510/cube_reorientation_2,samsam0510/cube_reorientation_4,samsam0510/glove_reorientation_1,vladfatu/so100_office,pranavsaroha/so100_legos4,Ityl/so100_recording2,FeiYjf/new_GtoR,dragon-95/so100_sorting_2,HITHY/so100_peach4,jpata/so100_pick_place_tangerine,HITHY/so100_strawberry,shreyasgite/so100_base_env,koenvanwijk/orange50-variation-2,pranavsaroha/so100_carrot_5,pandaRQ/pick_med_1,aractingi/push_cube_offline_data,DorayakiLin/so100_pick_charger_on_tissue,zijian2022/noticehuman3,liuhuanjim013/so100_th
-# # SAMPLING_WEIGHTS=
-# # DATASET_NAME=so100_community_v1
-
-
-# # # Community datasets V2
-# # # Inconsistent actions: 1g0rrr/sam_openpi_solder1, 1g0rrr/sam_openpi03, 1g0rrr/sam_openpi_solder2
-# # # Other issues: pierfabre/rabbit bensprenger/right_arm_p_brick_in_box_with_y_noise_v0 pierfabre/horse pierfabre/pig2 pierfabre/pig3 pierfabre/cow2,pierfabre/sheep
-# # # REPO_ID=Chojins/chess_game_009_white,sihyun77/suho_3_17_1,sihyun77/sihyun_3_17_2,sihyun77/suho_3_17_3,sihyun77/sihyun_3_17_5,Odog16/so100_cube_drop_pick_v1,sihyun77/sihyun_main_2,sihyun77/suho_main_2,Bartm3/dice2,sihyun77/sihyun_main_3,Loki0929/so100_duck,pietroom/holdthis,pietroom/actualeasytask,Beegbrain/pick_lemon_and_drop_in_bowl,Beegbrain/sweep_tissue_cube,zijian2022/321,gxy1111/so100_pick_place,Odog16/so100_cube_stacking_v1,sihyun77/mond_1,andlyu/so100_indoor_1,andlyu/so100_indoor_3,frk2/so100large,lirislab/sweep_tissue_cube,lirislab/lemon_into_bowl,lirislab/red_cube_into_green_lego_block,lirislab/red_cube_into_blue_cube,00ri/so100_battery,frk2/so100largediffcam,FsqZ/so100_1,ZGGZZG/so100_drop0,Chojins/chess_game_000_white_red,smanni/train_so100_fluffy_box,ganker5/so100_push_20250328,ganker5/so100_dataline_0328,ganker5/so100_color_0328,CrazyYhang/A1234-B-C_mvA2B,RasmusP/so100_Orange2Green,sixpigs1/so100_pick_cube_in_box,ganker5/so100_push_20250331,ganker5/so100_dataline_20250331,lirislab/put_caps_into_teabox,lirislab/close_top_drawer_teabox,lirislab/open_top_drawer_teabox,lirislab/unfold_bottom_right,lirislab/push_cup_target,lirislab/put_banana_bowl,Chojins/chess_game_001_blue_stereo,Chojins/chess_game_001_red_stereo,ganker5/so100_toy_20250402,Gano007/so100_medic,00ri/so100_battery_bin_center,paszea/so100_whale_2,lirislab/fold_bottom_right,lirislab/put_coffee_cap_teabox,therarelab/so100_pick_place_2,paszea/so100_whale_3,paszea/so100_whale_4,paszea/so100_lego,LemonadeDai/so100_coca,zijian2022/backgrounda,zijian2022/backgroundb,356c/so100_nut_sort_1,Mwuqiu/so100_0408_muti,aimihat/so100_tape,lirislab/so100_demo,356c/so100_duck_reposition_1,zijian2022/sort1,weiye11/so100_410_zwy,VoicAndrei/so100_banana_to_plate_only,sixpigs1/so100_stack_cube_error,isadev/bougies3,zijian2022/close3,bensprenger/left_arm_yellow_brick_in_box_v0,lirislab/guess_who_so100,bensprenger/left_arm_yellow_brick_in_box_with_purple_noise_v0,roboticshack/team16-can-stacking,zijian2022/insert2,roboticshack/team-7-right-arm-grasp-tape,Jiangeng/so100_413,roboticshack/team9-pick_cube_place_static_plate,AndrejOrsula/lerobot_double_ball_stacking_random,roboticshack/left-arm-grasp-lego-brick,roboticshack/team-7-left-arm-grasp-motor,roboticshack/team9-pick_chicken_place_plate,roboticshack/team13-two-balls-stacking,tkc79/so100_lego_box_1,roboticshack/team13-three-balls-stacking,pierfabre/chicken,roboticshack/team16-water-pouring,ad330/cubePlace,Jiafei1224/so100_pa222per,paszea/so100_lego_2cam,bensprenger/chess_game_001_blue_stereo,Mohamedal/put_banana,tkc79/so100_lego_box_2,samanthalhy/so100_herding_1,jlesein/TestBoulon7
-# # REPO_ID=pierfabre/rabbit,bensprenger/right_arm_p_brick_in_box_with_y_noise_v0,pierfabre/horse,pierfabre/pig2,pierfabre/pig3,pierfabre/cow2,pierfabre/sheep,Chojins/chess_game_009_white,sihyun77/suho_3_17_1,sihyun77/sihyun_3_17_2,sihyun77/suho_3_17_3,sihyun77/sihyun_3_17_5,Odog16/so100_cube_drop_pick_v1,sihyun77/sihyun_main_2,sihyun77/suho_main_2,Bartm3/dice2,sihyun77/sihyun_main_3,Loki0929/so100_duck,pietroom/holdthis,pietroom/actualeasytask,Beegbrain/pick_lemon_and_drop_in_bowl,Beegbrain/sweep_tissue_cube,zijian2022/321,gxy1111/so100_pick_place,Odog16/so100_cube_stacking_v1,sihyun77/mond_1,andlyu/so100_indoor_1,andlyu/so100_indoor_3,frk2/so100large,lirislab/sweep_tissue_cube,lirislab/lemon_into_bowl,lirislab/red_cube_into_green_lego_block,lirislab/red_cube_into_blue_cube,00ri/so100_battery,frk2/so100largediffcam,FsqZ/so100_1,ZGGZZG/so100_drop0,Chojins/chess_game_000_white_red,smanni/train_so100_fluffy_box,ganker5/so100_push_20250328,ganker5/so100_dataline_0328,ganker5/so100_color_0328,CrazyYhang/A1234-B-C_mvA2B,RasmusP/so100_Orange2Green,sixpigs1/so100_pick_cube_in_box,ganker5/so100_push_20250331,ganker5/so100_dataline_20250331,lirislab/put_caps_into_teabox,lirislab/close_top_drawer_teabox,lirislab/open_top_drawer_teabox,lirislab/unfold_bottom_right,lirislab/push_cup_target,lirislab/put_banana_bowl,Chojins/chess_game_001_blue_stereo,Chojins/chess_game_001_red_stereo,ganker5/so100_toy_20250402,Gano007/so100_medic,00ri/so100_battery_bin_center,paszea/so100_whale_2,lirislab/fold_bottom_right,lirislab/put_coffee_cap_teabox,therarelab/so100_pick_place_2,paszea/so100_whale_3,paszea/so100_whale_4,paszea/so100_lego,LemonadeDai/so100_coca,zijian2022/backgrounda,zijian2022/backgroundb,356c/so100_nut_sort_1,Mwuqiu/so100_0408_muti,aimihat/so100_tape,lirislab/so100_demo,356c/so100_duck_reposition_1,zijian2022/sort1,weiye11/so100_410_zwy,VoicAndrei/so100_banana_to_plate_only,sixpigs1/so100_stack_cube_error,isadev/bougies3,zijian2022/close3,bensprenger/left_arm_yellow_brick_in_box_v0,lirislab/guess_who_so100,bensprenger/left_arm_yellow_brick_in_box_with_purple_noise_v0,roboticshack/team16-can-stacking,zijian2022/insert2,roboticshack/team-7-right-arm-grasp-tape,Jiangeng/so100_413,roboticshack/team9-pick_cube_place_static_plate,AndrejOrsula/lerobot_double_ball_stacking_random,roboticshack/left-arm-grasp-lego-brick,roboticshack/team-7-left-arm-grasp-motor,roboticshack/team9-pick_chicken_place_plate,roboticshack/team13-two-balls-stacking,tkc79/so100_lego_box_1,roboticshack/team13-three-balls-stacking,pierfabre/chicken,roboticshack/team16-water-pouring,ad330/cubePlace,Jiafei1224/so100_pa222per,paszea/so100_lego_2cam,bensprenger/chess_game_001_blue_stereo,Mohamedal/put_banana,tkc79/so100_lego_box_2,samanthalhy/so100_herding_1,jlesein/TestBoulon7
-# # SAMPLING_WEIGHTS=
-# # DATASET_NAME=so100_community_v2
-
-# # Community datasets V1+V2
-# # REPO_ID=pierfabre/rabbit,bensprenger/right_arm_p_brick_in_box_with_y_noise_v0,pierfabre/horse,pierfabre/pig2,pierfabre/pig3,pierfabre/cow2,pierfabre/sheep,Chojins/chess_game_009_white,sihyun77/suho_3_17_1,sihyun77/sihyun_3_17_2,sihyun77/suho_3_17_3,sihyun77/sihyun_3_17_5,Odog16/so100_cube_drop_pick_v1,sihyun77/sihyun_main_2,sihyun77/suho_main_2,Bartm3/dice2,sihyun77/sihyun_main_3,Loki0929/so100_duck,pietroom/holdthis,pietroom/actualeasytask,Beegbrain/pick_lemon_and_drop_in_bowl,Beegbrain/sweep_tissue_cube,zijian2022/321,gxy1111/so100_pick_place,Odog16/so100_cube_stacking_v1,sihyun77/mond_1,andlyu/so100_indoor_1,andlyu/so100_indoor_3,frk2/so100large,lirislab/sweep_tissue_cube,lirislab/lemon_into_bowl,lirislab/red_cube_into_green_lego_block,lirislab/red_cube_into_blue_cube,00ri/so100_battery,frk2/so100largediffcam,FsqZ/so100_1,ZGGZZG/so100_drop0,Chojins/chess_game_000_white_red,smanni/train_so100_fluffy_box,ganker5/so100_push_20250328,ganker5/so100_dataline_0328,ganker5/so100_color_0328,CrazyYhang/A1234-B-C_mvA2B,RasmusP/so100_Orange2Green,sixpigs1/so100_pick_cube_in_box,ganker5/so100_push_20250331,ganker5/so100_dataline_20250331,lirislab/put_caps_into_teabox,lirislab/close_top_drawer_teabox,lirislab/open_top_drawer_teabox,lirislab/unfold_bottom_right,lirislab/push_cup_target,lirislab/put_banana_bowl,Chojins/chess_game_001_blue_stereo,Chojins/chess_game_001_red_stereo,ganker5/so100_toy_20250402,Gano007/so100_medic,00ri/so100_battery_bin_center,paszea/so100_whale_2,lirislab/fold_bottom_right,lirislab/put_coffee_cap_teabox,therarelab/so100_pick_place_2,paszea/so100_whale_3,paszea/so100_whale_4,paszea/so100_lego,LemonadeDai/so100_coca,zijian2022/backgrounda,zijian2022/backgroundb,356c/so100_nut_sort_1,Mwuqiu/so100_0408_muti,aimihat/so100_tape,lirislab/so100_demo,356c/so100_duck_reposition_1,zijian2022/sort1,weiye11/so100_410_zwy,VoicAndrei/so100_banana_to_plate_only,sixpigs1/so100_stack_cube_error,isadev/bougies3,zijian2022/close3,bensprenger/left_arm_yellow_brick_in_box_v0,lirislab/guess_who_so100,bensprenger/left_arm_yellow_brick_in_box_with_purple_noise_v0,roboticshack/team16-can-stacking,zijian2022/insert2,roboticshack/team-7-right-arm-grasp-tape,Jiangeng/so100_413,roboticshack/team9-pick_cube_place_static_plate,AndrejOrsula/lerobot_double_ball_stacking_random,roboticshack/left-arm-grasp-lego-brick,roboticshack/team-7-left-arm-grasp-motor,roboticshack/team9-pick_chicken_place_plate,roboticshack/team13-two-balls-stacking,tkc79/so100_lego_box_1,roboticshack/team13-three-balls-stacking,pierfabre/chicken,roboticshack/team16-water-pouring,ad330/cubePlace,Jiafei1224/so100_pa222per,paszea/so100_lego_2cam,bensprenger/chess_game_001_blue_stereo,Mohamedal/put_banana,tkc79/so100_lego_box_2,samanthalhy/so100_herding_1,jlesein/TestBoulon7,pranavsaroha/so100_onelego2,pranavsaroha/so100_onelego3,pranavsaroha/so100_carrot_2,vladfatu/so100_above,koenvanwijk/orange50-1,CSCSXX/pick_place_cube_1.18,dragon-95/so100_sorting,dragon-95/so100_sorting_1,nbaron99/so100_pick_and_place4,Beegbrain/pick_place_green_block,dragon-95/so100_sorting_3,HITHY/so100_peach3,shreyasgite/so100_legocube_50,triton7777/so100_dataset_mix,NONHUMAN-RESEARCH/SOARM100_TASK_VENDA,mikechambers/block_cup_14,samsam0510/tooth_extraction_3,samsam0510/tooth_extraction_4,samsam0510/cube_reorientation_2,samsam0510/cube_reorientation_4,samsam0510/glove_reorientation_1,vladfatu/so100_office,pranavsaroha/so100_legos4,Ityl/so100_recording2,FeiYjf/new_GtoR,dragon-95/so100_sorting_2,HITHY/so100_peach4,jpata/so100_pick_place_tangerine,HITHY/so100_strawberry,shreyasgite/so100_base_env,koenvanwijk/orange50-variation-2,pranavsaroha/so100_carrot_5,pandaRQ/pick_med_1,aractingi/push_cube_offline_data,DorayakiLin/so100_pick_charger_on_tissue,zijian2022/noticehuman3,liuhuanjim013/so100_th
-# REPO_ID=pierfabre/rabbit,bensprenger/right_arm_p_brick_in_box_with_y_noise_v0,pierfabre/horse,pierfabre/pig2
-# SAMPLING_WEIGHTS=
-
-# # # Community V3
-# # # issues, yskim2025/unitylerobot (version), cranberrysoft/so100 (don't exist),29  datasets different actions: nguyen-v/so100_rotate_red_button satvikahuja/mixer_on_off_new_1 ...
-# # REPO_ID=satvikahuja/mixer_on_off_new_1,aergogo/so100_pick_place,andy309/so100_0314_fold_cloths,jchun/so100_pickplace_small_20250323_120056,astroyat/cube,Ofiroz91/so_100_cube2bowl,HappyPablo/dec3_data2,ZCM5115/so100_1210,francescocrivelli/orange_feeding,francescocrivelli/carrot_eating,0x00raghu/toffee_red,0x00raghu/toffee_red_2,0x00raghu/toffee_red_3__,0x00raghu/toffee_blue,0x00raghu/toffee_blue_2,0x00raghu/toffee_to_hand_1,0x00raghu/toffee_to_hand_2,liyitenga/so100_bi_hello,liyitenga/so100_bi_giveme5,ZCM5115/so100_2Arm3cameras_movebox,pranavsaroha/so100_carrot_1,pranavsaroha/so100_carrot_3,pranavsaroha/so100_carrot_4,maximilienroberti/so100_lego_red_box,pranavsaroha/so100_squishy,rabhishek100/so100_train_dataset,pranavsaroha/so100_squishy100,swarajgosavi/kikobot_pusht_real_v2,pandaRQ/pickmed,swarajgosavi/act_kikobot_pusht_real,pranavsaroha/so100_squishy2colors,pranavsaroha/so100_squishy2colors_1,Chojins/chess_game_001_white,jmrog/so100_sweet_pick,Chojins/chess_game_002_white,pranavsaroha/so100_squishy2colors_2_new,Chojins/chess_game_003_white,aractingi/pick_place_lego_cube,Chojins/chess_game_004_white,Chojins/chess_game_005_white,Chojins/chess_game_006_white,Chojins/chess_game_007_white,koenvanwijk/blue2,jlitch/so100multicam3,koenvanwijk/blue52,jlitch/so100multicam6,aractingi/pick_place_lego_cube_1,jlitch/so100multicam7,vladfatu/so100_ds,Chojins/chess_game_000_white,HITHY/so100-kiwi,HITHY/so100_peach1,HITHY/so100_redstrawberry,satvikahuja/orange_mixer_1,satvikahuja/mixer_on_off,satvikahuja/orange_pick_place_new1,satvikahuja/mixer_on_off_new,danmac1/real_real332,FeiYjf/Makalu_push,liyitenga/so100_pick_taffy1,chmadran/so100_dataset04,FeiYjf/Maklu_dataset,FeiYjf/new_Dataset,liyitenga/so100_pick_taffy2,satvikahuja/mixer_on_off_new_4,CSCSXX/pick_place_cube_1.17,liyitenga/so100_pick_taffy3,liyitenga/so100_pick_taffy4,yuz1wan/so100_pick_pink,yuz1wan/so100_pick_wahaha,yuz1wan/so100_pp_pink,yuz1wan/so100_pour_cup,liyitenga/so100_pick_taffy5,liyitenga/so100_pick_taffy6,yuz1wan/so100_button,yuz1wan/so100_pickplace,liyitenga/so100_pick_taffy7,FeiYjf/push_gg,FeiYjf/push_0094,swarajgosavi/act_kikobot_block_real,liyitenga/so100_pick_taffy8,phospho-ai/OrangeBrick3Cameras,vaishanthr/toy_pick_place,SeanLMH/so100_picknplace_v2,pepijn223/yellow_lego_in_box1,DimiSch/so100_50ep_2,DimiSch/so100_50ep_3,SeanLMH/so100_picknplace,nbaron99/so100_pick_and_place2,chmadran/so100_dataset08,vaishanthr/toy_pickplace_50ep,Beegbrain/pick_place_green_block_lr,Ityl/so100_recording1,vaishanthr/toy_pickplace,ad330/so100_box_pickPlace,Beegbrain/so100_put_cube_cup,aractingi/push_green_cube_hf,aractingi/push_green_cube_hf_cropped_resized,carpit680/giraffe_task,carpit680/giraffe_sock_demo_1,DimiSch/so100_terra_50_2,carpit680/giraffe_sock_demo_2,aractingi/push_cube_to_face_reward,aractingi/push_cube_to_face_reward_cropped_resized,aractingi/push_cube_reward_data,aractingi/push_cube_reward_data_cropped_resized,aractingi/push_cube_offline_data_cropped_resized,aractingi/push_cube_front_side_reward,aractingi/push_cube_front_side_reward_cropped_resized,aractingi/push_cube_front_side_reward_long,aractingi/push_cube_front_side_reward_long_cropped_resized,aractingi/push_cube_reward,aractingi/push_cube_reward_cropped_resized,aractingi/push_cube_square_reward_cropped_resized,aractingi/push_cube_square_reward_1,aractingi/push_cube_square_reward_1_cropped_resized,aractingi/push_cube_square_light_reward,aractingi/push_cube_square_light_offline_demo,aractingi/push_cube_square_light_offline_demo_cropped_resized,denghj/dataset_red_tape01,aractingi/push_cube_square_offline_demo,aractingi/push_cube_square_offline_demo_cropped_resized,Beegbrain/stack_two_cubes,FeiYjf/Test_NNNN,LegrandFrederic/Orange-brick-lower-resolution,aractingi/pick_place_lego_cube_cropped_resized,aractingi/push_cube_overfit,aractingi/push_cube_overfit_cropped_resized,HITHY/so100_peach,zaringleb/so100_cube_2,andreasBihlmaier/dual_arm_transfer_2025_02_16,zaringleb/so100_cube_4_binary,1g0rrr/reward_pickplace1,1g0rrr/reward_pickplace1_cropped_resized,FeiYjf/Hold_Pieces,FeiYjf/Grab_Pieces,hegdearyandev/so100_eraser_cup_v1,jbraumann/so100_1902,liyitenga/so100_pick_taffy10,mikechambers/block_cup_5,zaringleb/so100_cube_5_linear,yuz1wan/so100_pickplace_0223_2,yuz1wan/so100_pickplace_0223_3,samsam0510/mj_data_temp,samsam0510/tape_insert_1,samsam0510/tape_insert_2,pengjunkun/so100_push_to_hole,Deason11/Random_Kitchen,1g0rrr/reward_dataset_name2,1g0rrr/reward_dataset_name2_cropped_resized,1g0rrr/offline_dataset_name2,1g0rrr/offline_dataset_name2_cropped_resized,aractingi/push_cube_simp_cropped_resized,danielkr452/so100_work6,Loki0929/so100_100,yuz1wan/so100_fold_0227_1,yuz1wan/so100_fold_0227_2,speedyyoshi/so100_grasp_pink_block,lirislab/stack_two_red_cubes,lirislab/red_cube_into_mug,lirislab/green_lego_block_into_mug,lirislab/green_lego_block_into_mug_easy,kevin510/lerobot-cat-toy-placement,NONHUMAN-RESEARCH/SOARM100_TASK_VENDA_BOX,wangjl1512/pour_water,airthebear/so100_GL,zijian2022/noticehuman1,zijian2022/noticehuman2,kantine/so100_kapla_tower6,zijian2022/noticehuman5,zijian2022/llm40,Ashton3/lerobot-aloha,zijian2022/noticehuman50,AaronNewman/screwdriver_task_batch1,AaronNewman/screwdriver_task_batch2,AaronNewman/screwdriver_task_batch3,zijian2022/noticehuman60,zijian2022/noticehuman70,Bartm3/tape_to_bin,liuhuanjim013/so100_th_1,Pi-robot/barbecue_flip,Pi-robot/barbecue_put,wangjl1512/doll,sshh11/so100_orange_50ep_1,sshh11/so100_orange_50ep_2,DorayakiLin/so100_pick_cube_in_box,Bartm3/tape_to_bin2,luke250305/play_dice_250311.1,andy309/so100_0311_1152,sihyun77/suho_so100,sihyun77/si_so100,shreyasgite/so100_base_left,sihyun77/suho_red,liuhuanjim013/so100_block,andy309/so100_0313_no_wrist_camera,zijian2022/l9,zijian2022/n1_2,DorayakiLin/so100_stack_cube,andy309/so100_0313_no_wrist_camera_with_two_arms_cloths,joaoocruz00/so100_makeitD1,zijian2022/l10_1,zijian2022/l10_5,sihyun77/suho_red2,sihyun77/suho_angel,sihyun77/sihyun_king,acrampette/third_arm_01,Winster/so100_cube,1g0rrr/sam_openpi03,thedevansh/mar16_1336,hkphoooey/throw_stuffie,doujiangwang/task1_10epi_100000step,sihyun77/sihyun_3_17_1,acrampette/third_arm_02,imsyed00/so100_yellowbowl_pickplace_1,kumarhans/so100_tape_task,sihyun77/sihyun_main,doujiangwang/task2_10epi_100000step,kantine/industrial_robothon_buttons_expert,kantine/industrial_robothon_buttons_anomaly,kantine/industrial_robothon_hatchAndProbe_expert,kantine/industrial_robothon_hatchAndProbe_anomaly,Odog16/so100_tea_towel_folding_v1,zijian2022/so100_318,zijian2022/so100_318_1,Congying1112/so100_place_blue_bottle_with_two_cameras,Congying1112/so100_place_blue_bottle_with_two_cameras2,Congying1112/so100_place_blue_bottle_with_single_camera,pietroom/first_task_short,kantine/industrial_screws_sorting_expert,kantine/industrial_screws_sorting_anomaly,pietroom/second_task,zijian2022/c0,doujiangwang/task4_10epi_100000step,Congying1112/so100_switch_with_onhand_camera,HYAIYN/so100_get_orange_10epi,doujiangwang/task5_10epi_100000step,1g0rrr/sam_openpi_cube_low10,1g0rrr/sam_openpi_cube_top10,1g0rrr/sam_openpi_wire10,1g0rrr/sam_openpi_solder1,1g0rrr/sam_openpi_solder2,wcode/so100_put_pen_50,jchun/so100_pickplace_small_20250322_193929,bnarin/so100_tic_tac_toe_we_do_it_live,dc2ac/so100-t5,chmadran/so100_home_dataset,baladhurgesh97/so100_final_picking_3,bnarin/so100_tic_tac_toe_move_0_0,bnarin/so100_tic_tac_toe_move_1_0,bnarin/so100_tic_tac_toe_move_2_1,bnarin/so100_tic_tac_toe_move_4_0,zaringleb/so100_cube_6_2d,andlyu/so100_indoor_0,andlyu/so100_indoor_2,Winster/so100_sim,badwolf256/so100_twin_cam_duck,Congying1112/so100_simplepick_with_2_cameras_from_top,andlyu/so100_indoor_4,Zak-Y/so100_grap_dataset,kantine/domotic_pouringCoffee_expert,kantine/domotic_pouringCoffee_anomaly,lucasngoo/so100_strawberry_grape,kantine/domotic_makingCoffee_expert,kantine/domotic_makingCoffee_anomaly,ZGGZZG/so100_drop1,kantine/industrial_soldering_expert,kantine/industrial_soldering_anomaly,Yotofu/so100_sweeper_shoes,kantine/domotic_dishTidyUp_expert,kantine/domotic_dishTidyUp_anomaly,kantine/domotic_groceriesSorting_expert,kantine/domotic_groceriesSorting_anomaly,badwolf256/so100_twin_cam_duck_v2,kantine/domotic_vegetagblesAndFruitsSorting_expert,kantine/domotic_vegetagblesAndFruitsSorting_anomaly,kantine/domotic_setTheTable_expert,kantine/domotic_setTheTable_anomaly,therarelab/so100_pick_place,abhisb/so100_51_ep,andlyu/so100_indoor_val_0,allenchienxxx/so100Test,lizi178119985/so100_jia,badwolf256/so100_twin_cam_duck_v3,andrewcole712/so100_tape_bin_place,Gano007/so100_lolo,Zak-Y/so100_three_cameras_dataset,Gano007/so100_doliprane,XXRRSSRR/so100_v3_num_episodes_50,zijian2022/assemblyarm2,ganker5/so100_action_20250403,andlyu/so100_indoor_val2,Gano007/so100_gano,paszea/so100_whale_grab,paszea/so100_whale,Clementppr/lerobot_pick_and_place_dataset_world_model,andlyu/so100_indoor_10,RasmusP/so100_dataset50ep_a,RasmusP/so100_dataset50ep,Gano007/so100_second,zaringleb/so100_cude_linear_and_2d_comb,dsfsg/grasp_pens,zijian2022/digitalfix,zijian2022/digitalfix2,zijian2022/digitalfix3,T1g3rGE/so100_pickplace_small_20250407_171912,sihyun77/mond_13,abokinala/sputnik_100_11_pick_place_container,dsfsg/bring_bottle,duthvik/sputnik_100_13_pick_place_container,abokinala/sputnik_100_12_pick_place_container,Mwuqiu/so100_0408,AK51/4090_01,356c/so100_rope_reposition_1,paszea/so100_lego_mix,abokinala/sputnik_100_14_pick_place_container,abokinala/sputnik_100_23_pick_place_surface,jiajun001/eraser00_2,jlesein/TestBoulon2,duthvik/sputnik_100_31_pour_liquid,duthvik/sputnik_100_24_pick_place_surface,duthvik/sputnik_100_25_pick_place_surface,duthvik/sputnik_100_17_pick_place_container,duthvik/sputnik_100_26_pick_place_surface,VoicAndrei/so100_banana_to_plate_rebel_full,isadev/bougies1,danaaubakirova/so100_task_1,danaaubakirova/so100_task_2,danaaubakirova/so100_task_3,danaaubakirova/so100_task_4,sixpigs1/so100_pick_cube_in_box_error,sixpigs1/so100_push_cube_error,sixpigs1/so100_pull_cube_error,isadev/bougies2,therarelab/med_dis_rare_6,duthvik/sputnik_100_27_pick_place_surface,zijian2022/closer3,duthvik/sputnik_100_41_custom_tasks,duthvik/sputnik_100_42_custom_tasks,duthvik/sputnik_100_43_custom_tasks,duthvik/sputnik_100_44_custom_tasks,duthvik/sputnik_100_51_kitchen_tasks,duthvik/sputnik_100_52_kitchen_tasks,duthvik/sputnik_100_53_kitchen_tasks,duthvik/sputnik_100_45_custom_tasks,duthvik/sputnik_100_32_pour_liquid,duthvik/sputnik_100_29_pick_place_surface,duthvik/sputnik_100_18_pick_place_container,sixpigs1/so100_pull_cube_by_tool_error,sixpigs1/so100_insert_cylinder_error,abokinala/sputnik_100_54_kitchen_tasks,abokinala/sputnik_100_55_kitchen_tasks,m1b/so100_bluelego,abokinala/sputnik_100_46_custom_tasks,m1b/so100_bluelego_updt,kantine/flip_A0,kantine/flip_A1,kantine/flip_A2,kantine/flip_A3,lirislab/guess_who_no_cond,kantine/flip_A4,kantine/flip_A5,lirislab/guess_who_lighting,nguyen-v/so100_press_red_button,nguyen-v/so100_bimanual_grab_lemon_put_in_box2,pierfabre/cow,nguyen-v/press_red_button_new,nguyen-v/so100_rotate_red_button,raghav-katta-1/lerobot2,Cidoyi/so100_all_notes,roboticshack/team10-red-block,Cidoyi/so100_all_notes_1,roboticshack/team_5-QuiEstCe_everyBox,roboticshack/team11_pianobot,roboticshack/team2-guess_who_so100,roboticshack/team2-guess_who_so100_light,roboticshack/team2-guess_who_so100_edge_case,roboticshack/team2-guess_who_less_ligth,Cidoyi/so100_all_notes_3,dsfsg/grasp_pen_and_bottle,abokinala/sputnik_100_60_kitchen_tasks,abokinala/sputnik_100_58_kitchen_tasks,danaaubakirova/so100_v2_task_1,danaaubakirova/so100_v2_task_2,danaaubakirova/so100_v2_task_3,danaaubakirova/so100_v2_task_4,zijian2022/force1,zijian2022/force2,zijian2022/force3,jiajun001/eraser00_3,zijian2022/bi2,zijian2022/bi1,zijian2022/hand1,Setchii/so100_grab_ball,MossProphet/so100_square-1-2-3.2
-# # SAMPLING_WEIGHTS=
-# # DATASET_NAME=so100_community_v3
-
-# ##########################
-
-# ROBOT=so100
-# export TOKENIZERS_PARALLELISM=false
-# export MUJOCO_GL=egl
-
-
-
-# SAMPLING_WEIGHTS=
-# FEATURES_VERSION=2
-# NUM_IMAGE_TRANSFORMS=10
-# TRAIN_ON_ALL_FEATURES=true
-# NORM_PER_ROBOT=true
-# USE_IMAGENET_STATS=false
-
-# MAX_STATE_DIM=6
-# MAX_ACTION_DIM=6
-# MAX_NUM_IMAGES=3
-# MAX_IMAGE_DIM=256
-
-
-# SEED=5000
-# BATCH_SIZE=32
-# # EVAL_STEPS=1000
-# EVAL_STEPS=100
-
-
-
-
-
-# SELF_ATTN_ONLY_ACTIONS=false
-# EXPERT_WIDTH_MULTIPLIER=0.75
-# PAST_OBS_KEYS="image"
-# N_OBS_STEPS=1
-# NUM_EXPERT_LAYERS=0
-# CHUNK_SIZE=50
-# NUM_VLM_LAYERS=16
-# PAD_LANG_TO=longest
-# EVAL_CKPT=/lustre/fswork/projects/rech/dyf/ugz83ue/logs/lerobot/lerobot_so100_community_v1_v2_smolpi0_lr1e-4bs64steps200000gpus4freeze32_imgtoktrue_cross_attn_gap1_localimgfalse_statetopreftrue_explay0_vlml16_causalacttrue_sa2_smolvlm2500_chunk50_nobs1_expw0.75_feat2_lrvlm1e-4_droptrue_max_length/checkpoints/080000/pretrained_model/
-# ADD_IMAGE_TOKENS=true
-# ATTN_MODE=cross_attn
-# STATE_TO_PREFIX=true
-# CAUSAL_ACTION_ATTENTION_MASK=true
-# SELF_ATTN_EVERY_N_LAYERS=2
-# VLM_NAME=HuggingFaceTB/SmolVLM-500M-Instruct
-
-
-# python lerobot/scripts/offline_inference.py \
-#     --output_dir=$WORK/logs/lerobot/tmp \
-#     --batch_size=$BATCH_SIZE \
-#     --seed=$SEED \
-#     --eval_steps=$EVAL_STEPS \
-#     --use_amp=false \
-#     --device=cuda \
-#     --dataset.repo_id=$REPO_ID --dataset.local_files_only=true --dataset.sampling_weights=$SAMPLING_WEIGHTS --dataset.use_imagenet_stats=$USE_IMAGENET_STATS --policy.normalize_per_robot_type=$NORM_PER_ROBOT \
-#     --dataset.image_transforms.max_num_transforms=$NUM_IMAGE_TRANSFORMS --dataset.image_transforms.enable=true --dataset.train_on_all_features=$TRAIN_ON_ALL_FEATURES \
-#     --dataset.max_action_dim=$MAX_ACTION_DIM --dataset.max_state_dim=$MAX_STATE_DIM --dataset.max_num_images=$MAX_NUM_IMAGES --dataset.max_image_dim=$MAX_IMAGE_DIM --dataset.features_version=$FEATURES_VERSION \
-#     --policy.type=$POLICY \
-#     --policy.checkpoint_path=$EVAL_CKPT \
-#     --policy.checkpoint_keys_mapping=$CKPT_KEYS_MAPPING \
-#     --policy.add_image_special_tokens=$ADD_IMAGE_TOKENS \
-#     --policy.attention_mode=$ATTN_MODE \
-#     --policy.causal_action_attention_mask=$CAUSAL_ACTION_ATTENTION_MASK \
-#     --policy.state_to_prefix=$STATE_TO_PREFIX \
-#     --policy.self_attn_every_n_layers=$SELF_ATTN_EVERY_N_LAYERS \
-#     --policy.vlm_model_name=$VLM_NAME \
-#     --policy.pad_language_to=$PAD_LANG_TO \
-#     --policy.load_vlm_weights=$LOAD_VLM_WEIGHTS \
-#     --policy.num_vlm_layers=$NUM_VLM_LAYERS \
-#     --policy.chunk_size=$CHUNK_SIZE \
-#     --policy.n_obs_steps=$N_OBS_STEPS \
-#     --policy.past_obs_keys=$PAST_OBS_KEYS \
-#     --policy.num_expert_layers=$NUM_EXPERT_LAYERS \
-#     --policy.expert_width_multiplier=$EXPERT_WIDTH_MULTIPLIER \
-#     --policy.peft_method=$PEFT_METHOD \
-#     --policy.self_attn_only_actions=$SELF_ATTN_ONLY_ACTIONS \
-#     --policy.robot_type=$ROBOT 
-
-
-
-
-# MULTITASK_EVAL=true
-# N_EPISODES=5
-# MAX_PARRALLEL_TASKS=1
-# ACTION_STEPS_LIST=(1)
-# TASK_LIST=(libero_10)
-# for N_ACTION_STEPS in "${ACTION_STEPS_LIST[@]}"; do
-#     for TASK in "${TASK_LIST[@]}"; do
-#         echo "$TASK Evaluating: $EVAL_CKPT | N_ACTION_STEPS=$N_ACTION_STEPS"
-#         python lerobot/scripts/eval.py \
-#             --output_dir=$WORK/logs/lerobot/tmp \
-#             --env.type=$ENV \
-#             --env.task=$TASK \
-#             --eval.batch_size=$N_EPISODES \
-#             --eval.n_episodes=$N_EPISODES \
-#             --use_amp=false \
-#             --device=cuda \
-#             --policy.n_action_steps=$N_ACTION_STEPS \
-#             --policy.type=$POLICY \
-#             --policy.checkpoint_path=$EVAL_CKPT \
-#             --policy.checkpoint_keys_mapping=$CKPT_KEYS_MAPPING \
-#             --env.multitask_eval=$MULTITASK_EVAL --env.max_parallel_tasks=$MAX_PARRALLEL_TASKS \
-#             --policy.add_image_special_tokens=$ADD_IMAGE_TOKENS \
-#             --policy.attention_mode=$ATTN_MODE \
-#             --policy.causal_action_attention_mask=$CAUSAL_ACTION_ATTENTION_MASK \
-#             --policy.state_to_prefix=$STATE_TO_PREFIX \
-#             --policy.self_attn_every_n_layers=$SELF_ATTN_EVERY_N_LAYERS \
-#             --policy.vlm_model_name=$VLM_NAME \
-#             --policy.load_vlm_weights=$LOAD_VLM_WEIGHTS \
-# --policy.num_vlm_layers=$NUM_VLM_LAYERS \
-# --policy.chunk_size=$CHUNK_SIZE 
-
-#         echo "Done with: $EVAL_CKPT | Steps=$N_ACTION_STEPS"
-#         echo "------------------------------------------------------"
-#     done
-# done
-
diff --git a/examples/checker.py b/examples/checker.py
deleted file mode 100644
index 12377e9b9..000000000
--- a/examples/checker.py
+++ /dev/null
@@ -1,27 +0,0 @@
-from huggingface_hub import HfApi
-api = HfApi()
-# api.upload_large_folder(
-#     repo_id="HuggingFaceVLA/libero",
-#     repo_type="dataset",
-#     folder_path="/raid/jade/.cache/huggingface/lerobot/HuggingFaceVLA/libero",
-# )
-api.upload_large_folder(
-    repo_id="HuggingFaceVLA/metaworld_mt50",
-    repo_type="dataset",
-    folder_path="/raid/jade/.cache/huggingface/lerobot/metaworld_mt50",
-)
-# repo_id="HuggingFaceVLA/libero"
-# # Upload extra files
-# api.upload_file(
-#     repo_id=repo_id,
-#     repo_type="dataset",
-#     path_or_fileobj="/raid/jade/libero_converted/README.md",
-#     path_in_repo="README.md"
-# )
-
-# api.upload_folder(
-#     repo_id=repo_id,
-#     repo_type="dataset",
-#     folder_path="/raid/jade/libero_converted/meta",
-#     path_in_repo="meta"
-# )
diff --git a/examples/checker2.py b/examples/checker2.py
deleted file mode 100644
index a5825d87f..000000000
--- a/examples/checker2.py
+++ /dev/null
@@ -1,35 +0,0 @@
-import pyarrow.parquet as pq
-
-# # First parquet (cached HF version)
-meta1 = pq.read_metadata("/raid/jade/.cache/huggingface/datasets/data/chunk-000/episode_000000.parquet")
-meta1 = pq.read_metadata("//raid/jade/.cache/huggingface/lerobot/HuggingFaceVLA/libero/data/chunk-000/episode_000019.parquet")
-print("First parquet key_value_metadata:")
-print(meta1.metadata)  # low-level file metadata
-# print()
-print("Second")
-# Second parquet (your converted version)
-meta2 = pq.read_metadata("//raid/jade/.cache/huggingface/lerobot/HuggingFaceVLA/libero/data/chunk-000/episode_000019.parquet")
-print("\nSecond parquet key_value_metadata:")
-# print(meta2.metadata)
-
-# from datasets import load_dataset
-# root_dir = "/raid/jade/libero_converted"
-
-# # Load all parquet files under the root_dir recursively
-# ds = load_dataset("parquet", data_files=f"{root_dir}/**/*.parquet")
-
-# print(ds)                 # prints split info
-# print(ds["train"].features)  # check schema/features
-
-# # Peek at one row
-# example = ds["train"][0]
-# print(example.keys())
-# print(type(example["observation.images.image"]))
-# print(type(example["observation.images.image2"]))
-
-import pyarrow.parquet as pq
-
-for ep in ["episode_000019.parquet", "episode_000021.parquet", "episode_000026.parquet"]:
-    path = f"/raid/jade/.cache/huggingface/lerobot/HuggingFaceVLA/libero/data/chunk-000/{ep}"
-    schema = pq.read_schema(path)
-    print(ep, schema.names)
diff --git a/examples/convert_data.py b/examples/convert_data.py
deleted file mode 100644
index 96ce58cb1..000000000
--- a/examples/convert_data.py
+++ /dev/null
@@ -1,253 +0,0 @@
-#!/usr/bin/env python3
-"""
-Convert local LeRobot datasets from v2.0 to v2.1 format.
-This script adapts the official converter to work with local datasets.
-"""
-
-import sys
-import argparse
-import logging
-from pathlib import Path
-
-# Add lerobot to path
-sys.path.insert(0, '/home/jade_choghari/lerobot/src')
-
-from lerobot.datasets.lerobot_dataset import CODEBASE_VERSION, LeRobotDataset
-from lerobot.datasets.utils import EPISODES_STATS_PATH, STATS_PATH, load_stats, write_info
-from lerobot.datasets.v21.convert_stats import check_aggregate_stats, convert_stats
-
-logging.basicConfig(level=logging.INFO)
-logger = logging.getLogger(__name__)
-
-def convert_local_dataset(
-    dataset_path: str,
-    num_workers: int = 4,
-    skip_if_converted: bool = True
-):
-    """
-    Convert a local dataset from v2.0 to v2.1 format.
-    
-    Args:
-        dataset_path: Path to the local dataset directory
-        num_workers: Number of workers for parallel processing
-        skip_if_converted: Skip if already has episodes_stats.jsonl
-    """
-    dataset_path = Path(dataset_path)
-    
-    print(f"🔄 Converting local dataset: {dataset_path}")
-    
-    # Check if already converted
-    episodes_stats_path = dataset_path / "meta" / "episodes_stats.jsonl"
-    if episodes_stats_path.exists() and skip_if_converted:
-        # Check if file is empty
-        file_size = episodes_stats_path.stat().st_size
-        if file_size == 0:
-            print(f"  ⚠️  episodes_stats.jsonl is empty, will regenerate")
-        else:
-            # Check if file has content
-            with open(episodes_stats_path, 'r') as f:
-                content = f.read().strip()
-                if not content:
-                    print(f"  ⚠️  episodes_stats.jsonl has no content, will regenerate")
-                else:
-                    print(f"  ⏭️  Already has episodes_stats.jsonl, skipping")
-                    return True
-    
-    try:
-        # Check if this is a v2.0 dataset that needs conversion
-        episodes_stats_path = dataset_path / "meta" / "episodes_stats.jsonl"
-        stats_path = dataset_path / "meta" / "stats.json"
-        
-        if not episodes_stats_path.exists() and stats_path.exists():
-            print(f"  🔄 Detected v2.0 dataset, creating temporary episodes_stats.jsonl...")
-            # Create empty episodes_stats.jsonl to allow loading
-            episodes_stats_path.touch()
-            created_temp_file = True
-        else:
-            created_temp_file = False
-        
-        # Load dataset from local path with pyav video backend
-        print(f"  📂 Loading dataset from local path...")
-        # Use a dummy repo_id since we're loading locally
-        dummy_repo_id = f"{dataset_path.parent.name}/{dataset_path.name}"
-        dataset = LeRobotDataset(
-            dummy_repo_id, 
-            root=str(dataset_path), 
-            # video_backend="pyav",
-            # local_files_only=True
-        )
-        
-        # Remove temporary file if we created it
-        if created_temp_file and episodes_stats_path.exists() and episodes_stats_path.stat().st_size == 0:
-            episodes_stats_path.unlink()
-            print(f"  🗑️  Removed temporary episodes_stats.jsonl")
-        
-        # Remove existing episodes_stats if present (ensure clean conversion)
-        episodes_stats_path = dataset_path / "meta" / "episodes_stats.jsonl"
-        if episodes_stats_path.exists():
-            episodes_stats_path.unlink()
-            print(f"  🗑️  Removed existing episodes_stats.jsonl")
-        
-        # Check if video directory exists before conversion
-        videos_dir = dataset_path / "videos"
-        if not videos_dir.exists():
-            print(f"  ⚠️  No videos directory found - will skip video statistics")
-        
-        # Convert stats
-        print(f"  📊 Computing episode statistics...")
-        convert_stats(dataset, num_workers=num_workers)
-        
-        # Load reference stats for validation if they exist
-        stats_path = dataset.root / STATS_PATH
-        if stats_path.exists():
-            print(f"  ✅ Validating against reference stats...")
-            try:
-                ref_stats = load_stats(dataset.root)
-                check_aggregate_stats(dataset, ref_stats)
-                print(f"  ✅ Stats validation passed!")
-            except AssertionError as e:
-                print(f"  ⚠️  Stats validation failed with minor differences: {e}")
-                print(f"  ⚠️  This is likely due to floating-point precision, continuing anyway...")
-                # Check if the error is just a small numerical difference
-                if "Max absolute difference:" in str(e) and "Max relative difference:" in str(e):
-                    print(f"  ✅ Treating as acceptable numerical precision difference")
-                else:
-                    raise e
-            
-            # Remove old stats.json file
-            print(f"  🗑️  Removing old stats.json")
-            stats_path.unlink()
-        else:
-            print(f"  ⚠️  No reference stats found, skipping validation")
-        
-        # Update codebase version
-        dataset.meta.info["codebase_version"] = CODEBASE_VERSION
-        write_info(dataset.meta.info, dataset.root)
-        
-        print(f"  ✅ Successfully converted to v2.1")
-        return True
-        
-    except Exception as e:
-        print(f"  ❌ Failed to convert: {e}")
-        logger.exception("Conversion failed")
-        return False
-
-def convert_multiple_datasets(
-    base_dirs: list[str],
-    max_datasets: int = None,
-    num_workers: int = 4
-):
-    """Convert multiple datasets from base directories."""
-    
-    datasets_to_convert = []
-    
-    # Scan for datasets needing conversion
-    for base_dir in base_dirs:
-        base_path = Path(base_dir)
-        if not base_path.exists():
-            print(f"⚠️  Directory not found: {base_dir}")
-            continue
-        
-        print(f"🔍 Scanning: {base_dir}")
-        
-        # Walk through author/dataset structure
-        for author_dir in sorted(base_path.iterdir()):
-            if not author_dir.is_dir():
-                continue
-            
-            for dataset_dir in sorted(author_dir.iterdir()):
-                if not dataset_dir.is_dir():
-                    continue
-                
-                # Check if needs conversion
-                episodes_stats_path = dataset_dir / "meta" / "episodes_stats.jsonl" 
-                info_path = dataset_dir / "meta" / "info.json"
-                
-                needs_conversion = False
-                if info_path.exists():
-                    if not episodes_stats_path.exists():
-                        needs_conversion = True
-                        print(f"  📝 Found (missing): {author_dir.name}/{dataset_dir.name}")
-                    else:
-                        # Check if episodes_stats file is empty
-                        try:
-                            file_size = episodes_stats_path.stat().st_size
-                            if file_size == 0:
-                                needs_conversion = True
-                                print(f"  📝 Found (empty): {author_dir.name}/{dataset_dir.name}")
-                            else:
-                                # Check if file has content
-                                with open(episodes_stats_path, 'r') as f:
-                                    content = f.read().strip()
-                                    if not content:
-                                        needs_conversion = True
-                                        print(f"  📝 Found (no content): {author_dir.name}/{dataset_dir.name}")
-                        except Exception as e:
-                            # If we can't read the file, consider it needs conversion
-                            needs_conversion = True
-                            print(f"  📝 Found (read error): {author_dir.name}/{dataset_dir.name}")
-                
-                if needs_conversion:
-                    datasets_to_convert.append(dataset_dir)
-    
-    if not datasets_to_convert:
-        print("🎉 No datasets need conversion!")
-        return
-    
-    if max_datasets:
-        datasets_to_convert = datasets_to_convert[:max_datasets]
-    
-    print(f"\n🚀 Converting {len(datasets_to_convert)} datasets...")
-    
-    successful = 0
-    failed = 0
-    
-    for i, dataset_path in enumerate(datasets_to_convert, 1):
-        print(f"\n[{i}/{len(datasets_to_convert)}] {dataset_path.parent.name}/{dataset_path.name}")
-        
-        success = convert_local_dataset(dataset_path, num_workers=num_workers)
-        if success:
-            successful += 1
-        else:
-            failed += 1
-    
-    print(f"\n📊 Conversion Summary:")
-    print(f"   ✅ Successful: {successful}")
-    print(f"   ❌ Failed: {failed}")
-    print(f"   📈 Success rate: {successful}/{len(datasets_to_convert)} ({100*successful/len(datasets_to_convert):.1f}%)")
-
-
-def main():
-    parser = argparse.ArgumentParser(description="Convert local LeRobot datasets to v2.1 format")
-    parser.add_argument("--dataset", type=str, help="Single dataset path to convert")
-    parser.add_argument("--base-dirs", nargs="+", 
-                       default=["/fsx/dana_aubakirova/vla/community_dataset_v1"],
-                       help="Base directories to scan for datasets")
-    parser.add_argument("--max-datasets", type=int, help="Maximum number of datasets to convert")
-    parser.add_argument("--num-workers", type=int, default=4, help="Number of workers for stats computation")
-    parser.add_argument("--all", action="store_true", help="Convert all datasets in base directories")
-    
-    args = parser.parse_args()
-    
-    if args.dataset:
-        # Convert single dataset
-        success = convert_local_dataset(args.dataset, num_workers=args.num_workers)
-        if success:
-            print(f"\n🎉 Successfully converted: {args.dataset}")
-        else:
-            print(f"\n💥 Failed to convert: {args.dataset}")
-            sys.exit(1)
-    
-    elif args.all:
-        # Convert all datasets
-        convert_multiple_datasets(
-            args.base_dirs, 
-            max_datasets=args.max_datasets,
-            num_workers=args.num_workers
-        )
-    
-    else:
-        parser.print_help()
-
-if __name__ == "__main__":
-    main()
\ No newline at end of file
diff --git a/examples/convert_libero.py b/examples/convert_libero.py
deleted file mode 100644
index 7bfc50eae..000000000
--- a/examples/convert_libero.py
+++ /dev/null
@@ -1,126 +0,0 @@
-import os
-import pyarrow.parquet as pq
-import tempfile
-import shutil
-
-# Root directory of converted data
-root_dir = "/raid/jade/libero_converted"
-
-# No renaming
-rename_map = {
-    
-}
-
-# Hugging Face features metadata (constant across all files)
-HF_METADATA = {
-    b"huggingface": b'{"info": {"features": {"observation.images.image": {"_type": "Image"}, "observation.images.image2": {"_type": "Image"}, "state": {"feature": {"dtype": "float32", "_type": "Value"}, "length": 8, "_type": "Sequence"}, "actions": {"feature": {"dtype": "float32", "_type": "Value"}, "length": 7, "_type": "Sequence"}, "timestamp": {"dtype": "float32", "_type": "Value"}, "frame_index": {"dtype": "int64", "_type": "Value"}, "episode_index": {"dtype": "int64", "_type": "Value"}, "index": {"dtype": "int64", "_type": "Value"}, "task_index": {"dtype": "int64", "_type": "Value"}}}}'
-}
-
-def patch_parquet(parquet_path, hf_metadata):
-    try:
-        table = pq.read_table(parquet_path)
-
-        # Merge metadata
-        new_meta = dict(table.schema.metadata or {})
-        new_meta.update(hf_metadata)
-
-        # Apply metadata to table
-        table = table.replace_schema_metadata(new_meta)
-
-        # Write safely via temp file
-        tmp_fd, tmp_path = tempfile.mkstemp(suffix=".parquet")
-        os.close(tmp_fd)
-        pq.write_table(table, tmp_path)
-        shutil.move(tmp_path, parquet_path)
-
-        print(f"✅ Patched: {parquet_path}")
-        return True
-    except Exception as e:
-        print(f"❌ Failed on {parquet_path}: {e}")
-        return False
-
-# Walk through all chunk dirs and patch parquet files
-for dirpath, _, filenames in os.walk(root_dir):
-    for fname in filenames:
-        if fname.endswith(".parquet"):
-            fpath = os.path.join(dirpath, fname)
-            patch_parquet(fpath, HF_METADATA)#!/usr/bin/env python3
-
-#!/usr/bin/env python3
-import os
-import pyarrow.parquet as pq
-import tempfile
-import shutil
-
-# Explicit list of files to patch
-FILES_TO_PATCH = [
-    "/raid/jade/.cache/huggingface/lerobot/HuggingFaceVLA/libero/data/chunk-000/episode_000021.parquet",
-    "/raid/jade/.cache/huggingface/lerobot/HuggingFaceVLA/libero/data/chunk-000/episode_000022.parquet",
-    "/raid/jade/.cache/huggingface/lerobot/HuggingFaceVLA/libero/data/chunk-000/episode_000023.parquet",
-    "/raid/jade/.cache/huggingface/lerobot/HuggingFaceVLA/libero/data/chunk-000/episode_000024.parquet",
-    "/raid/jade/.cache/huggingface/lerobot/HuggingFaceVLA/libero/data/chunk-000/episode_000025.parquet",
-]
-
-# Optional renaming map (fill in as needed)
-rename_map = {
-    # "old_column_name": "new_column_name",
-    "image": "observation.images.image",
-    "image2": "observation.images.image2",
-    "actions": "action",
-}
-
-# Hugging Face features metadata (constant across all files)
-HF_METADATA = {
-    b"huggingface": b'{"info": {"features": {'
-    b'"observation.images.image": {"_type": "Image"}, '
-    b'"observation.images.image2": {"_type": "Image"}, '
-    b'"state": {"feature": {"dtype": "float32", "_type": "Value"}, "length": 8, "_type": "Sequence"}, '
-    b'"actions": {"feature": {"dtype": "float32", "_type": "Value"}, "length": 7, "_type": "Sequence"}, '
-    b'"timestamp": {"dtype": "float32", "_type": "Value"}, '
-    b'"frame_index": {"dtype": "int64", "_type": "Value"}, '
-    b'"episode_index": {"dtype": "int64", "_type": "Value"}, '
-    b'"index": {"dtype": "int64", "_type": "Value"}, '
-    b'"task_index": {"dtype": "int64", "_type": "Value"}}}}'
-}
-
-def patch_parquet(parquet_path, hf_metadata, rename_map):
-    try:
-        # Load parquet table
-        table = pq.read_table(parquet_path)
-
-        # If renaming is needed
-        if rename_map:
-            schema = table.schema
-            new_names = [
-                rename_map.get(name, name) for name in schema.names
-            ]
-            table = table.rename_columns(new_names)
-
-        # Merge schema metadata
-        new_meta = dict(table.schema.metadata or {})
-        new_meta.update(hf_metadata)
-
-        # Replace metadata in table
-        table = table.replace_schema_metadata(new_meta)
-
-        # Write safely via temp file
-        tmp_fd, tmp_path = tempfile.mkstemp(suffix=".parquet")
-        os.close(tmp_fd)
-        pq.write_table(table, tmp_path)
-
-        # Replace original file
-        shutil.move(tmp_path, parquet_path)
-
-        print(f"✅ Patched: {parquet_path}")
-        return True
-    except Exception as e:
-        print(f"❌ Failed on {parquet_path}: {e}")
-        return False
-
-
-if __name__ == "__main__":
-    for fpath in FILES_TO_PATCH:
-        if os.path.exists(fpath):
-            patch_parquet(fpath, HF_METADATA, rename_map)
-        else:
-            print(f"⚠️ File not found: {fpath}")
diff --git a/examples/evaluate_libero.py b/examples/evaluate_libero.py
deleted file mode 100644
index bf99994b6..000000000
--- a/examples/evaluate_libero.py
+++ /dev/null
@@ -1,255 +0,0 @@
-"""
-This script demonstrates how to evaluate a pretrained smolVLA policy on the LIBERO benchmark.
-"""
-
-import collections
-import dataclasses
-import logging
-import math
-import pathlib
-
-import cv2
-import draccus
-import imageio
-import numpy as np
-import torch
-from libero.libero import benchmark, get_libero_path
-from libero.libero.envs import OffScreenRenderEnv
-from tqdm import tqdm
-
-from lerobot.policies.smolvla.modeling_smolvla import SmolVLAPolicy
-from lerobot.policies.pi0.modeling_pi0 import PI0Policy
-
-LIBERO_DUMMY_ACTION = [0.0] * 6 + [-1.0]
-LIBERO_ENV_RESOLUTION = 256  # resolution used to render training data
-
-@dataclasses.dataclass
-class Args:
-    """
-    Evaluation arguments for smolVLA on LIBERO.
-    """
-
-    # --- Hugging Face arguments ---
-    policy_path: str = "lerobot/smolvla_base"
-    """Path to the pretrained policy on the Hugging Face Hub or local directory."""
-
-    # --- LIBERO environment-specific parameters ---
-    task_suite_name: str = "libero_spatial"
-    """Task suite. Options: libero_spatial, libero_object, libero_goal, libero_10, libero_90"""
-    num_steps_wait: int = 10
-    """Number of steps to wait for objects to stabilize in sim."""
-    num_trials_per_task: int = 50
-    """Number of rollouts per task."""
-
-    # --- Evaluation arguments ---
-    video_out_path: str = "data/libero/videos"
-    """Path to save videos."""
-    device: str = "cuda"
-    """Device to use for evaluation."""
-
-    seed: int = 7
-    """Random Seed (for reproducibility)"""
-
-
-@draccus.wrap()
-def eval_libero(args: Args) -> None:
-    # Set random seed
-    torch.manual_seed(args.seed)
-    np.random.seed(args.seed)
-
-    # --- Load Policy ---
-    policy = SmolVLAPolicy.from_pretrained(args.policy_path)
-    policy.to(args.device)
-    policy.eval()
-
-    # --- Initialize LIBERO task suite ---
-    benchmark_dict = benchmark.get_benchmark_dict()
-    try:
-        task_suite = benchmark_dict[args.task_suite_name]()
-    except KeyError:
-        raise ValueError(
-            f"Unknown task suite: {args.task_suite_name}. "
-            f"Available options are: {list(benchmark_dict.keys())}"
-        )
-    num_tasks_in_suite = task_suite.n_tasks
-    logging.info(f"Task suite: {args.task_suite_name}")
-
-    pathlib.Path(args.video_out_path).mkdir(parents=True, exist_ok=True)
-
-    if args.task_suite_name == "libero_spatial":
-        max_steps = 220  # longest training demo has 193 steps
-    elif args.task_suite_name == "libero_object":
-        max_steps = 280  # longest training demo has 254 steps
-    elif args.task_suite_name == "libero_goal":
-        max_steps = 300  # longest training demo has 270 steps
-    elif args.task_suite_name == "libero_10":
-        max_steps = 520  # longest training demo has 505 steps
-    elif args.task_suite_name == "libero_90":
-        max_steps = 400  # longest training demo has 373 steps
-    else:
-        # Fallback for custom task suites
-        max_steps = 520
-
-    # --- Evaluation Loop ---
-    total_episodes, total_successes = 0, 0
-    for task_id in tqdm(range(num_tasks_in_suite), desc="Tasks"):
-        # Get task
-        task = task_suite.get_task(task_id)
-
-        # Get default LIBERO initial states
-        initial_states = task_suite.get_task_init_states(task_id)
-        
-        # Initialize LIBERO environment and task description
-        env, task_description = _get_libero_env(task, LIBERO_ENV_RESOLUTION, args.seed)
-
-        # Start episodes
-        task_episodes, task_successes = 0, 0
-        for episode_idx in tqdm(
-            range(min(args.num_trials_per_task, len(initial_states))),
-            desc=f"Task {task_id}: {task.language}",
-            leave=False,
-        ):
-            logging.info(f"\nTask: {task_description}")
-
-            # Reset environment and policy
-            env.reset()
-            policy.reset()
-
-            # Set initial states
-            obs = env.set_init_state(initial_states[episode_idx])
-
-            # IMPORTANT: Do nothing for the first few timesteps because the simulator drops objects
-            # and we need to wait for them to fall
-            for _ in range(args.num_steps_wait):
-                obs, _, _, _ = env.step(LIBERO_DUMMY_ACTION)
-
-            # Setup
-            t = 0
-            frames = []
-            done = False
-
-            # Add initial frame
-            agentview_image = np.ascontiguousarray(obs["agentview_image"][::-1, ::-1])
-            # frames.append(agentview_image)
-            # import ipdb; ipdb.set_trace()
-            logging.info(f"Starting episode {task_episodes+1}...")
-            while t < max_steps:
-                try:
-                    # Get preprocessed image
-                    # IMPORTANT: rotate 180 degrees to match train preprocessing
-                    wrist_img = np.ascontiguousarray(obs["robot0_eye_in_hand_image"][::-1, ::-1])
-                    agentview_image = np.ascontiguousarray(obs["agentview_image"][::-1, ::-1])
-                    frames.append(agentview_image)
-
-                    # Prepare observations dict
-                    state = np.concatenate(
-                        (
-                            obs["robot0_eef_pos"],
-                            _quat2axisangle(obs["robot0_eef_quat"]),
-                            obs["robot0_gripper_qpos"],
-                        )
-                    )
-                    observation = {
-                        "observation.images.image": torch.from_numpy(agentview_image / 255.0)
-                        .permute(2, 0, 1)
-                        .to(torch.float32)
-                        .to(args.device).unsqueeze(0),
-                        "observation.images.image2": torch.from_numpy(wrist_img / 255.0)
-                        .permute(2, 0, 1)
-                        .to(torch.float32)
-                        .to(args.device).unsqueeze(0),
-                        "observation.state": torch.from_numpy(state).to(torch.float32).to(args.device).unsqueeze(0),
-                        "task": task_description,
-                    }
-
-                    # Query model to get action
-                    with torch.inference_mode():
-                        action_tensor = policy.select_action(observation)
-                    action = action_tensor.cpu().numpy()[0]
-                    action[-1] = 1 -  action[-1]
-
-                    # Execute action in environment
-                    obs, _, done, _ = env.step(action)
-                    if done:
-                        task_successes += 1
-                        total_successes += 1
-                        break
-                    t += 1
-
-                except Exception as e:
-                    logging.error(f"Caught exception: {e}")
-                    break
-
-            task_episodes += 1
-            total_episodes += 1
-
-            # Save a replay video of the episode
-            suffix = "success" if done else "failure"
-            task_segment = task_description.replace(" ", "_").replace("/", "_")
-            video_path = (
-                pathlib.Path(args.video_out_path) / f"rollout_task_{task_id}_episode_{episode_idx}_{task_segment}_{suffix}.mp4"
-            )
-            fps = 30
-            writer = imageio.get_writer(video_path, fps=fps)
-
-            for image in frames:
-                writer.append_data(image)
-            writer.close()
-            logging.info(f"Saved video to {video_path}")
-
-            # Log current results
-            logging.info(f"Success: {done}")
-            if total_episodes > 0:
-                logging.info(f"# episodes completed so far: {total_episodes}")
-                logging.info(f"# successes: {total_successes} ({total_successes / total_episodes * 100:.1f}%)")
-
-        # Log final results for the task
-        if task_episodes > 0:
-            logging.info(f"Task {task_id} success rate: {float(task_successes) / float(task_episodes):.2f}")
-        if total_episodes > 0:
-            logging.info(f"Cumulative success rate: {float(total_successes) / float(total_episodes):.2f}")
-
-    logging.info("--- Evaluation finished ---")
-    if total_episodes > 0:
-        logging.info(f"Total success rate: {float(total_successes) / float(total_episodes):.2f}")
-    logging.info(f"Total episodes: {total_episodes}")
-    logging.info(f"Total successes: {total_successes}")
-    cv2.destroyAllWindows()
-
-
-def _get_libero_env(task, resolution, seed):
-    """Initializes and returns the LIBERO environment, along with the task description."""
-    task_description = task.language
-    task_bddl_file = pathlib.Path(get_libero_path("bddl_files")) / task.problem_folder / task.bddl_file
-    env_args = {
-        "bddl_file_name": str(task_bddl_file),
-        "camera_heights": resolution,
-        "camera_widths": resolution,
-    }
-    env = OffScreenRenderEnv(**env_args)
-    env.seed(seed)  # IMPORTANT: seed seems to affect object positions even when using fixed initial state
-    return env, task_description
-
-
-def _quat2axisangle(quat):
-    """
-    Copied from robosuite:
-    https://github.com/ARISE-Initiative/robosuite/blob/eafb81f54ffc104f905ee48a16bb15f059176ad3/robosuite/utils/transform_utils.py#L490C1-L512C55
-    """
-    # clip quaternion
-    if quat[3] > 1.0:
-        quat[3] = 1.0
-    elif quat[3] < -1.0:
-        quat[3] = -1.0
-
-    den = np.sqrt(1.0 - quat[3] * quat[3])
-    if math.isclose(den, 0.0):
-        # This is (close to) a zero degree rotation, immediately return
-        return np.zeros(3)
-
-    return (quat[:3] * 2.0 * math.acos(quat[3])) / den
-
-
-if __name__ == "__main__":
-    logging.basicConfig(level=logging.INFO)
-    eval_libero()
\ No newline at end of file
diff --git a/examples/requirements.in b/examples/requirements.in
deleted file mode 100644
index 25664608a..000000000
--- a/examples/requirements.in
+++ /dev/null
@@ -1,8 +0,0 @@
-imageio[ffmpeg]
-numpy==1.22.4
-tqdm
-tyro
-PyYaml
-opencv-python==4.6.0.66
-robosuite==1.4.1
-matplotlib==3.5.3
\ No newline at end of file
diff --git a/examples/script2.py b/examples/script2.py
deleted file mode 100644
index cbd4da913..000000000
--- a/examples/script2.py
+++ /dev/null
@@ -1,70 +0,0 @@
-#!/usr/bin/env python3
-import os
-import pyarrow.parquet as pq
-import tempfile
-import shutil
-
-FILES_TO_PATCH = [
-    "/raid/jade/.cache/huggingface/lerobot/HuggingFaceVLA/libero/data/chunk-000/episode_000021.parquet",
-    "/raid/jade/.cache/huggingface/lerobot/HuggingFaceVLA/libero/data/chunk-000/episode_000022.parquet",
-    "/raid/jade/.cache/huggingface/lerobot/HuggingFaceVLA/libero/data/chunk-000/episode_000023.parquet",
-    "/raid/jade/.cache/huggingface/lerobot/HuggingFaceVLA/libero/data/chunk-000/episode_000024.parquet",
-    "/raid/jade/.cache/huggingface/lerobot/HuggingFaceVLA/libero/data/chunk-000/episode_000025.parquet",
-]
-
-# Column renaming map
-rename_map = {
-    "wrist_image": "observation.images.image2",
-    "actions": "action",
-}
-
-# Hugging Face metadata
-HF_METADATA = {
-    b"huggingface": b'{"info": {"features": {'
-    b'"observation.images.image": {"_type": "Image"}, '
-    b'"observation.images.image2": {"_type": "Image"}, '
-    b'"state": {"feature": {"dtype": "float32", "_type": "Value"}, "length": 8, "_type": "Sequence"}, '
-    b'"action": {"feature": {"dtype": "float32", "_type": "Value"}, "length": 7, "_type": "Sequence"}, '
-    b'"timestamp": {"dtype": "float32", "_type": "Value"}, '
-    b'"frame_index": {"dtype": "int64", "_type": "Value"}, '
-    b'"episode_index": {"dtype": "int64", "_type": "Value"}, '
-    b'"index": {"dtype": "int64", "_type": "Value"}, '
-    b'"task_index": {"dtype": "int64", "_type": "Value"}}}}'
-}
-
-def patch_parquet(parquet_path, hf_metadata, rename_map):
-    try:
-        table = pq.read_table(parquet_path)
-
-        # Apply column renames if needed
-        if rename_map:
-            schema = table.schema
-            new_names = [rename_map.get(name, name) for name in schema.names]
-            table = table.rename_columns(new_names)
-
-        # Merge schema metadata
-        new_meta = dict(table.schema.metadata or {})
-        new_meta.update(hf_metadata)
-
-        # Replace metadata
-        table = table.replace_schema_metadata(new_meta)
-
-        # Write via temp file
-        tmp_fd, tmp_path = tempfile.mkstemp(suffix=".parquet")
-        os.close(tmp_fd)
-        pq.write_table(table, tmp_path)
-
-        shutil.move(tmp_path, parquet_path)
-        print(f"✅ Patched: {parquet_path}")
-        return True
-    except Exception as e:
-        print(f"❌ Failed on {parquet_path}: {e}")
-        return False
-
-
-if __name__ == "__main__":
-    for fpath in FILES_TO_PATCH:
-        if os.path.exists(fpath):
-            patch_parquet(fpath, HF_METADATA, rename_map)
-        else:
-            print(f"⚠️ File not found: {fpath}")
diff --git a/examples/script3.py b/examples/script3.py
deleted file mode 100644
index 7b4d7957a..000000000
--- a/examples/script3.py
+++ /dev/null
@@ -1,64 +0,0 @@
-#!/usr/bin/env python3
-import os
-import pyarrow.parquet as pq
-import tempfile
-import shutil
-
-# Root directory containing all parquet files
-ROOT_DIR = "/raid/jade/.cache/huggingface/lerobot/HuggingFaceVLA/libero/data"
-
-# Column renaming map (normalize schema to what training expects)
-rename_map = {
-    "state": "observation.state",
-}
-
-# Hugging Face metadata (aligned with expected feature names)
-HF_METADATA = {
-    b"huggingface": b'{"info": {"features": {'
-    b'"observation.images.image": {"_type": "Image"}, '
-    b'"observation.images.image2": {"_type": "Image"}, '
-    b'"observation.state": {"feature": {"dtype": "float32", "_type": "Value"}, "length": 8, "_type": "Sequence"}, '
-    b'"action": {"feature": {"dtype": "float32", "_type": "Value"}, "length": 7, "_type": "Sequence"}, '
-    b'"timestamp": {"dtype": "float32", "_type": "Value"}, '
-    b'"frame_index": {"dtype": "int64", "_type": "Value"}, '
-    b'"episode_index": {"dtype": "int64", "_type": "Value"}, '
-    b'"index": {"dtype": "int64", "_type": "Value"}, '
-    b'"task_index": {"dtype": "int64", "_type": "Value"}}}}'
-}
-
-def patch_parquet(parquet_path, hf_metadata, rename_map):
-    try:
-        # Read the parquet table
-        table = pq.read_table(parquet_path)
-
-        # Apply renames if necessary
-        if rename_map:
-            new_names = [rename_map.get(name, name) for name in table.schema.names]
-            if new_names != table.schema.names:
-                table = table.rename_columns(new_names)
-
-        # Update metadata
-        new_meta = dict(table.schema.metadata or {})
-        new_meta.update(hf_metadata)
-        table = table.replace_schema_metadata(new_meta)
-
-        # Write to temp file then atomically move back
-        tmp_fd, tmp_path = tempfile.mkstemp(suffix=".parquet")
-        os.close(tmp_fd)
-        pq.write_table(table, tmp_path)
-        shutil.move(tmp_path, parquet_path)
-
-        # Debug print
-        print(f"✅ Patched: {parquet_path}")
-        print("   Columns:", table.schema.names)
-        return True
-    except Exception as e:
-        print(f"❌ Failed on {parquet_path}: {e}")
-        return False
-
-if __name__ == "__main__":
-    for dirpath, _, filenames in os.walk(ROOT_DIR):
-        for fname in filenames:
-            if fname.endswith(".parquet"):
-                fpath = os.path.join(dirpath, fname)
-                patch_parquet(fpath, HF_METADATA, rename_map)
diff --git a/examples/script4.py b/examples/script4.py
deleted file mode 100644
index 2eed60a94..000000000
--- a/examples/script4.py
+++ /dev/null
@@ -1,3 +0,0 @@
-from huggingface_hub import HfApi
-hub_api = HfApi()
-hub_api.create_tag("HuggingFaceVLA/libero", tag="v2.1", repo_type="dataset")
diff --git a/log_text.txt b/log_text.txt
deleted file mode 100644
index 6676df0eb..000000000
--- a/log_text.txt
+++ /dev/null
@@ -1,1765 +0,0 @@
-    self.vlm_with_expert = SmolVLMWithExpertModel(
-  File "/home/jade_choghari/lerobot/src/lerobot/policies/smolvla/smolvlm_with_expert.py", line 88, in __init__
-    self.processor = AutoProcessor.from_pretrained(model_id)
-  File "/home/jade_choghari/miniconda3/envs/lerobot/lib/python3.10/site-packages/transformers/models/auto/processing
-_auto.py", line 300, in from_pretrained
-    config_dict, _ = ProcessorMixin.get_processor_dict(pretrained_model_name_or_path, **kwargs)
-  File "/home/jade_choghari/miniconda3/envs/lerobot/lib/python3.10/site-packages/transformers/processing_utils.py",
-line 944, in get_processor_dict
-    resolved_raw_chat_template_file = cached_file(
-  File "/home/jade_choghari/miniconda3/envs/lerobot/lib/python3.10/site-packages/transformers/utils/hub.py", line 32
-1, in cached_file
-    file = cached_files(path_or_repo_id=path_or_repo_id, filenames=[filename], **kwargs)
-  File "/home/jade_choghari/miniconda3/envs/lerobot/lib/python3.10/site-packages/transformers/utils/hub.py", line 47
-8, in cached_files
-    hf_hub_download(
-  File "/home/jade_choghari/miniconda3/envs/lerobot/lib/python3.10/site-packages/huggingface_hub/utils/_validators.p
-y", line 114, in _inner_fn
-    return fn(*args, **kwargs)
-  File "/home/jade_choghari/miniconda3/envs/lerobot/lib/python3.10/site-packages/huggingface_hub/file_download.py",
-line 1010, in hf_hub_download
-    return _hf_hub_download_to_cache_dir(
-  File "/home/jade_choghari/miniconda3/envs/lerobot/lib/python3.10/site-packages/huggingface_hub/file_download.py",
-line 1073, in _hf_hub_download_to_cache_dir
-    (url_to_download, etag, commit_hash, expected_size, xet_file_data, head_call_error) = _get_metadata_or_catch_err
-or(
-  File "/home/jade_choghari/miniconda3/envs/lerobot/lib/python3.10/site-packages/huggingface_hub/file_download.py",
-line 1546, in _get_metadata_or_catch_error
-    metadata = get_hf_file_metadata(
-  File "/home/jade_choghari/miniconda3/envs/lerobot/lib/python3.10/site-packages/huggingface_hub/utils/_validators.p
-y", line 114, in _inner_fn
-    return fn(*args, **kwargs)
-  File "/home/jade_choghari/miniconda3/envs/lerobot/lib/python3.10/site-packages/huggingface_hub/file_download.py",
-line 1463, in get_hf_file_metadata
-    r = _request_wrapper(
-  File "/home/jade_choghari/miniconda3/envs/lerobot/lib/python3.10/site-packages/huggingface_hub/file_download.py",
-line 286, in _request_wrapper
-    response = _request_wrapper(
-  File "/home/jade_choghari/miniconda3/envs/lerobot/lib/python3.10/site-packages/huggingface_hub/file_download.py",
-line 309, in _request_wrapper
-    response = http_backoff(method=method, url=url, **params, retry_on_exceptions=(), retry_on_status_codes=(429,))
-  File "/home/jade_choghari/miniconda3/envs/lerobot/lib/python3.10/site-packages/huggingface_hub/utils/_http.py", li
-ne 310, in http_backoff
-    response = session.request(method=method, url=url, **kwargs)
-  File "/home/jade_choghari/miniconda3/envs/lerobot/lib/python3.10/site-packages/requests/sessions.py", line 589, in
- request
-    resp = self.send(prep, **send_kwargs)
-  File "/home/jade_choghari/miniconda3/envs/lerobot/lib/python3.10/site-packages/requests/sessions.py", line 703, in
- send
-    r = adapter.send(request, **kwargs)
-  File "/home/jade_choghari/miniconda3/envs/lerobot/lib/python3.10/site-packages/huggingface_hub/utils/_http.py", li
-ne 96, in send
-    return super().send(request, *args, **kwargs)
-  File "/home/jade_choghari/miniconda3/envs/lerobot/lib/python3.10/site-packages/requests/adapters.py", line 644, in
- send
-    resp = conn.urlopen(
-  File "/home/jade_choghari/miniconda3/envs/lerobot/lib/python3.10/site-packages/urllib3/connectionpool.py", line 78
-7, in urlopen
-    response = self._make_request(
-  File "/home/jade_choghari/miniconda3/envs/lerobot/lib/python3.10/site-packages/urllib3/connectionpool.py", line 53
-4, in _make_request
-    response = conn.getresponse()
-  File "/home/jade_choghari/miniconda3/envs/lerobot/lib/python3.10/site-packages/urllib3/connection.py", line 565, i
-n getresponse
-    httplib_response = super().getresponse()
-  File "/home/jade_choghari/miniconda3/envs/lerobot/lib/python3.10/http/client.py", line 1375, in getresponse
-    response.begin()
-  File "/home/jade_choghari/miniconda3/envs/lerobot/lib/python3.10/http/client.py", line 318, in begin
-    version, status, reason = self._read_status()
-  File "/home/jade_choghari/miniconda3/envs/lerobot/lib/python3.10/http/client.py", line 279, in _read_status
-    line = str(self.fp.readline(_MAXLINE + 1), "iso-8859-1")
-  File "/home/jade_choghari/miniconda3/envs/lerobot/lib/python3.10/socket.py", line 717, in readinto
-    return self._sock.recv_into(b)
-  File "/home/jade_choghari/miniconda3/envs/lerobot/lib/python3.10/ssl.py", line 1307, in recv_into
-    return self.read(nbytes, buffer)
-  File "/home/jade_choghari/miniconda3/envs/lerobot/lib/python3.10/ssl.py", line 1163, in read
-    return self._sslobj.read(len, buffer)
-KeyboardInterrupt
-clea
-(lerobot) jade_choghari@hf-dgx-01:~/lerobot$ clear
-(lerobot) jade_choghari@hf-dgx-01:~/lerobot$ bash examples/8_train_smolvla_must.sh
-Training dir: /raid/jade/logs/lerobot/lerobot_2_physical-intelligence_libero_smolvla_lr1e-4bs32steps100000
-/home/jade_choghari/miniconda3/envs/lerobot/lib/python3.10/site-packages/transformers/utils/hub.py:111: FutureWarnin
-g: Using `TRANSFORMERS_CACHE` is deprecated and will be removed in v5 of Transformers. Use `HF_HOME` instead.
-  warnings.warn(
-INFO 2025-09-09 15:50:52 ils/utils.py:48 Cuda backend detected, using cuda.
-WARNING 2025-09-09 15:50:52 /policies.py:81 Device 'None' is not available. Switching to 'cuda'.
-INFO 2025-09-09 15:50:52 ts/train.py:137 {'batch_size': 32,
- 'dataset': {'episodes': None,
-             'image_transforms': {'enable': False,
-                                  'max_num_transforms': 3,
-                                  'random_order': False,
-                                  'tfs': {'brightness': {'kwargs': {'brightness': [0.8,
-                                                                                   1.2]},
-                                                         'type': 'ColorJitter',
-                                                         'weight': 1.0},
-                                          'contrast': {'kwargs': {'contrast': [0.8,
-                                                                               1.2]},
-                                                       'type': 'ColorJitter',
-                                                       'weight': 1.0},
-                                          'hue': {'kwargs': {'hue': [-0.05,
-                                                                     0.05]},
-                                                  'type': 'ColorJitter',
-                                                  'weight': 1.0},
-                                          'saturation': {'kwargs': {'saturation': [0.5,
-                                                                                   1.5]},
-                                                         'type': 'ColorJitter',
-                                                         'weight': 1.0},
-                                          'sharpness': {'kwargs': {'sharpness': [0.5,
-                                                                                 1.5]},
-                                                        'type': 'SharpnessJitter',
-                                                        'weight': 1.0}}},
-             'repo_id': 'physical-intelligence/libero',
-             'revision': None,
-             'root': '/raid/jade/.cache/huggingface/datasets',
-             'use_imagenet_stats': True,
-             'video_backend': 'torchcodec'},
- 'env': {'camera_name': 'agentview_image,robot0_eye_in_hand_image',
-         'episode_length': 520,
-         'features': {'action': {'shape': [7],
-                                 'type': <FeatureType.ACTION: 'ACTION'>},
-                      'agent_pos': {'shape': [8],
-                                    'type': <FeatureType.STATE: 'STATE'>},
-                      'pixels/agentview_image': {'shape': [360, 360, 3],
-                                                 'type': <FeatureType.VISUAL: 'VISUAL'>},
-                      'pixels/robot0_eye_in_hand_image': {'shape': [360,
-                                                                    360,
-                                                                    3],
-                                                          'type': <FeatureType.VISUAL: 'VISUAL'>}},
-         'features_map': {'action': 'action',
-                          'agent_pos': 'observation.state',
-                          'pixels/agentview_image': 'observation.images.image',
-                          'pixels/robot0_eye_in_hand_image': 'observation.images.image2'},
-         'fps': 30,
-         'init_states': True,
-         'max_parallel_tasks': 5,
-         'multitask_eval': True,
-         'obs_type': 'pixels_agent_pos',
-         'render_mode': 'rgb_array',
-         'task': 'libero_spatial',
-         'type': 'libero'},
- 'eval': {'batch_size': 1, 'n_episodes': 1, 'use_async_envs': False},
- 'eval_freq': 0,
- 'job_name': 'libero_smolvla',
- 'log_freq': 200,
- 'num_workers': 4,
- 'optimizer': {'betas': [0.9, 0.95],
-               'eps': 1e-08,
-               'grad_clip_norm': 10,
-               'lr': 0.0001,
-               'type': 'adamw',
-               'weight_decay': 1e-10},
- 'output_dir': '/raid/jade/logs/lerobot/lerobot_2_physical-intelligence_libero_smolvla_lr1e-4bs32steps100000',
- 'policy': {'adapt_to_pi_aloha': False,
-            'add_image_special_tokens': False,
-            'attention_mode': 'cross_attn',
-            'chunk_size': 50,
-            'device': 'cuda',
-            'empty_cameras': 0,
-            'expert_width_multiplier': 0.5,
-            'freeze_vision_encoder': True,
-            'gradient_accumulation_steps': 1,
-            'input_features': {},
-            'license': None,
-            'load_vlm_weights': False,
-            'max_action_dim': 32,
-            'max_period': 4.0,
-            'max_state_dim': 32,
-            'min_period': 0.004,
-            'n_action_steps': 1,
-            'n_obs_steps': 1,
-            'normalization_mapping': {'ACTION': <NormalizationMode.MEAN_STD: 'MEAN_STD'>,
-                                      'STATE': <NormalizationMode.MEAN_STD: 'MEAN_STD'>,
-                                      'VISUAL': <NormalizationMode.IDENTITY: 'IDENTITY'>},
-            'num_expert_layers': -1,
-            'num_steps': 10,
-            'num_vlm_layers': 16,
-            'optimizer_betas': [0.9, 0.95],
-            'optimizer_eps': 1e-08,
-            'optimizer_grad_clip_norm': 10,
-            'optimizer_lr': 0.0001,
-            'optimizer_weight_decay': 1e-10,
-            'output_features': {},
-            'pad_language_to': 'longest',
-            'prefix_length': 0,
-            'private': None,
-            'push_to_hub': True,
-            'repo_id': 'None',
-            'resize_imgs_with_padding': [512, 512],
-            'scheduler_decay_lr': 2.5e-06,
-            'scheduler_decay_steps': 30000,
-            'scheduler_warmup_steps': 1000,
-            'self_attn_every_n_layers': 2,
-            'tags': None,
-            'tokenizer_max_length': 48,
-            'train_expert_only': True,
-            'train_state_proj': True,
-            'type': 'smolvla',
-            'use_amp': True,
-            'use_cache': True,
-            'use_delta_joint_actions_aloha': False,
-            'vlm_model_name': 'HuggingFaceTB/SmolVLM2-500M-Instruct'},
- 'resume': False,
- 'save_checkpoint': True,
- 'save_freq': 20000,
- 'scheduler': {'decay_lr': 2.5e-06,
-               'num_decay_steps': 30000,
-               'num_warmup_steps': 1000,
-               'peak_lr': 0.0001,
-               'type': 'cosine_decay_with_warmup'},
- 'seed': 1000,
- 'steps': 100000,
- 'use_policy_training_preset': True,
- 'wandb': {'disable_artifact': False,
-           'enable': False,
-           'entity': None,
-           'mode': None,
-           'notes': None,
-           'project': 'lerobot',
-           'run_id': None}}
-INFO 2025-09-09 15:50:52 ts/train.py:143 Logs will be saved locally.
-INFO 2025-09-09 15:50:52 ts/train.py:153 Creating dataset
-WARNING 2025-09-09 15:50:52 ts/utils.py:302
-The dataset you requested (physical-intelligence/libero) is in 2.0 format.
-While current version of LeRobot is backward-compatible with it, the version of your dataset still uses global
-stats instead of per-episode stats. Update your dataset stats to the new format using this command:
-```
-python -m lerobot.datasets.v21.convert_dataset_v20_to_v21 --repo-id=physical-intelligence/libero
-```
-
-If you encounter a problem, contact LeRobot maintainers on [Discord](https://discord.com/invite/s3KuuzsPFb)
-or open an [issue on GitHub](https://github.com/huggingface/lerobot/issues/new/choose).
-
-WARNING 2025-09-09 15:50:52 ts/utils.py:302
-The dataset you requested (physical-intelligence/libero) is in 2.0 format.
-While current version of LeRobot is backward-compatible with it, the version of your dataset still uses global
-stats instead of per-episode stats. Update your dataset stats to the new format using this command:
-```
-python -m lerobot.datasets.v21.convert_dataset_v20_to_v21 --repo-id=physical-intelligence/libero
-```
-
-If you encounter a problem, contact LeRobot maintainers on [Discord](https://discord.com/invite/s3KuuzsPFb)
-or open an [issue on GitHub](https://github.com/huggingface/lerobot/issues/new/choose).
-
-Resolving data files: 100%|████████████████████████████████████████████████████████| 1693/1693 [00:00<00:00, 67057.8
-5it/s]
-Loading dataset shards: 100%|███████████████████████████████████████████████████████████| 70/70 [00:00<00:00, 5343.9
-4it/s]
-INFO 2025-09-09 15:50:57 ts/train.py:163 Creating policy
-Fetching 2 files: 100%|██████████████████████████████████████████████████████████████████| 2/2 [00:00<00:00, 47393.2
-7it/s]
-Fetching 2 files: 100%|███████████████████████████████████████████████████████████████████| 2/2 [00:00<00:00, 3797.4
-7it/s]
-Fetching 2 files: 100%|██████████████████████████████████████████████████████████████████| 2/2 [00:00<00:00, 44384.1
-7it/s]
-Fetching 2 files: 100%|███████████████████████████████████████████████████████████████████| 2/2 [00:00<00:00, 6533.1
-8it/s]
-Reducing the number of VLM layers to 16 ...
-INFO 2025-09-09 15:51:30 ts/train.py:168 Creating optimizer and scheduler
-INFO 2025-09-09 15:51:30 ts/train.py:180 Output dir: /raid/jade/logs/lerobot/lerobot_2_physical-intelligence_libero_
-smolvla_lr1e-4bs32steps100000
-INFO 2025-09-09 15:51:30 ts/train.py:182 cfg.env.task='libero_spatial'
-INFO 2025-09-09 15:51:30 ts/train.py:183 cfg.steps=100000 (100K)
-INFO 2025-09-09 15:51:30 ts/train.py:184 dataset.num_frames=273465 (273K)
-INFO 2025-09-09 15:51:30 ts/train.py:185 dataset.num_episodes=1693
-INFO 2025-09-09 15:51:30 ts/train.py:186 num_learnable_params=49103712 (49M)
-INFO 2025-09-09 15:51:30 ts/train.py:187 num_total_params=399268924 (399M)
-INFO 2025-09-09 15:51:30 ts/train.py:225 Start offline training on a fixed dataset
-> /home/jade_choghari/lerobot/src/lerobot/scripts/train.py(230)train()
--> train_tracker.dataloading_s = time.perf_counter() - start_time
-(Pdb) batch.keys()
-dict_keys(['image', 'wrist_image', 'state', 'actions', 'timestamp', 'frame_index', 'episode_index', 'index', 'task_i
-ndex', 'task'])
-(Pdb) policy.config.input_features
-{'image': PolicyFeature(type=<FeatureType.VISUAL: 'VISUAL'>, shape=(3, 256, 256)), 'wrist_image': PolicyFeature(type
-=<FeatureType.VISUAL: 'VISUAL'>, shape=(3, 256, 256))}
-(Pdb) quit()
-Traceback (most recent call last):
-  File "/home/jade_choghari/lerobot/src/lerobot/scripts/train.py", line 343, in <module>
-    main()
-  File "/home/jade_choghari/lerobot/src/lerobot/scripts/train.py", line 339, in main
-    train()
-  File "/home/jade_choghari/lerobot/src/lerobot/configs/parser.py", line 225, in wrapper_inner
-    response = fn(cfg, *args, **kwargs)
-  File "/home/jade_choghari/lerobot/src/lerobot/scripts/train.py", line 230, in train
-    train_tracker.dataloading_s = time.perf_counter() - start_time
-  File "/home/jade_choghari/lerobot/src/lerobot/scripts/train.py", line 230, in train
-    train_tracker.dataloading_s = time.perf_counter() - start_time
-  File "/home/jade_choghari/miniconda3/envs/lerobot/lib/python3.10/bdb.py", line 90, in trace_dispatch
-    return self.dispatch_line(frame)
-  File "/home/jade_choghari/miniconda3/envs/lerobot/lib/python3.10/bdb.py", line 115, in dispatch_line
-    if self.quitting: raise BdbQuit
-bdb.BdbQuit
-clear
-^[[A(lerobot) jade_choghari@hf-dgx-01:~/lerobot$ clear
-(lerobot) jade_choghari@hf-dgx-01:~/lerobot$ bash examples/8_train_smolvla_must.sh
-Training dir: /raid/jade/logs/lerobot/lerobot_2_physical-intelligence_libero_smolvla_lr1e-4bs32steps100000
-/home/jade_choghari/miniconda3/envs/lerobot/lib/python3.10/site-packages/transformers/utils/hub.py:111: FutureWarnin
-g: Using `TRANSFORMERS_CACHE` is deprecated and will be removed in v5 of Transformers. Use `HF_HOME` instead.
-  warnings.warn(
-INFO 2025-09-09 15:53:49 ils/utils.py:48 Cuda backend detected, using cuda.
-WARNING 2025-09-09 15:53:49 /policies.py:81 Device 'None' is not available. Switching to 'cuda'.
-INFO 2025-09-09 15:53:49 ts/train.py:137 {'batch_size': 32,
- 'dataset': {'episodes': None,
-             'image_transforms': {'enable': False,
-                                  'max_num_transforms': 3,
-                                  'random_order': False,
-                                  'tfs': {'brightness': {'kwargs': {'brightness': [0.8,
-                                                                                   1.2]},
-                                                         'type': 'ColorJitter',
-                                                         'weight': 1.0},
-                                          'contrast': {'kwargs': {'contrast': [0.8,
-                                                                               1.2]},
-                                                       'type': 'ColorJitter',
-                                                       'weight': 1.0},
-                                          'hue': {'kwargs': {'hue': [-0.05,
-                                                                     0.05]},
-                                                  'type': 'ColorJitter',
-                                                  'weight': 1.0},
-                                          'saturation': {'kwargs': {'saturation': [0.5,
-                                                                                   1.5]},
-                                                         'type': 'ColorJitter',
-                                                         'weight': 1.0},
-                                          'sharpness': {'kwargs': {'sharpness': [0.5,
-                                                                                 1.5]},
-                                                        'type': 'SharpnessJitter',
-                                                        'weight': 1.0}}},
-             'repo_id': 'physical-intelligence/libero',
-             'revision': None,
-             'root': '/raid/jade/.cache/huggingface/datasets',
-             'use_imagenet_stats': True,
-             'video_backend': 'torchcodec'},
- 'env': {'camera_name': 'agentview_image,robot0_eye_in_hand_image',
-         'episode_length': 520,
-         'features': {'action': {'shape': [7],
-                                 'type': <FeatureType.ACTION: 'ACTION'>},
-                      'agent_pos': {'shape': [8],
-                                    'type': <FeatureType.STATE: 'STATE'>},
-                      'pixels/agentview_image': {'shape': [360, 360, 3],
-                                                 'type': <FeatureType.VISUAL: 'VISUAL'>},
-                      'pixels/robot0_eye_in_hand_image': {'shape': [360,
-                                                                    360,
-                                                                    3],
-                                                          'type': <FeatureType.VISUAL: 'VISUAL'>}},
-         'features_map': {'action': 'action',
-                          'agent_pos': 'observation.state',
-                          'pixels/agentview_image': 'observation.images.image',
-                          'pixels/robot0_eye_in_hand_image': 'observation.images.image2'},
-         'fps': 30,
-         'init_states': True,
-         'max_parallel_tasks': 5,
-         'multitask_eval': True,
-         'obs_type': 'pixels_agent_pos',
-         'render_mode': 'rgb_array',
-         'task': 'libero_spatial',
-         'type': 'libero'},
- 'eval': {'batch_size': 1, 'n_episodes': 1, 'use_async_envs': False},
- 'eval_freq': 0,
- 'job_name': 'libero_smolvla',
- 'log_freq': 200,
- 'num_workers': 4,
- 'optimizer': {'betas': [0.9, 0.95],
-               'eps': 1e-08,
-               'grad_clip_norm': 10,
-               'lr': 0.0001,
-               'type': 'adamw',
-               'weight_decay': 1e-10},
- 'output_dir': '/raid/jade/logs/lerobot/lerobot_2_physical-intelligence_libero_smolvla_lr1e-4bs32steps100000',
- 'policy': {'adapt_to_pi_aloha': False,
-            'add_image_special_tokens': False,
-            'attention_mode': 'cross_attn',
-            'chunk_size': 50,
-            'device': 'cuda',
-            'empty_cameras': 0,
-            'expert_width_multiplier': 0.5,
-            'freeze_vision_encoder': True,
-            'gradient_accumulation_steps': 1,
-            'input_features': {},
-            'license': None,
-            'load_vlm_weights': False,
-            'max_action_dim': 32,
-            'max_period': 4.0,
-            'max_state_dim': 32,
-            'min_period': 0.004,
-            'n_action_steps': 1,
-            'n_obs_steps': 1,
-            'normalization_mapping': {'ACTION': <NormalizationMode.MEAN_STD: 'MEAN_STD'>,
-                                      'STATE': <NormalizationMode.MEAN_STD: 'MEAN_STD'>,
-                                      'VISUAL': <NormalizationMode.IDENTITY: 'IDENTITY'>},
-            'num_expert_layers': -1,
-            'num_steps': 10,
-            'num_vlm_layers': 16,
-            'optimizer_betas': [0.9, 0.95],
-            'optimizer_eps': 1e-08,
-            'optimizer_grad_clip_norm': 10,
-            'optimizer_lr': 0.0001,
-            'optimizer_weight_decay': 1e-10,
-            'output_features': {},
-            'pad_language_to': 'longest',
-            'prefix_length': 0,
-            'private': None,
-            'push_to_hub': True,
-            'repo_id': 'None',
-            'resize_imgs_with_padding': [512, 512],
-            'scheduler_decay_lr': 2.5e-06,
-            'scheduler_decay_steps': 30000,
-            'scheduler_warmup_steps': 1000,
-            'self_attn_every_n_layers': 2,
-            'tags': None,
-            'tokenizer_max_length': 48,
-            'train_expert_only': True,
-            'train_state_proj': True,
-            'type': 'smolvla',
-            'use_amp': True,
-            'use_cache': True,
-            'use_delta_joint_actions_aloha': False,
-            'vlm_model_name': 'HuggingFaceTB/SmolVLM2-500M-Instruct'},
- 'resume': False,
- 'save_checkpoint': True,
- 'save_freq': 20000,
- 'scheduler': {'decay_lr': 2.5e-06,
-               'num_decay_steps': 30000,
-               'num_warmup_steps': 1000,
-               'peak_lr': 0.0001,
-               'type': 'cosine_decay_with_warmup'},
- 'seed': 1000,
- 'steps': 100000,
- 'use_policy_training_preset': True,
- 'wandb': {'disable_artifact': False,
-           'enable': False,
-           'entity': None,
-           'mode': None,
-           'notes': None,
-           'project': 'lerobot',
-           'run_id': None}}
-INFO 2025-09-09 15:53:49 ts/train.py:143 Logs will be saved locally.
-INFO 2025-09-09 15:53:49 ts/train.py:153 Creating dataset
-WARNING 2025-09-09 15:53:49 ts/utils.py:302
-The dataset you requested (physical-intelligence/libero) is in 2.0 format.
-While current version of LeRobot is backward-compatible with it, the version of your dataset still uses global
-stats instead of per-episode stats. Update your dataset stats to the new format using this command:
-```
-python -m lerobot.datasets.v21.convert_dataset_v20_to_v21 --repo-id=physical-intelligence/libero
-```
-
-If you encounter a problem, contact LeRobot maintainers on [Discord](https://discord.com/invite/s3KuuzsPFb)
-or open an [issue on GitHub](https://github.com/huggingface/lerobot/issues/new/choose).
-
-WARNING 2025-09-09 15:53:49 ts/utils.py:302
-The dataset you requested (physical-intelligence/libero) is in 2.0 format.
-While current version of LeRobot is backward-compatible with it, the version of your dataset still uses global
-stats instead of per-episode stats. Update your dataset stats to the new format using this command:
-```
-python -m lerobot.datasets.v21.convert_dataset_v20_to_v21 --repo-id=physical-intelligence/libero
-```
-
-If you encounter a problem, contact LeRobot maintainers on [Discord](https://discord.com/invite/s3KuuzsPFb)
-or open an [issue on GitHub](https://github.com/huggingface/lerobot/issues/new/choose).
-
-Resolving data files: 100%|████████████████████████████████████████████████████████| 1693/1693 [00:00<00:00, 34701.4
-4it/s]
-Loading dataset shards: 100%|███████████████████████████████████████████████████████████| 70/70 [00:00<00:00, 5495.3
-7it/s]
-INFO 2025-09-09 15:53:55 ts/train.py:163 Creating policy
-Fetching 2 files: 100%|██████████████████████████████████████████████████████████████████| 2/2 [00:00<00:00, 41943.0
-4it/s]
-Fetching 2 files: 100%|███████████████████████████████████████████████████████████████████| 2/2 [00:00<00:00, 5500.7
-3it/s]
-Fetching 2 files: 100%|███████████████████████████████████████████████████████████████████| 2/2 [00:00<00:00, 2361.6
-6it/s]
-Fetching 2 files: 100%|███████████████████████████████████████████████████████████████████| 2/2 [00:00<00:00, 5041.2
-3it/s]
-Reducing the number of VLM layers to 16 ...
-> /home/jade_choghari/lerobot/src/lerobot/policies/factory.py(173)make_policy()
--> assert isinstance(policy, nn.Module)
-(Pdb) features
-{'image': PolicyFeature(type=<FeatureType.VISUAL: 'VISUAL'>, shape=(3, 256, 256)), 'wrist_image': PolicyFeature(type
-=<FeatureType.VISUAL: 'VISUAL'>, shape=(3, 256, 256)), 'actions': PolicyFeature(type=<FeatureType.ACTION: 'ACTION'>,
- shape=(7,))}
-(Pdb) ds_meta.features
-{'image': {'dtype': 'image', 'shape': (256, 256, 3), 'names': ['height', 'width', 'channel']}, 'wrist_image': {'dtyp
-e': 'image', 'shape': (256, 256, 3), 'names': ['height', 'width', 'channel']}, 'state': {'dtype': 'float32', 'shape'
-: (8,), 'names': ['state']}, 'actions': {'dtype': 'float32', 'shape': (7,), 'names': ['actions']}, 'timestamp': {'dt
-ype': 'float32', 'shape': (1,), 'names': None}, 'frame_index': {'dtype': 'int64', 'shape': (1,), 'names': None}, 'ep
-isode_index': {'dtype': 'int64', 'shape': (1,), 'names': None}, 'index': {'dtype': 'int64', 'shape': (1,), 'names':
-None}, 'task_index': {'dtype': 'int64', 'shape': (1,), 'names': None}}
-(Pdb) quit()
-
-Traceback (most recent call last):
-  File "/home/jade_choghari/lerobot/src/lerobot/scripts/train.py", line 343, in <module>
-    main()
-  File "/home/jade_choghari/lerobot/src/lerobot/scripts/train.py", line 339, in main
-    train()
-  File "/home/jade_choghari/lerobot/src/lerobot/configs/parser.py", line 225, in wrapper_inner
-    response = fn(cfg, *args, **kwargs)
-  File "/home/jade_choghari/lerobot/src/lerobot/scripts/train.py", line 164, in train
-    policy = make_policy(
-  File "/home/jade_choghari/lerobot/src/lerobot/policies/factory.py", line 173, in make_policy
-    assert isinstance(policy, nn.Module)
-  File "/home/jade_choghari/lerobot/src/lerobot/policies/factory.py", line 173, in make_policy
-    assert isinstance(policy, nn.Module)
-  File "/home/jade_choghari/miniconda3/envs/lerobot/lib/python3.10/bdb.py", line 90, in trace_dispatch
-    return self.dispatch_line(frame)
-  File "/home/jade_choghari/miniconda3/envs/lerobot/lib/python3.10/bdb.py", line 115, in dispatch_line
-    if self.quitting: raise BdbQuit
-bdb.BdbQuit
-clear
-(lerobot) jade_choghari@hf-dgx-01:~/lerobot$ clear
-(lerobot) jade_choghari@hf-dgx-01:~/lerobot$ bash examples/8_train_smolvla_must.sh
-Training dir: /raid/jade/logs/lerobot/lerobot_2_physical-intelligence_libero_smolvla_lr1e-4bs32steps100000
-/home/jade_choghari/miniconda3/envs/lerobot/lib/python3.10/site-packages/transformers/utils/hub.py:111: FutureWarnin
-g: Using `TRANSFORMERS_CACHE` is deprecated and will be removed in v5 of Transformers. Use `HF_HOME` instead.
-  warnings.warn(
-INFO 2025-09-09 15:56:35 ils/utils.py:48 Cuda backend detected, using cuda.
-WARNING 2025-09-09 15:56:35 /policies.py:81 Device 'None' is not available. Switching to 'cuda'.
-INFO 2025-09-09 15:56:35 ts/train.py:137 {'batch_size': 32,
- 'dataset': {'episodes': None,
-             'image_transforms': {'enable': False,
-                                  'max_num_transforms': 3,
-                                  'random_order': False,
-                                  'tfs': {'brightness': {'kwargs': {'brightness': [0.8,
-                                                                                   1.2]},
-                                                         'type': 'ColorJitter',
-                                                         'weight': 1.0},
-                                          'contrast': {'kwargs': {'contrast': [0.8,
-                                                                               1.2]},
-                                                       'type': 'ColorJitter',
-                                                       'weight': 1.0},
-                                          'hue': {'kwargs': {'hue': [-0.05,
-                                                                     0.05]},
-                                                  'type': 'ColorJitter',
-                                                  'weight': 1.0},
-                                          'saturation': {'kwargs': {'saturation': [0.5,
-                                                                                   1.5]},
-                                                         'type': 'ColorJitter',
-                                                         'weight': 1.0},
-                                          'sharpness': {'kwargs': {'sharpness': [0.5,
-                                                                                 1.5]},
-                                                        'type': 'SharpnessJitter',
-                                                        'weight': 1.0}}},
-             'repo_id': 'physical-intelligence/libero',
-             'revision': None,
-             'root': '/raid/jade/.cache/huggingface/datasets',
-             'use_imagenet_stats': True,
-             'video_backend': 'torchcodec'},
- 'env': {'camera_name': 'agentview_image,robot0_eye_in_hand_image',
-         'episode_length': 520,
-         'features': {'action': {'shape': [7],
-                                 'type': <FeatureType.ACTION: 'ACTION'>},
-                      'agent_pos': {'shape': [8],
-                                    'type': <FeatureType.STATE: 'STATE'>},
-                      'pixels/agentview_image': {'shape': [360, 360, 3],
-                                                 'type': <FeatureType.VISUAL: 'VISUAL'>},
-                      'pixels/robot0_eye_in_hand_image': {'shape': [360,
-                                                                    360,
-                                                                    3],
-                                                          'type': <FeatureType.VISUAL: 'VISUAL'>}},
-         'features_map': {'action': 'action',
-                          'agent_pos': 'observation.state',
-                          'pixels/agentview_image': 'observation.images.image',
-                          'pixels/robot0_eye_in_hand_image': 'observation.images.image2'},
-         'fps': 30,
-         'init_states': True,
-         'max_parallel_tasks': 5,
-         'multitask_eval': True,
-         'obs_type': 'pixels_agent_pos',
-         'render_mode': 'rgb_array',
-         'task': 'libero_spatial',
-         'type': 'libero'},
- 'eval': {'batch_size': 1, 'n_episodes': 1, 'use_async_envs': False},
- 'eval_freq': 0,
- 'job_name': 'libero_smolvla',
- 'log_freq': 200,
- 'num_workers': 4,
- 'optimizer': {'betas': [0.9, 0.95],
-               'eps': 1e-08,
-               'grad_clip_norm': 10,
-               'lr': 0.0001,
-               'type': 'adamw',
-               'weight_decay': 1e-10},
- 'output_dir': '/raid/jade/logs/lerobot/lerobot_2_physical-intelligence_libero_smolvla_lr1e-4bs32steps100000',
- 'policy': {'adapt_to_pi_aloha': False,
-            'add_image_special_tokens': False,
-            'attention_mode': 'cross_attn',
-            'chunk_size': 50,
-            'device': 'cuda',
-            'empty_cameras': 0,
-            'expert_width_multiplier': 0.5,
-            'freeze_vision_encoder': True,
-            'gradient_accumulation_steps': 1,
-            'input_features': {},
-            'license': None,
-            'load_vlm_weights': False,
-            'max_action_dim': 32,
-            'max_period': 4.0,
-            'max_state_dim': 32,
-            'min_period': 0.004,
-            'n_action_steps': 1,
-            'n_obs_steps': 1,
-            'normalization_mapping': {'ACTION': <NormalizationMode.MEAN_STD: 'MEAN_STD'>,
-                                      'STATE': <NormalizationMode.MEAN_STD: 'MEAN_STD'>,
-                                      'VISUAL': <NormalizationMode.IDENTITY: 'IDENTITY'>},
-            'num_expert_layers': -1,
-            'num_steps': 10,
-            'num_vlm_layers': 16,
-            'optimizer_betas': [0.9, 0.95],
-            'optimizer_eps': 1e-08,
-            'optimizer_grad_clip_norm': 10,
-            'optimizer_lr': 0.0001,
-            'optimizer_weight_decay': 1e-10,
-            'output_features': {},
-            'pad_language_to': 'longest',
-            'prefix_length': 0,
-            'private': None,
-            'push_to_hub': True,
-            'repo_id': 'None',
-            'resize_imgs_with_padding': [512, 512],
-            'scheduler_decay_lr': 2.5e-06,
-            'scheduler_decay_steps': 30000,
-            'scheduler_warmup_steps': 1000,
-            'self_attn_every_n_layers': 2,
-            'tags': None,
-            'tokenizer_max_length': 48,
-            'train_expert_only': True,
-            'train_state_proj': True,
-            'type': 'smolvla',
-            'use_amp': True,
-            'use_cache': True,
-            'use_delta_joint_actions_aloha': False,
-            'vlm_model_name': 'HuggingFaceTB/SmolVLM2-500M-Instruct'},
- 'resume': False,
- 'save_checkpoint': True,
- 'save_freq': 20000,
- 'scheduler': {'decay_lr': 2.5e-06,
-               'num_decay_steps': 30000,
-               'num_warmup_steps': 1000,
-               'peak_lr': 0.0001,
-               'type': 'cosine_decay_with_warmup'},
- 'seed': 1000,
- 'steps': 100000,
- 'use_policy_training_preset': True,
- 'wandb': {'disable_artifact': False,
-           'enable': False,
-           'entity': None,
-           'mode': None,
-           'notes': None,
-           'project': 'lerobot',
-           'run_id': None}}
-INFO 2025-09-09 15:56:35 ts/train.py:143 Logs will be saved locally.
-INFO 2025-09-09 15:56:35 ts/train.py:153 Creating dataset
-WARNING 2025-09-09 15:56:35 ts/utils.py:302
-The dataset you requested (physical-intelligence/libero) is in 2.0 format.
-While current version of LeRobot is backward-compatible with it, the version of your dataset still uses global
-stats instead of per-episode stats. Update your dataset stats to the new format using this command:
-```
-python -m lerobot.datasets.v21.convert_dataset_v20_to_v21 --repo-id=physical-intelligence/libero
-```
-
-If you encounter a problem, contact LeRobot maintainers on [Discord](https://discord.com/invite/s3KuuzsPFb)
-or open an [issue on GitHub](https://github.com/huggingface/lerobot/issues/new/choose).
-
-WARNING 2025-09-09 15:56:35 ts/utils.py:302
-The dataset you requested (physical-intelligence/libero) is in 2.0 format.
-While current version of LeRobot is backward-compatible with it, the version of your dataset still uses global
-stats instead of per-episode stats. Update your dataset stats to the new format using this command:
-```
-python -m lerobot.datasets.v21.convert_dataset_v20_to_v21 --repo-id=physical-intelligence/libero
-```
-
-If you encounter a problem, contact LeRobot maintainers on [Discord](https://discord.com/invite/s3KuuzsPFb)
-or open an [issue on GitHub](https://github.com/huggingface/lerobot/issues/new/choose).
-
-Resolving data files: 100%|████████████████████████████████████████████████████████| 1693/1693 [00:00<00:00, 78132.9
-5it/s]
-Loading dataset shards: 100%|███████████████████████████████████████████████████████████| 70/70 [00:00<00:00, 4716.0
-3it/s]
-INFO 2025-09-09 15:56:40 ts/train.py:163 Creating policy
-Fetching 2 files: 100%|███████████████████████████████████████████████████████████████████| 2/2 [00:00<00:00, 5259.3
-2it/s]
-Fetching 2 files: 100%|███████████████████████████████████████████████████████████████████| 2/2 [00:00<00:00, 3477.8
-6it/s]
-Fetching 2 files: 100%|██████████████████████████████████████████████████████████████████| 2/2 [00:00<00:00, 45343.8
-3it/s]
-Fetching 2 files: 100%|███████████████████████████████████████████████████████████████████| 2/2 [00:00<00:00, 5551.6
-9it/s]
-Reducing the number of VLM layers to 16 ...
-> /home/jade_choghari/lerobot/src/lerobot/policies/factory.py(173)make_policy()
--> assert isinstance(policy, nn.Module)
-(Pdb) features
-{'image': PolicyFeature(type=<FeatureType.VISUAL: 'VISUAL'>, shape=(3, 256, 256)), 'wrist_image': PolicyFeature(type
-=<FeatureType.VISUAL: 'VISUAL'>, shape=(3, 256, 256)), 'state': PolicyFeature(type=<FeatureType.STATE: 'STATE'>, sha
-pe=(8,)), 'actions': PolicyFeature(type=<FeatureType.ACTION: 'ACTION'>, shape=(7,))}
-(Pdb) quit()
-Traceback (most recent call last):
-  File "/home/jade_choghari/lerobot/src/lerobot/scripts/train.py", line 343, in <module>
-  File "/home/jade_choghari/lerobot/src/lerobot/scripts/train.py", line 339, in main
-
-  File "/home/jade_choghari/lerobot/src/lerobot/configs/parser.py", line 225, in wrapper_inner
-    response = fn(cfg, *args, **kwargs)
-  File "/home/jade_choghari/lerobot/src/lerobot/scripts/train.py", line 164, in train
-    policy = make_policy(
-  File "/home/jade_choghari/lerobot/src/lerobot/policies/factory.py", line 173, in make_policy
-    # policy = torch.compile(policy, mode="reduce-overhead")
-  File "/home/jade_choghari/lerobot/src/lerobot/policies/factory.py", line 173, in make_policy
-    # policy = torch.compile(policy, mode="reduce-overhead")
-  File "/home/jade_choghari/miniconda3/envs/lerobot/lib/python3.10/bdb.py", line 90, in trace_dispatch
-    return self.dispatch_line(frame)
-  File "/home/jade_choghari/miniconda3/envs/lerobot/lib/python3.10/bdb.py", line 115, in dispatch_line
-    if self.quitting: raise BdbQuit
-bdb.BdbQuit
-clear
-(lerobot) jade_choghari@hf-dgx-01:~/lerobot$ clear
-(lerobot) jade_choghari@hf-dgx-01:~/lerobot$ bash examples/8_train_smolvla_must.sh
-Training dir: /raid/jade/logs/lerobot/lerobot_2_physical-intelligence_libero_smolvla_lr1e-4bs32steps100000
-/home/jade_choghari/miniconda3/envs/lerobot/lib/python3.10/site-packages/transformers/utils/hub.py:111: FutureWarnin
-g: Using `TRANSFORMERS_CACHE` is deprecated and will be removed in v5 of Transformers. Use `HF_HOME` instead.
-  warnings.warn(
-INFO 2025-09-09 15:58:35 ils/utils.py:48 Cuda backend detected, using cuda.
-WARNING 2025-09-09 15:58:35 /policies.py:81 Device 'None' is not available. Switching to 'cuda'.
-INFO 2025-09-09 15:58:35 ts/train.py:137 {'batch_size': 32,
- 'dataset': {'episodes': None,
-             'image_transforms': {'enable': False,
-                                  'max_num_transforms': 3,
-                                  'random_order': False,
-                                  'tfs': {'brightness': {'kwargs': {'brightness': [0.8,
-                                                                                   1.2]},
-                                                         'type': 'ColorJitter',
-                                                         'weight': 1.0},
-                                          'contrast': {'kwargs': {'contrast': [0.8,
-                                                                               1.2]},
-                                                       'type': 'ColorJitter',
-                                                       'weight': 1.0},
-                                          'hue': {'kwargs': {'hue': [-0.05,
-                                                                     0.05]},
-                                                  'type': 'ColorJitter',
-                                                  'weight': 1.0},
-                                          'saturation': {'kwargs': {'saturation': [0.5,
-                                                                                   1.5]},
-                                                         'type': 'ColorJitter',
-                                                         'weight': 1.0},
-                                          'sharpness': {'kwargs': {'sharpness': [0.5,
-                                                                                 1.5]},
-                                                        'type': 'SharpnessJitter',
-                                                        'weight': 1.0}}},
-             'repo_id': 'physical-intelligence/libero',
-             'revision': None,
-             'root': '/raid/jade/.cache/huggingface/datasets',
-             'use_imagenet_stats': True,
-             'video_backend': 'torchcodec'},
- 'env': {'camera_name': 'agentview_image,robot0_eye_in_hand_image',
-         'episode_length': 520,
-         'features': {'action': {'shape': [7],
-                                 'type': <FeatureType.ACTION: 'ACTION'>},
-                      'agent_pos': {'shape': [8],
-                                    'type': <FeatureType.STATE: 'STATE'>},
-                      'pixels/agentview_image': {'shape': [360, 360, 3],
-                                                 'type': <FeatureType.VISUAL: 'VISUAL'>},
-                      'pixels/robot0_eye_in_hand_image': {'shape': [360,
-                                                                    360,
-                                                                    3],
-                                                          'type': <FeatureType.VISUAL: 'VISUAL'>}},
-         'features_map': {'action': 'action',
-                          'agent_pos': 'observation.state',
-                          'pixels/agentview_image': 'observation.images.image',
-                          'pixels/robot0_eye_in_hand_image': 'observation.images.image2'},
-         'fps': 30,
-         'init_states': True,
-         'max_parallel_tasks': 5,
-         'multitask_eval': True,
-         'obs_type': 'pixels_agent_pos',
-         'render_mode': 'rgb_array',
-         'task': 'libero_spatial',
-         'type': 'libero'},
- 'eval': {'batch_size': 1, 'n_episodes': 1, 'use_async_envs': False},
- 'eval_freq': 0,
- 'job_name': 'libero_smolvla',
- 'log_freq': 200,
- 'num_workers': 4,
- 'optimizer': {'betas': [0.9, 0.95],
-               'eps': 1e-08,
-               'grad_clip_norm': 10,
-               'lr': 0.0001,
-               'type': 'adamw',
-               'weight_decay': 1e-10},
- 'output_dir': '/raid/jade/logs/lerobot/lerobot_2_physical-intelligence_libero_smolvla_lr1e-4bs32steps100000',
- 'policy': {'adapt_to_pi_aloha': False,
-            'add_image_special_tokens': False,
-            'attention_mode': 'cross_attn',
-            'chunk_size': 50,
-            'device': 'cuda',
-            'empty_cameras': 0,
-            'expert_width_multiplier': 0.5,
-            'freeze_vision_encoder': True,
-            'gradient_accumulation_steps': 1,
-            'input_features': {},
-            'license': None,
-            'load_vlm_weights': False,
-            'max_action_dim': 32,
-            'max_period': 4.0,
-            'max_state_dim': 32,
-            'min_period': 0.004,
-            'n_action_steps': 1,
-            'n_obs_steps': 1,
-            'normalization_mapping': {'ACTION': <NormalizationMode.MEAN_STD: 'MEAN_STD'>,
-                                      'STATE': <NormalizationMode.MEAN_STD: 'MEAN_STD'>,
-                                      'VISUAL': <NormalizationMode.IDENTITY: 'IDENTITY'>},
-            'num_expert_layers': -1,
-            'num_steps': 10,
-            'num_vlm_layers': 16,
-            'optimizer_betas': [0.9, 0.95],
-            'optimizer_eps': 1e-08,
-            'optimizer_grad_clip_norm': 10,
-            'optimizer_lr': 0.0001,
-            'optimizer_weight_decay': 1e-10,
-            'output_features': {},
-            'pad_language_to': 'longest',
-            'prefix_length': 0,
-            'private': None,
-            'push_to_hub': True,
-            'repo_id': 'None',
-            'resize_imgs_with_padding': [512, 512],
-            'scheduler_decay_lr': 2.5e-06,
-            'scheduler_decay_steps': 30000,
-            'scheduler_warmup_steps': 1000,
-            'self_attn_every_n_layers': 2,
-            'tags': None,
-            'tokenizer_max_length': 48,
-            'train_expert_only': True,
-            'train_state_proj': True,
-            'type': 'smolvla',
-            'use_amp': True,
-            'use_cache': True,
-            'use_delta_joint_actions_aloha': False,
-            'vlm_model_name': 'HuggingFaceTB/SmolVLM2-500M-Instruct'},
- 'resume': False,
- 'save_checkpoint': True,
- 'save_freq': 20000,
- 'scheduler': {'decay_lr': 2.5e-06,
-               'num_decay_steps': 30000,
-               'num_warmup_steps': 1000,
-               'peak_lr': 0.0001,
-               'type': 'cosine_decay_with_warmup'},
- 'seed': 1000,
- 'steps': 100000,
- 'use_policy_training_preset': True,
- 'wandb': {'disable_artifact': False,
-           'enable': False,
-           'entity': None,
-           'mode': None,
-           'notes': None,
-           'project': 'lerobot',
-           'run_id': None}}
-INFO 2025-09-09 15:58:35 ts/train.py:143 Logs will be saved locally.
-INFO 2025-09-09 15:58:35 ts/train.py:153 Creating dataset
-WARNING 2025-09-09 15:58:35 ts/utils.py:302
-The dataset you requested (physical-intelligence/libero) is in 2.0 format.
-While current version of LeRobot is backward-compatible with it, the version of your dataset still uses global
-stats instead of per-episode stats. Update your dataset stats to the new format using this command:
-```
-python -m lerobot.datasets.v21.convert_dataset_v20_to_v21 --repo-id=physical-intelligence/libero
-```
-
-If you encounter a problem, contact LeRobot maintainers on [Discord](https://discord.com/invite/s3KuuzsPFb)
-or open an [issue on GitHub](https://github.com/huggingface/lerobot/issues/new/choose).
-
-WARNING 2025-09-09 15:58:35 ts/utils.py:302
-The dataset you requested (physical-intelligence/libero) is in 2.0 format.
-While current version of LeRobot is backward-compatible with it, the version of your dataset still uses global
-stats instead of per-episode stats. Update your dataset stats to the new format using this command:
-```
-python -m lerobot.datasets.v21.convert_dataset_v20_to_v21 --repo-id=physical-intelligence/libero
-```
-
-If you encounter a problem, contact LeRobot maintainers on [Discord](https://discord.com/invite/s3KuuzsPFb)
-or open an [issue on GitHub](https://github.com/huggingface/lerobot/issues/new/choose).
-
-Resolving data files: 100%|████████████████████████████████████████████████████████| 1693/1693 [00:00<00:00, 27666.4
-6it/s]
-Loading dataset shards: 100%|███████████████████████████████████████████████████████████| 70/70 [00:00<00:00, 5305.7
-0it/s]
-INFO 2025-09-09 15:58:41 ts/train.py:163 Creating policy
-Fetching 2 files: 100%|██████████████████████████████████████████████████████████████████| 2/2 [00:00<00:00, 44384.1
-7it/s]
-Fetching 2 files: 100%|███████████████████████████████████████████████████████████████████| 2/2 [00:00<00:00, 3192.0
-1it/s]
-Fetching 2 files: 100%|██████████████████████████████████████████████████████████████████| 2/2 [00:00<00:00, 44620.2
-6it/s]
-Fetching 2 files: 100%|██████████████████████████████████████████████████████████████████| 2/2 [00:00<00:00, 42799.0
-2it/s]
-Reducing the number of VLM layers to 16 ...
-INFO 2025-09-09 15:59:13 ts/train.py:168 Creating optimizer and scheduler
-INFO 2025-09-09 15:59:13 ts/train.py:180 Output dir: /raid/jade/logs/lerobot/lerobot_2_physical-intelligence_libero_
-smolvla_lr1e-4bs32steps100000
-INFO 2025-09-09 15:59:13 ts/train.py:182 cfg.env.task='libero_spatial'
-INFO 2025-09-09 15:59:13 ts/train.py:183 cfg.steps=100000 (100K)
-INFO 2025-09-09 15:59:13 ts/train.py:184 dataset.num_frames=273465 (273K)
-INFO 2025-09-09 15:59:13 ts/train.py:185 dataset.num_episodes=1693
-INFO 2025-09-09 15:59:13 ts/train.py:186 num_learnable_params=49103712 (49M)
-INFO 2025-09-09 15:59:13 ts/train.py:187 num_total_params=399268940 (399M)
-INFO 2025-09-09 15:59:13 ts/train.py:225 Start offline training on a fixed dataset
-Traceback (most recent call last):
-  File "/home/jade_choghari/lerobot/src/lerobot/scripts/train.py", line 342, in <module>
-    main()
-  File "/home/jade_choghari/lerobot/src/lerobot/scripts/train.py", line 338, in main
-    train()
-  File "/home/jade_choghari/lerobot/src/lerobot/configs/parser.py", line 225, in wrapper_inner
-    response = fn(cfg, *args, **kwargs)
-  File "/home/jade_choghari/lerobot/src/lerobot/scripts/train.py", line 235, in train
-    train_tracker, output_dict = update_policy(
-  File "/home/jade_choghari/lerobot/src/lerobot/scripts/train.py", line 71, in update_policy
-    loss, output_dict = policy.forward(batch)
-  File "/home/jade_choghari/lerobot/src/lerobot/policies/smolvla/modeling_smolvla.py", line 458, in forward
-    actions = self.prepare_action(batch)
-  File "/home/jade_choghari/lerobot/src/lerobot/policies/smolvla/modeling_smolvla.py", line 580, in prepare_action
-    actions = pad_vector(batch[ACTION], self.config.max_action_dim)
-KeyError: 'action'
-Exception in thread Thread-3 (_pin_memory_loop):
-Traceback (most recent call last):
-  File "/home/jade_choghari/miniconda3/envs/lerobot/lib/python3.10/threading.py", line 1016, in _bootstrap_inner
-    self.run()
-  File "/home/jade_choghari/miniconda3/envs/lerobot/lib/python3.10/threading.py", line 953, in run
-    self._target(*self._args, **self._kwargs)
-  File "/home/jade_choghari/miniconda3/envs/lerobot/lib/python3.10/site-packages/torch/utils/data/_utils/pin_memory.
-py", line 61, in _pin_memory_loop
-    do_one_step()
-  File "/home/jade_choghari/miniconda3/envs/lerobot/lib/python3.10/site-packages/torch/utils/data/_utils/pin_memory.
-py", line 37, in do_one_step
-    r = in_queue.get(timeout=MP_STATUS_CHECK_INTERVAL)
-  File "/home/jade_choghari/miniconda3/envs/lerobot/lib/python3.10/multiprocessing/queues.py", line 122, in get
-    return _ForkingPickler.loads(res)
-  File "/home/jade_choghari/miniconda3/envs/lerobot/lib/python3.10/site-packages/torch/multiprocessing/reductions.py
-", line 541, in rebuild_storage_fd
-    fd = df.detach()
-  File "/home/jade_choghari/miniconda3/envs/lerobot/lib/python3.10/multiprocessing/resource_sharer.py", line 57, in
-detach
-    with _resource_sharer.get_connection(self._id) as conn:
-  File "/home/jade_choghari/miniconda3/envs/lerobot/lib/python3.10/multiprocessing/resource_sharer.py", line 86, in
-get_connection
-    c = Client(address, authkey=process.current_process().authkey)
-  File "/home/jade_choghari/miniconda3/envs/lerobot/lib/python3.10/multiprocessing/connection.py", line 508, in Clie
-nt
-    answer_challenge(c, authkey)
-  File "/home/jade_choghari/miniconda3/envs/lerobot/lib/python3.10/multiprocessing/connection.py", line 752, in answ
-er_challenge
-    message = connection.recv_bytes(256)         # reject large message
-  File "/home/jade_choghari/miniconda3/envs/lerobot/lib/python3.10/multiprocessing/connection.py", line 216, in recv
-_bytes
-    buf = self._recv_bytes(maxlength)
-  File "/home/jade_choghari/miniconda3/envs/lerobot/lib/python3.10/multiprocessing/connection.py", line 414, in _rec
-v_bytes
-    buf = self._recv(4)
-  File "/home/jade_choghari/miniconda3/envs/lerobot/lib/python3.10/multiprocessing/connection.py", line 379, in _rec
-v
-    chunk = read(handle, remaining)
-ConnectionResetError: [Errno 104] Connection reset by peer
-(lerobot) jade_choghari@hf-dgx-01:~/lerobot$ clear
-(lerobot) jade_choghari@hf-dgx-01:~/lerobot$ bash examples/8_train_smolvla_must.sh
-Training dir: /raid/jade/logs/lerobot/lerobot_2_physical-intelligence_libero_smolvla_lr1e-4bs32steps100000
-/home/jade_choghari/miniconda3/envs/lerobot/lib/python3.10/site-packages/transformers/utils/hub.py:111: FutureWarnin
-g: Using `TRANSFORMERS_CACHE` is deprecated and will be removed in v5 of Transformers. Use `HF_HOME` instead.
-  warnings.warn(
-INFO 2025-09-09 15:59:53 ils/utils.py:48 Cuda backend detected, using cuda.
-WARNING 2025-09-09 15:59:53 /policies.py:81 Device 'None' is not available. Switching to 'cuda'.
-INFO 2025-09-09 15:59:53 ts/train.py:137 {'batch_size': 32,
- 'dataset': {'episodes': None,
-             'image_transforms': {'enable': False,
-                                  'max_num_transforms': 3,
-                                  'random_order': False,
-                                  'tfs': {'brightness': {'kwargs': {'brightness': [0.8,
-                                                                                   1.2]},
-                                                         'type': 'ColorJitter',
-                                                         'weight': 1.0},
-                                          'contrast': {'kwargs': {'contrast': [0.8,
-                                                                               1.2]},
-                                                       'type': 'ColorJitter',
-                                                       'weight': 1.0},
-                                          'hue': {'kwargs': {'hue': [-0.05,
-                                                                     0.05]},
-                                                  'type': 'ColorJitter',
-                                                  'weight': 1.0},
-                                          'saturation': {'kwargs': {'saturation': [0.5,
-                                                                                   1.5]},
-                                                         'type': 'ColorJitter',
-                                                         'weight': 1.0},
-                                          'sharpness': {'kwargs': {'sharpness': [0.5,
-                                                                                 1.5]},
-                                                        'type': 'SharpnessJitter',
-                                                        'weight': 1.0}}},
-             'repo_id': 'physical-intelligence/libero',
-             'revision': None,
-             'root': '/raid/jade/.cache/huggingface/datasets',
-             'use_imagenet_stats': True,
-             'video_backend': 'torchcodec'},
- 'env': {'camera_name': 'agentview_image,robot0_eye_in_hand_image',
-         'episode_length': 520,
-         'features': {'action': {'shape': [7],
-                                 'type': <FeatureType.ACTION: 'ACTION'>},
-                      'agent_pos': {'shape': [8],
-                                    'type': <FeatureType.STATE: 'STATE'>},
-                      'pixels/agentview_image': {'shape': [360, 360, 3],
-                                                 'type': <FeatureType.VISUAL: 'VISUAL'>},
-                      'pixels/robot0_eye_in_hand_image': {'shape': [360,
-                                                                    360,
-                                                                    3],
-                                                          'type': <FeatureType.VISUAL: 'VISUAL'>}},
-         'features_map': {'action': 'action',
-                          'agent_pos': 'observation.state',
-                          'pixels/agentview_image': 'observation.images.image',
-                          'pixels/robot0_eye_in_hand_image': 'observation.images.image2'},
-         'fps': 30,
-         'init_states': True,
-         'max_parallel_tasks': 5,
-         'multitask_eval': True,
-         'obs_type': 'pixels_agent_pos',
-         'render_mode': 'rgb_array',
-         'task': 'libero_spatial',
-         'type': 'libero'},
- 'eval': {'batch_size': 1, 'n_episodes': 1, 'use_async_envs': False},
- 'eval_freq': 0,
- 'job_name': 'libero_smolvla',
- 'log_freq': 200,
- 'num_workers': 4,
- 'optimizer': {'betas': [0.9, 0.95],
-               'eps': 1e-08,
-               'grad_clip_norm': 10,
-               'lr': 0.0001,
-               'type': 'adamw',
-               'weight_decay': 1e-10},
- 'output_dir': '/raid/jade/logs/lerobot/lerobot_2_physical-intelligence_libero_smolvla_lr1e-4bs32steps100000',
- 'policy': {'adapt_to_pi_aloha': False,
-            'add_image_special_tokens': False,
-            'attention_mode': 'cross_attn',
-            'chunk_size': 50,
-            'device': 'cuda',
-            'empty_cameras': 0,
-            'expert_width_multiplier': 0.5,
-            'freeze_vision_encoder': True,
-            'gradient_accumulation_steps': 1,
-            'input_features': {},
-            'license': None,
-            'load_vlm_weights': False,
-            'max_action_dim': 32,
-            'max_period': 4.0,
-            'max_state_dim': 32,
-            'min_period': 0.004,
-            'n_action_steps': 1,
-            'n_obs_steps': 1,
-            'normalization_mapping': {'ACTION': <NormalizationMode.MEAN_STD: 'MEAN_STD'>,
-                                      'STATE': <NormalizationMode.MEAN_STD: 'MEAN_STD'>,
-                                      'VISUAL': <NormalizationMode.IDENTITY: 'IDENTITY'>},
-            'num_expert_layers': -1,
-            'num_steps': 10,
-            'num_vlm_layers': 16,
-            'optimizer_betas': [0.9, 0.95],
-            'optimizer_eps': 1e-08,
-            'optimizer_grad_clip_norm': 10,
-            'optimizer_lr': 0.0001,
-            'optimizer_weight_decay': 1e-10,
-            'output_features': {},
-            'pad_language_to': 'longest',
-            'prefix_length': 0,
-            'private': None,
-            'push_to_hub': True,
-            'repo_id': 'None',
-            'resize_imgs_with_padding': [512, 512],
-            'scheduler_decay_lr': 2.5e-06,
-            'scheduler_decay_steps': 30000,
-            'scheduler_warmup_steps': 1000,
-            'self_attn_every_n_layers': 2,
-            'tags': None,
-            'tokenizer_max_length': 48,
-            'train_expert_only': True,
-            'train_state_proj': True,
-            'type': 'smolvla',
-            'use_amp': True,
-            'use_cache': True,
-            'use_delta_joint_actions_aloha': False,
-            'vlm_model_name': 'HuggingFaceTB/SmolVLM2-500M-Instruct'},
- 'resume': False,
- 'save_checkpoint': True,
- 'save_freq': 20000,
- 'scheduler': {'decay_lr': 2.5e-06,
-               'num_decay_steps': 30000,
-               'num_warmup_steps': 1000,
-               'peak_lr': 0.0001,
-               'type': 'cosine_decay_with_warmup'},
- 'seed': 1000,
- 'steps': 100000,
- 'use_policy_training_preset': True,
- 'wandb': {'disable_artifact': False,
-           'enable': False,
-           'entity': None,
-           'mode': None,
-           'notes': None,
-           'project': 'lerobot',
-           'run_id': None}}
-INFO 2025-09-09 15:59:53 ts/train.py:143 Logs will be saved locally.
-INFO 2025-09-09 15:59:53 ts/train.py:153 Creating dataset
-WARNING 2025-09-09 15:59:53 ts/utils.py:302
-The dataset you requested (physical-intelligence/libero) is in 2.0 format.
-While current version of LeRobot is backward-compatible with it, the version of your dataset still uses global
-stats instead of per-episode stats. Update your dataset stats to the new format using this command:
-```
-python -m lerobot.datasets.v21.convert_dataset_v20_to_v21 --repo-id=physical-intelligence/libero
-```
-
-If you encounter a problem, contact LeRobot maintainers on [Discord](https://discord.com/invite/s3KuuzsPFb)
-or open an [issue on GitHub](https://github.com/huggingface/lerobot/issues/new/choose).
-
-WARNING 2025-09-09 15:59:53 ts/utils.py:302
-The dataset you requested (physical-intelligence/libero) is in 2.0 format.
-While current version of LeRobot is backward-compatible with it, the version of your dataset still uses global
-stats instead of per-episode stats. Update your dataset stats to the new format using this command:
-```
-python -m lerobot.datasets.v21.convert_dataset_v20_to_v21 --repo-id=physical-intelligence/libero
-```
-
-If you encounter a problem, contact LeRobot maintainers on [Discord](https://discord.com/invite/s3KuuzsPFb)
-or open an [issue on GitHub](https://github.com/huggingface/lerobot/issues/new/choose).
-
-Resolving data files: 100%|████████████████████████████████████████████████████████| 1693/1693 [00:00<00:00, 72147.3
-3it/s]
-Loading dataset shards: 100%|███████████████████████████████████████████████████████████| 70/70 [00:00<00:00, 5076.7
-1it/s]
-INFO 2025-09-09 15:59:58 ts/train.py:163 Creating policy
-Fetching 2 files: 100%|███████████████████████████████████████████████████████████████████| 2/2 [00:00<00:00, 6096.3
-7it/s]
-Fetching 2 files: 100%|███████████████████████████████████████████████████████████████████| 2/2 [00:00<00:00, 4348.6
-8it/s]
-Fetching 2 files: 100%|██████████████████████████████████████████████████████████████████| 2/2 [00:00<00:00, 46091.2
-5it/s]
-Fetching 2 files: 100%|███████████████████████████████████████████████████████████████████| 2/2 [00:00<00:00, 3225.1
-5it/s]
-Reducing the number of VLM layers to 16 ...
-INFO 2025-09-09 16:00:31 ts/train.py:168 Creating optimizer and scheduler
-INFO 2025-09-09 16:00:31 ts/train.py:180 Output dir: /raid/jade/logs/lerobot/lerobot_2_physical-intelligence_libero_
-smolvla_lr1e-4bs32steps100000
-INFO 2025-09-09 16:00:31 ts/train.py:182 cfg.env.task='libero_spatial'
-INFO 2025-09-09 16:00:31 ts/train.py:183 cfg.steps=100000 (100K)
-INFO 2025-09-09 16:00:31 ts/train.py:184 dataset.num_frames=273465 (273K)
-INFO 2025-09-09 16:00:31 ts/train.py:185 dataset.num_episodes=1693
-INFO 2025-09-09 16:00:31 ts/train.py:186 num_learnable_params=49103712 (49M)
-INFO 2025-09-09 16:00:31 ts/train.py:187 num_total_params=399268940 (399M)
-INFO 2025-09-09 16:00:31 ts/train.py:225 Start offline training on a fixed dataset
-Traceback (most recent call last):
-  File "/home/jade_choghari/lerobot/src/lerobot/scripts/train.py", line 342, in <module>
-    main()
-  File "/home/jade_choghari/lerobot/src/lerobot/scripts/train.py", line 338, in main
-    train()
-  File "/home/jade_choghari/lerobot/src/lerobot/configs/parser.py", line 225, in wrapper_inner
-    response = fn(cfg, *args, **kwargs)
-  File "/home/jade_choghari/lerobot/src/lerobot/scripts/train.py", line 235, in train
-    train_tracker, output_dict = update_policy(
-  File "/home/jade_choghari/lerobot/src/lerobot/scripts/train.py", line 71, in update_policy
-    loss, output_dict = policy.forward(batch)
-  File "/home/jade_choghari/lerobot/src/lerobot/policies/smolvla/modeling_smolvla.py", line 461, in forward
-    losses = self.model.forward(images, img_masks, lang_tokens, lang_masks, state, actions, noise, time)
-  File "/home/jade_choghari/lerobot/src/lerobot/policies/smolvla/modeling_smolvla.py", line 850, in forward
-    att_2d_masks = make_att_2d_masks(pad_masks, att_masks)
-  File "/home/jade_choghari/lerobot/src/lerobot/policies/smolvla/modeling_smolvla.py", line 226, in make_att_2d_mask
-s
-    att_2d_masks = att_2d_masks & pad_2d_masks
-RuntimeError: The size of tensor a (199) must match the size of tensor b (181) at non-singleton dimension 2
-(lerobot) jade_choghari@hf-dgx-01:~/lerobot$ clear
-(lerobot) jade_choghari@hf-dgx-01:~/lerobot$ bash examples/8_train_smolvla_must.sh
-Training dir: /raid/jade/logs/lerobot/lerobot_2_physical-intelligence_libero_smolvla_lr1e-4bs32steps100000
-/home/jade_choghari/miniconda3/envs/lerobot/lib/python3.10/site-packages/transformers/utils/hub.py:111: FutureWarnin
-g: Using `TRANSFORMERS_CACHE` is deprecated and will be removed in v5 of Transformers. Use `HF_HOME` instead.
-  warnings.warn(
-INFO 2025-09-09 16:10:03 ils/utils.py:48 Cuda backend detected, using cuda.
-WARNING 2025-09-09 16:10:03 /policies.py:81 Device 'None' is not available. Switching to 'cuda'.
-INFO 2025-09-09 16:10:03 ts/train.py:137 {'batch_size': 32,
- 'dataset': {'episodes': None,
-             'image_transforms': {'enable': False,
-                                  'max_num_transforms': 3,
-                                  'random_order': False,
-                                  'tfs': {'brightness': {'kwargs': {'brightness': [0.8,
-                                                                                   1.2]},
-                                                         'type': 'ColorJitter',
-                                                         'weight': 1.0},
-                                          'contrast': {'kwargs': {'contrast': [0.8,
-                                                                               1.2]},
-                                                       'type': 'ColorJitter',
-                                                       'weight': 1.0},
-                                          'hue': {'kwargs': {'hue': [-0.05,
-                                                                     0.05]},
-                                                  'type': 'ColorJitter',
-                                                  'weight': 1.0},
-                                          'saturation': {'kwargs': {'saturation': [0.5,
-                                                                                   1.5]},
-                                                         'type': 'ColorJitter',
-                                                         'weight': 1.0},
-                                          'sharpness': {'kwargs': {'sharpness': [0.5,
-                                                                                 1.5]},
-                                                        'type': 'SharpnessJitter',
-                                                        'weight': 1.0}}},
-             'repo_id': 'physical-intelligence/libero',
-             'revision': None,
-             'root': '/raid/jade/.cache/huggingface/datasets',
-             'use_imagenet_stats': True,
-             'video_backend': 'torchcodec'},
- 'env': {'camera_name': 'agentview_image,robot0_eye_in_hand_image',
-         'episode_length': 520,
-         'features': {'action': {'shape': [7],
-                                 'type': <FeatureType.ACTION: 'ACTION'>},
-                      'agent_pos': {'shape': [8],
-                                    'type': <FeatureType.STATE: 'STATE'>},
-                      'pixels/agentview_image': {'shape': [360, 360, 3],
-                                                 'type': <FeatureType.VISUAL: 'VISUAL'>},
-                      'pixels/robot0_eye_in_hand_image': {'shape': [360,
-                                                                    360,
-                                                                    3],
-                                                          'type': <FeatureType.VISUAL: 'VISUAL'>}},
-         'features_map': {'action': 'action',
-                          'agent_pos': 'observation.state',
-                          'pixels/agentview_image': 'observation.images.image',
-                          'pixels/robot0_eye_in_hand_image': 'observation.images.image2'},
-         'fps': 30,
-         'init_states': True,
-         'max_parallel_tasks': 5,
-         'multitask_eval': True,
-         'obs_type': 'pixels_agent_pos',
-         'render_mode': 'rgb_array',
-         'task': 'libero_spatial',
-         'type': 'libero'},
- 'eval': {'batch_size': 1, 'n_episodes': 1, 'use_async_envs': False},
- 'eval_freq': 0,
- 'job_name': 'libero_smolvla',
- 'log_freq': 200,
- 'num_workers': 4,
- 'optimizer': {'betas': [0.9, 0.95],
-               'eps': 1e-08,
-               'grad_clip_norm': 10,
-               'lr': 0.0001,
-               'type': 'adamw',
-               'weight_decay': 1e-10},
- 'output_dir': '/raid/jade/logs/lerobot/lerobot_2_physical-intelligence_libero_smolvla_lr1e-4bs32steps100000',
- 'policy': {'adapt_to_pi_aloha': False,
-            'add_image_special_tokens': False,
-            'attention_mode': 'cross_attn',
-            'chunk_size': 50,
-            'device': 'cuda',
-            'empty_cameras': 0,
-            'expert_width_multiplier': 0.5,
-            'freeze_vision_encoder': True,
-            'gradient_accumulation_steps': 1,
-            'input_features': {},
-            'license': None,
-            'load_vlm_weights': False,
-            'max_action_dim': 32,
-            'max_period': 4.0,
-            'max_state_dim': 32,
-            'min_period': 0.004,
-            'n_action_steps': 1,
-            'n_obs_steps': 1,
-            'normalization_mapping': {'ACTION': <NormalizationMode.MEAN_STD: 'MEAN_STD'>,
-                                      'STATE': <NormalizationMode.MEAN_STD: 'MEAN_STD'>,
-                                      'VISUAL': <NormalizationMode.IDENTITY: 'IDENTITY'>},
-            'num_expert_layers': -1,
-            'num_steps': 10,
-            'num_vlm_layers': 16,
-            'optimizer_betas': [0.9, 0.95],
-            'optimizer_eps': 1e-08,
-            'optimizer_grad_clip_norm': 10,
-            'optimizer_lr': 0.0001,
-            'optimizer_weight_decay': 1e-10,
-            'output_features': {},
-            'pad_language_to': 'longest',
-            'prefix_length': 0,
-            'private': None,
-            'push_to_hub': True,
-            'repo_id': 'None',
-            'resize_imgs_with_padding': [512, 512],
-            'scheduler_decay_lr': 2.5e-06,
-            'scheduler_decay_steps': 30000,
-            'scheduler_warmup_steps': 1000,
-            'self_attn_every_n_layers': 2,
-            'tags': None,
-            'tokenizer_max_length': 48,
-            'train_expert_only': True,
-            'train_state_proj': True,
-            'type': 'smolvla',
-            'use_amp': True,
-            'use_cache': True,
-            'use_delta_joint_actions_aloha': False,
-            'vlm_model_name': 'HuggingFaceTB/SmolVLM2-500M-Instruct'},
- 'resume': False,
- 'save_checkpoint': True,
- 'save_freq': 20000,
- 'scheduler': {'decay_lr': 2.5e-06,
-               'num_decay_steps': 30000,
-               'num_warmup_steps': 1000,
-               'peak_lr': 0.0001,
-               'type': 'cosine_decay_with_warmup'},
- 'seed': 1000,
- 'steps': 100000,
- 'use_policy_training_preset': True,
- 'wandb': {'disable_artifact': False,
-           'enable': False,
-           'entity': None,
-           'mode': None,
-           'notes': None,
-           'project': 'lerobot',
-           'run_id': None}}
-INFO 2025-09-09 16:10:03 ts/train.py:143 Logs will be saved locally.
-INFO 2025-09-09 16:10:03 ts/train.py:153 Creating dataset
-WARNING 2025-09-09 16:10:03 ts/utils.py:302
-The dataset you requested (physical-intelligence/libero) is in 2.0 format.
-While current version of LeRobot is backward-compatible with it, the version of your dataset still uses global
-stats instead of per-episode stats. Update your dataset stats to the new format using this command:
-```
-python -m lerobot.datasets.v21.convert_dataset_v20_to_v21 --repo-id=physical-intelligence/libero
-```
-
-If you encounter a problem, contact LeRobot maintainers on [Discord](https://discord.com/invite/s3KuuzsPFb)
-or open an [issue on GitHub](https://github.com/huggingface/lerobot/issues/new/choose).
-
-WARNING 2025-09-09 16:10:03 ts/utils.py:302
-The dataset you requested (physical-intelligence/libero) is in 2.0 format.
-While current version of LeRobot is backward-compatible with it, the version of your dataset still uses global
-stats instead of per-episode stats. Update your dataset stats to the new format using this command:
-```
-python -m lerobot.datasets.v21.convert_dataset_v20_to_v21 --repo-id=physical-intelligence/libero
-```
-
-If you encounter a problem, contact LeRobot maintainers on [Discord](https://discord.com/invite/s3KuuzsPFb)
-or open an [issue on GitHub](https://github.com/huggingface/lerobot/issues/new/choose).
-
-Resolving data files: 100%|█████████████████████████████████| 1693/1693 [00:00<00:00, 54574.89it/s]
-Loading dataset shards: 100%|████████████████████████████████████| 70/70 [00:00<00:00, 7567.63it/s]
-INFO 2025-09-09 16:10:09 ts/train.py:163 Creating policy
-Fetching 2 files: 100%|███████████████████████████████████████████| 2/2 [00:00<00:00, 40721.40it/s]
-Fetching 2 files: 100%|████████████████████████████████████████████| 2/2 [00:00<00:00, 7516.67it/s]
-Fetching 2 files: 100%|████████████████████████████████████████████| 2/2 [00:00<00:00, 3158.36it/s]
-Fetching 2 files: 100%|████████████████████████████████████████████| 2/2 [00:00<00:00, 6775.94it/s]
-Reducing the number of VLM layers to 16 ...
-INFO 2025-09-09 16:10:41 ts/train.py:168 Creating optimizer and scheduler
-INFO 2025-09-09 16:10:41 ts/train.py:180 Output dir: /raid/jade/logs/lerobot/lerobot_2_physical-intelligence_libero_
-smolvla_lr1e-4bs32steps100000
-INFO 2025-09-09 16:10:41 ts/train.py:182 cfg.env.task='libero_spatial'
-INFO 2025-09-09 16:10:41 ts/train.py:183 cfg.steps=100000 (100K)
-INFO 2025-09-09 16:10:41 ts/train.py:184 dataset.num_frames=273465 (273K)
-INFO 2025-09-09 16:10:41 ts/train.py:185 dataset.num_episodes=1693
-INFO 2025-09-09 16:10:41 ts/train.py:186 num_learnable_params=49103712 (49M)
-INFO 2025-09-09 16:10:41 ts/train.py:187 num_total_params=399268940 (399M)
-INFO 2025-09-09 16:10:41 ts/train.py:225 Start offline training on a fixed dataset
-Traceback (most recent call last):
-  File "/home/jade_choghari/lerobot/src/lerobot/scripts/train.py", line 342, in <module>
-    main()
-  File "/home/jade_choghari/lerobot/src/lerobot/scripts/train.py", line 338, in main
-    train()
-  File "/home/jade_choghari/lerobot/src/lerobot/configs/parser.py", line 225, in wrapper_inner
-    response = fn(cfg, *args, **kwargs)
-  File "/home/jade_choghari/lerobot/src/lerobot/scripts/train.py", line 235, in train
-    train_tracker, output_dict = update_policy(
-  File "/home/jade_choghari/lerobot/src/lerobot/scripts/train.py", line 76, in update_policy
-    grad_scaler.unscale_(optimizer)
-  File "/home/jade_choghari/miniconda3/envs/lerobot/lib/python3.10/site-packages/torch/amp/grad_scaler.py", line 342
-, in unscale_
-    optimizer_state["found_inf_per_device"] = self._unscale_grads_(
-  File "/home/jade_choghari/miniconda3/envs/lerobot/lib/python3.10/site-packages/torch/amp/grad_scaler.py", line 283
-, in _unscale_grads_
-    torch._amp_foreach_non_finite_check_and_unscale_(
-RuntimeError: "_amp_foreach_non_finite_check_and_unscale_cuda" not implemented for 'BFloat16'
-(lerobot) jade_choghari@hf-dgx-01:~/lerobot$ clear
-(lerobot) jade_choghari@hf-dgx-01:~/lerobot$ bash examples/8_train_smolvla_must.sh
-Training dir: /raid/jade/logs/lerobot/lerobot_2_physical-intelligence_libero_smolvla_lr1e-4bs32steps100000
-/home/jade_choghari/miniconda3/envs/lerobot/lib/python3.10/site-packages/transformers/utils/hub.py:111: FutureWarnin
-g: Using `TRANSFORMERS_CACHE` is deprecated and will be removed in v5 of Transformers. Use `HF_HOME` instead.
-  warnings.warn(
-INFO 2025-09-09 16:12:28 ils/utils.py:48 Cuda backend detected, using cuda.
-WARNING 2025-09-09 16:12:28 /policies.py:81 Device 'None' is not available. Switching to 'cuda'.
-INFO 2025-09-09 16:12:28 ts/train.py:137 {'batch_size': 32,
- 'dataset': {'episodes': None,
-             'image_transforms': {'enable': False,
-                                  'max_num_transforms': 3,
-                                  'random_order': False,
-                                  'tfs': {'brightness': {'kwargs': {'brightness': [0.8,
-                                                                                   1.2]},
-                                                         'type': 'ColorJitter',
-                                                         'weight': 1.0},
-                                          'contrast': {'kwargs': {'contrast': [0.8,
-                                                                               1.2]},
-                                                       'type': 'ColorJitter',
-                                                       'weight': 1.0},
-                                          'hue': {'kwargs': {'hue': [-0.05,
-                                                                     0.05]},
-                                                  'type': 'ColorJitter',
-                                                  'weight': 1.0},
-                                          'saturation': {'kwargs': {'saturation': [0.5,
-                                                                                   1.5]},
-                                                         'type': 'ColorJitter',
-                                                         'weight': 1.0},
-                                          'sharpness': {'kwargs': {'sharpness': [0.5,
-                                                                                 1.5]},
-                                                        'type': 'SharpnessJitter',
-                                                        'weight': 1.0}}},
-             'repo_id': 'physical-intelligence/libero',
-             'revision': None,
-             'root': '/raid/jade/.cache/huggingface/datasets',
-             'use_imagenet_stats': True,
-             'video_backend': 'torchcodec'},
- 'env': {'camera_name': 'agentview_image,robot0_eye_in_hand_image',
-         'episode_length': 520,
-         'features': {'action': {'shape': [7],
-                                 'type': <FeatureType.ACTION: 'ACTION'>},
-                      'agent_pos': {'shape': [8],
-                                    'type': <FeatureType.STATE: 'STATE'>},
-                      'pixels/agentview_image': {'shape': [360, 360, 3],
-                                                 'type': <FeatureType.VISUAL: 'VISUAL'>},
-                      'pixels/robot0_eye_in_hand_image': {'shape': [360,
-                                                                    360,
-                                                                    3],
-                                                          'type': <FeatureType.VISUAL: 'VISUAL'>}},
-         'features_map': {'action': 'action',
-                          'agent_pos': 'observation.state',
-                          'pixels/agentview_image': 'observation.images.image',
-                          'pixels/robot0_eye_in_hand_image': 'observation.images.image2'},
-         'fps': 30,
-         'init_states': True,
-         'max_parallel_tasks': 5,
-         'multitask_eval': True,
-         'obs_type': 'pixels_agent_pos',
-         'render_mode': 'rgb_array',
-         'task': 'libero_spatial',
-         'type': 'libero'},
- 'eval': {'batch_size': 1, 'n_episodes': 1, 'use_async_envs': False},
- 'eval_freq': 0,
- 'job_name': 'libero_smolvla',
- 'log_freq': 200,
- 'num_workers': 4,
- 'optimizer': {'betas': [0.9, 0.95],
-               'eps': 1e-08,
-               'grad_clip_norm': 10,
-               'lr': 0.0001,
-               'type': 'adamw',
-               'weight_decay': 1e-10},
- 'output_dir': '/raid/jade/logs/lerobot/lerobot_2_physical-intelligence_libero_smolvla_lr1e-4bs32steps100000',
- 'policy': {'adapt_to_pi_aloha': False,
-            'add_image_special_tokens': False,
-            'attention_mode': 'cross_attn',
-            'chunk_size': 50,
-            'device': 'cuda',
-            'empty_cameras': 0,
-            'expert_width_multiplier': 0.5,
-            'freeze_vision_encoder': True,
-            'gradient_accumulation_steps': 1,
-            'input_features': {},
-            'license': None,
-            'load_vlm_weights': False,
-            'max_action_dim': 32,
-            'max_period': 4.0,
-            'max_state_dim': 32,
-            'min_period': 0.004,
-            'n_action_steps': 1,
-            'n_obs_steps': 1,
-            'normalization_mapping': {'ACTION': <NormalizationMode.MEAN_STD: 'MEAN_STD'>,
-                                      'STATE': <NormalizationMode.MEAN_STD: 'MEAN_STD'>,
-                                      'VISUAL': <NormalizationMode.IDENTITY: 'IDENTITY'>},
-            'num_expert_layers': -1,
-            'num_steps': 10,
-            'num_vlm_layers': 16,
-            'optimizer_betas': [0.9, 0.95],
-            'optimizer_eps': 1e-08,
-            'optimizer_grad_clip_norm': 10,
-            'optimizer_lr': 0.0001,
-            'optimizer_weight_decay': 1e-10,
-            'output_features': {},
-            'pad_language_to': 'longest',
-            'prefix_length': 0,
-            'private': None,
-            'push_to_hub': True,
-            'repo_id': 'None',
-            'resize_imgs_with_padding': [512, 512],
-            'scheduler_decay_lr': 2.5e-06,
-            'scheduler_decay_steps': 30000,
-            'scheduler_warmup_steps': 1000,
-            'self_attn_every_n_layers': 2,
-            'tags': None,
-            'tokenizer_max_length': 48,
-            'train_expert_only': True,
-            'train_state_proj': True,
-            'type': 'smolvla',
-            'use_amp': True,
-            'use_cache': True,
-            'use_delta_joint_actions_aloha': False,
-            'vlm_model_name': 'HuggingFaceTB/SmolVLM2-500M-Instruct'},
- 'resume': False,
- 'save_checkpoint': True,
- 'save_freq': 20000,
- 'scheduler': {'decay_lr': 2.5e-06,
-               'num_decay_steps': 30000,
-               'num_warmup_steps': 1000,
-               'peak_lr': 0.0001,
-               'type': 'cosine_decay_with_warmup'},
- 'seed': 1000,
- 'steps': 100000,
- 'use_policy_training_preset': True,
- 'wandb': {'disable_artifact': False,
-           'enable': False,
-           'entity': None,
-           'mode': None,
-           'notes': None,
-           'project': 'lerobot',
-           'run_id': None}}
-INFO 2025-09-09 16:12:28 ts/train.py:143 Logs will be saved locally.
-INFO 2025-09-09 16:12:28 ts/train.py:153 Creating dataset
-WARNING 2025-09-09 16:12:28 ts/utils.py:302
-The dataset you requested (physical-intelligence/libero) is in 2.0 format.
-While current version of LeRobot is backward-compatible with it, the version of your dataset still uses global
-stats instead of per-episode stats. Update your dataset stats to the new format using this command:
-```
-python -m lerobot.datasets.v21.convert_dataset_v20_to_v21 --repo-id=physical-intelligence/libero
-```
-
-If you encounter a problem, contact LeRobot maintainers on [Discord](https://discord.com/invite/s3KuuzsPFb)
-or open an [issue on GitHub](https://github.com/huggingface/lerobot/issues/new/choose).
-
-WARNING 2025-09-09 16:12:28 ts/utils.py:302
-The dataset you requested (physical-intelligence/libero) is in 2.0 format.
-While current version of LeRobot is backward-compatible with it, the version of your dataset still uses global
-stats instead of per-episode stats. Update your dataset stats to the new format using this command:
-```
-python -m lerobot.datasets.v21.convert_dataset_v20_to_v21 --repo-id=physical-intelligence/libero
-```
-
-If you encounter a problem, contact LeRobot maintainers on [Discord](https://discord.com/invite/s3KuuzsPFb)
-or open an [issue on GitHub](https://github.com/huggingface/lerobot/issues/new/choose).
-
-Resolving data files: 100%|█████████████████████████████████| 1693/1693 [00:00<00:00, 87666.13it/s]
-Loading dataset shards: 100%|████████████████████████████████████| 70/70 [00:00<00:00, 4223.20it/s]
-INFO 2025-09-09 16:12:34 ts/train.py:163 Creating policy
-Fetching 2 files: 100%|███████████████████████████████████████████| 2/2 [00:00<00:00, 43690.67it/s]
-Fetching 2 files: 100%|████████████████████████████████████████████| 2/2 [00:00<00:00, 4871.43it/s]
-Fetching 2 files: 100%|████████████████████████████████████████████| 2/2 [00:00<00:00, 6512.89it/s]
-Fetching 2 files: 100%|███████████████████████████████████████████| 2/2 [00:00<00:00, 43018.50it/s]
-Reducing the number of VLM layers to 16 ...
-INFO 2025-09-09 16:13:06 ts/train.py:168 Creating optimizer and scheduler
-INFO 2025-09-09 16:13:06 ts/train.py:180 Output dir: /raid/jade/logs/lerobot/lerobot_2_physical-intelligence_libero_
-smolvla_lr1e-4bs32steps100000
-INFO 2025-09-09 16:13:06 ts/train.py:182 cfg.env.task='libero_spatial'
-INFO 2025-09-09 16:13:06 ts/train.py:183 cfg.steps=100000 (100K)
-INFO 2025-09-09 16:13:06 ts/train.py:184 dataset.num_frames=273465 (273K)
-INFO 2025-09-09 16:13:06 ts/train.py:185 dataset.num_episodes=1693
-INFO 2025-09-09 16:13:06 ts/train.py:186 num_learnable_params=49103712 (49M)
-INFO 2025-09-09 16:13:06 ts/train.py:187 num_total_params=399268940 (399M)
-INFO 2025-09-09 16:13:06 ts/train.py:225 Start offline training on a fixed dataset
-Traceback (most recent call last):
-  File "/home/jade_choghari/lerobot/src/lerobot/scripts/train.py", line 342, in <module>
-    main()
-  File "/home/jade_choghari/lerobot/src/lerobot/scripts/train.py", line 338, in main
-    train()
-  File "/home/jade_choghari/lerobot/src/lerobot/configs/parser.py", line 225, in wrapper_inner
-    response = fn(cfg, *args, **kwargs)
-  File "/home/jade_choghari/lerobot/src/lerobot/scripts/train.py", line 235, in train
-    train_tracker, output_dict = update_policy(
-  File "/home/jade_choghari/lerobot/src/lerobot/scripts/train.py", line 76, in update_policy
-    grad_scaler.unscale_(optimizer)
-  File "/home/jade_choghari/miniconda3/envs/lerobot/lib/python3.10/site-packages/torch/amp/grad_scaler.py", line 342
-, in unscale_
-    optimizer_state["found_inf_per_device"] = self._unscale_grads_(
-  File "/home/jade_choghari/miniconda3/envs/lerobot/lib/python3.10/site-packages/torch/amp/grad_scaler.py", line 283
-, in _unscale_grads_
-    torch._amp_foreach_non_finite_check_and_unscale_(
-RuntimeError: "_amp_foreach_non_finite_check_and_unscale_cuda" not implemented for 'BFloat16'
-(lerobot) jade_choghari@hf-dgx-01:~/lerobot$ bash examples/8_train_smolvla_must.sh
-Training dir: /raid/jade/logs/lerobot/lerobot_2_physical-intelligence_libero_smolvla_lr1e-4bs32steps100000
-/home/jade_choghari/miniconda3/envs/lerobot/lib/python3.10/site-packages/transformers/utils/hub.py:111: FutureWarnin
-g: Using `TRANSFORMERS_CACHE` is deprecated and will be removed in v5 of Transformers. Use `HF_HOME` instead.
-  warnings.warn(
-INFO 2025-09-09 16:13:51 ils/utils.py:48 Cuda backend detected, using cuda.
-WARNING 2025-09-09 16:13:51 /policies.py:81 Device 'None' is not available. Switching to 'cuda'.
-INFO 2025-09-09 16:13:51 ts/train.py:137 {'batch_size': 32,
- 'dataset': {'episodes': None,
-             'image_transforms': {'enable': False,
-                                  'max_num_transforms': 3,
-                                  'random_order': False,
-                                  'tfs': {'brightness': {'kwargs': {'brightness': [0.8,
-                                                                                   1.2]},
-                                                         'type': 'ColorJitter',
-                                                         'weight': 1.0},
-                                          'contrast': {'kwargs': {'contrast': [0.8,
-                                                                               1.2]},
-                                                       'type': 'ColorJitter',
-                                                       'weight': 1.0},
-                                          'hue': {'kwargs': {'hue': [-0.05,
-                                                                     0.05]},
-                                                  'type': 'ColorJitter',
-                                                  'weight': 1.0},
-                                          'saturation': {'kwargs': {'saturation': [0.5,
-                                                                                   1.5]},
-                                                         'type': 'ColorJitter',
-                                                         'weight': 1.0},
-                                          'sharpness': {'kwargs': {'sharpness': [0.5,
-                                                                                 1.5]},
-                                                        'type': 'SharpnessJitter',
-                                                        'weight': 1.0}}},
-             'repo_id': 'physical-intelligence/libero',
-             'revision': None,
-             'root': '/raid/jade/.cache/huggingface/datasets',
-             'use_imagenet_stats': True,
-             'video_backend': 'torchcodec'},
- 'env': {'camera_name': 'agentview_image,robot0_eye_in_hand_image',
-         'episode_length': 520,
-         'features': {'action': {'shape': [7],
-                                 'type': <FeatureType.ACTION: 'ACTION'>},
-                      'agent_pos': {'shape': [8],
-                                    'type': <FeatureType.STATE: 'STATE'>},
-                      'pixels/agentview_image': {'shape': [360, 360, 3],
-                                                 'type': <FeatureType.VISUAL: 'VISUAL'>},
-                      'pixels/robot0_eye_in_hand_image': {'shape': [360,
-                                                                    360,
-                                                                    3],
-                                                          'type': <FeatureType.VISUAL: 'VISUAL'>}},
-         'features_map': {'action': 'action',
-                          'agent_pos': 'observation.state',
-                          'pixels/agentview_image': 'observation.images.image',
-                          'pixels/robot0_eye_in_hand_image': 'observation.images.image2'},
-         'fps': 30,
-         'init_states': True,
-         'max_parallel_tasks': 5,
-         'multitask_eval': True,
-         'obs_type': 'pixels_agent_pos',
-         'render_mode': 'rgb_array',
-         'task': 'libero_spatial',
-         'type': 'libero'},
- 'eval': {'batch_size': 1, 'n_episodes': 1, 'use_async_envs': False},
- 'eval_freq': 0,
- 'job_name': 'libero_smolvla',
- 'log_freq': 200,
- 'num_workers': 4,
- 'optimizer': {'betas': [0.9, 0.95],
-               'eps': 1e-08,
-               'grad_clip_norm': 10,
-               'lr': 0.0001,
-               'type': 'adamw',
-               'weight_decay': 1e-10},
- 'output_dir': '/raid/jade/logs/lerobot/lerobot_2_physical-intelligence_libero_smolvla_lr1e-4bs32steps100000',
- 'policy': {'adapt_to_pi_aloha': False,
-            'add_image_special_tokens': False,
-            'attention_mode': 'cross_attn',
-            'chunk_size': 50,
-            'device': 'cuda',
-            'empty_cameras': 0,
-            'expert_width_multiplier': 0.5,
-            'freeze_vision_encoder': True,
-            'gradient_accumulation_steps': 1,
-            'input_features': {},
-            'license': None,
-            'load_vlm_weights': False,
-            'max_action_dim': 32,
-            'max_period': 4.0,
-            'max_state_dim': 32,
-            'min_period': 0.004,
-            'n_action_steps': 1,
-            'n_obs_steps': 1,
-            'normalization_mapping': {'ACTION': <NormalizationMode.MEAN_STD: 'MEAN_STD'>,
-                                      'STATE': <NormalizationMode.MEAN_STD: 'MEAN_STD'>,
-                                      'VISUAL': <NormalizationMode.IDENTITY: 'IDENTITY'>},
-            'num_expert_layers': -1,
-            'num_steps': 10,
-            'num_vlm_layers': 16,
-            'optimizer_betas': [0.9, 0.95],
-            'optimizer_eps': 1e-08,
-            'optimizer_grad_clip_norm': 10,
-            'optimizer_lr': 0.0001,
-            'optimizer_weight_decay': 1e-10,
-            'output_features': {},
-            'pad_language_to': 'longest',
-            'prefix_length': 0,
-            'private': None,
-            'push_to_hub': True,
-            'repo_id': 'None',
-            'resize_imgs_with_padding': [512, 512],
-            'scheduler_decay_lr': 2.5e-06,
-            'scheduler_decay_steps': 30000,
-            'scheduler_warmup_steps': 1000,
-            'self_attn_every_n_layers': 2,
-            'tags': None,
-            'tokenizer_max_length': 48,
-            'train_expert_only': True,
-            'train_state_proj': True,
-            'type': 'smolvla',
-            'use_amp': False,
-            'use_cache': True,
-            'use_delta_joint_actions_aloha': False,
-            'vlm_model_name': 'HuggingFaceTB/SmolVLM2-500M-Instruct'},
- 'resume': False,
- 'save_checkpoint': True,
- 'save_freq': 20000,
- 'scheduler': {'decay_lr': 2.5e-06,
-               'num_decay_steps': 30000,
-               'num_warmup_steps': 1000,
-               'peak_lr': 0.0001,
-               'type': 'cosine_decay_with_warmup'},
- 'seed': 1000,
- 'steps': 100000,
- 'use_policy_training_preset': True,
- 'wandb': {'disable_artifact': False,
-           'enable': False,
-           'entity': None,
-           'mode': None,
-           'notes': None,
-           'project': 'lerobot',
-           'run_id': None}}
-INFO 2025-09-09 16:13:51 ts/train.py:143 Logs will be saved locally.
-INFO 2025-09-09 16:13:51 ts/train.py:153 Creating dataset
-WARNING 2025-09-09 16:13:51 ts/utils.py:302
-The dataset you requested (physical-intelligence/libero) is in 2.0 format.
-While current version of LeRobot is backward-compatible with it, the version of your dataset still uses global
-stats instead of per-episode stats. Update your dataset stats to the new format using this command:
-```
-python -m lerobot.datasets.v21.convert_dataset_v20_to_v21 --repo-id=physical-intelligence/libero
-```
-
-If you encounter a problem, contact LeRobot maintainers on [Discord](https://discord.com/invite/s3KuuzsPFb)
-or open an [issue on GitHub](https://github.com/huggingface/lerobot/issues/new/choose).
-
-WARNING 2025-09-09 16:13:51 ts/utils.py:302
-The dataset you requested (physical-intelligence/libero) is in 2.0 format.
-While current version of LeRobot is backward-compatible with it, the version of your dataset still uses global
-stats instead of per-episode stats. Update your dataset stats to the new format using this command:
-```
-python -m lerobot.datasets.v21.convert_dataset_v20_to_v21 --repo-id=physical-intelligence/libero
-```
-
-If you encounter a problem, contact LeRobot maintainers on [Discord](https://discord.com/invite/s3KuuzsPFb)
-or open an [issue on GitHub](https://github.com/huggingface/lerobot/issues/new/choose).
-
-Resolving data files: 100%|█████████████████████████████████| 1693/1693 [00:00<00:00, 82981.28it/s]
-Loading dataset shards: 100%|████████████████████████████████████| 70/70 [00:00<00:00, 4687.94it/s]
-INFO 2025-09-09 16:13:57 ts/train.py:163 Creating policy
-Fetching 2 files: 100%|███████████████████████████████████████████| 2/2 [00:00<00:00, 21345.06it/s]
-Fetching 2 files: 100%|████████████████████████████████████████████| 2/2 [00:00<00:00, 4226.00it/s]
-Fetching 2 files: 100%|████████████████████████████████████████████| 2/2 [00:00<00:00, 2966.27it/s]
-Fetching 2 files: 100%|████████████████████████████████████████████| 2/2 [00:00<00:00, 6497.76it/s]
-Reducing the number of VLM layers to 16 ...
-INFO 2025-09-09 16:14:30 ts/train.py:168 Creating optimizer and scheduler
-INFO 2025-09-09 16:14:30 ts/train.py:180 Output dir: /raid/jade/logs/lerobot/lerobot_2_physical-intelligence_libero_
-smolvla_lr1e-4bs32steps100000
-INFO 2025-09-09 16:14:30 ts/train.py:182 cfg.env.task='libero_spatial'
-INFO 2025-09-09 16:14:30 ts/train.py:183 cfg.steps=100000 (100K)
-INFO 2025-09-09 16:14:30 ts/train.py:184 dataset.num_frames=273465 (273K)
-INFO 2025-09-09 16:14:30 ts/train.py:185 dataset.num_episodes=1693
-INFO 2025-09-09 16:14:30 ts/train.py:186 num_learnable_params=49103712 (49M)
-INFO 2025-09-09 16:14:30 ts/train.py:187 num_total_params=399268940 (399M)
-INFO 2025-09-09 16:14:30 ts/train.py:225 Start offline training on a fixed dataset
-INFO 2025-09-09 16:16:20 ts/train.py:255 step:200 smpl:6K ep:40 epch:0.02 loss:1.244 grdn:2.492 lr:1.0e-05 updt_s:0.
-536 data_s:0.007
-INFO 2025-09-09 16:17:56 ts/train.py:255 step:400 smpl:13K ep:79 epch:0.05 loss:0.685 grdn:4.262 lr:3.0e-05 updt_s:0
-.481 data_s:0.000
-INFO 2025-09-09 16:19:33 ts/train.py:255 step:600 smpl:19K ep:119 epch:0.07 loss:0.364 grdn:4.849 lr:5.0e-05 updt_s:
-0.482 data_s:0.000
-INFO 2025-09-09 16:21:10 ts/train.py:255 step:800 smpl:26K ep:158 epch:0.09 loss:0.239 grdn:4.024 lr:7.0e-05 updt_s:
-0.481 data_s:0.000
-INFO 2025-09-09 16:22:46 ts/train.py:255 step:1K smpl:32K ep:198 epch:0.12 loss:0.197 grdn:3.267 lr:9.0e-05 updt_s:0
-.478 data_s:0.000
-INFO 2025-09-09 16:24:22 ts/train.py:255 step:1K smpl:38K ep:238 epch:0.14 loss:0.173 grdn:2.319 lr:1.0e-04 updt_s:0
-.481 data_s:0.000
-INFO 2025-09-09 16:25:59 ts/train.py:255 step:1K smpl:45K ep:277 epch:0.16 loss:0.153 grdn:1.741 lr:1.0e-04 updt_s:0
-.483 data_s:0.000
-INFO 2025-09-09 16:27:36 ts/train.py:255 step:2K smpl:51K ep:317 epch:0.19 loss:0.135 grdn:1.354 lr:9.9e-05 updt_s:0
-.483 data_s:0.000
-INFO 2025-09-09 16:29:14 ts/train.py:255 step:2K smpl:58K ep:357 epch:0.21 loss:0.126 grdn:1.177 lr:9.9e-05 updt_s:0
-.484 data_s:0.000
-
diff --git a/src/lerobot/envs/libero copy.py b/src/lerobot/envs/libero copy.py
deleted file mode 100644
index 83ccd2fb9..000000000
--- a/src/lerobot/envs/libero copy.py	
+++ /dev/null
@@ -1,326 +0,0 @@
-import math
-import os
-from collections import defaultdict
-from collections.abc import Callable
-from itertools import chain
-from typing import Any
-
-import gymnasium as gym
-import numpy as np
-import torch
-from gymnasium import spaces
-from libero.libero import benchmark, get_libero_path
-from libero.libero.envs import OffScreenRenderEnv
-
-
-def create_libero_envs(
-    task: str,
-    n_envs: int,
-    gym_kwargs: dict[str, Any] = None,
-    camera_name: str = "agentview_image,robot0_eye_in_hand_image",
-    init_states: bool = True,
-    env_cls: Callable = None,
-    multitask_eval: bool = True,
-) -> dict[str, dict[str, Any]]:
-    """
-    Here n_envs is per task and equal to the number of rollouts.
-    Returns:
-        dict[str, dict[str, list[LiberoEnv]]]: keys are task_suite and values are list of LiberoEnv envs.
-    """
-    print("num envs", n_envs)
-    print("multitask_eval", multitask_eval)
-    print("gym_kwargs", gym_kwargs)
-    if gym_kwargs is None:
-        gym_kwargs = {}
-
-    if not multitask_eval:
-        benchmark_dict = benchmark.get_benchmark_dict()
-        task_suite = benchmark_dict[task]()  # can also choose libero_spatial, libero_object, libero_10 etc.
-        tasks_id = list(range(len(task_suite.tasks)))
-        episode_indices = [0 for i in range(len(tasks_id))]
-        if len(tasks_id) == 1:
-            tasks_id = [tasks_id[0] for _ in range(n_envs)]
-            episode_indices = list(range(n_envs))
-        elif len(tasks_id) < n_envs and n_envs % len(tasks_id) == 0:
-            n_repeat = n_envs // len(tasks_id)
-            print("n_repeat", n_repeat)
-            episode_indices = []
-            for _ in range(len(tasks_id)):
-                episode_indices.extend(list(range(n_repeat)))
-            tasks_id = list(chain.from_iterable([[item] * n_repeat for item in tasks_id]))
-        elif n_envs < len(tasks_id):
-            tasks_id = tasks_id[:n_envs]
-            episode_indices = list(range(n_envs))[:n_envs]
-            print(f"WARNING: n_envs < len(tasks_id), evaluating only on {tasks_id}")
-        print(f"Creating Libero envs with task ids {tasks_id} from suite {task}")
-        assert n_envs == len(tasks_id), (
-            f"len(n_envs) and tasks_id should be the same, got {n_envs} and {len(tasks_id)}"
-        )
-        return env_cls(
-            [
-                lambda i=i: LiberoEnv(
-                    task_suite=task_suite,
-                    task_id=tasks_id[i],
-                    task_suite_name=task,
-                    camera_name=camera_name,
-                    init_states=init_states,
-                    episode_index=episode_indices[i],
-                    **gym_kwargs,
-                )
-                for i in range(n_envs)
-            ]
-        )
-    else:
-        envs = defaultdict(dict)
-        benchmark_dict = benchmark.get_benchmark_dict()
-        task = task.split(",")
-        for _task in task:
-            task_suite = benchmark_dict[
-                _task
-            ]()  # can also choose libero_spatial, libero_object, libero_10 etc.
-            tasks_ids = list(range(len(task_suite.tasks)))
-            for tasks_id in tasks_ids:
-                episode_indices = list(range(n_envs))
-                print(
-                    f"Creating Libero envs with task ids {tasks_id} from suite {_task}, episode_indices: {episode_indices}"
-                )
-                envs_list = [
-                    (
-                        lambda i=i,
-                        task_suite=task_suite,
-                        tasks_id=tasks_id,
-                        _task=_task,
-                        episode_indices=episode_indices: LiberoEnv(
-                            task_suite=task_suite,
-                            task_id=tasks_id,
-                            task_suite_name=_task,
-                            camera_name=camera_name,
-                            init_states=init_states,
-                            episode_index=episode_indices[i],
-                            **gym_kwargs,
-                        )
-                    )
-                    for i in range(n_envs)
-                ]
-                envs[_task][tasks_id] = env_cls(envs_list)
-        return envs
-
-
-def quat2axisangle(quat):
-    """
-    Copied from robosuite: https://github.com/ARISE-Initiative/robosuite/blob/eafb81f54ffc104f905ee48a16bb15f059176ad3/robosuite/utils/transform_utils.py#L490C1-L512C55
-
-    Converts quaternion to axis-angle format.
-    Returns a unit vector direction scaled by its angle in radians.
-
-    Args:
-        quat (np.array): (x,y,z,w) vec4 float angles
-
-    Returns:
-        np.array: (ax,ay,az) axis-angle exponential coordinates
-    """
-    # clip quaternion
-    if quat[3] > 1.0:
-        quat[3] = 1.0
-    elif quat[3] < -1.0:
-        quat[3] = -1.0
-
-    den = np.sqrt(1.0 - quat[3] * quat[3])
-    if math.isclose(den, 0.0):
-        # This is (close to) a zero degree rotation, immediately return
-        return np.zeros(3)
-
-    return (quat[:3] * 2.0 * math.acos(quat[3])) / den
-
-
-def get_task_init_states(task_suite, i):
-    init_states_path = os.path.join(
-        get_libero_path("init_states"),
-        task_suite.tasks[i].problem_folder,
-        task_suite.tasks[i].init_states_file,
-    )
-    init_states = torch.load(init_states_path, weights_only=False)  # nosec B614
-    return init_states
-
-
-def get_libero_dummy_action():
-    """Get dummy/no-op action, used to roll out the simulation while the robot does nothing."""
-    return [0, 0, 0, 0, 0, 0, -1]
-
-
-OBS_STATE_DIM = 8
-ACTION_DIM = 7
-
-
-class LiberoEnv(gym.Env):
-    metadata = {"render_modes": ["rgb_array"], "render_fps": 80}
-
-    def __init__(
-        self,
-        task_suite,
-        task_id,
-        task_suite_name,
-        camera_name="agentview_image,robot0_eye_in_hand_image",
-        obs_type="pixels",
-        render_mode="rgb_array",
-        observation_width=256,
-        observation_height=256,
-        visualization_width=640,
-        visualization_height=480,
-        init_states=True,
-        episode_index=0,
-    ):
-        super().__init__()
-        self.task_id = task_id
-        self.obs_type = obs_type
-        self.render_mode = render_mode
-        self.observation_width = observation_width
-        self.observation_height = observation_height
-        self.visualization_width = visualization_width
-        self.visualization_height = visualization_height
-        self.init_states = init_states
-        self.camera_name = camera_name.split(
-            ","
-        )  # agentview_image (main) or robot0_eye_in_hand_image (wrist)
-
-        # Map raw camera names to "image1" and "image2".
-        # The preprocessing step `preprocess_observation` will then prefix these with `.images.*`,
-        # following the LeRobot convention (e.g., `observation.images.image`, `observation.images.image2`).
-        # This ensures the policy consistently receives observations in the
-        # expected format regardless of the original camera naming.
-        self.camera_name_mapping = {
-            "agentview_image": "image",
-            "robot0_eye_in_hand_image": "image2",
-        }
-
-        self.num_steps_wait = (
-            10  # Do nothing for the first few timesteps to wait for the simulator drops objects
-        )
-        self.episode_index = episode_index
-
-        self._env = self._make_envs_task(task_suite, self.task_id)
-        if task_suite_name == "libero_spatial":
-            max_steps = 220  # longest training demo has 193 steps
-        elif task_suite_name == "libero_object":
-            max_steps = 280  # longest training demo has 254 steps
-        elif task_suite_name == "libero_goal":
-            max_steps = 300  # longest training demo has 270 steps
-        elif task_suite_name == "libero_10":
-            max_steps = 520  # longest training demo has 505 steps
-        elif task_suite_name == "libero_90":
-            max_steps = 400  # longest training demo has 373 steps
-        self._max_episode_steps = max_steps
-
-        images = {}
-        for cam in self.camera_name:
-            images[self.camera_name_mapping[cam]] = spaces.Box(
-                low=0,
-                high=255,
-                shape=(self.observation_height, self.observation_width, 3),
-                dtype=np.uint8,
-            )
-
-        if self.obs_type == "state":
-            raise NotImplementedError()
-        elif self.obs_type == "pixels":
-            self.observation_space = spaces.Dict(
-                {
-                    "pixels": spaces.Dict(images),
-                }
-            )
-        elif self.obs_type == "pixels_agent_pos":
-            self.observation_space = spaces.Dict(
-                {
-                    "pixels": spaces.Dict(images),
-                    "agent_pos": spaces.Box(
-                        low=-1000.0,
-                        high=1000.0,
-                        shape=(OBS_STATE_DIM,),
-                        dtype=np.float64,
-                    ),
-                }
-            )
-
-        self.action_space = spaces.Box(low=-1, high=1, shape=(ACTION_DIM,), dtype=np.float32)
-
-    def render(self):
-        raw_obs = self._env.env._get_observations()
-        image = self._format_raw_obs(raw_obs)["pixels"]["image"]
-        return image
-
-    def _make_envs_task(self, task_suite, task_id: int = 0):
-        task = task_suite.get_task(task_id)
-        self.task = task.name
-        self.task_description = task.language
-        task_bddl_file = os.path.join(get_libero_path("bddl_files"), task.problem_folder, task.bddl_file)
-
-        env_args = {
-            "bddl_file_name": task_bddl_file,
-            "camera_heights": self.observation_height,
-            "camera_widths": self.observation_width,
-        }
-        env = OffScreenRenderEnv(**env_args)
-        env.reset()
-        if self.init_states:
-            init_states = get_task_init_states(
-                task_suite, task_id
-            )  # for benchmarking purpose, we fix the a set of initial states FIXME(mshukor): should be in the reset()?
-            init_state_id = self.episode_index  # episode index
-            env.set_init_state(init_states[init_state_id])
-
-        return env
-
-    def _format_raw_obs(self, raw_obs):
-        images = {}
-        for camera_name in self.camera_name:
-            image = raw_obs[camera_name]
-            image = image[::-1, ::-1]  # rotate 180 degrees
-            images[self.camera_name_mapping[camera_name]] = image
-        state = np.concatenate(
-            (
-                raw_obs["robot0_eef_pos"],
-                quat2axisangle(raw_obs["robot0_eef_quat"]),
-                raw_obs["robot0_gripper_qpos"],
-            )
-        )
-        agent_pos = state
-        if self.obs_type == "state":
-            raise NotImplementedError()
-        elif self.obs_type == "pixels":
-            obs = {"pixels": images.copy()}
-        elif self.obs_type == "pixels_agent_pos":
-            obs = {
-                "pixels": images.copy(),
-                "agent_pos": agent_pos,
-            }
-        return obs
-
-    def reset(self, seed=None, **kwargs):
-        super().reset(seed=seed)
-
-        self._env.seed(seed)
-        raw_obs = self._env.reset()
-        # Do nothing for the first few timesteps to wait for the simulator drops objects
-        for _ in range(self.num_steps_wait):
-            raw_obs, _, _, _ = self._env.step(get_libero_dummy_action())
-        observation = self._format_raw_obs(raw_obs)
-        info = {"is_success": False}
-        return observation, info
-
-    def step(self, action):
-        assert action.ndim == 1
-        raw_obs, reward, done, info = self._env.step(action)
-
-        is_success = self._env.check_success()
-        terminated = done or is_success
-        info["is_success"] = done  # is_success
-
-        observation = self._format_raw_obs(raw_obs)
-        if done:
-            self.reset()
-            print(self.task, self.task_id, done, is_success)
-        truncated = False
-        return observation, reward, terminated, truncated, info
-
-    def close(self):
-        self._env.close()
diff --git a/src/lerobot/envs/libero2.py b/src/lerobot/envs/libero2.py
deleted file mode 100644
index 1e794072c..000000000
--- a/src/lerobot/envs/libero2.py
+++ /dev/null
@@ -1,308 +0,0 @@
-import math
-import os
-from collections import defaultdict
-from itertools import chain
-from typing import Any, Callable
-
-import gymnasium as gym
-import numpy as np
-import torch
-from gymnasium import spaces
-from libero.libero import benchmark, get_libero_path
-from libero.libero.envs import OffScreenRenderEnv
-
-
-OBS_IMAGE = "observation.image"
-OBS_IMAGE_2 = "observation.image2"
-def create_libero_envs(
-    task: str,
-    n_envs: int,
-    gym_kwargs: dict[str, Any] = None,
-    camera_name: str = "agentview_image,robot0_eye_in_hand_image",
-    init_states: bool = True,
-    env_cls: Callable = None,
-    multitask_eval: bool = True,
-) -> dict[str, dict[str, Any]]:
-    """
-    Here n_envs is per task and equal to the number of rollouts.
-    Returns:
-        dict[str, dict[str, list[LiberoEnv]]]: keys are task_suite and values are list of LiberoEnv envs.
-    """
-    if gym_kwargs is None:
-        gym_kwargs = {}
-
-    if not multitask_eval:
-        benchmark_dict = benchmark.get_benchmark_dict()
-        task_suite = benchmark_dict[task]()  # can also choose libero_spatial, libero_object, libero_10 etc.
-        tasks_id = list(range(len(task_suite.tasks)))
-        episode_indices = [0 for i in range(len(tasks_id))]
-        if len(tasks_id) == 1:
-            tasks_id = [tasks_id[0] for _ in range(n_envs)]
-            episode_indices = list(range(n_envs))
-        elif len(tasks_id) < n_envs and n_envs % len(tasks_id) == 0:
-            n_repeat = n_envs // len(tasks_id)
-            episode_indices = []
-            for i in range(len(tasks_id)):
-                episode_indices.extend(list(range(n_repeat)))
-            tasks_id = list(chain.from_iterable([[item] * n_repeat for item in tasks_id]))
-        elif n_envs < len(tasks_id):
-            tasks_id = tasks_id[:n_envs]
-            episode_indices = list(range(n_envs))[:n_envs]
-            print(f"WARNING: n_envs < len(tasks_id), evaluating only on {tasks_id}")
-        print(f"Creating Libero envs with task ids {tasks_id} from suite {task}")
-        assert n_envs == len(
-            tasks_id
-        ), f"len(n_envs) and tasks_id should be the same, got {n_envs} and {len(tasks_id)}"
-        return env_cls(
-            [
-                lambda i=i: LiberoEnv(
-                    task_suite=task_suite,
-                    task_id=tasks_id[i],
-                    task_suite_name=task,
-                    camera_name=camera_name,
-                    init_states=init_states,
-                    episode_index=episode_indices[i],
-                    **gym_kwargs,
-                )
-                for i in range(n_envs)
-            ]
-        )
-    else:
-        envs = defaultdict(dict)
-        benchmark_dict = benchmark.get_benchmark_dict()
-        task = task.split(",")
-        for _task in task:
-            task_suite = benchmark_dict[
-                _task
-            ]()  # can also choose libero_spatial, libero_object, libero_10 etc.
-            tasks_ids = list(range(len(task_suite.tasks)))
-            # tasks_ids = [0] # FIXME(mshukor): debug
-            for tasks_id in tasks_ids:
-                episode_indices = list(range(n_envs))
-                print(
-                    f"Creating Libero envs with task ids {tasks_id} from suite {_task}, episode_indices: {episode_indices}"
-                )
-                envs_list = [
-                    lambda i=i: LiberoEnv(
-                        task_suite=task_suite,
-                        task_id=tasks_id,
-                        task_suite_name=_task,
-                        camera_name=camera_name,
-                        init_states=init_states,
-                        episode_index=episode_indices[i],
-                        **gym_kwargs,
-                    )
-                    for i in range(n_envs)
-                ]
-                envs[_task][tasks_id] = env_cls(envs_list)
-        return envs
-
-
-def quat2axisangle(quat):
-    """
-    Copied from robosuite: https://github.com/ARISE-Initiative/robosuite/blob/eafb81f54ffc104f905ee48a16bb15f059176ad3/robosuite/utils/transform_utils.py#L490C1-L512C55
-
-    Converts quaternion to axis-angle format.
-    Returns a unit vector direction scaled by its angle in radians.
-
-    Args:
-        quat (np.array): (x,y,z,w) vec4 float angles
-
-    Returns:
-        np.array: (ax,ay,az) axis-angle exponential coordinates
-    """
-    # clip quaternion
-    if quat[3] > 1.0:
-        quat[3] = 1.0
-    elif quat[3] < -1.0:
-        quat[3] = -1.0
-
-    den = np.sqrt(1.0 - quat[3] * quat[3])
-    if math.isclose(den, 0.0):
-        # This is (close to) a zero degree rotation, immediately return
-        return np.zeros(3)
-
-    return (quat[:3] * 2.0 * math.acos(quat[3])) / den
-
-
-def get_task_init_states(task_suite, i):
-    init_states_path = os.path.join(
-        get_libero_path("init_states"),
-        task_suite.tasks[i].problem_folder,
-        task_suite.tasks[i].init_states_file,
-    )
-    init_states = torch.load(init_states_path, weights_only=False)
-    return init_states
-
-
-def get_libero_dummy_action():
-    """Get dummy/no-op action, used to roll out the simulation while the robot does nothing."""
-    return [0, 0, 0, 0, 0, 0, -1]
-
-
-class LiberoEnv(gym.Env):
-    metadata = {"render_modes": ["rgb_array"], "render_fps": 80}
-
-    def __init__(
-        self,
-        task_suite,
-        task_id,
-        task_suite_name,
-        camera_name="agentview_image,robot0_eye_in_hand_image",
-        obs_type="pixels",
-        render_mode="rgb_array",
-        observation_width=256,
-        observation_height=256,
-        visualization_width=640,
-        visualization_height=480,
-        init_states=True,
-        episode_index=0,
-    ):
-        super().__init__()
-        self.task_id = task_id
-        self.obs_type = obs_type
-        self.render_mode = render_mode
-        self.observation_width = observation_width
-        self.observation_height = observation_height
-        self.visualization_width = visualization_width
-        self.visualization_height = visualization_height
-        self.init_states = init_states
-        self.camera_name = camera_name.split(
-            ","
-        )  # agentview_image (main) or robot0_eye_in_hand_image (wrist)
-        self.camera_name_mapping = {
-            "agentview_image": OBS_IMAGE,
-            "robot0_eye_in_hand_image": OBS_IMAGE_2,
-        }
-        self.num_steps_wait = (
-            10  # Do nothing for the first few timesteps to wait for the simulator drops objects
-        )
-        self.episode_index = episode_index
-
-        self._env = self._make_envs_task(task_suite, self.task_id)
-        if task_suite_name == "libero_spatial":
-            max_steps = 220  # longest training demo has 193 steps
-        elif task_suite_name == "libero_object":
-            max_steps = 280  # longest training demo has 254 steps
-        elif task_suite_name == "libero_goal":
-            max_steps = 300  # longest training demo has 270 steps
-        elif task_suite_name == "libero_10":
-            max_steps = 520  # longest training demo has 505 steps
-        elif task_suite_name == "libero_90":
-            max_steps = 400  # longest training demo has 373 steps
-        self._max_episode_steps = max_steps
-
-        images = {}
-        for cam in self.camera_name:
-            images[self.camera_name_mapping[cam]] = spaces.Box(
-                low=0,
-                high=255,
-                shape=(self.observation_height, self.observation_width, 3),
-                dtype=np.uint8,
-            )
-
-        if self.obs_type == "state":
-            raise NotImplementedError()
-        elif self.obs_type == "pixels":
-            self.observation_space = spaces.Dict(
-                {
-                    "pixels": spaces.Dict(images),
-                }
-            )
-        elif self.obs_type == "pixels_agent_pos":
-            self.observation_space = spaces.Dict(
-                {
-                    "pixels": spaces.Dict(images),
-                    "agent_pos": spaces.Box(
-                        low=-1000.0,
-                        high=1000.0,
-                        shape=(8,),
-                        dtype=np.float64,
-                    ),
-                }
-            )
-
-        self.action_space = spaces.Box(low=-1, high=1, shape=(7,), dtype=np.float32)
-
-    def render(self):
-        raw_obs = self._env.env._get_observations()
-        image = self._format_raw_obs(raw_obs)["pixels"][OBS_IMAGE]
-        return image
-
-    def _make_envs_task(self, task_suite, task_id: int = 0):
-        task = task_suite.get_task(task_id)
-        self.task = task.name
-        self.task_description = task.language
-        task_bddl_file = os.path.join(get_libero_path("bddl_files"), task.problem_folder, task.bddl_file)
-
-        env_args = {
-            "bddl_file_name": task_bddl_file,
-            "camera_heights": self.observation_height,
-            "camera_widths": self.observation_width,
-        }
-        env = OffScreenRenderEnv(**env_args)
-        env.reset()
-        if self.init_states:
-            init_states = get_task_init_states(
-                task_suite, task_id
-            )  # for benchmarking purpose, we fix the a set of initial states FIXME(mshukor): should be in the reset()?
-            init_state_id = self.episode_index  # episode index
-            env.set_init_state(init_states[init_state_id])
-
-        return env
-
-    def _format_raw_obs(self, raw_obs):
-        images = {}
-        for camera_name in self.camera_name:
-            image = raw_obs[camera_name]
-            image = image[::-1, ::-1]  # rotate 180 degrees
-            images[self.camera_name_mapping[camera_name]] = image
-        # images = image if len(images) == 1 else images
-        state = np.concatenate(
-            (
-                raw_obs["robot0_eef_pos"],
-                quat2axisangle(raw_obs["robot0_eef_quat"]),
-                raw_obs["robot0_gripper_qpos"],
-            )
-        )
-        agent_pos = state
-        if self.obs_type == "state":
-            raise NotImplementedError()
-        elif self.obs_type == "pixels":
-            obs = {"pixels": images.copy()}
-        elif self.obs_type == "pixels_agent_pos":
-            obs = {
-                "pixels": images.copy(),
-                "agent_pos": agent_pos,
-            }
-        return obs
-
-    def reset(self, seed=None, **kwargs):
-        super().reset(seed=seed)
-
-        self._env.seed(seed)
-        raw_obs = self._env.reset()
-        # Do nothing for the first few timesteps to wait for the simulator drops objects
-        for _ in range(self.num_steps_wait):
-            raw_obs, _, _, _ = self._env.step(get_libero_dummy_action())
-        observation = self._format_raw_obs(raw_obs)
-        info = {"is_success": False}
-        return observation, info
-
-    def step(self, action):
-        assert action.ndim == 1
-        raw_obs, reward, done, info = self._env.step(action)
-
-        is_success = self._env.check_success()
-        terminated = done or is_success
-        info["is_success"] = done  # is_success
-
-        observation = self._format_raw_obs(raw_obs)
-        if done:
-            self.reset()
-            print(self.task, self.task_id, done, is_success)
-        truncated = False
-        return observation, reward, terminated, truncated, info
-
-    def close(self):
-        self._env.close()
diff --git a/tmux_log.txt b/tmux_log.txt
deleted file mode 100644
index 4936578bc..000000000
--- a/tmux_log.txt
+++ /dev/null
@@ -1,2008 +0,0 @@
- 'resume': False,
- 'save_checkpoint': True,
- 'save_freq': 20000,
- 'scheduler': {'decay_lr': 2.5e-06,
-               'num_decay_steps': 30000,
-               'num_warmup_steps': 1000,
-               'peak_lr': 0.0001,
-               'type': 'cosine_decay_with_warmup'},
- 'seed': 1000,
- 'steps': 100000,
- 'use_policy_training_preset': True,
- 'wandb': {'disable_artifact': False,
-           'enable': False,
-           'entity': None,
-           'mode': None,
-           'notes': None,
-           'project': 'lerobot',
-           'run_id': None}}
-INFO 2025-09-08 13:23:15 ils/utils.py:48 Cuda backend detected, using cuda.
-WARNING 2025-09-08 13:23:15 /policies.py:81 Device 'None' is not available. Switching to 'cuda'.
-INFO 2025-09-08 13:23:15 ccelerate.py:99 {'batch_size': 32,
- 'dataset': {'episodes': None,
-             'image_transforms': {'enable': False,
-                                  'max_num_transforms': 3,
-                                  'random_order': False,
-                                  'tfs': {'brightness': {'kwargs': {'brightness': [0.8,
-                                                                                   1.2]},
-                                                         'type': 'ColorJitter',
-                                                         'weight': 1.0},
-                                          'contrast': {'kwargs': {'contrast': [0.8,
-                                                                               1.2]},
-                                                       'type': 'ColorJitter',
-                                                       'weight': 1.0},
-                                          'hue': {'kwargs': {'hue': [-0.05,
-                                                                     0.05]},
-                                                  'type': 'ColorJitter',
-                                                  'weight': 1.0},
-                                          'saturation': {'kwargs': {'saturation': [0.5,
-                                                                                   1.5]},
-                                                         'type': 'ColorJitter',
-                                                         'weight': 1.0},
-                                          'sharpness': {'kwargs': {'sharpness': [0.5,
-                                                                                 1.5]},
-                                                        'type': 'SharpnessJitter',
-                                                        'weight': 1.0}}},
-             'repo_id': 'HuggingFaceVLA/libero',
-             'revision': None,
-             'root': '/raid/jade/.cache/huggingface/datasets',
-             'use_imagenet_stats': True,
-             'video_backend': 'torchcodec'},
- 'env': {'camera_name': 'agentview_image,robot0_eye_in_hand_image',
-         'episode_length': 520,
-         'features': {'action': {'shape': [7],
-                                 'type': <FeatureType.ACTION: 'ACTION'>},
-                      'agent_pos': {'shape': [8],
-                                    'type': <FeatureType.STATE: 'STATE'>},
-                      'pixels/agentview_image': {'shape': [360, 360, 3],
-                                                 'type': <FeatureType.VISUAL: 'VISUAL'>},
-                      'pixels/robot0_eye_in_hand_image': {'shape': [360,
-                                                                    360,
-                                                                    3],
-                                                          'type': <FeatureType.VISUAL: 'VISUAL'>}},
-         'features_map': {'action': 'action',
-                          'agent_pos': 'observation.state',
-                          'pixels/agentview_image': 'observation.images.image',
-                          'pixels/robot0_eye_in_hand_image': 'observation.images.image2'},
-         'fps': 30,
-         'init_states': True,
-         'max_parallel_tasks': 5,
-         'multitask_eval': True,
-         'obs_type': 'pixels_agent_pos',
-         'render_mode': 'rgb_array',
-         'task': 'libero_spatial',
-         'type': 'libero'},
- 'eval': {'batch_size': 1, 'n_episodes': 1, 'use_async_envs': False},
- 'eval_freq': 0,
- 'job_name': 'libero_smolvla',
- 'log_freq': 200,
- 'num_workers': 4,
- 'optimizer': {'betas': [0.9, 0.95],
-               'eps': 1e-08,
-               'grad_clip_norm': 10,
-               'lr': 0.0001,
-               'type': 'adamw',
-               'weight_decay': 1e-10},
- 'output_dir': '/raid/jade/logs/lerobot/lerobot_2_HuggingFaceVLA_libero_smolvla_lr1e-4bs32steps100000',
- 'policy': {'adapt_to_pi_aloha': False,
-            'add_image_special_tokens': False,
-            'attention_mode': 'cross_attn',
-            'chunk_size': 50,
-            'device': 'cuda',
-            'empty_cameras': 0,
-            'expert_width_multiplier': 0.75,
-            'freeze_vision_encoder': True,
-            'gradient_accumulation_steps': 1,
-            'input_features': {},
-            'license': None,
-            'load_vlm_weights': False,
-            'max_action_dim': 32,
-            'max_period': 4.0,
-            'max_state_dim': 32,
-            'min_period': 0.004,
-            'n_action_steps': 1,
-            'n_obs_steps': 1,
-            'normalization_mapping': {'ACTION': <NormalizationMode.MEAN_STD: 'MEAN_STD'>,
-                                      'STATE': <NormalizationMode.MEAN_STD: 'MEAN_STD'>,
-                                      'VISUAL': <NormalizationMode.IDENTITY: 'IDENTITY'>},
-            'num_expert_layers': -1,
-            'num_steps': 10,
-            'num_vlm_layers': 16,
-            'optimizer_betas': [0.9, 0.95],
-            'optimizer_eps': 1e-08,
-            'optimizer_grad_clip_norm': 10,
-            'optimizer_lr': 0.0001,
-            'optimizer_weight_decay': 1e-10,
-            'output_features': {},
-            'pad_language_to': 'longest',
-            'prefix_length': -1,
-            'private': None,
-            'push_to_hub': True,
-            'repo_id': 'None',
-            'resize_imgs_with_padding': [512, 512],
-            'scheduler_decay_lr': 2.5e-06,
-            'scheduler_decay_steps': 30000,
-            'scheduler_warmup_steps': 1000,
-            'self_attn_every_n_layers': 2,
-            'tags': None,
-            'tokenizer_max_length': 48,
-            'train_expert_only': True,
-            'train_state_proj': True,
-            'type': 'smolvla',
-            'use_amp': True,
-            'use_cache': True,
-            'use_delta_joint_actions_aloha': False,
-            'vlm_model_name': 'HuggingFaceTB/SmolVLM2-500M-Instruct'},
- 'resume': False,
- 'save_checkpoint': True,
- 'save_freq': 20000,
- 'scheduler': {'decay_lr': 2.5e-06,
-               'num_decay_steps': 30000,
-               'num_warmup_steps': 1000,
-               'peak_lr': 0.0001,
-               'type': 'cosine_decay_with_warmup'},
- 'seed': 1000,
- 'steps': 100000,
- 'use_policy_training_preset': True,
- 'wandb': {'disable_artifact': False,
-           'enable': False,
-           'entity': None,
-           'mode': None,
-           'notes': None,
-           'project': 'lerobot',
-           'run_id': None}}
-WARNING 2025-09-08 13:23:15 ls/other.py:512 Detected kernel version 5.4.0, which is below the recommended minimum of
- 5.5.0; this can cause the process to hang. It is recommended to upgrade the kernel to the minimum version or higher
-.
-WARNING 2025-09-08 13:23:15 ls/other.py:512 Detected kernel version 5.4.0, which is below the recommended minimum of
- 5.5.0; this can cause the process to hang. It is recommended to upgrade the kernel to the minimum version or higher
-.
-INFO 2025-09-08 13:23:15 celerate.py:149 Creating dataset
-Resolving data files: 100%|█████████████████████████████████| 1693/1693 [00:00<00:00, 35414.48it/s]
-Loading dataset shards: 100%|████████████████████████████████████| 69/69 [00:00<00:00, 5660.00it/s]
-Resolving data files: 100%|█████████████████████████████████| 1693/1693 [00:00<00:00, 43760.67it/s]
-Loading dataset shards: 100%|████████████████████████████████████| 69/69 [00:00<00:00, 5629.72it/s]
-c
-INFO 2025-09-08 13:23:22 celerate.py:160 Creating policy
-/home/jade_choghari/miniconda3/envs/lerobot/lib/python3.10/site-packages/transformers/utils/hub.py:111: FutureWarnin
-g: Using `TRANSFORMERS_CACHE` is deprecated and will be removed in v5 of Transformers. Use `HF_HOME` instead.
-  warnings.warn(
-c
-/home/jade_choghari/miniconda3/envs/lerobot/lib/python3.10/site-packages/torch/distributed/distributed_c10d.py:4631:
- UserWarning: No device id is provided via `init_process_group` or `barrier `. Using the current device set by the u
-ser.
-  warnings.warn(  # warn only once
-[rank1]:[W908 13:23:22.785516795 ProcessGroupNCCL.cpp:4718] [PG ID 0 PG GUID 0 Rank 1]  using GPU 1 as device used b
-y this process is currently unknown. This can potentially cause a hang if this rank to GPU mapping is incorrect. You
- can pecify device_id in init_process_group() to force use of a particular device.
-Reducing the number of VLM layers to 16 ...
-/home/jade_choghari/miniconda3/envs/lerobot/lib/python3.10/site-packages/torch/distributed/distributed_c10d.py:4631:
- UserWarning: No device id is provided via `init_process_group` or `barrier `. Using the current device set by the u
-ser.
-  warnings.warn(  # warn only once
-[rank0]:[W908 13:23:43.028071493 ProcessGroupNCCL.cpp:4718] [PG ID 0 PG GUID 0 Rank 0]  using GPU 0 as device used b
-y this process is currently unknown. This can potentially cause a hang if this rank to GPU mapping is incorrect. You
- can pecify device_id in init_process_group() to force use of a particular device.
-INFO 2025-09-08 13:23:43 celerate.py:171 Creating optimizer and scheduler
-/home/jade_choghari/miniconda3/envs/lerobot/lib/python3.10/site-packages/transformers/utils/hub.py:111: FutureWarnin
-g: Using `TRANSFORMERS_CACHE` is deprecated and will be removed in v5 of Transformers. Use `HF_HOME` instead.
-  warnings.warn(
-Reducing the number of VLM layers to 16 ...
-INFO 2025-09-08 13:24:04 celerate.py:211 Output dir: /raid/jade/logs/lerobot/lerobot_2_HuggingFaceVLA_libero_smolvla
-_lr1e-4bs32steps100000
-INFO 2025-09-08 13:24:04 celerate.py:213 cfg.env.task='libero_spatial'
-INFO 2025-09-08 13:24:04 celerate.py:214 cfg.steps=100000 (100K)
-INFO 2025-09-08 13:24:04 celerate.py:215 dataset.num_frames=273465 (273K)
-INFO 2025-09-08 13:24:04 celerate.py:216 dataset.num_episodes=1693
-INFO 2025-09-08 13:24:04 celerate.py:217 num_learnable_params=99880992 (100M)
-INFO 2025-09-08 13:24:04 celerate.py:218 num_total_params=450046220 (450M)
-INFO 2025-09-08 13:24:04 celerate.py:219 Number of processes: 2
-INFO 2025-09-08 13:24:04 celerate.py:220 Device: cuda:0
-INFO 2025-09-08 13:24:04 celerate.py:221 Mixed precision: bf16
-INFO 2025-09-08 13:24:04 celerate.py:243 Start offline training on a fixed dataset
-
-bach:  dict_keys(['observation.images.image', 'observation.images.image2', 'observation.state', 'action', 'timestamp
-', 'frame_index', 'episode_index', 'index', 'task_index', 'observation.images.image_is_pad', 'observation.images.ima
-ge2_is_pad', 'observation.state_is_pad', 'action_is_pad', 'task'])
-> /home/jade_choghari/lerobot/src/lerobot/scripts/train_accelerate.py(263)train()
--> train_tracker, output_dict = update_policy(
-(Pdb)
-bach:  dict_keys(['observation.images.image', 'observation.images.image2', 'observation.state', 'action', 'timestamp
-', 'frame_index', 'episode_index', 'index', 'task_index', 'observation.images.image_is_pad', 'observation.images.ima
-ge2_is_pad', 'observation.state_is_pad', 'action_is_pad', 'task'])
-> /home/jade_choghari/lerobot/src/lerobot/scripts/train_accelerate.py(263)train()
--> train_tracker, output_dict = update_policy(
-(Pdb) batch.keys()[rank0]:[W908 13:24:43.868440913 reducer.cpp:1430] Warning: find_unused_parameters=True was specif
-ied in DDP constructor, but did not find any unused parameters in the forward pass. This flag results in an extra tr
-aversal of the autograd graph every iteration,  which can adversely affect performance. If your model indeed never h
-as any unused parameters in the forward pass, consider turning this flag off. Note that this warning may be a false
-positive if your model has flow control causing later iterations to have unused parameters. (function operator())
-policy.config.input_features
-*** SyntaxError: invalid syntax
-(Pdb) policy.config.input_features
-*** AttributeError: 'DistributedDataParallel' object has no attribute 'config'
-(Pdb) policy
-DistributedDataParallel(
-  (module): SmolVLAPolicy(
-    (normalize_inputs): Normalize(
-      (buffer_observation_state): ParameterDict(
-          (mean): Parameter containing: [torch.cuda.FloatTensor of size 8 (cuda:1)]
-          (std): Parameter containing: [torch.cuda.FloatTensor of size 8 (cuda:1)]
-      )
-    )
-    (normalize_targets): Normalize(
-      (buffer_action): ParameterDict(
-          (mean): Parameter containing: [torch.cuda.FloatTensor of size 7 (cuda:1)]
-          (std): Parameter containing: [torch.cuda.FloatTensor of size 7 (cuda:1)]
-      )
-    )
-    (unnormalize_outputs): Unnormalize(
-      (buffer_action): ParameterDict(
-          (mean): Parameter containing: [torch.cuda.FloatTensor of size 7 (cuda:1)]
-          (std): Parameter containing: [torch.cuda.FloatTensor of size 7 (cuda:1)]
-      )
-    )
-    (model): VLAFlowMatching(
-      (vlm_with_expert): SmolVLMWithExpertModel(
-        (vlm): SmolVLMForConditionalGeneration(
-          (model): SmolVLMModel(
-            (vision_model): SmolVLMVisionTransformer(
-              (embeddings): SmolVLMVisionEmbeddings(
-                (patch_embedding): Conv2d(3, 768, kernel_size=(16, 16), stride=(16, 16), padding=valid)
-                (position_embedding): Embedding(1024, 768)
-              )
-              (encoder): SmolVLMEncoder(
-                (layers): ModuleList(
-                  (0-11): 12 x SmolVLMEncoderLayer(
-                    (self_attn): SmolVLMVisionAttention(
-                      (k_proj): Linear(in_features=768, out_features=768, bias=True)
-                      (v_proj): Linear(in_features=768, out_features=768, bias=True)
-                      (q_proj): Linear(in_features=768, out_features=768, bias=True)
-                      (out_proj): Linear(in_features=768, out_features=768, bias=True)
-                    )
-                    (layer_norm1): LayerNorm((768,), eps=1e-06, elementwise_affine=True)
-                    (mlp): SmolVLMVisionMLP(
-                      (activation_fn): PytorchGELUTanh()
-                      (fc1): Linear(in_features=768, out_features=3072, bias=True)
-                      (fc2): Linear(in_features=3072, out_features=768, bias=True)
-                    )
-                    (layer_norm2): LayerNorm((768,), eps=1e-06, elementwise_affine=True)
-                  )
-                )
-              )
-              (post_layernorm): LayerNorm((768,), eps=1e-06, elementwise_affine=True)
-            )
-            (connector): SmolVLMConnector(
-              (modality_projection): SmolVLMSimpleMLP(
-                (proj): Linear(in_features=12288, out_features=960, bias=False)
-              )
-            )
-            (text_model): LlamaModel(
-              (embed_tokens): Embedding(49280, 960, padding_idx=2)
-              (layers): ModuleList(
-                (0-15): 16 x LlamaDecoderLayer(
-                  (self_attn): LlamaAttention(
-                    (q_proj): Linear(in_features=960, out_features=960, bias=False)
-                    (k_proj): Linear(in_features=960, out_features=320, bias=False)
-                    (v_proj): Linear(in_features=960, out_features=320, bias=False)
-                    (o_proj): Linear(in_features=960, out_features=960, bias=False)
-                  )
-                  (mlp): LlamaMLP(
-                    (gate_proj): Linear(in_features=960, out_features=2560, bias=False)
-                    (up_proj): Linear(in_features=960, out_features=2560, bias=False)
-                    (down_proj): Linear(in_features=2560, out_features=960, bias=False)
-                    (act_fn): SiLU()
-                  )
-                  (input_layernorm): LlamaRMSNorm((960,), eps=1e-05)
-                  (post_attention_layernorm): LlamaRMSNorm((960,), eps=1e-05)
-                )
-              )
-              (norm): LlamaRMSNorm((960,), eps=1e-05)
-              (rotary_emb): LlamaRotaryEmbedding()
-            )
-          )
-          (lm_head): Linear(in_features=960, out_features=49280, bias=False)
-        )
-        (lm_expert): LlamaModel(
-          (embed_tokens): None
-          (layers): ModuleList(
-            (0): LlamaDecoderLayer(
-              (self_attn): LlamaAttention(
-                (q_proj): Linear(in_features=720, out_features=960, bias=False)
-                (k_proj): Linear(in_features=720, out_features=320, bias=False)
-                (v_proj): Linear(in_features=720, out_features=320, bias=False)
-                (o_proj): Linear(in_features=960, out_features=720, bias=False)
-              )
-              (mlp): LlamaMLP(
-                (gate_proj): Linear(in_features=720, out_features=2048, bias=False)
-                (up_proj): Linear(in_features=720, out_features=2048, bias=False)
-                (down_proj): Linear(in_features=2048, out_features=720, bias=False)
-                (act_fn): SiLU()
-              )
-              (input_layernorm): LlamaRMSNorm((720,), eps=1e-05)
-              (post_attention_layernorm): LlamaRMSNorm((720,), eps=1e-05)
-            )
-            (1): LlamaDecoderLayer(
-              (self_attn): LlamaAttention(
-                (q_proj): Linear(in_features=720, out_features=960, bias=False)
-                (k_proj): Linear(in_features=320, out_features=320, bias=False)
-                (v_proj): Linear(in_features=320, out_features=320, bias=False)
-                (o_proj): Linear(in_features=960, out_features=720, bias=False)
-              )
-              (mlp): LlamaMLP(
-                (gate_proj): Linear(in_features=720, out_features=2048, bias=False)
-                (up_proj): Linear(in_features=720, out_features=2048, bias=False)
-                (down_proj): Linear(in_features=2048, out_features=720, bias=False)
-                (act_fn): SiLU()
-              )
-              (input_layernorm): LlamaRMSNorm((720,), eps=1e-05)
-              (post_attention_layernorm): LlamaRMSNorm((720,), eps=1e-05)
-            )
-            (2): LlamaDecoderLayer(
-              (self_attn): LlamaAttention(
-                (q_proj): Linear(in_features=720, out_features=960, bias=False)
-                (k_proj): Linear(in_features=720, out_features=320, bias=False)
-                (v_proj): Linear(in_features=720, out_features=320, bias=False)
-                (o_proj): Linear(in_features=960, out_features=720, bias=False)
-              )
-              (mlp): LlamaMLP(
-                (gate_proj): Linear(in_features=720, out_features=2048, bias=False)
-                (up_proj): Linear(in_features=720, out_features=2048, bias=False)
-                (down_proj): Linear(in_features=2048, out_features=720, bias=False)
-                (act_fn): SiLU()
-              )
-              (input_layernorm): LlamaRMSNorm((720,), eps=1e-05)
-              (post_attention_layernorm): LlamaRMSNorm((720,), eps=1e-05)
-            )
-            (3): LlamaDecoderLayer(
-              (self_attn): LlamaAttention(
-                (q_proj): Linear(in_features=720, out_features=960, bias=False)
-                (k_proj): Linear(in_features=320, out_features=320, bias=False)
-                (v_proj): Linear(in_features=320, out_features=320, bias=False)
-                (o_proj): Linear(in_features=960, out_features=720, bias=False)
-              )
-              (mlp): LlamaMLP(
-                (gate_proj): Linear(in_features=720, out_features=2048, bias=False)
-                (up_proj): Linear(in_features=720, out_features=2048, bias=False)
-                (down_proj): Linear(in_features=2048, out_features=720, bias=False)
-                (act_fn): SiLU()
-              )
-              (input_layernorm): LlamaRMSNorm((720,), eps=1e-05)
-              (post_attention_layernorm): LlamaRMSNorm((720,), eps=1e-05)
-            )
-            (4): LlamaDecoderLayer(
-              (self_attn): LlamaAttention(
-                (q_proj): Linear(in_features=720, out_features=960, bias=False)
-                (k_proj): Linear(in_features=720, out_features=320, bias=False)
-                (v_proj): Linear(in_features=720, out_features=320, bias=False)
-                (o_proj): Linear(in_features=960, out_features=720, bias=False)
-              )
-              (mlp): LlamaMLP(
-                (gate_proj): Linear(in_features=720, out_features=2048, bias=False)
-                (up_proj): Linear(in_features=720, out_features=2048, bias=False)
-                (down_proj): Linear(in_features=2048, out_features=720, bias=False)
-                (act_fn): SiLU()
-              )
-              (input_layernorm): LlamaRMSNorm((720,), eps=1e-05)
-              (post_attention_layernorm): LlamaRMSNorm((720,), eps=1e-05)
-            )
-            (5): LlamaDecoderLayer(
-              (self_attn): LlamaAttention(
-                (q_proj): Linear(in_features=720, out_features=960, bias=False)
-                (k_proj): Linear(in_features=320, out_features=320, bias=False)
-                (v_proj): Linear(in_features=320, out_features=320, bias=False)
-                (o_proj): Linear(in_features=960, out_features=720, bias=False)
-              )
-              (mlp): LlamaMLP(
-                (gate_proj): Linear(in_features=720, out_features=2048, bias=False)
-                (up_proj): Linear(in_features=720, out_features=2048, bias=False)
-                (down_proj): Linear(in_features=2048, out_features=720, bias=False)
-                (act_fn): SiLU()
-              )
-              (input_layernorm): LlamaRMSNorm((720,), eps=1e-05)
-              (post_attention_layernorm): LlamaRMSNorm((720,), eps=1e-05)
-            )
-            (6): LlamaDecoderLayer(
-              (self_attn): LlamaAttention(
-                (q_proj): Linear(in_features=720, out_features=960, bias=False)
-                (k_proj): Linear(in_features=720, out_features=320, bias=False)
-                (v_proj): Linear(in_features=720, out_features=320, bias=False)
-                (o_proj): Linear(in_features=960, out_features=720, bias=False)
-              )
-              (mlp): LlamaMLP(
-                (gate_proj): Linear(in_features=720, out_features=2048, bias=False)
-                (up_proj): Linear(in_features=720, out_features=2048, bias=False)
-                (down_proj): Linear(in_features=2048, out_features=720, bias=False)
-                (act_fn): SiLU()
-              )
-              (input_layernorm): LlamaRMSNorm((720,), eps=1e-05)
-              (post_attention_layernorm): LlamaRMSNorm((720,), eps=1e-05)
-            )
-            (7): LlamaDecoderLayer(
-              (self_attn): LlamaAttention(
-                (q_proj): Linear(in_features=720, out_features=960, bias=False)
-                (k_proj): Linear(in_features=320, out_features=320, bias=False)
-                (v_proj): Linear(in_features=320, out_features=320, bias=False)
-                (o_proj): Linear(in_features=960, out_features=720, bias=False)
-              )
-              (mlp): LlamaMLP(
-                (gate_proj): Linear(in_features=720, out_features=2048, bias=False)
-                (up_proj): Linear(in_features=720, out_features=2048, bias=False)
-                (down_proj): Linear(in_features=2048, out_features=720, bias=False)
-                (act_fn): SiLU()
-              )
-              (input_layernorm): LlamaRMSNorm((720,), eps=1e-05)
-              (post_attention_layernorm): LlamaRMSNorm((720,), eps=1e-05)
-            )
-            (8): LlamaDecoderLayer(
-              (self_attn): LlamaAttention(
-                (q_proj): Linear(in_features=720, out_features=960, bias=False)
-                (k_proj): Linear(in_features=720, out_features=320, bias=False)
-                (v_proj): Linear(in_features=720, out_features=320, bias=False)
-                (o_proj): Linear(in_features=960, out_features=720, bias=False)
-              )
-              (mlp): LlamaMLP(
-                (gate_proj): Linear(in_features=720, out_features=2048, bias=False)
-                (up_proj): Linear(in_features=720, out_features=2048, bias=False)
-                (down_proj): Linear(in_features=2048, out_features=720, bias=False)
-                (act_fn): SiLU()
-              )
-              (input_layernorm): LlamaRMSNorm((720,), eps=1e-05)
-              (post_attention_layernorm): LlamaRMSNorm((720,), eps=1e-05)
-            )
-            (9): LlamaDecoderLayer(
-              (self_attn): LlamaAttention(
-                (q_proj): Linear(in_features=720, out_features=960, bias=False)
-                (k_proj): Linear(in_features=320, out_features=320, bias=False)
-                (v_proj): Linear(in_features=320, out_features=320, bias=False)
-                (o_proj): Linear(in_features=960, out_features=720, bias=False)
-              )
-              (mlp): LlamaMLP(
-                (gate_proj): Linear(in_features=720, out_features=2048, bias=False)
-                (up_proj): Linear(in_features=720, out_features=2048, bias=False)
-                (down_proj): Linear(in_features=2048, out_features=720, bias=False)
-                (act_fn): SiLU()
-              )
-              (input_layernorm): LlamaRMSNorm((720,), eps=1e-05)
-              (post_attention_layernorm): LlamaRMSNorm((720,), eps=1e-05)
-            )
-            (10): LlamaDecoderLayer(
-              (self_attn): LlamaAttention(
-                (q_proj): Linear(in_features=720, out_features=960, bias=False)
-                (k_proj): Linear(in_features=720, out_features=320, bias=False)
-                (v_proj): Linear(in_features=720, out_features=320, bias=False)
-                (o_proj): Linear(in_features=960, out_features=720, bias=False)
-              )
-              (mlp): LlamaMLP(
-                (gate_proj): Linear(in_features=720, out_features=2048, bias=False)
-                (up_proj): Linear(in_features=720, out_features=2048, bias=False)
-                (down_proj): Linear(in_features=2048, out_features=720, bias=False)
-                (act_fn): SiLU()
-              )
-              (input_layernorm): LlamaRMSNorm((720,), eps=1e-05)
-              (post_attention_layernorm): LlamaRMSNorm((720,), eps=1e-05)
-            )
-            (11): LlamaDecoderLayer(
-              (self_attn): LlamaAttention(
-                (q_proj): Linear(in_features=720, out_features=960, bias=False)
-                (k_proj): Linear(in_features=320, out_features=320, bias=False)
-                (v_proj): Linear(in_features=320, out_features=320, bias=False)
-                (o_proj): Linear(in_features=960, out_features=720, bias=False)
-              )
-              (mlp): LlamaMLP(
-                (gate_proj): Linear(in_features=720, out_features=2048, bias=False)
-                (up_proj): Linear(in_features=720, out_features=2048, bias=False)
-                (down_proj): Linear(in_features=2048, out_features=720, bias=False)
-                (act_fn): SiLU()
-              )
-              (input_layernorm): LlamaRMSNorm((720,), eps=1e-05)
-              (post_attention_layernorm): LlamaRMSNorm((720,), eps=1e-05)
-            )
-            (12): LlamaDecoderLayer(
-              (self_attn): LlamaAttention(
-                (q_proj): Linear(in_features=720, out_features=960, bias=False)
-                (k_proj): Linear(in_features=720, out_features=320, bias=False)
-                (v_proj): Linear(in_features=720, out_features=320, bias=False)
-                (o_proj): Linear(in_features=960, out_features=720, bias=False)
-              )
-              (mlp): LlamaMLP(
-                (gate_proj): Linear(in_features=720, out_features=2048, bias=False)
-                (up_proj): Linear(in_features=720, out_features=2048, bias=False)
-                (down_proj): Linear(in_features=2048, out_features=720, bias=False)
-                (act_fn): SiLU()
-              )
-              (input_layernorm): LlamaRMSNorm((720,), eps=1e-05)
-              (post_attention_layernorm): LlamaRMSNorm((720,), eps=1e-05)
-            )
-            (13): LlamaDecoderLayer(
-              (self_attn): LlamaAttention(
-                (q_proj): Linear(in_features=720, out_features=960, bias=False)
-                (k_proj): Linear(in_features=320, out_features=320, bias=False)
-                (v_proj): Linear(in_features=320, out_features=320, bias=False)
-                (o_proj): Linear(in_features=960, out_features=720, bias=False)
-              )
-              (mlp): LlamaMLP(
-                (gate_proj): Linear(in_features=720, out_features=2048, bias=False)
-                (up_proj): Linear(in_features=720, out_features=2048, bias=False)
-                (down_proj): Linear(in_features=2048, out_features=720, bias=False)
-                (act_fn): SiLU()
-              )
-              (input_layernorm): LlamaRMSNorm((720,), eps=1e-05)
-              (post_attention_layernorm): LlamaRMSNorm((720,), eps=1e-05)
-            )
-            (14): LlamaDecoderLayer(
-              (self_attn): LlamaAttention(
-                (q_proj): Linear(in_features=720, out_features=960, bias=False)
-                (k_proj): Linear(in_features=720, out_features=320, bias=False)
-                (v_proj): Linear(in_features=720, out_features=320, bias=False)
-                (o_proj): Linear(in_features=960, out_features=720, bias=False)
-              )
-              (mlp): LlamaMLP(
-                (gate_proj): Linear(in_features=720, out_features=2048, bias=False)
-                (up_proj): Linear(in_features=720, out_features=2048, bias=False)
-                (down_proj): Linear(in_features=2048, out_features=720, bias=False)
-                (act_fn): SiLU()
-              )
-              (input_layernorm): LlamaRMSNorm((720,), eps=1e-05)
-              (post_attention_layernorm): LlamaRMSNorm((720,), eps=1e-05)
-            )
-            (15): LlamaDecoderLayer(
-              (self_attn): LlamaAttention(
-                (q_proj): Linear(in_features=720, out_features=960, bias=False)
-                (k_proj): Linear(in_features=320, out_features=320, bias=False)
-                (v_proj): Linear(in_features=320, out_features=320, bias=False)
-                (o_proj): Linear(in_features=960, out_features=720, bias=False)
-              )
-              (mlp): LlamaMLP(
-                (gate_proj): Linear(in_features=720, out_features=2048, bias=False)
-                (up_proj): Linear(in_features=720, out_features=2048, bias=False)
-                (down_proj): Linear(in_features=2048, out_features=720, bias=False)
-                (act_fn): SiLU()
-              )
-              (input_layernorm): LlamaRMSNorm((720,), eps=1e-05)
-              (post_attention_layernorm): LlamaRMSNorm((720,), eps=1e-05)
-            )
-          )
-          (norm): LlamaRMSNorm((720,), eps=1e-05)
-          (rotary_emb): LlamaRotaryEmbedding()
-        )
-      )
-      (state_proj): Linear(in_features=32, out_features=960, bias=True)
-      (action_in_proj): Linear(in_features=32, out_features=720, bias=True)
-      (action_out_proj): Linear(in_features=720, out_features=32, bias=True)
-      (action_time_mlp_in): Linear(in_features=1440, out_features=720, bias=True)
-      (action_time_mlp_out): Linear(in_features=720, out_features=720, bias=True)
-    )
-  )
-)
-(Pdb) policy.config
-*** AttributeError: 'DistributedDataParallel' object has no attribute 'config'
-(Pdb) policy.input_features
-*** AttributeError: 'DistributedDataParallel' object has no attribute 'input_features'
-(Pdb) quit()
-[rank1]: Traceback (most recent call last):
-[rank1]:   File "/home/jade_choghari/lerobot/src/lerobot/scripts/train_accelerate.py", line 368, in <module>
-[rank1]:     train()
-[rank1]:   File "/home/jade_choghari/lerobot/src/lerobot/configs/parser.py", line 225, in wrapper_inner
-[rank1]:     response = fn(cfg, *args, **kwargs)
-[rank1]:   File "/home/jade_choghari/lerobot/src/lerobot/scripts/train_accelerate.py", line 263, in train
-[rank1]:     train_tracker, output_dict = update_policy(
-[rank1]:   File "/home/jade_choghari/lerobot/src/lerobot/scripts/train_accelerate.py", line 263, in train
-[rank1]:     train_tracker, output_dict = update_policy(
-[rank1]:   File "/home/jade_choghari/miniconda3/envs/lerobot/lib/python3.10/bdb.py", line 90, in trace_dispatch
-[rank1]:     return self.dispatch_line(frame)
-[rank1]:   File "/home/jade_choghari/miniconda3/envs/lerobot/lib/python3.10/bdb.py", line 115, in dispatch_line
-[rank1]:     if self.quitting: raise BdbQuit
-[rank1]: bdb.BdbQuit
-W0908 13:25:34.274000 776579 site-packages/torch/distributed/elastic/multiprocessing/api.py:900] Sending process 776
-663 closing signal SIGTERM
-E0908 13:25:34.589000 776579 site-packages/torch/distributed/elastic/multiprocessing/api.py:874] failed (exitcode: 1
-) local_rank: 1 (pid: 776664) of binary: /home/jade_choghari/miniconda3/envs/lerobot/bin/python
-Traceback (most recent call last):
-  File "/home/jade_choghari/miniconda3/envs/lerobot/lib/python3.10/runpy.py", line 196, in _run_module_as_main
-    return _run_code(code, main_globals, None,
-  File "/home/jade_choghari/miniconda3/envs/lerobot/lib/python3.10/runpy.py", line 86, in _run_code
-    exec(code, run_globals)
-  File "/home/jade_choghari/miniconda3/envs/lerobot/lib/python3.10/site-packages/accelerate/commands/launch.py", lin
-e 1245, in <module>
-    main()
-  File "/home/jade_choghari/miniconda3/envs/lerobot/lib/python3.10/site-packages/accelerate/commands/launch.py", lin
-e 1241, in main
-    launch_command(args)
-  File "/home/jade_choghari/miniconda3/envs/lerobot/lib/python3.10/site-packages/accelerate/commands/launch.py", lin
-e 1226, in launch_command
-    multi_gpu_launcher(args)
-  File "/home/jade_choghari/miniconda3/envs/lerobot/lib/python3.10/site-packages/accelerate/commands/launch.py", lin
-e 853, in multi_gpu_launcher
-    distrib_run.run(args)
-  File "/home/jade_choghari/miniconda3/envs/lerobot/lib/python3.10/site-packages/torch/distributed/run.py", line 883
-, in run
-    elastic_launch(
-  File "/home/jade_choghari/miniconda3/envs/lerobot/lib/python3.10/site-packages/torch/distributed/launcher/api.py",
- line 139, in __call__
-    return launch_agent(self._config, self._entrypoint, list(args))
-  File "/home/jade_choghari/miniconda3/envs/lerobot/lib/python3.10/site-packages/torch/distributed/launcher/api.py",
- line 270, in launch_agent
-    raise ChildFailedError(
-torch.distributed.elastic.multiprocessing.errors.ChildFailedError:
-============================================================
-src/lerobot/scripts/train_accelerate.py FAILED
-------------------------------------------------------------
-Failures:
-  <NO_OTHER_FAILURES>
-------------------------------------------------------------
-Root Cause (first observed failure):
-[0]:
-  time      : 2025-09-08_13:25:34
-  host      : hf-dgx-01
-  rank      : 1 (local_rank: 1)
-  exitcode  : 1 (pid: 776664)
-  error_file: <N/A>
-  traceback : To enable traceback see: https://pytorch.org/docs/stable/elastic/errors.html
-============================================================
-(lerobot) jade_choghari@hf-dgx-01:~/lerobot$ clear
-(lerobot) jade_choghari@hf-dgx-01:~/lerobot$ clear
-(lerobot) jade_choghari@hf-dgx-01:~/lerobot$ bash examples/7_train_acc.sh
-Training dir: /raid/jade/logs/lerobot/lerobot_2_HuggingFaceVLA_libero_smolvla_lr1e-4bs32steps100000
-The following values were not passed to `accelerate launch` and had defaults used instead:
-                More than one GPU was found, enabling multi-GPU training.
-                If this was unintended please pass in `--num_processes=1`.
-        `--dynamo_backend` was set to a value of `'no'`
-To avoid this warning pass in values for each of the problematic parameters or run `accelerate config`.
-/home/jade_choghari/miniconda3/envs/lerobot/lib/python3.10/site-packages/accelerate/utils/launch.py:238: UserWarning
-: Port `29522` is already in use. Accelerate will attempt to launch in a standalone-like mode by finding an open por
-t automatically for this session. If this current attempt fails, or for more control in future runs, please specify
-a different port (e.g., `--main_process_port <your_chosen_port>`) or use `--main_process_port 0` for automatic selec
-tion in your launch command or Accelerate config file.
-  warnings.warn(
-INFO 2025-09-08 13:33:47 ils/utils.py:48 Cuda backend detected, using cuda.
-WARNING 2025-09-08 13:33:47 /policies.py:81 Device 'None' is not available. Switching to 'cuda'.
-INFO 2025-09-08 13:33:47 ccelerate.py:99 {'batch_size': 32,
- 'dataset': {'episodes': None,
-             'image_transforms': {'enable': False,
-                                  'max_num_transforms': 3,
-                                  'random_order': False,
-                                  'tfs': {'brightness': {'kwargs': {'brightness': [0.8,
-                                                                                   1.2]},
-                                                         'type': 'ColorJitter',
-                                                         'weight': 1.0},
-                                          'contrast': {'kwargs': {'contrast': [0.8,
-                                                                               1.2]},
-                                                       'type': 'ColorJitter',
-                                                       'weight': 1.0},
-                                          'hue': {'kwargs': {'hue': [-0.05,
-                                                                     0.05]},
-                                                  'type': 'ColorJitter',
-                                                  'weight': 1.0},
-                                          'saturation': {'kwargs': {'saturation': [0.5,
-                                                                                   1.5]},
-                                                         'type': 'ColorJitter',
-                                                         'weight': 1.0},
-                                          'sharpness': {'kwargs': {'sharpness': [0.5,
-                                                                                 1.5]},
-                                                        'type': 'SharpnessJitter',
-                                                        'weight': 1.0}}},
-             'repo_id': 'HuggingFaceVLA/libero',
-             'revision': None,
-             'root': '/raid/jade/.cache/huggingface/datasets',
-             'use_imagenet_stats': True,
-             'video_backend': 'torchcodec'},
- 'env': {'camera_name': 'agentview_image,robot0_eye_in_hand_image',
-         'episode_length': 520,
-         'features': {'action': {'shape': [7],
-                                 'type': <FeatureType.ACTION: 'ACTION'>},
-                      'agent_pos': {'shape': [8],
-                                    'type': <FeatureType.STATE: 'STATE'>},
-                      'pixels/agentview_image': {'shape': [360, 360, 3],
-                                                 'type': <FeatureType.VISUAL: 'VISUAL'>},
-                      'pixels/robot0_eye_in_hand_image': {'shape': [360,
-                                                                    360,
-                                                                    3],
-                                                          'type': <FeatureType.VISUAL: 'VISUAL'>}},
-         'features_map': {'action': 'action',
-                          'agent_pos': 'observation.state',
-                          'pixels/agentview_image': 'observation.images.image',
-                          'pixels/robot0_eye_in_hand_image': 'observation.images.image2'},
-         'fps': 30,
-         'init_states': True,
-         'max_parallel_tasks': 5,
-         'multitask_eval': True,
-         'obs_type': 'pixels_agent_pos',
-         'render_mode': 'rgb_array',
-         'task': 'libero_spatial',
-         'type': 'libero'},
- 'eval': {'batch_size': 1, 'n_episodes': 1, 'use_async_envs': False},
- 'eval_freq': 0,
- 'job_name': 'libero_smolvla',
- 'log_freq': 200,
- 'num_workers': 4,
- 'optimizer': {'betas': [0.9, 0.95],
-               'eps': 1e-08,
-               'grad_clip_norm': 10,
-               'lr': 0.0001,
-               'type': 'adamw',
-               'weight_decay': 1e-10},
- 'output_dir': '/raid/jade/logs/lerobot/lerobot_2_HuggingFaceVLA_libero_smolvla_lr1e-4bs32steps100000',
- 'policy': {'adapt_to_pi_aloha': False,
-            'add_image_special_tokens': False,
-            'attention_mode': 'cross_attn',
-            'chunk_size': 50,
-            'device': 'cuda',
-            'empty_cameras': 0,
-            'expert_width_multiplier': 0.75,
-            'freeze_vision_encoder': True,
-            'gradient_accumulation_steps': 1,
-            'input_features': {},
-            'license': None,
-            'load_vlm_weights': False,
-            'max_action_dim': 32,
-            'max_period': 4.0,
-            'max_state_dim': 32,
-            'min_period': 0.004,
-            'n_action_steps': 1,
-            'n_obs_steps': 1,
-            'normalization_mapping': {'ACTION': <NormalizationMode.MEAN_STD: 'MEAN_STD'>,
-                                      'STATE': <NormalizationMode.MEAN_STD: 'MEAN_STD'>,
-                                      'VISUAL': <NormalizationMode.IDENTITY: 'IDENTITY'>},
-            'num_expert_layers': -1,
-            'num_steps': 10,
-            'num_vlm_layers': 16,
-            'optimizer_betas': [0.9, 0.95],
-            'optimizer_eps': 1e-08,
-            'optimizer_grad_clip_norm': 10,
-            'optimizer_lr': 0.0001,
-            'optimizer_weight_decay': 1e-10,
-            'output_features': {},
-            'pad_language_to': 'longest',
-            'prefix_length': -1,
-            'private': None,
-            'push_to_hub': True,
-            'repo_id': 'None',
-            'resize_imgs_with_padding': [512, 512],
-            'scheduler_decay_lr': 2.5e-06,
-            'scheduler_decay_steps': 30000,
-            'scheduler_warmup_steps': 1000,
-            'self_attn_every_n_layers': 2,
-            'tags': None,
-            'tokenizer_max_length': 48,
-            'train_expert_only': True,
-            'train_state_proj': True,
-            'type': 'smolvla',
-            'use_amp': True,
-            'use_cache': True,
-            'use_delta_joint_actions_aloha': False,
-            'vlm_model_name': 'HuggingFaceTB/SmolVLM2-500M-Instruct'},
- 'resume': False,
- 'save_checkpoint': True,
- 'save_freq': 20000,
- 'scheduler': {'decay_lr': 2.5e-06,
-               'num_decay_steps': 30000,
-               'num_warmup_steps': 1000,
-               'peak_lr': 0.0001,
-               'type': 'cosine_decay_with_warmup'},
- 'seed': 1000,
- 'steps': 100000,
- 'use_policy_training_preset': True,
- 'wandb': {'disable_artifact': False,
-           'enable': False,
-           'entity': None,
-           'mode': None,
-           'notes': None,
-           'project': 'lerobot',
-           'run_id': None}}
-INFO 2025-09-08 13:33:47 ils/utils.py:48 Cuda backend detected, using cuda.
-WARNING 2025-09-08 13:33:47 /policies.py:81 Device 'None' is not available. Switching to 'cuda'.
-INFO 2025-09-08 13:33:47 ccelerate.py:99 {'batch_size': 32,
- 'dataset': {'episodes': None,
-             'image_transforms': {'enable': False,
-                                  'max_num_transforms': 3,
-                                  'random_order': False,
-                                  'tfs': {'brightness': {'kwargs': {'brightness': [0.8,
-                                                                                   1.2]},
-                                                         'type': 'ColorJitter',
-                                                         'weight': 1.0},
-                                          'contrast': {'kwargs': {'contrast': [0.8,
-                                                                               1.2]},
-                                                       'type': 'ColorJitter',
-                                                       'weight': 1.0},
-                                          'hue': {'kwargs': {'hue': [-0.05,
-                                                                     0.05]},
-                                                  'type': 'ColorJitter',
-                                                  'weight': 1.0},
-                                          'saturation': {'kwargs': {'saturation': [0.5,
-                                                                                   1.5]},
-                                                         'type': 'ColorJitter',
-                                                         'weight': 1.0},
-                                          'sharpness': {'kwargs': {'sharpness': [0.5,
-                                                                                 1.5]},
-                                                        'type': 'SharpnessJitter',
-                                                        'weight': 1.0}}},
-             'repo_id': 'HuggingFaceVLA/libero',
-             'revision': None,
-             'root': '/raid/jade/.cache/huggingface/datasets',
-             'use_imagenet_stats': True,
-             'video_backend': 'torchcodec'},
- 'env': {'camera_name': 'agentview_image,robot0_eye_in_hand_image',
-         'episode_length': 520,
-         'features': {'action': {'shape': [7],
-                                 'type': <FeatureType.ACTION: 'ACTION'>},
-                      'agent_pos': {'shape': [8],
-                                    'type': <FeatureType.STATE: 'STATE'>},
-                      'pixels/agentview_image': {'shape': [360, 360, 3],
-                                                 'type': <FeatureType.VISUAL: 'VISUAL'>},
-                      'pixels/robot0_eye_in_hand_image': {'shape': [360,
-                                                                    360,
-                                                                    3],
-                                                          'type': <FeatureType.VISUAL: 'VISUAL'>}},
-         'features_map': {'action': 'action',
-                          'agent_pos': 'observation.state',
-                          'pixels/agentview_image': 'observation.images.image',
-                          'pixels/robot0_eye_in_hand_image': 'observation.images.image2'},
-         'fps': 30,
-         'init_states': True,
-         'max_parallel_tasks': 5,
-         'multitask_eval': True,
-         'obs_type': 'pixels_agent_pos',
-         'render_mode': 'rgb_array',
-         'task': 'libero_spatial',
-         'type': 'libero'},
- 'eval': {'batch_size': 1, 'n_episodes': 1, 'use_async_envs': False},
- 'eval_freq': 0,
- 'job_name': 'libero_smolvla',
- 'log_freq': 200,
- 'num_workers': 4,
- 'optimizer': {'betas': [0.9, 0.95],
-               'eps': 1e-08,
-               'grad_clip_norm': 10,
-               'lr': 0.0001,
-               'type': 'adamw',
-               'weight_decay': 1e-10},
- 'output_dir': '/raid/jade/logs/lerobot/lerobot_2_HuggingFaceVLA_libero_smolvla_lr1e-4bs32steps100000',
- 'policy': {'adapt_to_pi_aloha': False,
-            'add_image_special_tokens': False,
-            'attention_mode': 'cross_attn',
-            'chunk_size': 50,
-            'device': 'cuda',
-            'empty_cameras': 0,
-            'expert_width_multiplier': 0.75,
-            'freeze_vision_encoder': True,
-            'gradient_accumulation_steps': 1,
-            'input_features': {},
-            'license': None,
-            'load_vlm_weights': False,
-            'max_action_dim': 32,
-            'max_period': 4.0,
-            'max_state_dim': 32,
-            'min_period': 0.004,
-            'n_action_steps': 1,
-            'n_obs_steps': 1,
-            'normalization_mapping': {'ACTION': <NormalizationMode.MEAN_STD: 'MEAN_STD'>,
-                                      'STATE': <NormalizationMode.MEAN_STD: 'MEAN_STD'>,
-                                      'VISUAL': <NormalizationMode.IDENTITY: 'IDENTITY'>},
-            'num_expert_layers': -1,
-            'num_steps': 10,
-            'num_vlm_layers': 16,
-            'optimizer_betas': [0.9, 0.95],
-            'optimizer_eps': 1e-08,
-            'optimizer_grad_clip_norm': 10,
-            'optimizer_lr': 0.0001,
-            'optimizer_weight_decay': 1e-10,
-            'output_features': {},
-            'pad_language_to': 'longest',
-            'prefix_length': -1,
-            'private': None,
-            'push_to_hub': True,
-            'repo_id': 'None',
-            'resize_imgs_with_padding': [512, 512],
-            'scheduler_decay_lr': 2.5e-06,
-            'scheduler_decay_steps': 30000,
-            'scheduler_warmup_steps': 1000,
-            'self_attn_every_n_layers': 2,
-            'tags': None,
-            'tokenizer_max_length': 48,
-            'train_expert_only': True,
-            'train_state_proj': True,
-            'type': 'smolvla',
-            'use_amp': True,
-            'use_cache': True,
-            'use_delta_joint_actions_aloha': False,
-            'vlm_model_name': 'HuggingFaceTB/SmolVLM2-500M-Instruct'},
- 'resume': False,
- 'save_checkpoint': True,
- 'save_freq': 20000,
- 'scheduler': {'decay_lr': 2.5e-06,
-               'num_decay_steps': 30000,
-               'num_warmup_steps': 1000,
-               'peak_lr': 0.0001,
-               'type': 'cosine_decay_with_warmup'},
- 'seed': 1000,
- 'steps': 100000,
- 'use_policy_training_preset': True,
- 'wandb': {'disable_artifact': False,
-           'enable': False,
-           'entity': None,
-           'mode': None,
-           'notes': None,
-           'project': 'lerobot',
-           'run_id': None}}
-WARNING 2025-09-08 13:33:47 ls/other.py:512 Detected kernel version 5.4.0, which is below the recommended minimum of
- 5.5.0; this can cause the process to hang. It is recommended to upgrade the kernel to the minimum version or higher
-.
-WARNING 2025-09-08 13:33:47 ls/other.py:512 Detected kernel version 5.4.0, which is below the recommended minimum of
- 5.5.0; this can cause the process to hang. It is recommended to upgrade the kernel to the minimum version or higher
-.
-INFO 2025-09-08 13:33:47 celerate.py:149 Creating dataset
-Resolving data files: 100%|████████████████████████████████| 1693/1693 [00:00<00:00, 103295.66it/s]
-Loading dataset shards: 100%|████████████████████████████████████| 69/69 [00:00<00:00, 5229.81it/s]
-Resolving data files: 100%|████████████████████████████████| 1693/1693 [00:00<00:00, 360601.09it/s]
-Loading dataset shards: 100%|████████████████████████████████████| 69/69 [00:00<00:00, 4881.54it/s]
-c
-INFO 2025-09-08 13:33:53 celerate.py:160 Creating policy
-/home/jade_choghari/miniconda3/envs/lerobot/lib/python3.10/site-packages/transformers/utils/hub.py:111: FutureWarnin
-g: Using `TRANSFORMERS_CACHE` is deprecated and will be removed in v5 of Transformers. Use `HF_HOME` instead.
-  warnings.warn(
-c
-/home/jade_choghari/miniconda3/envs/lerobot/lib/python3.10/site-packages/torch/distributed/distributed_c10d.py:4631:
- UserWarning: No device id is provided via `init_process_group` or `barrier `. Using the current device set by the u
-ser.
-  warnings.warn(  # warn only once
-[rank1]:[W908 13:33:54.613597516 ProcessGroupNCCL.cpp:4718] [PG ID 0 PG GUID 0 Rank 1]  using GPU 1 as device used b
-y this process is currently unknown. This can potentially cause a hang if this rank to GPU mapping is incorrect. You
- can pecify device_id in init_process_group() to force use of a particular device.
-Reducing the number of VLM layers to 16 ...
-/home/jade_choghari/miniconda3/envs/lerobot/lib/python3.10/site-packages/torch/distributed/distributed_c10d.py:4631:
- UserWarning: No device id is provided via `init_process_group` or `barrier `. Using the current device set by the u
-ser.
-  warnings.warn(  # warn only once
-[rank0]:[W908 13:34:15.806448425 ProcessGroupNCCL.cpp:4718] [PG ID 0 PG GUID 0 Rank 0]  using GPU 0 as device used b
-y this process is currently unknown. This can potentially cause a hang if this rank to GPU mapping is incorrect. You
- can pecify device_id in init_process_group() to force use of a particular device.
-INFO 2025-09-08 13:34:15 celerate.py:171 Creating optimizer and scheduler
-/home/jade_choghari/miniconda3/envs/lerobot/lib/python3.10/site-packages/transformers/utils/hub.py:111: FutureWarnin
-g: Using `TRANSFORMERS_CACHE` is deprecated and will be removed in v5 of Transformers. Use `HF_HOME` instead.
-  warnings.warn(
-Reducing the number of VLM layers to 16 ...
-INFO 2025-09-08 13:34:36 celerate.py:211 Output dir: /raid/jade/logs/lerobot/lerobot_2_HuggingFaceVLA_libero_smolvla
-_lr1e-4bs32steps100000
-INFO 2025-09-08 13:34:36 celerate.py:213 cfg.env.task='libero_spatial'
-INFO 2025-09-08 13:34:36 celerate.py:214 cfg.steps=100000 (100K)
-INFO 2025-09-08 13:34:36 celerate.py:215 dataset.num_frames=273465 (273K)
-INFO 2025-09-08 13:34:36 celerate.py:216 dataset.num_episodes=1693
-INFO 2025-09-08 13:34:36 celerate.py:217 num_learnable_params=99880992 (100M)
-INFO 2025-09-08 13:34:36 celerate.py:218 num_total_params=450046220 (450M)
-INFO 2025-09-08 13:34:36 celerate.py:219 Number of processes: 2
-INFO 2025-09-08 13:34:36 celerate.py:220 Device: cuda:0
-INFO 2025-09-08 13:34:36 celerate.py:221 Mixed precision: bf16
-INFO 2025-09-08 13:34:36 celerate.py:243 Start offline training on a fixed dataset
-[rank1]:[W908 13:34:39.454560620 reducer.cpp:1430] Warning: find_unused_parameters=True was specified in DDP constru
-ctor, but did not find any unused parameters in the forward pass. This flag results in an extra traversal of the aut
-ograd graph every iteration,  which can adversely affect performance. If your model indeed never has any unused para
-meters in the forward pass, consider turning this flag off. Note that this warning may be a false positive if your m
-odel has flow control causing later iterations to have unused parameters. (function operator())
-[rank0]:[W908 13:34:40.502702504 reducer.cpp:1430] Warning: find_unused_parameters=True was specified in DDP constru
-ctor, but did not find any unused parameters in the forward pass. This flag results in an extra traversal of the aut
-ograd graph every iteration,  which can adversely affect performance. If your model indeed never has any unused para
-meters in the forward pass, consider turning this flag off. Note that this warning may be a false positive if your m
-odel has flow control causing later iterations to have unused parameters. (function operator())
-INFO 2025-09-08 13:36:23 celerate.py:281 step:200 smpl:13K ep:79 epch:0.05 loss:0.963 grdn:2.699 lr:2.0e-05 updt_s:0
-.506 data_s:0.027
-INFO 2025-09-08 13:38:09 celerate.py:281 step:400 smpl:26K ep:158 epch:0.09 loss:0.389 grdn:3.127 lr:6.0e-05 updt_s:
-0.525 data_s:0.003
-INFO 2025-09-08 13:39:53 celerate.py:281 step:600 smpl:38K ep:238 epch:0.14 loss:0.261 grdn:2.618 lr:9.5e-05 updt_s:
-0.517 data_s:0.003
-INFO 2025-09-08 13:41:37 celerate.py:281 step:800 smpl:51K ep:317 epch:0.19 loss:0.231 grdn:1.684 lr:9.9e-05 updt_s:
-0.516 data_s:0.003
-INFO 2025-09-08 13:43:21 celerate.py:281 step:1K smpl:64K ep:396 epch:0.23 loss:0.211 grdn:1.258 lr:9.9e-05 updt_s:0
-.514 data_s:0.003
-INFO 2025-09-08 13:45:05 celerate.py:281 step:1K smpl:77K ep:475 epch:0.28 loss:0.198 grdn:1.032 lr:9.9e-05 updt_s:0
-.517 data_s:0.003
-INFO 2025-09-08 13:46:49 celerate.py:281 step:1K smpl:90K ep:555 epch:0.33 loss:0.182 grdn:0.880 lr:9.8e-05 updt_s:0
-.515 data_s:0.003
-INFO 2025-09-08 13:48:33 celerate.py:281 step:2K smpl:102K ep:634 epch:0.37 loss:0.167 grdn:0.744 lr:9.8e-05 updt_s:
-0.514 data_s:0.003
-INFO 2025-09-08 13:50:17 celerate.py:281 step:2K smpl:115K ep:713 epch:0.42 loss:0.157 grdn:0.680 lr:9.7e-05 updt_s:
-0.514 data_s:0.003
-INFO 2025-09-08 13:52:01 celerate.py:281 step:2K smpl:128K ep:792 epch:0.47 loss:0.147 grdn:0.612 lr:9.6e-05 updt_s:
-0.517 data_s:0.003
-INFO 2025-09-08 13:53:44 celerate.py:281 step:2K smpl:141K ep:872 epch:0.51 loss:0.142 grdn:0.576 lr:9.5e-05 updt_s:
-0.510 data_s:0.003
-INFO 2025-09-08 13:55:27 celerate.py:281 step:2K smpl:154K ep:951 epch:0.56 loss:0.136 grdn:0.523 lr:9.4e-05 updt_s:
-0.514 data_s:0.003
-INFO 2025-09-08 13:57:11 celerate.py:281 step:3K smpl:166K ep:1K epch:0.61 loss:0.132 grdn:0.509 lr:9.3e-05 updt_s:0
-.516 data_s:0.003
-INFO 2025-09-08 13:58:57 celerate.py:281 step:3K smpl:179K ep:1K epch:0.66 loss:0.126 grdn:0.492 lr:9.2e-05 updt_s:0
-.525 data_s:0.003
-INFO 2025-09-08 14:00:43 celerate.py:281 step:3K smpl:192K ep:1K epch:0.70 loss:0.124 grdn:0.467 lr:9.1e-05 updt_s:0
-.525 data_s:0.003
-INFO 2025-09-08 14:02:26 celerate.py:281 step:3K smpl:205K ep:1K epch:0.75 loss:0.119 grdn:0.438 lr:9.0e-05 updt_s:0
-.508 data_s:0.003
-INFO 2025-09-08 14:04:27 celerate.py:281 step:3K smpl:218K ep:1K epch:0.80 loss:0.118 grdn:0.426 lr:8.9e-05 updt_s:0
-.564 data_s:0.039
-INFO 2025-09-08 14:06:10 celerate.py:281 step:4K smpl:230K ep:1K epch:0.84 loss:0.116 grdn:0.422 lr:8.7e-05 updt_s:0
-.511 data_s:0.004
-INFO 2025-09-08 14:07:55 celerate.py:281 step:4K smpl:243K ep:2K epch:0.89 loss:0.113 grdn:0.395 lr:8.6e-05 updt_s:0
-.517 data_s:0.003
-INFO 2025-09-08 14:09:38 celerate.py:281 step:4K smpl:256K ep:2K epch:0.94 loss:0.111 grdn:0.401 lr:8.5e-05 updt_s:0
-.511 data_s:0.003
-INFO 2025-09-08 14:11:21 celerate.py:281 step:4K smpl:269K ep:2K epch:0.98 loss:0.110 grdn:0.380 lr:8.3e-05 updt_s:0
-.511 data_s:0.003
-INFO 2025-09-08 14:13:08 celerate.py:281 step:4K smpl:282K ep:2K epch:1.03 loss:0.109 grdn:0.381 lr:8.2e-05 updt_s:0
-.413 data_s:0.119
-INFO 2025-09-08 14:14:52 celerate.py:281 step:5K smpl:294K ep:2K epch:1.08 loss:0.107 grdn:0.387 lr:8.0e-05 updt_s:0
-.373 data_s:0.146
-INFO 2025-09-08 14:16:36 celerate.py:281 step:5K smpl:307K ep:2K epch:1.12 loss:0.107 grdn:0.366 lr:7.8e-05 updt_s:0
-.446 data_s:0.072
-INFO 2025-09-08 14:18:19 celerate.py:281 step:5K smpl:320K ep:2K epch:1.17 loss:0.105 grdn:0.347 lr:7.6e-05 updt_s:0
-.468 data_s:0.045
-INFO 2025-09-08 14:20:01 celerate.py:281 step:5K smpl:333K ep:2K epch:1.22 loss:0.103 grdn:0.350 lr:7.5e-05 updt_s:0
-.510 data_s:0.003
-INFO 2025-09-08 14:21:46 celerate.py:281 step:5K smpl:346K ep:2K epch:1.26 loss:0.101 grdn:0.336 lr:7.3e-05 updt_s:0
-.512 data_s:0.011
-INFO 2025-09-08 14:23:30 celerate.py:281 step:6K smpl:358K ep:2K epch:1.31 loss:0.102 grdn:0.345 lr:7.1e-05 updt_s:0
-.515 data_s:0.003
-INFO 2025-09-08 14:25:15 celerate.py:281 step:6K smpl:371K ep:2K epch:1.36 loss:0.100 grdn:0.333 lr:6.9e-05 updt_s:0
-.521 data_s:0.003
-INFO 2025-09-08 14:26:59 celerate.py:281 step:6K smpl:384K ep:2K epch:1.40 loss:0.100 grdn:0.328 lr:6.7e-05 updt_s:0
-.516 data_s:0.003
-INFO 2025-09-08 14:28:43 celerate.py:281 step:6K smpl:397K ep:2K epch:1.45 loss:0.099 grdn:0.319 lr:6.5e-05 updt_s:0
-.512 data_s:0.003
-INFO 2025-09-08 14:30:26 celerate.py:281 step:6K smpl:410K ep:3K epch:1.50 loss:0.098 grdn:0.313 lr:6.3e-05 updt_s:0
-.515 data_s:0.003
-INFO 2025-09-08 14:32:11 celerate.py:281 step:7K smpl:422K ep:3K epch:1.54 loss:0.097 grdn:0.319 lr:6.1e-05 updt_s:0
-.519 data_s:0.004
-INFO 2025-09-08 14:33:55 celerate.py:281 step:7K smpl:435K ep:3K epch:1.59 loss:0.097 grdn:0.312 lr:5.9e-05 updt_s:0
-.506 data_s:0.010
-INFO 2025-09-08 14:35:39 celerate.py:281 step:7K smpl:448K ep:3K epch:1.64 loss:0.097 grdn:0.307 lr:5.7e-05 updt_s:0
-.516 data_s:0.003
-INFO 2025-09-08 14:37:23 celerate.py:281 step:7K smpl:461K ep:3K epch:1.69 loss:0.095 grdn:0.294 lr:5.5e-05 updt_s:0
-.518 data_s:0.003
-INFO 2025-09-08 14:39:07 celerate.py:281 step:7K smpl:474K ep:3K epch:1.73 loss:0.095 grdn:0.299 lr:5.3e-05 updt_s:0
-.507 data_s:0.007
-INFO 2025-09-08 14:40:52 celerate.py:281 step:8K smpl:486K ep:3K epch:1.78 loss:0.094 grdn:0.283 lr:5.1e-05 updt_s:0
-.523 data_s:0.003
-INFO 2025-09-08 14:42:36 celerate.py:281 step:8K smpl:499K ep:3K epch:1.83 loss:0.093 grdn:0.284 lr:4.9e-05 updt_s:0
-.517 data_s:0.003
-INFO 2025-09-08 14:44:22 celerate.py:281 step:8K smpl:512K ep:3K epch:1.87 loss:0.092 grdn:0.284 lr:4.7e-05 updt_s:0
-.465 data_s:0.060
-INFO 2025-09-08 14:46:06 celerate.py:281 step:8K smpl:525K ep:3K epch:1.92 loss:0.093 grdn:0.292 lr:4.5e-05 updt_s:0
-.456 data_s:0.066
-INFO 2025-09-08 14:47:49 celerate.py:281 step:8K smpl:538K ep:3K epch:1.97 loss:0.093 grdn:0.290 lr:4.3e-05 updt_s:0
-.510 data_s:0.003
-INFO 2025-09-08 14:49:37 celerate.py:281 step:9K smpl:550K ep:3K epch:2.01 loss:0.092 grdn:0.283 lr:4.1e-05 updt_s:0
-.419 data_s:0.117
-INFO 2025-09-08 14:51:20 celerate.py:281 step:9K smpl:563K ep:3K epch:2.06 loss:0.092 grdn:0.275 lr:3.9e-05 updt_s:0
-.463 data_s:0.053
-INFO 2025-09-08 14:53:05 celerate.py:281 step:9K smpl:576K ep:4K epch:2.11 loss:0.090 grdn:0.272 lr:3.7e-05 updt_s:0
-.517 data_s:0.003
-INFO 2025-09-08 14:54:49 celerate.py:281 step:9K smpl:589K ep:4K epch:2.15 loss:0.090 grdn:0.268 lr:3.5e-05 updt_s:0
-.506 data_s:0.013
-INFO 2025-09-08 14:56:32 celerate.py:281 step:9K smpl:602K ep:4K epch:2.20 loss:0.090 grdn:0.271 lr:3.3e-05 updt_s:0
-.513 data_s:0.003
-INFO 2025-09-08 14:58:17 celerate.py:281 step:10K smpl:614K ep:4K epch:2.25 loss:0.090 grdn:0.268 lr:3.1e-05 updt_s:
-0.520 data_s:0.003
-INFO 2025-09-08 15:00:02 celerate.py:281 step:10K smpl:627K ep:4K epch:2.29 loss:0.089 grdn:0.261 lr:3.0e-05 updt_s:
-0.519 data_s:0.003
-INFO 2025-09-08 15:01:48 celerate.py:281 step:10K smpl:640K ep:4K epch:2.34 loss:0.090 grdn:0.271 lr:2.8e-05 updt_s:
-0.526 data_s:0.003
-INFO 2025-09-08 15:03:33 celerate.py:281 step:10K smpl:653K ep:4K epch:2.39 loss:0.089 grdn:0.262 lr:2.6e-05 updt_s:
-0.521 data_s:0.003
-INFO 2025-09-08 15:05:18 celerate.py:281 step:10K smpl:666K ep:4K epch:2.43 loss:0.090 grdn:0.264 lr:2.4e-05 updt_s:
-0.519 data_s:0.003
-INFO 2025-09-08 15:07:32 celerate.py:281 step:11K smpl:678K ep:4K epch:2.48 loss:0.089 grdn:0.255 lr:2.3e-05 updt_s:
-0.663 data_s:0.004
-INFO 2025-09-08 15:09:21 celerate.py:281 step:11K smpl:691K ep:4K epch:2.53 loss:0.090 grdn:0.263 lr:2.1e-05 updt_s:
-0.514 data_s:0.030
-INFO 2025-09-08 15:11:06 celerate.py:281 step:11K smpl:704K ep:4K epch:2.57 loss:0.088 grdn:0.254 lr:1.9e-05 updt_s:
-0.517 data_s:0.006
-INFO 2025-09-08 15:12:51 celerate.py:281 step:11K smpl:717K ep:4K epch:2.62 loss:0.088 grdn:0.252 lr:1.8e-05 updt_s:
-0.517 data_s:0.005
-INFO 2025-09-08 15:14:38 celerate.py:281 step:11K smpl:730K ep:5K epch:2.67 loss:0.088 grdn:0.251 lr:1.6e-05 updt_s:
-0.532 data_s:0.003
-INFO 2025-09-08 15:16:23 celerate.py:281 step:12K smpl:742K ep:5K epch:2.71 loss:0.088 grdn:0.253 lr:1.5e-05 updt_s:
-0.520 data_s:0.003
-INFO 2025-09-08 15:18:08 celerate.py:281 step:12K smpl:755K ep:5K epch:2.76 loss:0.087 grdn:0.244 lr:1.4e-05 updt_s:
-0.521 data_s:0.003
-INFO 2025-09-08 15:19:54 celerate.py:281 step:12K smpl:768K ep:5K epch:2.81 loss:0.088 grdn:0.247 lr:1.2e-05 updt_s:
-0.524 data_s:0.003
-INFO 2025-09-08 15:21:39 celerate.py:281 step:12K smpl:781K ep:5K epch:2.86 loss:0.087 grdn:0.242 lr:1.1e-05 updt_s:
-0.520 data_s:0.003
-INFO 2025-09-08 15:23:32 celerate.py:281 step:12K smpl:794K ep:5K epch:2.90 loss:0.088 grdn:0.243 lr:1.0e-05 updt_s:
-0.560 data_s:0.003
-INFO 2025-09-08 15:25:48 celerate.py:281 step:13K smpl:806K ep:5K epch:2.95 loss:0.087 grdn:0.240 lr:9.0e-06 updt_s:
-0.674 data_s:0.005
-INFO 2025-09-08 15:28:02 celerate.py:281 step:13K smpl:819K ep:5K epch:3.00 loss:0.088 grdn:0.245 lr:8.0e-06 updt_s:
-0.662 data_s:0.004
-INFO 2025-09-08 15:31:06 celerate.py:281 step:13K smpl:832K ep:5K epch:3.04 loss:0.086 grdn:0.236 lr:7.1e-06 updt_s:
-0.688 data_s:0.231
-INFO 2025-09-08 15:32:52 celerate.py:281 step:13K smpl:845K ep:5K epch:3.09 loss:0.087 grdn:0.231 lr:6.3e-06 updt_s:
-0.521 data_s:0.003
-INFO 2025-09-08 15:35:46 celerate.py:281 step:13K smpl:858K ep:5K epch:3.14 loss:0.088 grdn:0.235 lr:5.6e-06 updt_s:
-0.637 data_s:0.232
-INFO 2025-09-08 15:37:34 celerate.py:281 step:14K smpl:870K ep:5K epch:3.18 loss:0.087 grdn:0.238 lr:4.9e-06 updt_s:
-0.514 data_s:0.025
-INFO 2025-09-08 15:39:18 celerate.py:281 step:14K smpl:883K ep:5K epch:3.23 loss:0.087 grdn:0.226 lr:4.3e-06 updt_s:
-0.515 data_s:0.003
-INFO 2025-09-08 15:41:02 celerate.py:281 step:14K smpl:896K ep:6K epch:3.28 loss:0.087 grdn:0.230 lr:3.8e-06 updt_s:
-0.515 data_s:0.003
-INFO 2025-09-08 15:42:45 celerate.py:281 step:14K smpl:909K ep:6K epch:3.32 loss:0.086 grdn:0.229 lr:3.4e-06 updt_s:
-0.507 data_s:0.008
-INFO 2025-09-08 15:44:29 celerate.py:281 step:14K smpl:922K ep:6K epch:3.37 loss:0.087 grdn:0.229 lr:3.0e-06 updt_s:
-0.515 data_s:0.003
-INFO 2025-09-08 15:46:12 celerate.py:281 step:15K smpl:934K ep:6K epch:3.42 loss:0.087 grdn:0.228 lr:2.8e-06 updt_s:
-0.502 data_s:0.011
-INFO 2025-09-08 15:47:56 celerate.py:281 step:15K smpl:947K ep:6K epch:3.46 loss:0.086 grdn:0.232 lr:2.6e-06 updt_s:
-0.515 data_s:0.004
-INFO 2025-09-08 15:49:39 celerate.py:281 step:15K smpl:960K ep:6K epch:3.51 loss:0.087 grdn:0.234 lr:2.5e-06 updt_s:
-0.365 data_s:0.147
-INFO 2025-09-08 15:51:22 celerate.py:281 step:15K smpl:973K ep:6K epch:3.56 loss:0.086 grdn:0.230 lr:2.5e-06 updt_s:
-0.333 data_s:0.179
-INFO 2025-09-08 15:53:07 celerate.py:281 step:15K smpl:986K ep:6K epch:3.60 loss:0.087 grdn:0.229 lr:2.5e-06 updt_s:
-0.357 data_s:0.164
-INFO 2025-09-08 15:54:50 celerate.py:281 step:16K smpl:998K ep:6K epch:3.65 loss:0.087 grdn:0.230 lr:2.5e-06 updt_s:
-0.365 data_s:0.151
-INFO 2025-09-08 15:56:35 celerate.py:281 step:16K smpl:1M ep:6K epch:3.70 loss:0.086 grdn:0.228 lr:2.5e-06 updt_s:0.
-450 data_s:0.071
-INFO 2025-09-08 15:58:19 celerate.py:281 step:16K smpl:1M ep:6K epch:3.74 loss:0.087 grdn:0.232 lr:2.5e-06 updt_s:0.
-495 data_s:0.023
-INFO 2025-09-08 16:00:04 celerate.py:281 step:16K smpl:1M ep:6K epch:3.79 loss:0.087 grdn:0.227 lr:2.5e-06 updt_s:0.
-393 data_s:0.131
-INFO 2025-09-08 16:01:48 celerate.py:281 step:16K smpl:1M ep:6K epch:3.84 loss:0.086 grdn:0.230 lr:2.5e-06 updt_s:0.
-515 data_s:0.003
-INFO 2025-09-08 16:03:32 celerate.py:281 step:17K smpl:1M ep:7K epch:3.88 loss:0.087 grdn:0.231 lr:2.5e-06 updt_s:0.
-518 data_s:0.003
-INFO 2025-09-08 16:05:17 celerate.py:281 step:17K smpl:1M ep:7K epch:3.93 loss:0.087 grdn:0.231 lr:2.5e-06 updt_s:0.
-397 data_s:0.125
-INFO 2025-09-08 16:12:41 celerate.py:281 step:17K smpl:1M ep:7K epch:3.98 loss:0.086 grdn:0.230 lr:2.5e-06 updt_s:1.
-496 data_s:0.719
-INFO 2025-09-08 16:14:28 celerate.py:281 step:17K smpl:1M ep:7K epch:4.03 loss:0.087 grdn:0.228 lr:2.5e-06 updt_s:0.
-413 data_s:0.124
-INFO 2025-09-08 16:16:13 celerate.py:281 step:17K smpl:1M ep:7K epch:4.07 loss:0.087 grdn:0.234 lr:2.5e-06 updt_s:0.
-469 data_s:0.050
-INFO 2025-09-08 16:17:57 celerate.py:281 step:18K smpl:1M ep:7K epch:4.12 loss:0.087 grdn:0.228 lr:2.5e-06 updt_s:0.
-375 data_s:0.145
-INFO 2025-09-08 16:19:42 celerate.py:281 step:18K smpl:1M ep:7K epch:4.17 loss:0.086 grdn:0.230 lr:2.5e-06 updt_s:0.
-333 data_s:0.189
-INFO 2025-09-08 16:21:26 celerate.py:281 step:18K smpl:1M ep:7K epch:4.21 loss:0.087 grdn:0.231 lr:2.5e-06 updt_s:0.
-334 data_s:0.184
-INFO 2025-09-08 16:23:09 celerate.py:281 step:18K smpl:1M ep:7K epch:4.26 loss:0.086 grdn:0.235 lr:2.5e-06 updt_s:0.
-331 data_s:0.185
-INFO 2025-09-08 16:24:53 celerate.py:281 step:18K smpl:1M ep:7K epch:4.31 loss:0.088 grdn:0.236 lr:2.5e-06 updt_s:0.
-333 data_s:0.182
-INFO 2025-09-08 16:26:38 celerate.py:281 step:19K smpl:1M ep:7K epch:4.35 loss:0.086 grdn:0.230 lr:2.5e-06 updt_s:0.
-337 data_s:0.188
-INFO 2025-09-08 16:28:22 celerate.py:281 step:19K smpl:1M ep:7K epch:4.40 loss:0.086 grdn:0.231 lr:2.5e-06 updt_s:0.
-420 data_s:0.099
-INFO 2025-09-08 16:30:06 celerate.py:281 step:19K smpl:1M ep:8K epch:4.45 loss:0.086 grdn:0.232 lr:2.5e-06 updt_s:0.
-444 data_s:0.075
-INFO 2025-09-08 16:31:49 celerate.py:281 step:19K smpl:1M ep:8K epch:4.49 loss:0.087 grdn:0.235 lr:2.5e-06 updt_s:0.
-475 data_s:0.036
-INFO 2025-09-08 16:33:33 celerate.py:281 step:19K smpl:1M ep:8K epch:4.54 loss:0.087 grdn:0.236 lr:2.5e-06 updt_s:0.
-379 data_s:0.139
-INFO 2025-09-08 16:35:17 celerate.py:281 step:20K smpl:1M ep:8K epch:4.59 loss:0.087 grdn:0.237 lr:2.5e-06 updt_s:0.
-348 data_s:0.171
-INFO 2025-09-08 16:37:01 celerate.py:281 step:20K smpl:1M ep:8K epch:4.63 loss:0.087 grdn:0.231 lr:2.5e-06 updt_s:0.
-332 data_s:0.185
-/home/jade_choghari/miniconda3/envs/lerobot/lib/python3.10/site-packages/torch/distributed/distributed_c10d.py:4631:
- UserWarning: No device id is provided via `init_process_group` or `barrier `. Using the current device set by the u
-ser.
-  warnings.warn(  # warn only once
-INFO 2025-09-08 16:38:46 celerate.py:281 step:20K smpl:1M ep:8K epch:4.68 loss:0.086 grdn:0.228 lr:2.5e-06 updt_s:0.
-486 data_s:0.037
-INFO 2025-09-08 16:38:46 celerate.py:295 Checkpoint policy after step 20000
-INFO 2025-09-08 16:40:30 celerate.py:281 step:20K smpl:1M ep:8K epch:4.73 loss:0.086 grdn:0.232 lr:2.5e-06 updt_s:0.
-509 data_s:0.003
-INFO 2025-09-08 16:42:16 celerate.py:281 step:20K smpl:1M ep:8K epch:4.77 loss:0.086 grdn:0.237 lr:2.5e-06 updt_s:0.
-527 data_s:0.003
-INFO 2025-09-08 16:44:01 celerate.py:281 step:21K smpl:1M ep:8K epch:4.82 loss:0.086 grdn:0.238 lr:2.5e-06 updt_s:0.
-519 data_s:0.003
-INFO 2025-09-08 16:45:45 celerate.py:281 step:21K smpl:1M ep:8K epch:4.87 loss:0.087 grdn:0.235 lr:2.5e-06 updt_s:0.
-504 data_s:0.013
-INFO 2025-09-08 16:47:29 celerate.py:281 step:21K smpl:1M ep:8K epch:4.91 loss:0.087 grdn:0.233 lr:2.5e-06 updt_s:0.
-509 data_s:0.011
-INFO 2025-09-08 16:49:19 celerate.py:281 step:21K smpl:1M ep:8K epch:4.96 loss:0.086 grdn:0.234 lr:2.5e-06 updt_s:0.
-544 data_s:0.003
-INFO 2025-09-08 16:51:04 celerate.py:281 step:21K smpl:1M ep:8K epch:5.01 loss:0.086 grdn:0.225 lr:2.5e-06 updt_s:0.
-488 data_s:0.039
-INFO 2025-09-08 16:52:51 celerate.py:281 step:22K smpl:1M ep:9K epch:5.06 loss:0.086 grdn:0.232 lr:2.5e-06 updt_s:0.
-430 data_s:0.099
-INFO 2025-09-08 16:54:36 celerate.py:281 step:22K smpl:1M ep:9K epch:5.10 loss:0.087 grdn:0.236 lr:2.5e-06 updt_s:0.
-521 data_s:0.003
-INFO 2025-09-08 16:56:23 celerate.py:281 step:22K smpl:1M ep:9K epch:5.15 loss:0.086 grdn:0.233 lr:2.5e-06 updt_s:0.
-521 data_s:0.014
-INFO 2025-09-08 16:58:09 celerate.py:281 step:22K smpl:1M ep:9K epch:5.20 loss:0.087 grdn:0.234 lr:2.5e-06 updt_s:0.
-525 data_s:0.003
-INFO 2025-09-08 17:00:04 celerate.py:281 step:22K smpl:1M ep:9K epch:5.24 loss:0.086 grdn:0.233 lr:2.5e-06 updt_s:0.
-568 data_s:0.003
-INFO 2025-09-08 17:02:00 celerate.py:281 step:23K smpl:1M ep:9K epch:5.29 loss:0.087 grdn:0.238 lr:2.5e-06 updt_s:0.
-575 data_s:0.003
-INFO 2025-09-08 17:03:49 celerate.py:281 step:23K smpl:1M ep:9K epch:5.34 loss:0.087 grdn:0.233 lr:2.5e-06 updt_s:0.
-513 data_s:0.030
-INFO 2025-09-08 17:05:39 celerate.py:281 step:23K smpl:1M ep:9K epch:5.38 loss:0.085 grdn:0.227 lr:2.5e-06 updt_s:0.
-523 data_s:0.027
-INFO 2025-09-08 17:07:26 celerate.py:281 step:23K smpl:1M ep:9K epch:5.43 loss:0.086 grdn:0.233 lr:2.5e-06 updt_s:0.
-529 data_s:0.003
-INFO 2025-09-08 17:09:12 celerate.py:281 step:23K smpl:1M ep:9K epch:5.48 loss:0.086 grdn:0.232 lr:2.5e-06 updt_s:0.
-526 data_s:0.003
-INFO 2025-09-08 17:10:55 celerate.py:281 step:24K smpl:2M ep:9K epch:5.52 loss:0.087 grdn:0.230 lr:2.5e-06 updt_s:0.
-443 data_s:0.072
-INFO 2025-09-08 17:12:40 celerate.py:281 step:24K smpl:2M ep:9K epch:5.57 loss:0.087 grdn:0.229 lr:2.5e-06 updt_s:0.
-518 data_s:0.004
-INFO 2025-09-08 17:14:25 celerate.py:281 step:24K smpl:2M ep:10K epch:5.62 loss:0.087 grdn:0.232 lr:2.5e-06 updt_s:0
-.521 data_s:0.003
-INFO 2025-09-08 17:16:11 celerate.py:281 step:24K smpl:2M ep:10K epch:5.66 loss:0.086 grdn:0.230 lr:2.5e-06 updt_s:0
-.523 data_s:0.003
-INFO 2025-09-08 17:17:55 celerate.py:281 step:24K smpl:2M ep:10K epch:5.71 loss:0.086 grdn:0.228 lr:2.5e-06 updt_s:0
-.515 data_s:0.005
-INFO 2025-09-08 17:19:39 celerate.py:281 step:25K smpl:2M ep:10K epch:5.76 loss:0.087 grdn:0.229 lr:2.5e-06 updt_s:0
-.415 data_s:0.106
-INFO 2025-09-08 17:21:24 celerate.py:281 step:25K smpl:2M ep:10K epch:5.80 loss:0.086 grdn:0.234 lr:2.5e-06 updt_s:0
-.507 data_s:0.016
-INFO 2025-09-08 17:23:08 celerate.py:281 step:25K smpl:2M ep:10K epch:5.85 loss:0.085 grdn:0.229 lr:2.5e-06 updt_s:0
-.514 data_s:0.003
-INFO 2025-09-08 17:24:54 celerate.py:281 step:25K smpl:2M ep:10K epch:5.90 loss:0.087 grdn:0.227 lr:2.5e-06 updt_s:0
-.518 data_s:0.008
-INFO 2025-09-08 17:26:41 celerate.py:281 step:25K smpl:2M ep:10K epch:5.94 loss:0.087 grdn:0.237 lr:2.5e-06 updt_s:0
-.529 data_s:0.003
-INFO 2025-09-08 17:28:24 celerate.py:281 step:26K smpl:2M ep:10K epch:5.99 loss:0.087 grdn:0.232 lr:2.5e-06 updt_s:0
-.513 data_s:0.003
-INFO 2025-09-08 17:30:11 celerate.py:281 step:26K smpl:2M ep:10K epch:6.04 loss:0.087 grdn:0.233 lr:2.5e-06 updt_s:0
-.370 data_s:0.164
-INFO 2025-09-08 17:31:55 celerate.py:281 step:26K smpl:2M ep:10K epch:6.08 loss:0.087 grdn:0.231 lr:2.5e-06 updt_s:0
-.385 data_s:0.132
-INFO 2025-09-08 17:33:39 celerate.py:281 step:26K smpl:2M ep:10K epch:6.13 loss:0.087 grdn:0.237 lr:2.5e-06 updt_s:0
-.450 data_s:0.069
-INFO 2025-09-08 17:35:24 celerate.py:281 step:26K smpl:2M ep:10K epch:6.18 loss:0.087 grdn:0.238 lr:2.5e-06 updt_s:0
-.468 data_s:0.052
-INFO 2025-09-08 17:37:07 celerate.py:281 step:27K smpl:2M ep:11K epch:6.23 loss:0.087 grdn:0.233 lr:2.5e-06 updt_s:0
-.514 data_s:0.004
-INFO 2025-09-08 17:38:52 celerate.py:281 step:27K smpl:2M ep:11K epch:6.27 loss:0.087 grdn:0.236 lr:2.5e-06 updt_s:0
-.519 data_s:0.003
-INFO 2025-09-08 17:40:40 celerate.py:281 step:27K smpl:2M ep:11K epch:6.32 loss:0.086 grdn:0.234 lr:2.5e-06 updt_s:0
-.534 data_s:0.003
-INFO 2025-09-08 17:42:57 celerate.py:281 step:27K smpl:2M ep:11K epch:6.37 loss:0.087 grdn:0.231 lr:2.5e-06 updt_s:0
-.678 data_s:0.007
-INFO 2025-09-08 17:46:13 celerate.py:281 step:27K smpl:2M ep:11K epch:6.41 loss:0.086 grdn:0.233 lr:2.5e-06 updt_s:0
-.968 data_s:0.009
-INFO 2025-09-08 17:49:20 celerate.py:281 step:28K smpl:2M ep:11K epch:6.46 loss:0.087 grdn:0.231 lr:2.5e-06 updt_s:0
-.895 data_s:0.037
-INFO 2025-09-08 17:51:22 celerate.py:281 step:28K smpl:2M ep:11K epch:6.51 loss:0.087 grdn:0.235 lr:2.5e-06 updt_s:0
-.604 data_s:0.003
-INFO 2025-09-08 17:53:07 celerate.py:281 step:28K smpl:2M ep:11K epch:6.55 loss:0.086 grdn:0.236 lr:2.5e-06 updt_s:0
-.521 data_s:0.003
-INFO 2025-09-08 17:54:51 celerate.py:281 step:28K smpl:2M ep:11K epch:6.60 loss:0.087 grdn:0.234 lr:2.5e-06 updt_s:0
-.516 data_s:0.003
-INFO 2025-09-08 17:56:36 celerate.py:281 step:28K smpl:2M ep:11K epch:6.65 loss:0.086 grdn:0.232 lr:2.5e-06 updt_s:0
-.519 data_s:0.003
-INFO 2025-09-08 17:58:21 celerate.py:281 step:29K smpl:2M ep:11K epch:6.69 loss:0.085 grdn:0.228 lr:2.5e-06 updt_s:0
-.521 data_s:0.003
-INFO 2025-09-08 18:00:06 celerate.py:281 step:29K smpl:2M ep:11K epch:6.74 loss:0.087 grdn:0.237 lr:2.5e-06 updt_s:0
-.513 data_s:0.011
-INFO 2025-09-08 18:01:50 celerate.py:281 step:29K smpl:2M ep:11K epch:6.79 loss:0.086 grdn:0.232 lr:2.5e-06 updt_s:0
-.476 data_s:0.041
-INFO 2025-09-08 18:03:34 celerate.py:281 step:29K smpl:2M ep:12K epch:6.83 loss:0.086 grdn:0.236 lr:2.5e-06 updt_s:0
-.506 data_s:0.012
-INFO 2025-09-08 18:05:21 celerate.py:281 step:29K smpl:2M ep:12K epch:6.88 loss:0.086 grdn:0.229 lr:2.5e-06 updt_s:0
-.455 data_s:0.075
-INFO 2025-09-08 18:07:04 celerate.py:281 step:30K smpl:2M ep:12K epch:6.93 loss:0.086 grdn:0.233 lr:2.5e-06 updt_s:0
-.514 data_s:0.003
-INFO 2025-09-08 18:08:47 celerate.py:281 step:30K smpl:2M ep:12K epch:6.97 loss:0.086 grdn:0.236 lr:2.5e-06 updt_s:0
-.509 data_s:0.003
-INFO 2025-09-08 18:10:33 celerate.py:281 step:30K smpl:2M ep:12K epch:7.02 loss:0.087 grdn:0.237 lr:2.5e-06 updt_s:0
-.422 data_s:0.105
-INFO 2025-09-08 18:12:19 celerate.py:281 step:30K smpl:2M ep:12K epch:7.07 loss:0.087 grdn:0.236 lr:2.5e-06 updt_s:0
-.347 data_s:0.182
-INFO 2025-09-08 18:14:05 celerate.py:281 step:30K smpl:2M ep:12K epch:7.11 loss:0.086 grdn:0.231 lr:2.5e-06 updt_s:0
-.473 data_s:0.053
-INFO 2025-09-08 18:15:52 celerate.py:281 step:31K smpl:2M ep:12K epch:7.16 loss:0.086 grdn:0.238 lr:2.5e-06 updt_s:0
-.531 data_s:0.005
-INFO 2025-09-08 18:17:37 celerate.py:281 step:31K smpl:2M ep:12K epch:7.21 loss:0.086 grdn:0.235 lr:2.5e-06 updt_s:0
-.520 data_s:0.003
-INFO 2025-09-08 18:19:22 celerate.py:281 step:31K smpl:2M ep:12K epch:7.26 loss:0.086 grdn:0.231 lr:2.5e-06 updt_s:0
-.500 data_s:0.020
-INFO 2025-09-08 18:21:06 celerate.py:281 step:31K smpl:2M ep:12K epch:7.30 loss:0.087 grdn:0.243 lr:2.5e-06 updt_s:0
-.511 data_s:0.009
-INFO 2025-09-08 18:22:50 celerate.py:281 step:31K smpl:2M ep:12K epch:7.35 loss:0.087 grdn:0.227 lr:2.5e-06 updt_s:0
-.518 data_s:0.003
-INFO 2025-09-08 18:24:33 celerate.py:281 step:32K smpl:2M ep:13K epch:7.40 loss:0.086 grdn:0.235 lr:2.5e-06 updt_s:0
-.507 data_s:0.007
-INFO 2025-09-08 18:26:16 celerate.py:281 step:32K smpl:2M ep:13K epch:7.44 loss:0.087 grdn:0.238 lr:2.5e-06 updt_s:0
-.463 data_s:0.047
-INFO 2025-09-08 18:27:59 celerate.py:281 step:32K smpl:2M ep:13K epch:7.49 loss:0.087 grdn:0.240 lr:2.5e-06 updt_s:0
-.509 data_s:0.007
-INFO 2025-09-08 18:29:43 celerate.py:281 step:32K smpl:2M ep:13K epch:7.54 loss:0.087 grdn:0.234 lr:2.5e-06 updt_s:0
-.514 data_s:0.003
-INFO 2025-09-08 18:31:26 celerate.py:281 step:32K smpl:2M ep:13K epch:7.58 loss:0.086 grdn:0.234 lr:2.5e-06 updt_s:0
-.511 data_s:0.003
-INFO 2025-09-08 18:33:11 celerate.py:281 step:33K smpl:2M ep:13K epch:7.63 loss:0.086 grdn:0.233 lr:2.5e-06 updt_s:0
-.518 data_s:0.003
-INFO 2025-09-08 18:34:57 celerate.py:281 step:33K smpl:2M ep:13K epch:7.68 loss:0.087 grdn:0.236 lr:2.5e-06 updt_s:0
-.512 data_s:0.016
-INFO 2025-09-08 18:36:41 celerate.py:281 step:33K smpl:2M ep:13K epch:7.72 loss:0.086 grdn:0.229 lr:2.5e-06 updt_s:0
-.517 data_s:0.003
-INFO 2025-09-08 18:38:25 celerate.py:281 step:33K smpl:2M ep:13K epch:7.77 loss:0.087 grdn:0.236 lr:2.5e-06 updt_s:0
-.516 data_s:0.003
-INFO 2025-09-08 18:40:07 celerate.py:281 step:33K smpl:2M ep:13K epch:7.82 loss:0.086 grdn:0.235 lr:2.5e-06 updt_s:0
-.506 data_s:0.003
-INFO 2025-09-08 18:41:51 celerate.py:281 step:34K smpl:2M ep:13K epch:7.86 loss:0.087 grdn:0.234 lr:2.5e-06 updt_s:0
-.509 data_s:0.006
-INFO 2025-09-08 18:43:36 celerate.py:281 step:34K smpl:2M ep:13K epch:7.91 loss:0.086 grdn:0.239 lr:2.5e-06 updt_s:0
-.521 data_s:0.003
-INFO 2025-09-08 18:45:19 celerate.py:281 step:34K smpl:2M ep:13K epch:7.96 loss:0.086 grdn:0.233 lr:2.5e-06 updt_s:0
-.511 data_s:0.003
-INFO 2025-09-08 18:47:05 celerate.py:281 step:34K smpl:2M ep:14K epch:8.00 loss:0.086 grdn:0.232 lr:2.5e-06 updt_s:0
-.495 data_s:0.035
-INFO 2025-09-08 18:48:51 celerate.py:281 step:34K smpl:2M ep:14K epch:8.05 loss:0.087 grdn:0.243 lr:2.5e-06 updt_s:0
-.413 data_s:0.112
-INFO 2025-09-08 18:50:34 celerate.py:281 step:35K smpl:2M ep:14K epch:8.10 loss:0.086 grdn:0.237 lr:2.5e-06 updt_s:0
-.515 data_s:0.003
-INFO 2025-09-08 18:52:19 celerate.py:281 step:35K smpl:2M ep:14K epch:8.14 loss:0.087 grdn:0.237 lr:2.5e-06 updt_s:0
-.520 data_s:0.003
-INFO 2025-09-08 18:54:03 celerate.py:281 step:35K smpl:2M ep:14K epch:8.19 loss:0.087 grdn:0.231 lr:2.5e-06 updt_s:0
-.515 data_s:0.003
-INFO 2025-09-08 18:55:48 celerate.py:281 step:35K smpl:2M ep:14K epch:8.24 loss:0.086 grdn:0.231 lr:2.5e-06 updt_s:0
-.420 data_s:0.101
-INFO 2025-09-08 18:57:33 celerate.py:281 step:35K smpl:2M ep:14K epch:8.28 loss:0.086 grdn:0.237 lr:2.5e-06 updt_s:0
-.506 data_s:0.022
-INFO 2025-09-08 18:59:19 celerate.py:281 step:36K smpl:2M ep:14K epch:8.33 loss:0.087 grdn:0.235 lr:2.5e-06 updt_s:0
-.525 data_s:0.003
-INFO 2025-09-08 19:01:03 celerate.py:281 step:36K smpl:2M ep:14K epch:8.38 loss:0.086 grdn:0.228 lr:2.5e-06 updt_s:0
-.516 data_s:0.003
-INFO 2025-09-08 19:02:48 celerate.py:281 step:36K smpl:2M ep:14K epch:8.43 loss:0.086 grdn:0.232 lr:2.5e-06 updt_s:0
-.516 data_s:0.003
-INFO 2025-09-08 19:04:32 celerate.py:281 step:36K smpl:2M ep:14K epch:8.47 loss:0.087 grdn:0.235 lr:2.5e-06 updt_s:0
-.505 data_s:0.013
-INFO 2025-09-08 19:06:15 celerate.py:281 step:36K smpl:2M ep:14K epch:8.52 loss:0.086 grdn:0.232 lr:2.5e-06 updt_s:0
-.384 data_s:0.130
-INFO 2025-09-08 19:07:58 celerate.py:281 step:37K smpl:2M ep:15K epch:8.57 loss:0.087 grdn:0.235 lr:2.5e-06 updt_s:0
-.430 data_s:0.084
-INFO 2025-09-08 19:09:41 celerate.py:281 step:37K smpl:2M ep:15K epch:8.61 loss:0.087 grdn:0.234 lr:2.5e-06 updt_s:0
-.351 data_s:0.162
-INFO 2025-09-08 19:11:25 celerate.py:281 step:37K smpl:2M ep:15K epch:8.66 loss:0.086 grdn:0.239 lr:2.5e-06 updt_s:0
-.337 data_s:0.181
-INFO 2025-09-08 19:13:08 celerate.py:281 step:37K smpl:2M ep:15K epch:8.71 loss:0.086 grdn:0.232 lr:2.5e-06 updt_s:0
-.336 data_s:0.177
-INFO 2025-09-08 19:14:52 celerate.py:281 step:37K smpl:2M ep:15K epch:8.75 loss:0.086 grdn:0.235 lr:2.5e-06 updt_s:0
-.344 data_s:0.174
-INFO 2025-09-08 19:16:35 celerate.py:281 step:38K smpl:2M ep:15K epch:8.80 loss:0.086 grdn:0.231 lr:2.5e-06 updt_s:0
-.332 data_s:0.182
-INFO 2025-09-08 19:18:18 celerate.py:281 step:38K smpl:2M ep:15K epch:8.85 loss:0.086 grdn:0.234 lr:2.5e-06 updt_s:0
-.332 data_s:0.182
-INFO 2025-09-08 19:20:01 celerate.py:281 step:38K smpl:2M ep:15K epch:8.89 loss:0.087 grdn:0.233 lr:2.5e-06 updt_s:0
-.337 data_s:0.177
-INFO 2025-09-08 19:21:45 celerate.py:281 step:38K smpl:2M ep:15K epch:8.94 loss:0.086 grdn:0.234 lr:2.5e-06 updt_s:0
-.495 data_s:0.022
-INFO 2025-09-08 19:23:29 celerate.py:281 step:38K smpl:2M ep:15K epch:8.99 loss:0.086 grdn:0.235 lr:2.5e-06 updt_s:0
-.513 data_s:0.003
-INFO 2025-09-08 19:25:15 celerate.py:281 step:39K smpl:2M ep:15K epch:9.03 loss:0.086 grdn:0.234 lr:2.5e-06 updt_s:0
-.436 data_s:0.097
-INFO 2025-09-08 19:26:59 celerate.py:281 step:39K smpl:2M ep:15K epch:9.08 loss:0.086 grdn:0.237 lr:2.5e-06 updt_s:0
-.378 data_s:0.138
-INFO 2025-09-08 19:28:43 celerate.py:281 step:39K smpl:2M ep:15K epch:9.13 loss:0.087 grdn:0.241 lr:2.5e-06 updt_s:0
-.497 data_s:0.023
-INFO 2025-09-08 19:30:27 celerate.py:281 step:39K smpl:3M ep:16K epch:9.17 loss:0.086 grdn:0.235 lr:2.5e-06 updt_s:0
-.515 data_s:0.003
-INFO 2025-09-08 19:32:14 celerate.py:281 step:39K smpl:3M ep:16K epch:9.22 loss:0.086 grdn:0.233 lr:2.5e-06 updt_s:0
-.526 data_s:0.003
-INFO 2025-09-08 19:33:57 celerate.py:281 step:40K smpl:3M ep:16K epch:9.27 loss:0.087 grdn:0.234 lr:2.5e-06 updt_s:0
-.506 data_s:0.011
-INFO 2025-09-08 19:35:42 celerate.py:281 step:40K smpl:3M ep:16K epch:9.31 loss:0.087 grdn:0.229 lr:2.5e-06 updt_s:0
-.455 data_s:0.066
-INFO 2025-09-08 19:37:26 celerate.py:281 step:40K smpl:3M ep:16K epch:9.36 loss:0.086 grdn:0.238 lr:2.5e-06 updt_s:0
-.493 data_s:0.026
-INFO 2025-09-08 19:37:26 celerate.py:295 Checkpoint policy after step 40000
-INFO 2025-09-08 19:39:10 celerate.py:281 step:40K smpl:3M ep:16K epch:9.41 loss:0.086 grdn:0.236 lr:2.5e-06 updt_s:0
-.397 data_s:0.114
-INFO 2025-09-08 19:40:53 celerate.py:281 step:40K smpl:3M ep:16K epch:9.45 loss:0.086 grdn:0.239 lr:2.5e-06 updt_s:0
-.344 data_s:0.168
-INFO 2025-09-08 19:42:37 celerate.py:281 step:41K smpl:3M ep:16K epch:9.50 loss:0.086 grdn:0.234 lr:2.5e-06 updt_s:0
-.480 data_s:0.036
-INFO 2025-09-08 19:44:21 celerate.py:281 step:41K smpl:3M ep:16K epch:9.55 loss:0.086 grdn:0.237 lr:2.5e-06 updt_s:0
-.517 data_s:0.003
-INFO 2025-09-08 19:46:05 celerate.py:281 step:41K smpl:3M ep:16K epch:9.60 loss:0.086 grdn:0.233 lr:2.5e-06 updt_s:0
-.517 data_s:0.003
-INFO 2025-09-08 19:47:49 celerate.py:281 step:41K smpl:3M ep:16K epch:9.64 loss:0.086 grdn:0.234 lr:2.5e-06 updt_s:0
-.513 data_s:0.003
-INFO 2025-09-08 19:49:33 celerate.py:281 step:41K smpl:3M ep:16K epch:9.69 loss:0.087 grdn:0.236 lr:2.5e-06 updt_s:0
-.515 data_s:0.003
-INFO 2025-09-08 19:51:17 celerate.py:281 step:42K smpl:3M ep:16K epch:9.74 loss:0.086 grdn:0.228 lr:2.5e-06 updt_s:0
-.515 data_s:0.003
-INFO 2025-09-08 19:53:00 celerate.py:281 step:42K smpl:3M ep:17K epch:9.78 loss:0.086 grdn:0.232 lr:2.5e-06 updt_s:0
-.513 data_s:0.003
-INFO 2025-09-08 19:54:44 celerate.py:281 step:42K smpl:3M ep:17K epch:9.83 loss:0.086 grdn:0.233 lr:2.5e-06 updt_s:0
-.516 data_s:0.003
-INFO 2025-09-08 19:56:28 celerate.py:281 step:42K smpl:3M ep:17K epch:9.88 loss:0.087 grdn:0.237 lr:2.5e-06 updt_s:0
-.512 data_s:0.003
-INFO 2025-09-08 19:58:11 celerate.py:281 step:42K smpl:3M ep:17K epch:9.92 loss:0.086 grdn:0.233 lr:2.5e-06 updt_s:0
-.514 data_s:0.003
-INFO 2025-09-08 19:59:55 celerate.py:281 step:43K smpl:3M ep:17K epch:9.97 loss:0.087 grdn:0.238 lr:2.5e-06 updt_s:0
-.514 data_s:0.003
-INFO 2025-09-08 20:01:42 celerate.py:281 step:43K smpl:3M ep:17K epch:10.02 loss:0.087 grdn:0.234 lr:2.5e-06 updt_s:
-0.476 data_s:0.057
-INFO 2025-09-08 20:03:25 celerate.py:281 step:43K smpl:3M ep:17K epch:10.06 loss:0.087 grdn:0.239 lr:2.5e-06 updt_s:
-0.471 data_s:0.043
-INFO 2025-09-08 20:05:09 celerate.py:281 step:43K smpl:3M ep:17K epch:10.11 loss:0.086 grdn:0.237 lr:2.5e-06 updt_s:
-0.515 data_s:0.004
-INFO 2025-09-08 20:06:53 celerate.py:281 step:43K smpl:3M ep:17K epch:10.16 loss:0.087 grdn:0.237 lr:2.5e-06 updt_s:
-0.505 data_s:0.013
-INFO 2025-09-08 20:08:36 celerate.py:281 step:44K smpl:3M ep:17K epch:10.20 loss:0.087 grdn:0.237 lr:2.5e-06 updt_s:
-0.511 data_s:0.003
-INFO 2025-09-08 20:10:20 celerate.py:281 step:44K smpl:3M ep:17K epch:10.25 loss:0.086 grdn:0.236 lr:2.5e-06 updt_s:
-0.516 data_s:0.003
-INFO 2025-09-08 20:12:04 celerate.py:281 step:44K smpl:3M ep:17K epch:10.30 loss:0.086 grdn:0.232 lr:2.5e-06 updt_s:
-0.511 data_s:0.003
-INFO 2025-09-08 20:13:47 celerate.py:281 step:44K smpl:3M ep:18K epch:10.34 loss:0.087 grdn:0.235 lr:2.5e-06 updt_s:
-0.503 data_s:0.011
-INFO 2025-09-08 20:15:31 celerate.py:281 step:44K smpl:3M ep:18K epch:10.39 loss:0.086 grdn:0.233 lr:2.5e-06 updt_s:
-0.416 data_s:0.102
-INFO 2025-09-08 20:17:15 celerate.py:281 step:45K smpl:3M ep:18K epch:10.44 loss:0.087 grdn:0.235 lr:2.5e-06 updt_s:
-0.502 data_s:0.017
-INFO 2025-09-08 20:18:58 celerate.py:281 step:45K smpl:3M ep:18K epch:10.48 loss:0.086 grdn:0.234 lr:2.5e-06 updt_s:
-0.512 data_s:0.003
-INFO 2025-09-08 20:20:41 celerate.py:281 step:45K smpl:3M ep:18K epch:10.53 loss:0.086 grdn:0.238 lr:2.5e-06 updt_s:
-0.496 data_s:0.017
-INFO 2025-09-08 20:22:24 celerate.py:281 step:45K smpl:3M ep:18K epch:10.58 loss:0.086 grdn:0.233 lr:2.5e-06 updt_s:
-0.493 data_s:0.022
-INFO 2025-09-08 20:24:08 celerate.py:281 step:45K smpl:3M ep:18K epch:10.63 loss:0.086 grdn:0.236 lr:2.5e-06 updt_s:
-0.485 data_s:0.031
-INFO 2025-09-08 20:25:52 celerate.py:281 step:46K smpl:3M ep:18K epch:10.67 loss:0.086 grdn:0.231 lr:2.5e-06 updt_s:
-0.518 data_s:0.003
-INFO 2025-09-08 20:27:36 celerate.py:281 step:46K smpl:3M ep:18K epch:10.72 loss:0.085 grdn:0.228 lr:2.5e-06 updt_s:
-0.513 data_s:0.003
-INFO 2025-09-08 20:29:19 celerate.py:281 step:46K smpl:3M ep:18K epch:10.77 loss:0.087 grdn:0.235 lr:2.5e-06 updt_s:
-0.514 data_s:0.003
-INFO 2025-09-08 20:31:04 celerate.py:281 step:46K smpl:3M ep:18K epch:10.81 loss:0.086 grdn:0.242 lr:2.5e-06 updt_s:
-0.516 data_s:0.003
-INFO 2025-09-08 20:32:47 celerate.py:281 step:46K smpl:3M ep:18K epch:10.86 loss:0.086 grdn:0.235 lr:2.5e-06 updt_s:
-0.513 data_s:0.003
-INFO 2025-09-08 20:34:32 celerate.py:281 step:47K smpl:3M ep:18K epch:10.91 loss:0.086 grdn:0.234 lr:2.5e-06 updt_s:
-0.518 data_s:0.003
-INFO 2025-09-08 20:36:15 celerate.py:281 step:47K smpl:3M ep:19K epch:10.95 loss:0.087 grdn:0.238 lr:2.5e-06 updt_s:
-0.514 data_s:0.003
-INFO 2025-09-08 20:38:01 celerate.py:281 step:47K smpl:3M ep:19K epch:11.00 loss:0.086 grdn:0.239 lr:2.5e-06 updt_s:
-0.416 data_s:0.114
-INFO 2025-09-08 20:39:45 celerate.py:281 step:47K smpl:3M ep:19K epch:11.05 loss:0.086 grdn:0.233 lr:2.5e-06 updt_s:
-0.429 data_s:0.086
-INFO 2025-09-08 20:41:28 celerate.py:281 step:47K smpl:3M ep:19K epch:11.09 loss:0.086 grdn:0.229 lr:2.5e-06 updt_s:
-0.409 data_s:0.106
-INFO 2025-09-08 20:43:12 celerate.py:281 step:48K smpl:3M ep:19K epch:11.14 loss:0.086 grdn:0.238 lr:2.5e-06 updt_s:
-0.470 data_s:0.050
-INFO 2025-09-08 20:44:56 celerate.py:281 step:48K smpl:3M ep:19K epch:11.19 loss:0.086 grdn:0.237 lr:2.5e-06 updt_s:
-0.516 data_s:0.003
-INFO 2025-09-08 20:46:41 celerate.py:281 step:48K smpl:3M ep:19K epch:11.23 loss:0.087 grdn:0.243 lr:2.5e-06 updt_s:
-0.517 data_s:0.003
-INFO 2025-09-08 20:48:25 celerate.py:281 step:48K smpl:3M ep:19K epch:11.28 loss:0.086 grdn:0.237 lr:2.5e-06 updt_s:
-0.518 data_s:0.003
-INFO 2025-09-08 20:50:08 celerate.py:281 step:48K smpl:3M ep:19K epch:11.33 loss:0.086 grdn:0.238 lr:2.5e-06 updt_s:
-0.505 data_s:0.009
-INFO 2025-09-08 20:51:52 celerate.py:281 step:49K smpl:3M ep:19K epch:11.37 loss:0.087 grdn:0.245 lr:2.5e-06 updt_s:
-0.449 data_s:0.067
-INFO 2025-09-08 20:53:35 celerate.py:281 step:49K smpl:3M ep:19K epch:11.42 loss:0.086 grdn:0.238 lr:2.5e-06 updt_s:
-0.420 data_s:0.094
-INFO 2025-09-08 20:55:19 celerate.py:281 step:49K smpl:3M ep:19K epch:11.47 loss:0.086 grdn:0.232 lr:2.5e-06 updt_s:
-0.515 data_s:0.003
-INFO 2025-09-08 20:57:01 celerate.py:281 step:49K smpl:3M ep:19K epch:11.51 loss:0.087 grdn:0.236 lr:2.5e-06 updt_s:
-0.505 data_s:0.003
-INFO 2025-09-08 20:58:44 celerate.py:281 step:49K smpl:3M ep:20K epch:11.56 loss:0.086 grdn:0.234 lr:2.5e-06 updt_s:
-0.511 data_s:0.003
-INFO 2025-09-08 21:00:28 celerate.py:281 step:50K smpl:3M ep:20K epch:11.61 loss:0.087 grdn:0.239 lr:2.5e-06 updt_s:
-0.516 data_s:0.003
-INFO 2025-09-08 21:02:11 celerate.py:281 step:50K smpl:3M ep:20K epch:11.65 loss:0.086 grdn:0.233 lr:2.5e-06 updt_s:
-0.402 data_s:0.110
-INFO 2025-09-08 21:03:54 celerate.py:281 step:50K smpl:3M ep:20K epch:11.70 loss:0.086 grdn:0.238 lr:2.5e-06 updt_s:
-0.332 data_s:0.184
-INFO 2025-09-08 21:05:37 celerate.py:281 step:50K smpl:3M ep:20K epch:11.75 loss:0.086 grdn:0.229 lr:2.5e-06 updt_s:
-0.332 data_s:0.182
-INFO 2025-09-08 21:07:21 celerate.py:281 step:50K smpl:3M ep:20K epch:11.80 loss:0.087 grdn:0.237 lr:2.5e-06 updt_s:
-0.466 data_s:0.049
-INFO 2025-09-08 21:09:05 celerate.py:281 step:51K smpl:3M ep:20K epch:11.84 loss:0.086 grdn:0.238 lr:2.5e-06 updt_s:
-0.517 data_s:0.003
-INFO 2025-09-08 21:10:49 celerate.py:281 step:51K smpl:3M ep:20K epch:11.89 loss:0.087 grdn:0.240 lr:2.5e-06 updt_s:
-0.512 data_s:0.004
-INFO 2025-09-08 21:12:32 celerate.py:281 step:51K smpl:3M ep:20K epch:11.94 loss:0.085 grdn:0.234 lr:2.5e-06 updt_s:
-0.484 data_s:0.032
-INFO 2025-09-08 21:14:17 celerate.py:281 step:51K smpl:3M ep:20K epch:11.98 loss:0.087 grdn:0.236 lr:2.5e-06 updt_s:
-0.517 data_s:0.004
-INFO 2025-09-08 21:16:03 celerate.py:281 step:51K smpl:3M ep:20K epch:12.03 loss:0.086 grdn:0.237 lr:2.5e-06 updt_s:
-0.424 data_s:0.105
-INFO 2025-09-08 21:17:46 celerate.py:281 step:52K smpl:3M ep:20K epch:12.08 loss:0.086 grdn:0.236 lr:2.5e-06 updt_s:
-0.442 data_s:0.073
-INFO 2025-09-08 21:19:30 celerate.py:281 step:52K smpl:3M ep:21K epch:12.12 loss:0.086 grdn:0.231 lr:2.5e-06 updt_s:
-0.511 data_s:0.007
-INFO 2025-09-08 21:21:15 celerate.py:281 step:52K smpl:3M ep:21K epch:12.17 loss:0.086 grdn:0.241 lr:2.5e-06 updt_s:
-0.520 data_s:0.003
-INFO 2025-09-08 21:22:59 celerate.py:281 step:52K smpl:3M ep:21K epch:12.22 loss:0.086 grdn:0.242 lr:2.5e-06 updt_s:
-0.515 data_s:0.003
-INFO 2025-09-08 21:24:43 celerate.py:281 step:52K smpl:3M ep:21K epch:12.26 loss:0.086 grdn:0.241 lr:2.5e-06 updt_s:
-0.518 data_s:0.003
-INFO 2025-09-08 21:26:27 celerate.py:281 step:53K smpl:3M ep:21K epch:12.31 loss:0.086 grdn:0.241 lr:2.5e-06 updt_s:
-0.514 data_s:0.003
-INFO 2025-09-08 21:28:11 celerate.py:281 step:53K smpl:3M ep:21K epch:12.36 loss:0.087 grdn:0.241 lr:2.5e-06 updt_s:
-0.517 data_s:0.003
-INFO 2025-09-08 21:29:55 celerate.py:281 step:53K smpl:3M ep:21K epch:12.40 loss:0.087 grdn:0.236 lr:2.5e-06 updt_s:
-0.514 data_s:0.003
-INFO 2025-09-08 21:31:39 celerate.py:281 step:53K smpl:3M ep:21K epch:12.45 loss:0.086 grdn:0.240 lr:2.5e-06 updt_s:
-0.517 data_s:0.003
-INFO 2025-09-08 21:33:23 celerate.py:281 step:53K smpl:3M ep:21K epch:12.50 loss:0.086 grdn:0.237 lr:2.5e-06 updt_s:
-0.515 data_s:0.003
-INFO 2025-09-08 21:35:08 celerate.py:281 step:54K smpl:3M ep:21K epch:12.54 loss:0.086 grdn:0.236 lr:2.5e-06 updt_s:
-0.519 data_s:0.003
-INFO 2025-09-08 21:36:51 celerate.py:281 step:54K smpl:3M ep:21K epch:12.59 loss:0.086 grdn:0.232 lr:2.5e-06 updt_s:
-0.511 data_s:0.003
-INFO 2025-09-08 21:38:36 celerate.py:281 step:54K smpl:3M ep:21K epch:12.64 loss:0.087 grdn:0.236 lr:2.5e-06 updt_s:
-0.517 data_s:0.003
-INFO 2025-09-08 21:40:18 celerate.py:281 step:54K smpl:3M ep:21K epch:12.68 loss:0.086 grdn:0.237 lr:2.5e-06 updt_s:
-0.473 data_s:0.038
-INFO 2025-09-08 21:42:02 celerate.py:281 step:54K smpl:3M ep:22K epch:12.73 loss:0.085 grdn:0.232 lr:2.5e-06 updt_s:
-0.408 data_s:0.109
-INFO 2025-09-08 21:43:45 celerate.py:281 step:55K smpl:3M ep:22K epch:12.78 loss:0.086 grdn:0.244 lr:2.5e-06 updt_s:
-0.377 data_s:0.136
-INFO 2025-09-08 21:45:29 celerate.py:281 step:55K smpl:4M ep:22K epch:12.83 loss:0.087 grdn:0.233 lr:2.5e-06 updt_s:
-0.497 data_s:0.022
-INFO 2025-09-08 21:47:12 celerate.py:281 step:55K smpl:4M ep:22K epch:12.87 loss:0.086 grdn:0.235 lr:2.5e-06 updt_s:
-0.512 data_s:0.003
-INFO 2025-09-08 21:48:56 celerate.py:281 step:55K smpl:4M ep:22K epch:12.92 loss:0.086 grdn:0.237 lr:2.5e-06 updt_s:
-0.429 data_s:0.086
-INFO 2025-09-08 21:50:39 celerate.py:281 step:55K smpl:4M ep:22K epch:12.97 loss:0.087 grdn:0.241 lr:2.5e-06 updt_s:
-0.454 data_s:0.059
-INFO 2025-09-08 21:52:25 celerate.py:281 step:56K smpl:4M ep:22K epch:13.01 loss:0.086 grdn:0.235 lr:2.5e-06 updt_s:
-0.459 data_s:0.072
-INFO 2025-09-08 21:54:08 celerate.py:281 step:56K smpl:4M ep:22K epch:13.06 loss:0.086 grdn:0.232 lr:2.5e-06 updt_s:
-0.382 data_s:0.132
-INFO 2025-09-08 21:55:51 celerate.py:281 step:56K smpl:4M ep:22K epch:13.11 loss:0.086 grdn:0.236 lr:2.5e-06 updt_s:
-0.500 data_s:0.016
-INFO 2025-09-08 21:57:36 celerate.py:281 step:56K smpl:4M ep:22K epch:13.15 loss:0.086 grdn:0.237 lr:2.5e-06 updt_s:
-0.517 data_s:0.003
-INFO 2025-09-08 21:59:20 celerate.py:281 step:56K smpl:4M ep:22K epch:13.20 loss:0.085 grdn:0.235 lr:2.5e-06 updt_s:
-0.518 data_s:0.003
-INFO 2025-09-08 22:01:03 celerate.py:281 step:57K smpl:4M ep:22K epch:13.25 loss:0.086 grdn:0.244 lr:2.5e-06 updt_s:
-0.510 data_s:0.003
-INFO 2025-09-08 22:02:48 celerate.py:281 step:57K smpl:4M ep:23K epch:13.29 loss:0.086 grdn:0.242 lr:2.5e-06 updt_s:
-0.517 data_s:0.003
-INFO 2025-09-08 22:04:32 celerate.py:281 step:57K smpl:4M ep:23K epch:13.34 loss:0.086 grdn:0.236 lr:2.5e-06 updt_s:
-0.515 data_s:0.003
-INFO 2025-09-08 22:06:14 celerate.py:281 step:57K smpl:4M ep:23K epch:13.39 loss:0.087 grdn:0.244 lr:2.5e-06 updt_s:
-0.505 data_s:0.005
-bINFO 2025-09-08 22:07:59 celerate.py:281 step:57K smpl:4M ep:23K epch:13.43 loss:0.086 grdn:0.239 lr:2.5e-06 updt_s
-:0.496 data_s:0.026
-INFO 2025-09-08 22:09:43 celerate.py:281 step:58K smpl:4M ep:23K epch:13.48 loss:0.087 grdn:0.239 lr:2.5e-06 updt_s:
-0.438 data_s:0.080
-INFO 2025-09-08 22:11:27 celerate.py:281 step:58K smpl:4M ep:23K epch:13.53 loss:0.087 grdn:0.240 lr:2.5e-06 updt_s:
-0.444 data_s:0.073
-INFO 2025-09-08 22:13:11 celerate.py:281 step:58K smpl:4M ep:23K epch:13.57 loss:0.086 grdn:0.240 lr:2.5e-06 updt_s:
-0.515 data_s:0.003
-INFO 2025-09-08 22:14:55 celerate.py:281 step:58K smpl:4M ep:23K epch:13.62 loss:0.086 grdn:0.237 lr:2.5e-06 updt_s:
-0.518 data_s:0.003
-INFO 2025-09-08 22:16:39 celerate.py:281 step:58K smpl:4M ep:23K epch:13.67 loss:0.086 grdn:0.233 lr:2.5e-06 updt_s:
-0.513 data_s:0.003
-INFO 2025-09-08 22:18:22 celerate.py:281 step:59K smpl:4M ep:23K epch:13.71 loss:0.087 grdn:0.240 lr:2.5e-06 updt_s:
-0.513 data_s:0.003
-INFO 2025-09-08 22:20:05 celerate.py:281 step:59K smpl:4M ep:23K epch:13.76 loss:0.087 grdn:0.239 lr:2.5e-06 updt_s:
-0.508 data_s:0.003
-INFO 2025-09-08 22:21:48 celerate.py:281 step:59K smpl:4M ep:23K epch:13.81 loss:0.086 grdn:0.228 lr:2.5e-06 updt_s:
-0.505 data_s:0.008
-INFO 2025-09-08 22:23:31 celerate.py:281 step:59K smpl:4M ep:23K epch:13.85 loss:0.086 grdn:0.238 lr:2.5e-06 updt_s:
-0.505 data_s:0.008
-INFO 2025-09-08 22:25:15 celerate.py:281 step:59K smpl:4M ep:24K epch:13.90 loss:0.086 grdn:0.240 lr:2.5e-06 updt_s:
-0.515 data_s:0.003
-INFO 2025-09-08 22:26:58 celerate.py:281 step:60K smpl:4M ep:24K epch:13.95 loss:0.087 grdn:0.235 lr:2.5e-06 updt_s:
-0.493 data_s:0.022
-INFO 2025-09-08 22:28:42 celerate.py:281 step:60K smpl:4M ep:24K epch:14.00 loss:0.086 grdn:0.231 lr:2.5e-06 updt_s:
-0.515 data_s:0.003
-INFO 2025-09-08 22:30:29 celerate.py:281 step:60K smpl:4M ep:24K epch:14.04 loss:0.086 grdn:0.235 lr:2.5e-06 updt_s:
-0.435 data_s:0.101
-INFO 2025-09-08 22:30:29 celerate.py:295 Checkpoint policy after step 60000
-INFO 2025-09-08 22:32:14 celerate.py:281 step:60K smpl:4M ep:24K epch:14.09 loss:0.086 grdn:0.237 lr:2.5e-06 updt_s:
-0.508 data_s:0.003
-INFO 2025-09-08 22:33:58 celerate.py:281 step:60K smpl:4M ep:24K epch:14.14 loss:0.086 grdn:0.238 lr:2.5e-06 updt_s:
-0.516 data_s:0.003
-INFO 2025-09-08 22:35:42 celerate.py:281 step:61K smpl:4M ep:24K epch:14.18 loss:0.086 grdn:0.241 lr:2.5e-06 updt_s:
-0.515 data_s:0.003
-INFO 2025-09-08 22:37:25 celerate.py:281 step:61K smpl:4M ep:24K epch:14.23 loss:0.086 grdn:0.240 lr:2.5e-06 updt_s:
-0.513 data_s:0.003
-INFO 2025-09-08 22:39:09 celerate.py:281 step:61K smpl:4M ep:24K epch:14.28 loss:0.087 grdn:0.235 lr:2.5e-06 updt_s:
-0.514 data_s:0.003
-INFO 2025-09-08 22:40:52 celerate.py:281 step:61K smpl:4M ep:24K epch:14.32 loss:0.086 grdn:0.242 lr:2.5e-06 updt_s:
-0.509 data_s:0.003
-INFO 2025-09-08 22:42:35 celerate.py:281 step:61K smpl:4M ep:24K epch:14.37 loss:0.086 grdn:0.235 lr:2.5e-06 updt_s:
-0.513 data_s:0.003
-INFO 2025-09-08 22:44:18 celerate.py:281 step:62K smpl:4M ep:24K epch:14.42 loss:0.086 grdn:0.233 lr:2.5e-06 updt_s:
-0.511 data_s:0.003
-INFO 2025-09-08 22:46:02 celerate.py:281 step:62K smpl:4M ep:24K epch:14.46 loss:0.086 grdn:0.235 lr:2.5e-06 updt_s:
-0.515 data_s:0.003
-INFO 2025-09-08 22:47:47 celerate.py:281 step:62K smpl:4M ep:25K epch:14.51 loss:0.086 grdn:0.242 lr:2.5e-06 updt_s:
-0.517 data_s:0.003
-INFO 2025-09-08 22:49:30 celerate.py:281 step:62K smpl:4M ep:25K epch:14.56 loss:0.086 grdn:0.234 lr:2.5e-06 updt_s:
-0.512 data_s:0.003
-INFO 2025-09-08 22:51:14 celerate.py:281 step:62K smpl:4M ep:25K epch:14.60 loss:0.086 grdn:0.237 lr:2.5e-06 updt_s:
-0.514 data_s:0.003
-INFO 2025-09-08 22:52:58 celerate.py:281 step:63K smpl:4M ep:25K epch:14.65 loss:0.087 grdn:0.245 lr:2.5e-06 updt_s:
-0.484 data_s:0.033
-INFO 2025-09-08 22:54:41 celerate.py:281 step:63K smpl:4M ep:25K epch:14.70 loss:0.086 grdn:0.239 lr:2.5e-06 updt_s:
-0.501 data_s:0.016
-INFO 2025-09-08 22:56:25 celerate.py:281 step:63K smpl:4M ep:25K epch:14.74 loss:0.086 grdn:0.233 lr:2.5e-06 updt_s:
-0.436 data_s:0.081
-INFO 2025-09-08 22:58:08 celerate.py:281 step:63K smpl:4M ep:25K epch:14.79 loss:0.087 grdn:0.235 lr:2.5e-06 updt_s:
-0.436 data_s:0.080
-INFO 2025-09-08 22:59:51 celerate.py:281 step:63K smpl:4M ep:25K epch:14.84 loss:0.086 grdn:0.234 lr:2.5e-06 updt_s:
-0.344 data_s:0.168
-INFO 2025-09-08 23:01:34 celerate.py:281 step:64K smpl:4M ep:25K epch:14.88 loss:0.086 grdn:0.237 lr:2.5e-06 updt_s:
-0.479 data_s:0.035
-INFO 2025-09-08 23:03:18 celerate.py:281 step:64K smpl:4M ep:25K epch:14.93 loss:0.086 grdn:0.241 lr:2.5e-06 updt_s:
-0.514 data_s:0.003
-INFO 2025-09-08 23:05:02 celerate.py:281 step:64K smpl:4M ep:25K epch:14.98 loss:0.086 grdn:0.230 lr:2.5e-06 updt_s:
-0.513 data_s:0.003
-INFO 2025-09-08 23:06:47 celerate.py:281 step:64K smpl:4M ep:25K epch:15.02 loss:0.086 grdn:0.234 lr:2.5e-06 updt_s:
-0.405 data_s:0.119
-INFO 2025-09-08 23:08:30 celerate.py:281 step:64K smpl:4M ep:26K epch:15.07 loss:0.087 grdn:0.248 lr:2.5e-06 updt_s:
-0.393 data_s:0.121
-INFO 2025-09-08 23:10:13 celerate.py:281 step:65K smpl:4M ep:26K epch:15.12 loss:0.087 grdn:0.242 lr:2.5e-06 updt_s:
-0.369 data_s:0.142
-INFO 2025-09-08 23:11:56 celerate.py:281 step:65K smpl:4M ep:26K epch:15.17 loss:0.085 grdn:0.230 lr:2.5e-06 updt_s:
-0.360 data_s:0.156
-INFO 2025-09-08 23:13:40 celerate.py:281 step:65K smpl:4M ep:26K epch:15.21 loss:0.086 grdn:0.241 lr:2.5e-06 updt_s:
-0.333 data_s:0.182
-INFO 2025-09-08 23:15:23 celerate.py:281 step:65K smpl:4M ep:26K epch:15.26 loss:0.087 grdn:0.241 lr:2.5e-06 updt_s:
-0.376 data_s:0.136
-INFO 2025-09-08 23:17:05 celerate.py:281 step:65K smpl:4M ep:26K epch:15.31 loss:0.087 grdn:0.239 lr:2.5e-06 updt_s:
-0.439 data_s:0.069
-INFO 2025-09-08 23:18:49 celerate.py:281 step:66K smpl:4M ep:26K epch:15.35 loss:0.086 grdn:0.242 lr:2.5e-06 updt_s:
-0.512 data_s:0.006
-INFO 2025-09-08 23:20:32 celerate.py:281 step:66K smpl:4M ep:26K epch:15.40 loss:0.087 grdn:0.242 lr:2.5e-06 updt_s:
-0.511 data_s:0.003
-INFO 2025-09-08 23:22:15 celerate.py:281 step:66K smpl:4M ep:26K epch:15.45 loss:0.086 grdn:0.236 lr:2.5e-06 updt_s:
-0.508 data_s:0.004
-INFO 2025-09-08 23:23:59 celerate.py:281 step:66K smpl:4M ep:26K epch:15.49 loss:0.086 grdn:0.235 lr:2.5e-06 updt_s:
-0.474 data_s:0.046
-INFO 2025-09-08 23:25:42 celerate.py:281 step:66K smpl:4M ep:26K epch:15.54 loss:0.087 grdn:0.238 lr:2.5e-06 updt_s:
-0.510 data_s:0.003
-INFO 2025-09-08 23:27:25 celerate.py:281 step:67K smpl:4M ep:26K epch:15.59 loss:0.086 grdn:0.240 lr:2.5e-06 updt_s:
-0.487 data_s:0.025
-INFO 2025-09-08 23:29:08 celerate.py:281 step:67K smpl:4M ep:26K epch:15.63 loss:0.086 grdn:0.241 lr:2.5e-06 updt_s:
-0.510 data_s:0.005
-INFO 2025-09-08 23:30:50 celerate.py:281 step:67K smpl:4M ep:27K epch:15.68 loss:0.086 grdn:0.237 lr:2.5e-06 updt_s:
-0.506 data_s:0.003
-INFO 2025-09-08 23:32:33 celerate.py:281 step:67K smpl:4M ep:27K epch:15.73 loss:0.086 grdn:0.245 lr:2.5e-06 updt_s:
-0.509 data_s:0.003
-INFO 2025-09-08 23:34:16 celerate.py:281 step:67K smpl:4M ep:27K epch:15.77 loss:0.087 grdn:0.244 lr:2.5e-06 updt_s:
-0.503 data_s:0.014
-INFO 2025-09-08 23:35:59 celerate.py:281 step:68K smpl:4M ep:27K epch:15.82 loss:0.086 grdn:0.236 lr:2.5e-06 updt_s:
-0.512 data_s:0.003
-INFO 2025-09-08 23:37:42 celerate.py:281 step:68K smpl:4M ep:27K epch:15.87 loss:0.086 grdn:0.233 lr:2.5e-06 updt_s:
-0.509 data_s:0.003
-INFO 2025-09-08 23:39:26 celerate.py:281 step:68K smpl:4M ep:27K epch:15.91 loss:0.085 grdn:0.235 lr:2.5e-06 updt_s:
-0.512 data_s:0.003
-INFO 2025-09-08 23:41:10 celerate.py:281 step:68K smpl:4M ep:27K epch:15.96 loss:0.086 grdn:0.241 lr:2.5e-06 updt_s:
-0.517 data_s:0.003
-INFO 2025-09-08 23:42:56 celerate.py:281 step:68K smpl:4M ep:27K epch:16.01 loss:0.086 grdn:0.242 lr:2.5e-06 updt_s:
-0.469 data_s:0.061
-INFO 2025-09-08 23:44:40 celerate.py:281 step:69K smpl:4M ep:27K epch:16.05 loss:0.086 grdn:0.235 lr:2.5e-06 updt_s:
-0.339 data_s:0.179
-INFO 2025-09-08 23:46:22 celerate.py:281 step:69K smpl:4M ep:27K epch:16.10 loss:0.086 grdn:0.242 lr:2.5e-06 updt_s:
-0.386 data_s:0.124
-INFO 2025-09-08 23:48:06 celerate.py:281 step:69K smpl:4M ep:27K epch:16.15 loss:0.085 grdn:0.239 lr:2.5e-06 updt_s:
-0.515 data_s:0.003
-INFO 2025-09-08 23:49:50 celerate.py:281 step:69K smpl:4M ep:27K epch:16.20 loss:0.086 grdn:0.237 lr:2.5e-06 updt_s:
-0.516 data_s:0.003
-INFO 2025-09-08 23:51:33 celerate.py:281 step:69K smpl:4M ep:27K epch:16.24 loss:0.087 grdn:0.246 lr:2.5e-06 updt_s:
-0.501 data_s:0.014
-INFO 2025-09-08 23:53:17 celerate.py:281 step:70K smpl:4M ep:28K epch:16.29 loss:0.086 grdn:0.236 lr:2.5e-06 updt_s:
-0.513 data_s:0.003
-INFO 2025-09-08 23:54:59 celerate.py:281 step:70K smpl:4M ep:28K epch:16.34 loss:0.087 grdn:0.245 lr:2.5e-06 updt_s:
-0.507 data_s:0.005
-INFO 2025-09-08 23:56:43 celerate.py:281 step:70K smpl:4M ep:28K epch:16.38 loss:0.086 grdn:0.238 lr:2.5e-06 updt_s:
-0.516 data_s:0.003
-INFO 2025-09-08 23:58:27 celerate.py:281 step:70K smpl:4M ep:28K epch:16.43 loss:0.086 grdn:0.235 lr:2.5e-06 updt_s:
-0.514 data_s:0.003
-INFO 2025-09-09 00:00:12 celerate.py:281 step:70K smpl:5M ep:28K epch:16.48 loss:0.086 grdn:0.240 lr:2.5e-06 updt_s:
-0.518 data_s:0.003
-INFO 2025-09-09 00:01:55 celerate.py:281 step:71K smpl:5M ep:28K epch:16.52 loss:0.086 grdn:0.241 lr:2.5e-06 updt_s:
-0.512 data_s:0.003
-INFO 2025-09-09 00:03:37 celerate.py:281 step:71K smpl:5M ep:28K epch:16.57 loss:0.087 grdn:0.241 lr:2.5e-06 updt_s:
-0.508 data_s:0.003
-INFO 2025-09-09 00:05:20 celerate.py:281 step:71K smpl:5M ep:28K epch:16.62 loss:0.086 grdn:0.238 lr:2.5e-06 updt_s:
-0.509 data_s:0.003
-INFO 2025-09-09 00:07:04 celerate.py:281 step:71K smpl:5M ep:28K epch:16.66 loss:0.086 grdn:0.235 lr:2.5e-06 updt_s:
-0.519 data_s:0.003
-INFO 2025-09-09 00:08:47 celerate.py:281 step:71K smpl:5M ep:28K epch:16.71 loss:0.087 grdn:0.240 lr:2.5e-06 updt_s:
-0.509 data_s:0.003
-INFO 2025-09-09 00:10:30 celerate.py:281 step:72K smpl:5M ep:28K epch:16.76 loss:0.087 grdn:0.245 lr:2.5e-06 updt_s:
-0.511 data_s:0.003
-INFO 2025-09-09 00:12:13 celerate.py:281 step:72K smpl:5M ep:28K epch:16.80 loss:0.085 grdn:0.230 lr:2.5e-06 updt_s:
-0.510 data_s:0.003
-INFO 2025-09-09 00:13:58 celerate.py:281 step:72K smpl:5M ep:29K epch:16.85 loss:0.086 grdn:0.237 lr:2.5e-06 updt_s:
-0.522 data_s:0.003
-INFO 2025-09-09 00:15:40 celerate.py:281 step:72K smpl:5M ep:29K epch:16.90 loss:0.086 grdn:0.239 lr:2.5e-06 updt_s:
-0.506 data_s:0.003
-INFO 2025-09-09 00:17:23 celerate.py:281 step:72K smpl:5M ep:29K epch:16.94 loss:0.086 grdn:0.237 lr:2.5e-06 updt_s:
-0.510 data_s:0.004
-INFO 2025-09-09 00:19:05 celerate.py:281 step:73K smpl:5M ep:29K epch:16.99 loss:0.086 grdn:0.237 lr:2.5e-06 updt_s:
-0.505 data_s:0.003
-INFO 2025-09-09 00:20:51 celerate.py:281 step:73K smpl:5M ep:29K epch:17.04 loss:0.086 grdn:0.239 lr:2.5e-06 updt_s:
-0.437 data_s:0.092
-INFO 2025-09-09 00:22:34 celerate.py:281 step:73K smpl:5M ep:29K epch:17.08 loss:0.087 grdn:0.241 lr:2.5e-06 updt_s:
-0.489 data_s:0.025
-INFO 2025-09-09 00:24:17 celerate.py:281 step:73K smpl:5M ep:29K epch:17.13 loss:0.086 grdn:0.243 lr:2.5e-06 updt_s:
-0.511 data_s:0.003
-INFO 2025-09-09 00:26:01 celerate.py:281 step:73K smpl:5M ep:29K epch:17.18 loss:0.087 grdn:0.245 lr:2.5e-06 updt_s:
-0.513 data_s:0.003
-INFO 2025-09-09 00:27:45 celerate.py:281 step:74K smpl:5M ep:29K epch:17.22 loss:0.087 grdn:0.246 lr:2.5e-06 updt_s:
-0.515 data_s:0.003
-INFO 2025-09-09 00:29:28 celerate.py:281 step:74K smpl:5M ep:29K epch:17.27 loss:0.086 grdn:0.239 lr:2.5e-06 updt_s:
-0.509 data_s:0.003
-INFO 2025-09-09 00:31:11 celerate.py:281 step:74K smpl:5M ep:29K epch:17.32 loss:0.086 grdn:0.238 lr:2.5e-06 updt_s:
-0.512 data_s:0.003
-INFO 2025-09-09 00:32:54 celerate.py:281 step:74K smpl:5M ep:29K epch:17.37 loss:0.087 grdn:0.237 lr:2.5e-06 updt_s:
-0.510 data_s:0.003
-INFO 2025-09-09 00:34:38 celerate.py:281 step:74K smpl:5M ep:29K epch:17.41 loss:0.086 grdn:0.240 lr:2.5e-06 updt_s:
-0.515 data_s:0.003
-INFO 2025-09-09 00:36:21 celerate.py:281 step:75K smpl:5M ep:30K epch:17.46 loss:0.086 grdn:0.241 lr:2.5e-06 updt_s:
-0.513 data_s:0.003
-INFO 2025-09-09 00:38:05 celerate.py:281 step:75K smpl:5M ep:30K epch:17.51 loss:0.086 grdn:0.244 lr:2.5e-06 updt_s:
-0.512 data_s:0.003
-INFO 2025-09-09 00:39:49 celerate.py:281 step:75K smpl:5M ep:30K epch:17.55 loss:0.086 grdn:0.251 lr:2.5e-06 updt_s:
-0.517 data_s:0.003
-INFO 2025-09-09 00:41:32 celerate.py:281 step:75K smpl:5M ep:30K epch:17.60 loss:0.086 grdn:0.243 lr:2.5e-06 updt_s:
-0.506 data_s:0.005
-INFO 2025-09-09 00:43:14 celerate.py:281 step:75K smpl:5M ep:30K epch:17.65 loss:0.086 grdn:0.240 lr:2.5e-06 updt_s:
-0.390 data_s:0.122
-INFO 2025-09-09 00:44:58 celerate.py:281 step:76K smpl:5M ep:30K epch:17.69 loss:0.086 grdn:0.232 lr:2.5e-06 updt_s:
-0.410 data_s:0.107
-INFO 2025-09-09 00:46:41 celerate.py:281 step:76K smpl:5M ep:30K epch:17.74 loss:0.087 grdn:0.247 lr:2.5e-06 updt_s:
-0.427 data_s:0.085
-INFO 2025-09-09 00:48:24 celerate.py:281 step:76K smpl:5M ep:30K epch:17.79 loss:0.085 grdn:0.241 lr:2.5e-06 updt_s:
-0.492 data_s:0.025
-INFO 2025-09-09 00:50:08 celerate.py:281 step:76K smpl:5M ep:30K epch:17.83 loss:0.086 grdn:0.238 lr:2.5e-06 updt_s:
-0.514 data_s:0.003
-INFO 2025-09-09 00:51:51 celerate.py:281 step:76K smpl:5M ep:30K epch:17.88 loss:0.087 grdn:0.241 lr:2.5e-06 updt_s:
-0.510 data_s:0.003
-INFO 2025-09-09 00:53:33 celerate.py:281 step:77K smpl:5M ep:30K epch:17.93 loss:0.086 grdn:0.234 lr:2.5e-06 updt_s:
-0.510 data_s:0.003
-INFO 2025-09-09 00:55:18 celerate.py:281 step:77K smpl:5M ep:30K epch:17.97 loss:0.086 grdn:0.240 lr:2.5e-06 updt_s:
-0.519 data_s:0.003
-INFO 2025-09-09 00:57:05 celerate.py:281 step:77K smpl:5M ep:31K epch:18.02 loss:0.086 grdn:0.236 lr:2.5e-06 updt_s:
-0.486 data_s:0.046
-INFO 2025-09-09 00:58:47 celerate.py:281 step:77K smpl:5M ep:31K epch:18.07 loss:0.086 grdn:0.239 lr:2.5e-06 updt_s:
-0.509 data_s:0.003
-INFO 2025-09-09 01:00:31 celerate.py:281 step:77K smpl:5M ep:31K epch:18.11 loss:0.086 grdn:0.244 lr:2.5e-06 updt_s:
-0.514 data_s:0.003
-INFO 2025-09-09 01:02:15 celerate.py:281 step:78K smpl:5M ep:31K epch:18.16 loss:0.086 grdn:0.241 lr:2.5e-06 updt_s:
-0.515 data_s:0.003
-INFO 2025-09-09 01:03:56 celerate.py:281 step:78K smpl:5M ep:31K epch:18.21 loss:0.086 grdn:0.233 lr:2.5e-06 updt_s:
-0.498 data_s:0.008
-INFO 2025-09-09 01:05:40 celerate.py:281 step:78K smpl:5M ep:31K epch:18.25 loss:0.086 grdn:0.237 lr:2.5e-06 updt_s:
-0.508 data_s:0.010
-INFO 2025-09-09 01:07:24 celerate.py:281 step:78K smpl:5M ep:31K epch:18.30 loss:0.085 grdn:0.240 lr:2.5e-06 updt_s:
-0.515 data_s:0.003
-INFO 2025-09-09 01:09:07 celerate.py:281 step:78K smpl:5M ep:31K epch:18.35 loss:0.086 grdn:0.237 lr:2.5e-06 updt_s:
-0.513 data_s:0.003
-INFO 2025-09-09 01:10:51 celerate.py:281 step:79K smpl:5M ep:31K epch:18.40 loss:0.086 grdn:0.236 lr:2.5e-06 updt_s:
-0.515 data_s:0.003
-INFO 2025-09-09 01:12:35 celerate.py:281 step:79K smpl:5M ep:31K epch:18.44 loss:0.086 grdn:0.239 lr:2.5e-06 updt_s:
-0.514 data_s:0.003
-INFO 2025-09-09 01:14:17 celerate.py:281 step:79K smpl:5M ep:31K epch:18.49 loss:0.086 grdn:0.247 lr:2.5e-06 updt_s:
-0.508 data_s:0.003
-INFO 2025-09-09 01:16:01 celerate.py:281 step:79K smpl:5M ep:31K epch:18.54 loss:0.086 grdn:0.244 lr:2.5e-06 updt_s:
-0.513 data_s:0.003
-INFO 2025-09-09 01:17:43 celerate.py:281 step:79K smpl:5M ep:31K epch:18.58 loss:0.086 grdn:0.233 lr:2.5e-06 updt_s:
-0.507 data_s:0.003
-INFO 2025-09-09 01:19:26 celerate.py:281 step:80K smpl:5M ep:32K epch:18.63 loss:0.086 grdn:0.237 lr:2.5e-06 updt_s:
-0.511 data_s:0.003
-INFO 2025-09-09 01:21:09 celerate.py:281 step:80K smpl:5M ep:32K epch:18.68 loss:0.086 grdn:0.237 lr:2.5e-06 updt_s:
-0.513 data_s:0.003
-INFO 2025-09-09 01:22:52 celerate.py:281 step:80K smpl:5M ep:32K epch:18.72 loss:0.086 grdn:0.240 lr:2.5e-06 updt_s:
-0.501 data_s:0.011
-INFO 2025-09-09 01:22:52 celerate.py:295 Checkpoint policy after step 80000
-INFO 2025-09-09 01:24:37 celerate.py:281 step:80K smpl:5M ep:32K epch:18.77 loss:0.086 grdn:0.241 lr:2.5e-06 updt_s:
-0.510 data_s:0.003
-INFO 2025-09-09 01:26:20 celerate.py:281 step:80K smpl:5M ep:32K epch:18.82 loss:0.087 grdn:0.242 lr:2.5e-06 updt_s:
-0.514 data_s:0.003
-INFO 2025-09-09 01:28:05 celerate.py:281 step:81K smpl:5M ep:32K epch:18.86 loss:0.086 grdn:0.241 lr:2.5e-06 updt_s:
-0.517 data_s:0.003
-INFO 2025-09-09 01:29:48 celerate.py:281 step:81K smpl:5M ep:32K epch:18.91 loss:0.086 grdn:0.243 lr:2.5e-06 updt_s:
-0.511 data_s:0.003
-INFO 2025-09-09 01:31:32 celerate.py:281 step:81K smpl:5M ep:32K epch:18.96 loss:0.086 grdn:0.237 lr:2.5e-06 updt_s:
-0.517 data_s:0.003
-INFO 2025-09-09 01:33:18 celerate.py:281 step:81K smpl:5M ep:32K epch:19.00 loss:0.087 grdn:0.244 lr:2.5e-06 updt_s:
-0.487 data_s:0.042
-INFO 2025-09-09 01:35:01 celerate.py:281 step:81K smpl:5M ep:32K epch:19.05 loss:0.086 grdn:0.236 lr:2.5e-06 updt_s:
-0.461 data_s:0.054
-INFO 2025-09-09 01:36:46 celerate.py:281 step:82K smpl:5M ep:32K epch:19.10 loss:0.086 grdn:0.239 lr:2.5e-06 updt_s:
-0.450 data_s:0.071
-INFO 2025-09-09 01:38:29 celerate.py:281 step:82K smpl:5M ep:32K epch:19.14 loss:0.086 grdn:0.241 lr:2.5e-06 updt_s:
-0.451 data_s:0.064
-INFO 2025-09-09 01:40:12 celerate.py:281 step:82K smpl:5M ep:32K epch:19.19 loss:0.086 grdn:0.234 lr:2.5e-06 updt_s:
-0.512 data_s:0.003
-INFO 2025-09-09 01:41:54 celerate.py:281 step:82K smpl:5M ep:33K epch:19.24 loss:0.085 grdn:0.237 lr:2.5e-06 updt_s:
-0.480 data_s:0.030
-INFO 2025-09-09 01:43:37 celerate.py:281 step:82K smpl:5M ep:33K epch:19.28 loss:0.086 grdn:0.242 lr:2.5e-06 updt_s:
-0.434 data_s:0.080
-INFO 2025-09-09 01:45:20 celerate.py:281 step:83K smpl:5M ep:33K epch:19.33 loss:0.086 grdn:0.243 lr:2.5e-06 updt_s:
-0.511 data_s:0.003
-INFO 2025-09-09 01:47:04 celerate.py:281 step:83K smpl:5M ep:33K epch:19.38 loss:0.086 grdn:0.243 lr:2.5e-06 updt_s:
-0.511 data_s:0.003
-INFO 2025-09-09 01:48:49 celerate.py:281 step:83K smpl:5M ep:33K epch:19.42 loss:0.086 grdn:0.243 lr:2.5e-06 updt_s:
-0.520 data_s:0.003
-INFO 2025-09-09 01:50:32 celerate.py:281 step:83K smpl:5M ep:33K epch:19.47 loss:0.086 grdn:0.238 lr:2.5e-06 updt_s:
-0.511 data_s:0.003
-INFO 2025-09-09 01:52:15 celerate.py:281 step:83K smpl:5M ep:33K epch:19.52 loss:0.086 grdn:0.243 lr:2.5e-06 updt_s:
-0.513 data_s:0.003
-INFO 2025-09-09 01:53:59 celerate.py:281 step:84K smpl:5M ep:33K epch:19.57 loss:0.085 grdn:0.236 lr:2.5e-06 updt_s:
-0.513 data_s:0.003
-INFO 2025-09-09 01:55:41 celerate.py:281 step:84K smpl:5M ep:33K epch:19.61 loss:0.086 grdn:0.238 lr:2.5e-06 updt_s:
-0.508 data_s:0.003
-INFO 2025-09-09 01:57:24 celerate.py:281 step:84K smpl:5M ep:33K epch:19.66 loss:0.087 grdn:0.237 lr:2.5e-06 updt_s:
-0.500 data_s:0.013
-INFO 2025-09-09 01:59:07 celerate.py:281 step:84K smpl:5M ep:33K epch:19.71 loss:0.087 grdn:0.245 lr:2.5e-06 updt_s:
-0.510 data_s:0.003
-INFO 2025-09-09 02:00:50 celerate.py:281 step:84K smpl:5M ep:33K epch:19.75 loss:0.086 grdn:0.237 lr:2.5e-06 updt_s:
-0.512 data_s:0.003
-INFO 2025-09-09 02:02:34 celerate.py:281 step:85K smpl:5M ep:34K epch:19.80 loss:0.086 grdn:0.240 lr:2.5e-06 updt_s:
-0.516 data_s:0.003
-INFO 2025-09-09 02:04:17 celerate.py:281 step:85K smpl:5M ep:34K epch:19.85 loss:0.086 grdn:0.244 lr:2.5e-06 updt_s:
-0.511 data_s:0.003
-INFO 2025-09-09 02:05:59 celerate.py:281 step:85K smpl:5M ep:34K epch:19.89 loss:0.086 grdn:0.243 lr:2.5e-06 updt_s:
-0.507 data_s:0.004
-INFO 2025-09-09 02:07:43 celerate.py:281 step:85K smpl:5M ep:34K epch:19.94 loss:0.086 grdn:0.237 lr:2.5e-06 updt_s:
-0.505 data_s:0.011
-INFO 2025-09-09 02:09:26 celerate.py:281 step:85K smpl:5M ep:34K epch:19.99 loss:0.086 grdn:0.242 lr:2.5e-06 updt_s:
-0.392 data_s:0.124
-INFO 2025-09-09 02:11:14 celerate.py:281 step:86K smpl:5M ep:34K epch:20.03 loss:0.086 grdn:0.240 lr:2.5e-06 updt_s:
-0.435 data_s:0.100
-INFO 2025-09-09 02:12:57 celerate.py:281 step:86K smpl:5M ep:34K epch:20.08 loss:0.086 grdn:0.242 lr:2.5e-06 updt_s:
-0.514 data_s:0.003
-INFO 2025-09-09 02:14:41 celerate.py:281 step:86K smpl:6M ep:34K epch:20.13 loss:0.085 grdn:0.234 lr:2.5e-06 updt_s:
-0.516 data_s:0.003
-INFO 2025-09-09 02:16:25 celerate.py:281 step:86K smpl:6M ep:34K epch:20.17 loss:0.085 grdn:0.234 lr:2.5e-06 updt_s:
-0.514 data_s:0.003
-INFO 2025-09-09 02:18:09 celerate.py:281 step:86K smpl:6M ep:34K epch:20.22 loss:0.085 grdn:0.238 lr:2.5e-06 updt_s:
-0.506 data_s:0.010
-INFO 2025-09-09 02:19:53 celerate.py:281 step:87K smpl:6M ep:34K epch:20.27 loss:0.086 grdn:0.238 lr:2.5e-06 updt_s:
-0.515 data_s:0.003
-INFO 2025-09-09 02:21:35 celerate.py:281 step:87K smpl:6M ep:34K epch:20.31 loss:0.086 grdn:0.243 lr:2.5e-06 updt_s:
-0.501 data_s:0.007
-INFO 2025-09-09 02:23:17 celerate.py:281 step:87K smpl:6M ep:34K epch:20.36 loss:0.085 grdn:0.242 lr:2.5e-06 updt_s:
-0.432 data_s:0.080
-INFO 2025-09-09 02:25:01 celerate.py:281 step:87K smpl:6M ep:35K epch:20.41 loss:0.086 grdn:0.235 lr:2.5e-06 updt_s:
-0.445 data_s:0.073
-INFO 2025-09-09 02:26:43 celerate.py:281 step:87K smpl:6M ep:35K epch:20.45 loss:0.085 grdn:0.239 lr:2.5e-06 updt_s:
-0.401 data_s:0.107
-INFO 2025-09-09 02:28:26 celerate.py:281 step:88K smpl:6M ep:35K epch:20.50 loss:0.086 grdn:0.237 lr:2.5e-06 updt_s:
-0.502 data_s:0.009
-INFO 2025-09-09 02:30:09 celerate.py:281 step:88K smpl:6M ep:35K epch:20.55 loss:0.087 grdn:0.248 lr:2.5e-06 updt_s:
-0.491 data_s:0.021
-INFO 2025-09-09 02:31:52 celerate.py:281 step:88K smpl:6M ep:35K epch:20.59 loss:0.086 grdn:0.243 lr:2.5e-06 updt_s:
-0.391 data_s:0.124
-INFO 2025-09-09 02:33:35 celerate.py:281 step:88K smpl:6M ep:35K epch:20.64 loss:0.086 grdn:0.238 lr:2.5e-06 updt_s:
-0.332 data_s:0.180
-INFO 2025-09-09 02:35:18 celerate.py:281 step:88K smpl:6M ep:35K epch:20.69 loss:0.087 grdn:0.243 lr:2.5e-06 updt_s:
-0.333 data_s:0.181
-INFO 2025-09-09 02:37:02 celerate.py:281 step:89K smpl:6M ep:35K epch:20.74 loss:0.086 grdn:0.245 lr:2.5e-06 updt_s:
-0.332 data_s:0.185
-INFO 2025-09-09 02:38:47 celerate.py:281 step:89K smpl:6M ep:35K epch:20.78 loss:0.086 grdn:0.241 lr:2.5e-06 updt_s:
-0.332 data_s:0.190
-INFO 2025-09-09 02:40:30 celerate.py:281 step:89K smpl:6M ep:35K epch:20.83 loss:0.086 grdn:0.240 lr:2.5e-06 updt_s:
-0.330 data_s:0.187
-INFO 2025-09-09 02:42:12 celerate.py:281 step:89K smpl:6M ep:35K epch:20.88 loss:0.085 grdn:0.243 lr:2.5e-06 updt_s:
-0.331 data_s:0.177
-INFO 2025-09-09 02:43:56 celerate.py:281 step:89K smpl:6M ep:35K epch:20.92 loss:0.086 grdn:0.244 lr:2.5e-06 updt_s:
-0.330 data_s:0.185
-INFO 2025-09-09 02:45:39 celerate.py:281 step:90K smpl:6M ep:36K epch:20.97 loss:0.087 grdn:0.252 lr:2.5e-06 updt_s:
-0.335 data_s:0.182
-INFO 2025-09-09 02:47:25 celerate.py:281 step:90K smpl:6M ep:36K epch:21.02 loss:0.087 grdn:0.247 lr:2.5e-06 updt_s:
-0.330 data_s:0.197
-INFO 2025-09-09 02:49:08 celerate.py:281 step:90K smpl:6M ep:36K epch:21.06 loss:0.086 grdn:0.240 lr:2.5e-06 updt_s:
-0.330 data_s:0.182
-INFO 2025-09-09 02:50:51 celerate.py:281 step:90K smpl:6M ep:36K epch:21.11 loss:0.086 grdn:0.243 lr:2.5e-06 updt_s:
-0.334 data_s:0.181
-INFO 2025-09-09 02:52:34 celerate.py:281 step:90K smpl:6M ep:36K epch:21.16 loss:0.086 grdn:0.237 lr:2.5e-06 updt_s:
-0.330 data_s:0.183
-INFO 2025-09-09 02:54:17 celerate.py:281 step:91K smpl:6M ep:36K epch:21.20 loss:0.085 grdn:0.235 lr:2.5e-06 updt_s:
-0.342 data_s:0.173
-INFO 2025-09-09 02:56:01 celerate.py:281 step:91K smpl:6M ep:36K epch:21.25 loss:0.086 grdn:0.246 lr:2.5e-06 updt_s:
-0.331 data_s:0.189
-INFO 2025-09-09 02:57:44 celerate.py:281 step:91K smpl:6M ep:36K epch:21.30 loss:0.086 grdn:0.236 lr:2.5e-06 updt_s:
-0.329 data_s:0.182
-INFO 2025-09-09 02:59:27 celerate.py:281 step:91K smpl:6M ep:36K epch:21.34 loss:0.087 grdn:0.246 lr:2.5e-06 updt_s:
-0.341 data_s:0.175
-INFO 2025-09-09 03:01:11 celerate.py:281 step:91K smpl:6M ep:36K epch:21.39 loss:0.086 grdn:0.242 lr:2.5e-06 updt_s:
-0.330 data_s:0.188
-INFO 2025-09-09 03:02:54 celerate.py:281 step:92K smpl:6M ep:36K epch:21.44 loss:0.086 grdn:0.237 lr:2.5e-06 updt_s:
-0.328 data_s:0.186
-INFO 2025-09-09 03:04:39 celerate.py:281 step:92K smpl:6M ep:36K epch:21.48 loss:0.086 grdn:0.244 lr:2.5e-06 updt_s:
-0.329 data_s:0.195
-INFO 2025-09-09 03:06:22 celerate.py:281 step:92K smpl:6M ep:36K epch:21.53 loss:0.086 grdn:0.244 lr:2.5e-06 updt_s:
-0.330 data_s:0.183
-INFO 2025-09-09 03:08:04 celerate.py:281 step:92K smpl:6M ep:37K epch:21.58 loss:0.086 grdn:0.241 lr:2.5e-06 updt_s:
-0.462 data_s:0.051
-INFO 2025-09-09 03:09:48 celerate.py:281 step:92K smpl:6M ep:37K epch:21.62 loss:0.086 grdn:0.248 lr:2.5e-06 updt_s:
-0.407 data_s:0.108
-INFO 2025-09-09 03:11:32 celerate.py:281 step:93K smpl:6M ep:37K epch:21.67 loss:0.086 grdn:0.232 lr:2.5e-06 updt_s:
-0.333 data_s:0.185
-INFO 2025-09-09 03:13:15 celerate.py:281 step:93K smpl:6M ep:37K epch:21.72 loss:0.085 grdn:0.242 lr:2.5e-06 updt_s:
-0.329 data_s:0.187
-INFO 2025-09-09 03:14:58 celerate.py:281 step:93K smpl:6M ep:37K epch:21.77 loss:0.086 grdn:0.240 lr:2.5e-06 updt_s:
-0.357 data_s:0.156
-INFO 2025-09-09 03:16:41 celerate.py:281 step:93K smpl:6M ep:37K epch:21.81 loss:0.086 grdn:0.237 lr:2.5e-06 updt_s:
-0.487 data_s:0.027
-INFO 2025-09-09 03:18:25 celerate.py:281 step:93K smpl:6M ep:37K epch:21.86 loss:0.086 grdn:0.240 lr:2.5e-06 updt_s:
-0.512 data_s:0.003
-INFO 2025-09-09 03:20:08 celerate.py:281 step:94K smpl:6M ep:37K epch:21.91 loss:0.087 grdn:0.247 lr:2.5e-06 updt_s:
-0.512 data_s:0.003
-INFO 2025-09-09 03:21:51 celerate.py:281 step:94K smpl:6M ep:37K epch:21.95 loss:0.086 grdn:0.238 lr:2.5e-06 updt_s:
-0.508 data_s:0.004
-INFO 2025-09-09 03:23:38 celerate.py:281 step:94K smpl:6M ep:37K epch:22.00 loss:0.085 grdn:0.239 lr:2.5e-06 updt_s:
-0.429 data_s:0.104
-INFO 2025-09-09 03:25:20 celerate.py:281 step:94K smpl:6M ep:37K epch:22.05 loss:0.086 grdn:0.243 lr:2.5e-06 updt_s:
-0.328 data_s:0.183
-INFO 2025-09-09 03:27:04 celerate.py:281 step:94K smpl:6M ep:37K epch:22.09 loss:0.086 grdn:0.246 lr:2.5e-06 updt_s:
-0.329 data_s:0.191
-INFO 2025-09-09 03:28:47 celerate.py:281 step:95K smpl:6M ep:37K epch:22.14 loss:0.086 grdn:0.246 lr:2.5e-06 updt_s:
-0.329 data_s:0.186
-INFO 2025-09-09 03:30:30 celerate.py:281 step:95K smpl:6M ep:38K epch:22.19 loss:0.086 grdn:0.241 lr:2.5e-06 updt_s:
-0.346 data_s:0.166
-INFO 2025-09-09 03:32:13 celerate.py:281 step:95K smpl:6M ep:38K epch:22.23 loss:0.086 grdn:0.247 lr:2.5e-06 updt_s:
-0.334 data_s:0.181
-INFO 2025-09-09 03:33:56 celerate.py:281 step:95K smpl:6M ep:38K epch:22.28 loss:0.086 grdn:0.244 lr:2.5e-06 updt_s:
-0.402 data_s:0.110
-INFO 2025-09-09 03:35:39 celerate.py:281 step:95K smpl:6M ep:38K epch:22.33 loss:0.085 grdn:0.237 lr:2.5e-06 updt_s:
-0.353 data_s:0.161
-INFO 2025-09-09 03:37:22 celerate.py:281 step:96K smpl:6M ep:38K epch:22.37 loss:0.086 grdn:0.235 lr:2.5e-06 updt_s:
-0.356 data_s:0.158
-INFO 2025-09-09 03:39:04 celerate.py:281 step:96K smpl:6M ep:38K epch:22.42 loss:0.086 grdn:0.244 lr:2.5e-06 updt_s:
-0.379 data_s:0.131
-INFO 2025-09-09 03:40:49 celerate.py:281 step:96K smpl:6M ep:38K epch:22.47 loss:0.085 grdn:0.239 lr:2.5e-06 updt_s:
-0.344 data_s:0.175
-INFO 2025-09-09 03:42:32 celerate.py:281 step:96K smpl:6M ep:38K epch:22.51 loss:0.086 grdn:0.236 lr:2.5e-06 updt_s:
-0.331 data_s:0.185
-INFO 2025-09-09 03:44:15 celerate.py:281 step:96K smpl:6M ep:38K epch:22.56 loss:0.086 grdn:0.244 lr:2.5e-06 updt_s:
-0.331 data_s:0.183
-INFO 2025-09-09 03:45:58 celerate.py:281 step:97K smpl:6M ep:38K epch:22.61 loss:0.086 grdn:0.238 lr:2.5e-06 updt_s:
-0.330 data_s:0.184
-INFO 2025-09-09 03:47:43 celerate.py:281 step:97K smpl:6M ep:38K epch:22.65 loss:0.086 grdn:0.248 lr:2.5e-06 updt_s:
-0.331 data_s:0.188
-INFO 2025-09-09 03:49:27 celerate.py:281 step:97K smpl:6M ep:38K epch:22.70 loss:0.086 grdn:0.240 lr:2.5e-06 updt_s:
-0.333 data_s:0.185
-INFO 2025-09-09 03:51:10 celerate.py:281 step:97K smpl:6M ep:39K epch:22.75 loss:0.085 grdn:0.241 lr:2.5e-06 updt_s:
-0.330 data_s:0.185
-INFO 2025-09-09 03:52:54 celerate.py:281 step:97K smpl:6M ep:39K epch:22.79 loss:0.086 grdn:0.247 lr:2.5e-06 updt_s:
-0.330 data_s:0.192
-INFO 2025-09-09 03:54:37 celerate.py:281 step:98K smpl:6M ep:39K epch:22.84 loss:0.086 grdn:0.237 lr:2.5e-06 updt_s:
-0.329 data_s:0.185
-INFO 2025-09-09 03:56:21 celerate.py:281 step:98K smpl:6M ep:39K epch:22.89 loss:0.086 grdn:0.237 lr:2.5e-06 updt_s:
-0.329 data_s:0.187
-INFO 2025-09-09 03:58:04 celerate.py:281 step:98K smpl:6M ep:39K epch:22.94 loss:0.086 grdn:0.242 lr:2.5e-06 updt_s:
-0.329 data_s:0.185
-INFO 2025-09-09 03:59:46 celerate.py:281 step:98K smpl:6M ep:39K epch:22.98 loss:0.086 grdn:0.239 lr:2.5e-06 updt_s:
-0.329 data_s:0.183
-INFO 2025-09-09 04:01:32 celerate.py:281 step:98K smpl:6M ep:39K epch:23.03 loss:0.087 grdn:0.250 lr:2.5e-06 updt_s:
-0.376 data_s:0.151
-INFO 2025-09-09 04:03:16 celerate.py:281 step:99K smpl:6M ep:39K epch:23.08 loss:0.086 grdn:0.241 lr:2.5e-06 updt_s:
-0.329 data_s:0.187
-INFO 2025-09-09 04:04:59 celerate.py:281 step:99K smpl:6M ep:39K epch:23.12 loss:0.086 grdn:0.243 lr:2.5e-06 updt_s:
-0.379 data_s:0.136
-INFO 2025-09-09 04:06:42 celerate.py:281 step:99K smpl:6M ep:39K epch:23.17 loss:0.086 grdn:0.240 lr:2.5e-06 updt_s:
-0.513 data_s:0.003
-INFO 2025-09-09 04:08:25 celerate.py:281 step:99K smpl:6M ep:39K epch:23.22 loss:0.086 grdn:0.242 lr:2.5e-06 updt_s:
-0.510 data_s:0.003
-INFO 2025-09-09 04:10:07 celerate.py:281 step:99K smpl:6M ep:39K epch:23.26 loss:0.087 grdn:0.242 lr:2.5e-06 updt_s:
-0.470 data_s:0.039
-INFO 2025-09-09 04:11:50 celerate.py:281 step:100K smpl:6M ep:39K epch:23.31 loss:0.086 grdn:0.247 lr:2.5e-06 updt_s
-:0.347 data_s:0.169
-INFO 2025-09-09 04:13:34 celerate.py:281 step:100K smpl:6M ep:40K epch:23.36 loss:0.085 grdn:0.237 lr:2.5e-06 updt_s
-:0.330 data_s:0.185
-INFO 2025-09-09 04:15:17 celerate.py:281 step:100K smpl:6M ep:40K epch:23.40 loss:0.086 grdn:0.239 lr:2.5e-06 updt_s
-:0.356 data_s:0.156
-INFO 2025-09-09 04:15:17 celerate.py:295 Checkpoint policy after step 100000
-INFO 2025-09-09 04:15:18 celerate.py:359 End of training
-(lerobot) jade_choghari@hf-dgx-01:~/lerobot$ clear
-(lerobot) jade_choghari@hf-dgx-01:~/lerobot$ tmux capture-pane -pS - > tmux_log.txt
-
-
-
-
-
-
-
-
-
-

From a19d7fb6bfdcd55b3c8475d1a249b7f815cda092 Mon Sep 17 00:00:00 2001
From: "pre-commit-ci[bot]"
 <66853113+pre-commit-ci[bot]@users.noreply.github.com>
Date: Thu, 11 Sep 2025 11:51:53 +0000
Subject: [PATCH 06/12] [pre-commit.ci] auto fixes from pre-commit.com hooks

for more information, see https://pre-commit.ci
---
 examples/6_evaluate_libero.sh                 |   2 +-
 src/lerobot/envs/factory.py                   |   8 +-
 src/lerobot/envs/libero.py                    |  55 ++--
 src/lerobot/envs/utils.py                     |   7 +-
 src/lerobot/policies/factory.py               |   2 +-
 src/lerobot/policies/normalize.py             |   6 +-
 .../policies/smolpi0/configuration_smolpi0.py |  22 +-
 .../policies/smolpi0/flex_attention.py        |  10 +-
 .../policies/smolpi0/modeling_smolpi0.py      | 211 +++++++++-----
 .../policies/smolpi0/smolvlm_with_expert.py   | 262 ++++++++++++------
 .../policies/smolvla/modeling_smolvla.py      |   3 +-
 src/lerobot/policies/smolvla/saver.txt        |   2 +-
 .../policies/smolvla/smolvlm_with_expert.py   |   2 +-
 src/lerobot/scripts/eval.py                   |  95 ++++---
 src/lerobot/scripts/train.py                  |  17 +-
 src/lerobot/scripts/train_2.py                |  10 +-
 src/lerobot/scripts/train_accelerate.py       |   9 +-
 17 files changed, 469 insertions(+), 254 deletions(-)

diff --git a/examples/6_evaluate_libero.sh b/examples/6_evaluate_libero.sh
index 46355dfa1..a7e235fcd 100644
--- a/examples/6_evaluate_libero.sh
+++ b/examples/6_evaluate_libero.sh
@@ -55,4 +55,4 @@ python src/lerobot/scripts/eval.py \
 #     --num_trials_per_task 10 \
 #     --video_out_path "data/libero/videos" \
 #     --device "cuda" \
-#     --seed 7
\ No newline at end of file
+#     --seed 7
diff --git a/src/lerobot/envs/factory.py b/src/lerobot/envs/factory.py
index d38b2eed3..b74af7276 100644
--- a/src/lerobot/envs/factory.py
+++ b/src/lerobot/envs/factory.py
@@ -61,9 +61,9 @@ def make_env(
 
     env_cls = gym.vector.AsyncVectorEnv if use_async_envs else gym.vector.SyncVectorEnv
 
-
     if "libero" in cfg.type:
         from lerobot.envs.libero import create_libero_envs
+
         return create_libero_envs(
             task=cfg.task,
             n_envs=n_envs,
@@ -74,17 +74,16 @@ def make_env(
             multitask_eval=cfg.multitask_eval,
         )
 
-    
     package_name = f"gym_{cfg.type}"
     try:
         importlib.import_module(package_name)
     except ModuleNotFoundError as e:
         raise ModuleNotFoundError(
-            f"{package_name} is not installed. Install with: pip install \"lerobot[{cfg.type}]\""
+            f'{package_name} is not installed. Install with: pip install "lerobot[{cfg.type}]"'
         ) from e
 
     gym_handle = f"{package_name}/{cfg.task}"
-    
+
     def _make_one():
         return gym.make(gym_handle, disable_env_checker=True, **(cfg.gym_kwargs or {}))
 
@@ -93,4 +92,3 @@ def make_env(
     # normalize to {suite: {task_id: vec_env}} for consistency
     suite_name = cfg.type  # e.g., "pusht", "aloha"
     return {suite_name: {0: vec}}
-
diff --git a/src/lerobot/envs/libero.py b/src/lerobot/envs/libero.py
index ff1574416..0672f583f 100644
--- a/src/lerobot/envs/libero.py
+++ b/src/lerobot/envs/libero.py
@@ -1,11 +1,13 @@
 from __future__ import annotations
 
+import logging
 import math
 import os
 from collections import defaultdict
 from collections.abc import Callable
 from itertools import chain
-from typing import Any
+from typing import Any, Dict, List
+from collections.abc import Callable, Iterable, Mapping, Sequence
 
 import gymnasium as gym
 import numpy as np
@@ -14,16 +16,12 @@ from gymnasium import spaces
 from libero.libero import benchmark, get_libero_path
 from libero.libero.envs import OffScreenRenderEnv
 
-import logging
-from collections import defaultdict
-from typing import Any, Callable, Dict, Iterable, List, Mapping, Sequence
-
-
 logger = logging.getLogger(__name__)
 
 # ---- Helpers -----------------------------------------------------------------
 
-def _parse_camera_names(camera_name: str | Sequence[str]) -> List[str]:
+
+def _parse_camera_names(camera_name: str | Sequence[str]) -> list[str]:
     """Normalize camera_name into a non-empty list of strings."""
     if isinstance(camera_name, str):
         cams = [c.strip() for c in camera_name.split(",") if c.strip()]
@@ -47,14 +45,14 @@ def _get_suite(name: str):
     return suite
 
 
-def _select_task_ids(total_tasks: int, task_ids: Iterable[int] | None) -> List[int]:
+def _select_task_ids(total_tasks: int, task_ids: Iterable[int] | None) -> list[int]:
     """Validate/normalize task ids. If None → all tasks."""
     if task_ids is None:
         return list(range(total_tasks))
-    ids = sorted(set(int(t) for t in task_ids))
+    ids = sorted({int(t) for t in task_ids})
     for t in ids:
         if t < 0 or t >= total_tasks:
-            raise ValueError(f"task_id {t} out of range [0, {total_tasks-1}].")
+            raise ValueError(f"task_id {t} out of range [0, {total_tasks - 1}].")
     return ids
 
 
@@ -64,16 +62,25 @@ def _make_env_fns(
     suite_name: str,
     task_id: int,
     n_envs: int,
-    camera_names: List[str],
+    camera_names: list[str],
     init_states: bool,
     gym_kwargs: Mapping[str, Any],
     LiberoEnv: type,  # injected to avoid forward ref issues if needed
-) -> List[Callable[[], "LiberoEnv"]]:
+) -> list[Callable[[], LiberoEnv]]:
     """Build n_envs factory callables for a single (suite, task_id)."""
     joined_cams = ",".join(camera_names)  # keep backward-compat: downstream expects a string
-    fns: List[Callable[[], "LiberoEnv"]] = []
+    fns: list[Callable[[], LiberoEnv]] = []
     for i in range(n_envs):
-        def _mk(i=i, suite=suite, task_id=task_id, suite_name=suite_name, joined_cams=joined_cams, init_states=init_states, gym_kwargs=dict(gym_kwargs)):
+
+        def _mk(
+            i=i,
+            suite=suite,
+            task_id=task_id,
+            suite_name=suite_name,
+            joined_cams=joined_cams,
+            init_states=init_states,
+            gym_kwargs=dict(gym_kwargs),
+        ):
             return LiberoEnv(
                 task_suite=suite,
                 task_id=task_id,
@@ -83,11 +90,14 @@ def _make_env_fns(
                 episode_index=i,
                 **gym_kwargs,
             )
+
         fns.append(_mk)
     return fns
 
+
 # ---- Main API ----------------------------------------------------------------
 
+
 def create_libero_envs(
     task: str,
     n_envs: int,
@@ -130,12 +140,15 @@ def create_libero_envs(
 
     logger.info(
         "Creating LIBERO envs | suites=%s | n_envs(per task)=%d | init_states=%s | multitask_eval=%s",
-        suite_names, n_envs, init_states, bool(multitask_eval)
+        suite_names,
+        n_envs,
+        init_states,
+        bool(multitask_eval),
     )
     if task_ids_filter is not None:
         logger.info("Restricting to task_ids=%s", task_ids_filter)
 
-    out: Dict[str, Dict[int, Any]] = defaultdict(dict)
+    out: dict[str, dict[int, Any]] = defaultdict(dict)
 
     for suite_name in suite_names:
         suite = _get_suite(suite_name)
@@ -161,6 +174,8 @@ def create_libero_envs(
 
     # return plain dicts for predictability
     return {suite: dict(task_map) for suite, task_map in out.items()}
+
+
 def quat2axisangle(quat):
     """
     Copied from robosuite: https://github.com/ARISE-Initiative/robosuite/blob/eafb81f54ffc104f905ee48a16bb15f059176ad3/robosuite/utils/transform_utils.py#L490C1-L512C55
@@ -256,10 +271,10 @@ class LiberoEnv(gym.Env):
         self._env = self._make_envs_task(task_suite, self.task_id)
         TASK_SUITE_MAX_STEPS: dict[str, int] = {
             "libero_spatial": 220,  # longest training demo has 193 steps
-            "libero_object": 280,   # longest training demo has 254 steps
-            "libero_goal": 300,     # longest training demo has 270 steps
-            "libero_10": 520,       # longest training demo has 505 steps
-            "libero_90": 400,       # longest training demo has 373 steps
+            "libero_object": 280,  # longest training demo has 254 steps
+            "libero_goal": 300,  # longest training demo has 270 steps
+            "libero_10": 520,  # longest training demo has 505 steps
+            "libero_90": 400,  # longest training demo has 373 steps
         }
         default_steps = 500
         self._max_episode_steps = TASK_SUITE_MAX_STEPS.get(task_suite_name, default_steps)
diff --git a/src/lerobot/envs/utils.py b/src/lerobot/envs/utils.py
index 9490f670e..d65f7f29f 100644
--- a/src/lerobot/envs/utils.py
+++ b/src/lerobot/envs/utils.py
@@ -80,6 +80,7 @@ def preprocess_observation(observations: dict[str, np.ndarray]) -> dict[str, Ten
 
     return return_observations
 
+
 def preprocess_observation1(
     observations: dict[str, np.ndarray], cfg: dict[str, Any] = None
 ) -> dict[str, Tensor]:
@@ -130,6 +131,8 @@ def preprocess_observation1(
     if "task" in observations:
         return_observations["task"] = observations["task"]
     return return_observations
+
+
 def env_to_policy_features(env_cfg: EnvConfig) -> dict[str, PolicyFeature]:
     # TODO(aliberts, rcadene): remove this hardcoding of keys and just use the nested keys as is
     # (need to also refactor preprocess_observation and externalize normalization from policies)
@@ -183,6 +186,7 @@ def add_envs_task(env: gym.vector.VectorEnv, observation: dict[str, Any]) -> dic
         observation["task"] = ["" for _ in range(num_envs)]
     return observation
 
+
 def _close_single_env(env: Any) -> None:
     """Try to close a single env object if it exposes .close()."""
     try:
@@ -193,6 +197,7 @@ def _close_single_env(env: Any) -> None:
         # Best-effort close: log but don't raise
         LOG.debug("Exception while closing env %s: %s", env, exc)
 
+
 def close_envs(env_or_collection: Any) -> None:
     """
     Close a single env or any nested structure of envs.
@@ -225,4 +230,4 @@ def close_envs(env_or_collection: Any) -> None:
 
     # Fallback: try to close if possible
     if hasattr(env_or_collection, "close"):
-        _close_single_env(env_or_collection)
\ No newline at end of file
+        _close_single_env(env_or_collection)
diff --git a/src/lerobot/policies/factory.py b/src/lerobot/policies/factory.py
index 03fa44a2a..cc1b0480d 100644
--- a/src/lerobot/policies/factory.py
+++ b/src/lerobot/policies/factory.py
@@ -31,10 +31,10 @@ from lerobot.policies.pi0fast.configuration_pi0fast import PI0FASTConfig
 from lerobot.policies.pretrained import PreTrainedPolicy
 from lerobot.policies.sac.configuration_sac import SACConfig
 from lerobot.policies.sac.reward_model.configuration_classifier import RewardClassifierConfig
+from lerobot.policies.smolpi0.configuration_smolpi0 import SMOLPI0Config
 from lerobot.policies.smolvla.configuration_smolvla import SmolVLAConfig
 from lerobot.policies.tdmpc.configuration_tdmpc import TDMPCConfig
 from lerobot.policies.vqbet.configuration_vqbet import VQBeTConfig
-from lerobot.policies.smolpi0.configuration_smolpi0 import SMOLPI0Config
 
 
 def get_policy_class(name: str) -> PreTrainedPolicy:
diff --git a/src/lerobot/policies/normalize.py b/src/lerobot/policies/normalize.py
index 043265b1b..646c330cb 100644
--- a/src/lerobot/policies/normalize.py
+++ b/src/lerobot/policies/normalize.py
@@ -309,8 +309,8 @@ class NormalizePerRobotType(nn.Module):
                 getattr(self, f"{robot_type}_buffer_" + key.replace(".", "_")) for robot_type in robot_types
             ]
             if norm_mode is NormalizationMode.MEAN_STD:
-                mean = torch.stack([buffers[i]["mean"] for i in range(len(robot_types))],dim=0)
-                std = torch.stack([buffers[i]["std"] for i in range(len(robot_types))],dim=0)
+                mean = torch.stack([buffers[i]["mean"] for i in range(len(robot_types))], dim=0)
+                std = torch.stack([buffers[i]["std"] for i in range(len(robot_types))], dim=0)
                 if batch[key].ndim == 3:
                     mean = mean.unsqueeze(1)
                     std = std.unsqueeze(1)
@@ -332,6 +332,8 @@ class NormalizePerRobotType(nn.Module):
             else:
                 raise ValueError(norm_mode)
         return batch
+
+
 # TODO (azouitine): We should replace all normalization on the policies with register_buffer normalization
 #       and remove the `Normalize` and `Unnormalize` classes.
 def _initialize_stats_buffers(
diff --git a/src/lerobot/policies/smolpi0/configuration_smolpi0.py b/src/lerobot/policies/smolpi0/configuration_smolpi0.py
index c3605cd82..e39d17f15 100644
--- a/src/lerobot/policies/smolpi0/configuration_smolpi0.py
+++ b/src/lerobot/policies/smolpi0/configuration_smolpi0.py
@@ -14,12 +14,12 @@
 
 from dataclasses import dataclass, field
 
+from lerobot.configs.policies import PreTrainedConfig
+from lerobot.configs.types import FeatureType, NormalizationMode, PolicyFeature
 from lerobot.optim.optimizers import AdamWConfig
 from lerobot.optim.schedulers import (
     CosineDecayWithWarmupSchedulerConfig,
 )
-from lerobot.configs.policies import PreTrainedConfig
-from lerobot.configs.types import FeatureType, NormalizationMode, PolicyFeature
 
 
 @dataclass
@@ -52,7 +52,7 @@ class SMOLPI0Config(PreTrainedConfig):
     max_action_dim: int = 32
 
     # Image preprocessing
-    resize_imgs_with_padding: tuple[int, int] = (512, 512) #(224, 224)
+    resize_imgs_with_padding: tuple[int, int] = (512, 512)  # (224, 224)
 
     # Add empty images. Used by pi0_aloha_sim which adds the empty
     # left and right wrist cameras in addition to the top camera.
@@ -107,14 +107,14 @@ class SMOLPI0Config(PreTrainedConfig):
 
     add_image_special_tokens: bool = False
     add_prompt_template: bool = False
-    prefix_prompt_template: str = f"<|im_start|>User: What action should the robot take to"
-    suffix_prompt_template: str = f"?\nAssistant:"
+    prefix_prompt_template: str = "<|im_start|>User: What action should the robot take to"
+    suffix_prompt_template: str = "?\nAssistant:"
 
     attention_mode: str = "self_attn"
 
-    prefix_length: int = -1 # n_obs_steps * num_cameras * num_image_token_per_image + tokenizer_max_length
+    prefix_length: int = -1  # n_obs_steps * num_cameras * num_image_token_per_image + tokenizer_max_length
 
-    past_obs_keys: str = f"image"
+    past_obs_keys: str = "image"
 
     add_local_special_image_tokens: bool = False
 
@@ -122,7 +122,7 @@ class SMOLPI0Config(PreTrainedConfig):
 
     state_to_prefix: bool = False
 
-    pad_language_to: str = "longest" # "max_length"
+    pad_language_to: str = "longest"  # "max_length"
 
     num_expert_layers: int = -1
     num_vlm_layers: int = -1
@@ -144,9 +144,9 @@ class SMOLPI0Config(PreTrainedConfig):
 
     shuffle_camera_positions: bool = False
     vlm_img_size: int = -1
-    
+
     regression_loss: bool = False
-    
+
     def __post_init__(self):
         super().__post_init__()
         if self.vlm_img_size > 0:
@@ -198,7 +198,7 @@ class SMOLPI0Config(PreTrainedConfig):
         )
 
     @property
-    def observation_delta_indices(self) -> list: # FIXME(mshukor): support spacing between observations
+    def observation_delta_indices(self) -> list:  # FIXME(mshukor): support spacing between observations
         return [-k for k in range(0, self.n_obs_steps * self.n_obs_gap, self.n_obs_gap)][::-1]
 
     @property
diff --git a/src/lerobot/policies/smolpi0/flex_attention.py b/src/lerobot/policies/smolpi0/flex_attention.py
index 13950f743..732920af2 100644
--- a/src/lerobot/policies/smolpi0/flex_attention.py
+++ b/src/lerobot/policies/smolpi0/flex_attention.py
@@ -85,7 +85,7 @@ def flex_attention_forward(
 
     b_mask, h_mask, q_len, kv_len = causal_mask.shape  # The shape of your mask
 
-    block_size = 128 # limitation of flex attention
+    block_size = 128  # limitation of flex attention
     q_len_rounded = _round_up_to_multiple(q_len, block_size)
     kv_len_rounded = _round_up_to_multiple(kv_len, block_size)
 
@@ -95,7 +95,7 @@ def flex_attention_forward(
     pad_k = kv_len_rounded - kv_len
     if pad_q > 0 or pad_k > 0:
         padded_causal_mask = F.pad(causal_mask, (0, pad_k, 0, pad_q), value=0.0)
-    else: 
+    else:
         padded_causal_mask = causal_mask
     mask_mod_fn_orig = precomputed_mask_factory(padded_causal_mask)
 
@@ -107,21 +107,21 @@ def flex_attention_forward(
         KV_LEN=kv_len_rounded,
         device=causal_mask.device,
     )
-    
+
     mask_mod_fn_padded = precomputed_mask_factory(mask_4d)
     # FIXME(mshukor): compile mask torch.compile(create_block_mask)
     create_block_mask_compiled = torch.compile(create_block_mask)
     block_mask = create_block_mask_compiled(
         mask_mod=mask_mod_fn_padded,
         B=b_mask,
-        H=None, # 
+        H=None,  #
         Q_LEN=q_len_rounded,
         KV_LEN=kv_len_rounded,
         BLOCK_SIZE=block_size,
         device=causal_mask.device,
         _compile=False,
     )
-    padded_query_states = F.pad(query_states, (0, 0, 0, pad_q), value=0.0) if pad_q > 0  else query_states
+    padded_query_states = F.pad(query_states, (0, 0, 0, pad_q), value=0.0) if pad_q > 0 else query_states
     padded_key_states = F.pad(key_states, (0, 0, 0, pad_k), value=0.0) if pad_k > 0 else key_states
     padded_value_states = F.pad(value_states, (0, 0, 0, pad_k), value=0.0) if pad_k > 0 else value_states
     #  mask is applied inside the kernel, ideally more efficiently than score_mod.
diff --git a/src/lerobot/policies/smolpi0/modeling_smolpi0.py b/src/lerobot/policies/smolpi0/modeling_smolpi0.py
index 9a128f7b6..765a5901a 100644
--- a/src/lerobot/policies/smolpi0/modeling_smolpi0.py
+++ b/src/lerobot/policies/smolpi0/modeling_smolpi0.py
@@ -50,9 +50,10 @@ policy = Pi0Policy.from_pretrained("lerobot/pi0")
 """
 
 import math
-from collections import deque
 import os
 import re
+from collections import deque
+
 import safetensors
 import torch
 import torch.nn.functional as F  # noqa: N812
@@ -66,12 +67,11 @@ from lerobot.policies.normalize import (
     Unnormalize,
     UnnormalizePerRobotType,
 )
-from lerobot.policies.smolpi0.configuration_smolpi0 import SMOLPI0Config
-from lerobot.policies.smolpi0.smolvlm_with_expert import (
-    SmolVLMWithExpertModel
-)
 from lerobot.policies.pretrained import PreTrainedPolicy
+from lerobot.policies.smolpi0.configuration_smolpi0 import SMOLPI0Config
+from lerobot.policies.smolpi0.smolvlm_with_expert import SmolVLMWithExpertModel
 from lerobot.utils.utils import get_safe_dtype
+
 OBS_IMAGE = "observation.image"
 OBS_IMAGES = "observation.images"
 ACTION = "action"
@@ -86,10 +86,13 @@ IMAGES_ORDER = {
     OBS_IMAGE_3: 2,
     OBS_IMAGE_4: 3,
 }
+import random
+
 from lerobot.policies.utils import (
     populate_queues,
 )
-import random
+
+
 def create_sinusoidal_pos_embedding(
     time: torch.tensor, dimension: int, min_period: float, max_period: float, device="cpu"
 ) -> Tensor:
@@ -171,7 +174,10 @@ def resize_with_pad(img, width, height, pad_value=-1):
     padded_img = F.pad(resized_img, (pad_width, 0, pad_height, 0), value=pad_value)
     return padded_img
 
+
 _VARIANT_RE = re.compile(r"\.so\d+(?:-[\w]+)?_buffer_")
+
+
 def canonicalise(k: str) -> str:
     """
     Remove dataset-variant markers like '.so100-blue_' or '.so100_' from a
@@ -179,6 +185,7 @@ def canonicalise(k: str) -> str:
     """
     return _VARIANT_RE.sub(".buffer_", k)
 
+
 def standardise_state_dict(
     checkpoint: dict[str, torch.Tensor], ref_keys: set[str], *, verbose: bool = True
 ) -> tuple[dict[str, torch.Tensor], list[str]]:
@@ -209,6 +216,7 @@ def standardise_state_dict(
     out.update({k: checkpoint[k] for k in unmatched})
     return out, unmatched
 
+
 def load_smolvla(
     model: torch.nn.Module,
     filename: str | os.PathLike,
@@ -237,6 +245,8 @@ def load_smolvla(
         )
 
     return model
+
+
 def pad_vector(vector, new_dim):
     """Can be (batch_size x sequence_length x features_dimension)
     or (batch_size x features_dimension)
@@ -286,6 +296,7 @@ def aloha_gripper_to_angular(value):
     # The values 0.4 and 1.5 were measured on an actual Trossen robot.
     return normalize(value, min_val=0.4, max_val=1.5)
 
+
 def rename_checkpoint_keys(checkpoint: dict, rename_str: str):
     """
     Renames keys in a checkpoint dictionary based on the given rename string.
@@ -307,6 +318,8 @@ def rename_checkpoint_keys(checkpoint: dict, rename_str: str):
                 k = k.replace(old_key, new_key)
         new_checkpoint[k] = v
     return new_checkpoint
+
+
 def aloha_gripper_from_angular(value):
     # Convert from the gripper position used by pi0 to the gripper position that is used by Aloha.
     # Note that the units are still angular but the range is different.
@@ -324,6 +337,7 @@ def aloha_gripper_from_angular_inv(value):
     value = unnormalize(value, min_val=-0.6213, max_val=1.4910)
     return normalize(value, min_val=0.4, max_val=1.5)
 
+
 class SMOLPI0Policy(PreTrainedPolicy):
     """Wrapper class around VLAFlowMatching model to train and run inference within LeRobot."""
 
@@ -374,7 +388,9 @@ class SMOLPI0Policy(PreTrainedPolicy):
 
         self.language_tokenizer = AutoProcessor.from_pretrained(self.config.vlm_model_name).tokenizer
         self.model = VLAFlowMatching(config)
-        self.include_past_states = config.n_obs_steps > 1 and OBS_STATE in self.config.past_obs_keys.split(",")
+        self.include_past_states = config.n_obs_steps > 1 and OBS_STATE in self.config.past_obs_keys.split(
+            ","
+        )
         self.include_past_images = config.n_obs_steps > 1 and "image" in self.config.past_obs_keys.split(",")
         self.num_past_images = self.config.n_obs_steps if self.include_past_images else 1
         self.reset()
@@ -389,31 +405,20 @@ class SMOLPI0Policy(PreTrainedPolicy):
             for k in self.config.input_features:
                 if any([past_obs_key in k for past_obs_key in self.config.past_obs_keys.split(",")]):
                     self._queues[k] = deque(maxlen=self.config.n_obs_steps)
-    
+
     def get_optim_params(self) -> dict:
         if self.config.optimizer_lr_vlm > 0 and self.config.optimizer_lr_vlm != self.config.optimizer_lr:
             params = [
+                {"params": [p for n, p in self.named_parameters() if ".vlm." not in n and p.requires_grad]},
                 {
-                    "params": [
-                        p
-                        for n, p in self.named_parameters()
-                        if not ".vlm." in n and p.requires_grad
-                    ]
-                },
-                {
-                    "params": [
-                        p
-                        for n, p in self.named_parameters()
-                        if ".vlm." in n and p.requires_grad
-                    ],
+                    "params": [p for n, p in self.named_parameters() if ".vlm." in n and p.requires_grad],
                     "lr": self.config.optimizer_lr_vlm,
                 },
             ]
             return params
-            
+
         else:
             return self.parameters()
-            
 
     def merge_peft_model_weights(self) -> None:
         if "lora" in self.config.peft_method:
@@ -438,9 +443,7 @@ class SMOLPI0Policy(PreTrainedPolicy):
         state = self.prepare_state(batch)
         lang_tokens, lang_masks = self.prepare_language(batch)
 
-        actions = self.model.sample_actions(
-            images, img_masks, lang_tokens, lang_masks, state, noise=noise
-        )
+        actions = self.model.sample_actions(images, img_masks, lang_tokens, lang_masks, state, noise=noise)
         # Unpad actions
         original_action_dim = self.config.action_feature.shape[0]
         actions = actions[:, :, :original_action_dim]
@@ -469,6 +472,7 @@ class SMOLPI0Policy(PreTrainedPolicy):
             device=map_location,
             checkpoint_keys_mapping="model._orig_mod.//model.",
         )
+
     @torch.no_grad
     def select_action(self, batch: dict[str, Tensor], noise: Tensor | None = None) -> Tensor:
         """Select a single action given environment observations.
@@ -564,8 +568,12 @@ class SMOLPI0Policy(PreTrainedPolicy):
         present_img_keys = [key for key in self.config.image_features if key in batch]
         missing_img_keys = [key for key in self.config.image_features if key not in batch]
 
-        present_img_keys = sorted(present_img_keys, key=lambda k: IMAGES_ORDER.get(k, float("inf")), reverse=self.config.reverse_images_order)
-        if self.config.shuffle_camera_positions and ACTION in batch: # only during training
+        present_img_keys = sorted(
+            present_img_keys,
+            key=lambda k: IMAGES_ORDER.get(k, float("inf")),
+            reverse=self.config.reverse_images_order,
+        )
+        if self.config.shuffle_camera_positions and ACTION in batch:  # only during training
             present_img_keys = random.sample(present_img_keys, len(present_img_keys))
         if len(present_img_keys) == 0:
             raise ValueError(
@@ -609,7 +617,10 @@ class SMOLPI0Policy(PreTrainedPolicy):
             tasks = [tasks[0] for _ in range(batch[OBS_STATE].shape[0])]
 
         if self.config.add_prompt_template:
-            tasks = [f"{self.config.prefix_prompt_template}{task}{self.config.suffix_prompt_template}" for task in tasks]
+            tasks = [
+                f"{self.config.prefix_prompt_template}{task}{self.config.suffix_prompt_template}"
+                for task in tasks
+            ]
         else:
             tasks = [task if task.endswith("\n") else f"{task}\n" for task in tasks]
         tokenized_prompt = self.language_tokenizer.__call__(
@@ -618,7 +629,7 @@ class SMOLPI0Policy(PreTrainedPolicy):
             padding_side="right",
             max_length=self.config.tokenizer_max_length,
             return_tensors="pt",
-            truncation=True, # FIXME(mshukor)
+            truncation=True,  # FIXME(mshukor)
         )
 
         lang_tokens = tokenized_prompt["input_ids"].to(device=device)
@@ -655,7 +666,11 @@ class SMOLPI0Policy(PreTrainedPolicy):
 
     def prepare_state(self, batch):
         """Pad state"""
-        state = batch[OBS_STATE][:, -1, :] if (batch[OBS_STATE].ndim > 2 and not self.include_past_states) else batch[OBS_STATE] # FIXME(mshukor): no state history for now
+        state = (
+            batch[OBS_STATE][:, -1, :]
+            if (batch[OBS_STATE].ndim > 2 and not self.include_past_states)
+            else batch[OBS_STATE]
+        )  # FIXME(mshukor): no state history for now
         state = pad_vector(state, self.config.max_state_dim)
         return state
 
@@ -666,7 +681,9 @@ class SMOLPI0Policy(PreTrainedPolicy):
             if self.config.relative_actions_mode == "first":
                 actions = torch.cat((actions[:, :1], actions[:, 1:] - actions[:, :1]), dim=1)
             elif self.config.relative_actions_mode == "state":
-                assert batch[ACTION].shape[-1] == batch[OBS_STATE].shape[-1], "Relative action mode 'state' requires the action and state to have the same dimension."
+                assert batch[ACTION].shape[-1] == batch[OBS_STATE].shape[-1], (
+                    "Relative action mode 'state' requires the action and state to have the same dimension."
+                )
                 if state.ndim == 2:
                     state = state.unsqueeze(1)
                 actions = actions - state
@@ -674,6 +691,7 @@ class SMOLPI0Policy(PreTrainedPolicy):
                 actions = torch.cat((actions[:, :1], actions[:, 1:] - actions[:, :-1]), dim=1)
         return actions
 
+
 def pad_tensor(tensor, max_len, pad_value=0):
     """
     Efficiently pads a tensor along sequence dimension to match max_len.
@@ -687,13 +705,16 @@ def pad_tensor(tensor, max_len, pad_value=0):
         torch.Tensor: Shape (B, max_len, ...) or (B, max_len).
     """
     B, L = tensor.shape[:2]
-    
+
     # Create a padded tensor of max_len and copy the existing values
-    padded_tensor = torch.full((B, max_len, *tensor.shape[2:]), pad_value, dtype=tensor.dtype, device=tensor.device)
+    padded_tensor = torch.full(
+        (B, max_len, *tensor.shape[2:]), pad_value, dtype=tensor.dtype, device=tensor.device
+    )
     padded_tensor[:, :L] = tensor  # Efficient in-place copy
 
     return padded_tensor
 
+
 class VLAFlowMatching(nn.Module):
     """
     π0: A Vision-Language-Action Flow Model for General Robot Control
@@ -725,7 +746,8 @@ class VLAFlowMatching(nn.Module):
         super().__init__()
         self.config = config
 
-        self.vlm_with_expert = SmolVLMWithExpertModel(model_id=self.config.vlm_model_name, 
+        self.vlm_with_expert = SmolVLMWithExpertModel(
+            model_id=self.config.vlm_model_name,
             freeze_vision_encoder=self.config.freeze_vision_encoder,
             train_expert_only=self.config.train_expert_only,
             attention_implementation=self.config.attention_implementation,
@@ -736,50 +758,64 @@ class VLAFlowMatching(nn.Module):
             self_attn_every_n_layers=self.config.self_attn_every_n_layers,
             expert_width_multiplier=self.config.expert_width_multiplier,
             self_attn_only_actions=self.config.self_attn_only_actions,
-            )
+        )
         # self.paligemma_with_expert = self.configure_peft(paligemma_with_expert)
         self.vlm_with_expert.configure_peft(config=self.config)
         # Projections are float32
         self.state_to_prefix = self.config.state_to_prefix
         if self.state_to_prefix:
-            self.state_proj = nn.Linear(self.config.max_state_dim, self.vlm_with_expert.config.text_config.hidden_size)
+            self.state_proj = nn.Linear(
+                self.config.max_state_dim, self.vlm_with_expert.config.text_config.hidden_size
+            )
         else:
             self.state_proj = nn.Linear(self.config.max_state_dim, self.vlm_with_expert.expert_hidden_size)
         self.action_in_proj = nn.Linear(self.config.max_action_dim, self.vlm_with_expert.expert_hidden_size)
         self.action_out_proj = nn.Linear(self.vlm_with_expert.expert_hidden_size, self.config.max_action_dim)
 
-        self.action_time_mlp_in = nn.Linear(self.vlm_with_expert.expert_hidden_size * 2, self.vlm_with_expert.expert_hidden_size)
-        self.action_time_mlp_out = nn.Linear(self.vlm_with_expert.expert_hidden_size, self.vlm_with_expert.expert_hidden_size)
+        self.action_time_mlp_in = nn.Linear(
+            self.vlm_with_expert.expert_hidden_size * 2, self.vlm_with_expert.expert_hidden_size
+        )
+        self.action_time_mlp_out = nn.Linear(
+            self.vlm_with_expert.expert_hidden_size, self.vlm_with_expert.expert_hidden_size
+        )
 
         self.set_requires_grad()
         # SmolVLM2 has: [fake_tok + crop_tok + crop + fake_tok + crop_tok ... + fake_tok + global_tok + global + fake_tok] + [second image] + ...
-        if  any([k in self.config.vlm_model_name for k in ["SmolVLM-", "SmolVLA-"]]):
+        if any([k in self.config.vlm_model_name for k in ["SmolVLM-", "SmolVLA-"]]):
             if "SmolVLM-Instruct" in self.config.vlm_model_name:
-                self.fake_image_token =  49152
-                self.global_image_token =  [44, 13906, 29, 6266, 46]
-                self.global_image_start_token = torch.tensor([self.fake_image_token] + self.global_image_token, dtype=torch.long)
+                self.fake_image_token = 49152
+                self.global_image_token = [44, 13906, 29, 6266, 46]
+                self.global_image_start_token = torch.tensor(
+                    [self.fake_image_token] + self.global_image_token, dtype=torch.long
+                )
             else:
-                self.fake_image_token =  49189
-                self.global_image_token =  49152
-                self.global_image_start_token = torch.tensor([self.fake_image_token, self.global_image_token], dtype=torch.long)
+                self.fake_image_token = 49189
+                self.global_image_token = 49152
+                self.global_image_start_token = torch.tensor(
+                    [self.fake_image_token, self.global_image_token], dtype=torch.long
+                )
         else:
             self.fake_image_token = self.vlm_with_expert.processor.tokenizer.fake_image_token_id
             self.global_image_token = self.vlm_with_expert.processor.tokenizer.global_image_token_id
-            self.global_image_start_token = torch.tensor([self.fake_image_token, self.global_image_token], dtype=torch.long)
+            self.global_image_start_token = torch.tensor(
+                [self.fake_image_token, self.global_image_token], dtype=torch.long
+            )
 
         self.add_image_special_tokens = self.config.add_image_special_tokens
         self.add_local_special_image_tokens = self.config.add_local_special_image_tokens
-        self.local_image_tokens = [torch.tensor([self.fake_image_token, tok], dtype=torch.long) for tok in [49153, 49154, 49155, 49159, 49160, 49161, 49165, 49166, 49167]] # assume 3 x 3 grid
-        
+        self.local_image_tokens = [
+            torch.tensor([self.fake_image_token, tok], dtype=torch.long)
+            for tok in [49153, 49154, 49155, 49159, 49160, 49161, 49165, 49166, 49167]
+        ]  # assume 3 x 3 grid
+
         self.local_image_start_token = self.global_image_start_token
         self.image_end_token = torch.tensor([self.fake_image_token], dtype=torch.long)
         self.prefix_length = self.config.prefix_length
-        self.include_past_images = self.config.n_obs_steps > 1 and "image" in self.config.past_obs_keys.split(",")
+        self.include_past_images = self.config.n_obs_steps > 1 and "image" in self.config.past_obs_keys.split(
+            ","
+        )
         self.num_past_images = self.config.n_obs_steps if self.include_past_images else 1
         self.causal_attention_on_history = self.config.causal_attention_on_history
-        
-        
-        
 
     # def configure_peft(self, model):
     #     # return model
@@ -845,14 +881,30 @@ class VLAFlowMatching(nn.Module):
             img,
             img_mask,
         ) in enumerate(zip(images, img_masks, strict=False)):
-            # FIXME(mshukor):  add special tokens for the history each history_steps or not 
+            # FIXME(mshukor):  add special tokens for the history each history_steps or not
             if self.add_image_special_tokens:
                 if self.add_local_special_image_tokens and img_idx % num_images != num_images - 1:
                     local_token_idx = img_idx % num_images
-                    image_start_token = self.vlm_with_expert.embed_language_tokens(self.local_image_tokens[local_token_idx].to(device=self.vlm_with_expert.vlm.device)).unsqueeze(0).expand(img.shape[0], -1, -1)
+                    image_start_token = (
+                        self.vlm_with_expert.embed_language_tokens(
+                            self.local_image_tokens[local_token_idx].to(
+                                device=self.vlm_with_expert.vlm.device
+                            )
+                        )
+                        .unsqueeze(0)
+                        .expand(img.shape[0], -1, -1)
+                    )
                 else:
-                    image_start_token = self.vlm_with_expert.embed_language_tokens(self.global_image_start_token.to(device=self.vlm_with_expert.vlm.device)).unsqueeze(0).expand(img.shape[0], -1, -1)
-                image_start_mask = torch.ones_like(image_start_token[:, :, 0], dtype=torch.bool, device=image_start_token.device)
+                    image_start_token = (
+                        self.vlm_with_expert.embed_language_tokens(
+                            self.global_image_start_token.to(device=self.vlm_with_expert.vlm.device)
+                        )
+                        .unsqueeze(0)
+                        .expand(img.shape[0], -1, -1)
+                    )
+                image_start_mask = torch.ones_like(
+                    image_start_token[:, :, 0], dtype=torch.bool, device=image_start_token.device
+                )
                 if self.causal_attention_on_history and img_idx % num_images == 0:
                     att_masks += [1] + [0] * (image_start_mask.shape[-1] - 1)
                 else:
@@ -861,7 +913,7 @@ class VLAFlowMatching(nn.Module):
                 pad_masks.append(image_start_mask)
 
             img_emb = self.vlm_with_expert.embed_image(img)
-            img_emb = img_emb #.to(dtype=self.vlm_with_expert.type)
+            img_emb = img_emb  # .to(dtype=self.vlm_with_expert.type)
 
             # Normalize image embeddings
             img_emb_dim = img_emb.shape[-1]
@@ -880,16 +932,26 @@ class VLAFlowMatching(nn.Module):
 
             att_masks += [0] * (num_img_embs)
             if self.add_image_special_tokens:
-                if not self.add_local_special_image_tokens or (self.add_local_special_image_tokens and img_idx % num_images == num_images - 1):
-                    image_end_token = self.vlm_with_expert.embed_language_tokens(self.image_end_token.to(device=self.vlm_with_expert.vlm.device)).unsqueeze(0).expand(img.shape[0], -1, -1)
-                    image_end_mask = torch.ones_like(image_end_token[:, :, 0], dtype=torch.bool, device=image_end_token.device)
+                if not self.add_local_special_image_tokens or (
+                    self.add_local_special_image_tokens and img_idx % num_images == num_images - 1
+                ):
+                    image_end_token = (
+                        self.vlm_with_expert.embed_language_tokens(
+                            self.image_end_token.to(device=self.vlm_with_expert.vlm.device)
+                        )
+                        .unsqueeze(0)
+                        .expand(img.shape[0], -1, -1)
+                    )
+                    image_end_mask = torch.ones_like(
+                        image_end_token[:, :, 0], dtype=torch.bool, device=image_end_token.device
+                    )
                     embs.append(image_end_token)
                     pad_masks.append(image_end_mask)
                     att_masks += [0] * (image_end_mask.shape[1])
         lang_emb = self.vlm_with_expert.embed_language_tokens(lang_tokens)
         # Normalize language embeddings
         lang_emb_dim = lang_emb.shape[-1]
-        lang_emb = lang_emb * math.sqrt(lang_emb_dim) # FIXME(mshukor): is this needed for smolvlm?
+        lang_emb = lang_emb * math.sqrt(lang_emb_dim)  # FIXME(mshukor): is this needed for smolvlm?
 
         embs.append(lang_emb)
         pad_masks.append(lang_masks)
@@ -900,7 +962,9 @@ class VLAFlowMatching(nn.Module):
 
         if state is not None and self.state_to_prefix:
             state_emb = self.state_proj(state)
-            state_emb = state_emb[:, None, :] if state_emb.ndim == 2 else state_emb #.to(dtype=self.vlm_with_expert.type)
+            state_emb = (
+                state_emb[:, None, :] if state_emb.ndim == 2 else state_emb
+            )  # .to(dtype=self.vlm_with_expert.type)
             embs.append(state_emb)
             bsize = state_emb.shape[0]
             dtype = state_emb.dtype
@@ -912,7 +976,7 @@ class VLAFlowMatching(nn.Module):
 
             # Set attention masks so that image and language inputs do not attend to state or actions
             # att_masks += [1] + [0]*(states_seq_len - 1)
-            att_masks += [1]*(states_seq_len)
+            att_masks += [1] * (states_seq_len)
         embs = torch.cat(embs, dim=1)
         pad_masks = torch.cat(pad_masks, dim=1)
         att_masks = torch.tensor(att_masks, dtype=torch.bool, device=pad_masks.device)
@@ -937,7 +1001,9 @@ class VLAFlowMatching(nn.Module):
         # Embed state
         if not self.state_to_prefix:
             state_emb = self.state_proj(state)
-            state_emb = state_emb[:, None, :] if state_emb.ndim == 2 else state_emb #.to(dtype=self.vlm_with_expert.type)
+            state_emb = (
+                state_emb[:, None, :] if state_emb.ndim == 2 else state_emb
+            )  # .to(dtype=self.vlm_with_expert.type)
             embs.append(state_emb)
             bsize = state_emb.shape[0]
             dtype = state_emb.dtype
@@ -948,8 +1014,7 @@ class VLAFlowMatching(nn.Module):
             pad_masks.append(state_mask)
 
             # Set attention masks so that image and language inputs do not attend to state or actions
-            att_masks += [1] + [0]*(states_seq_len - 1)
-
+            att_masks += [1] + [0] * (states_seq_len - 1)
 
         # Fuse timestep + action information using an MLP
         action_emb = self.action_in_proj(noisy_actions)
@@ -1010,7 +1075,7 @@ class VLAFlowMatching(nn.Module):
             images, img_masks, lang_tokens, lang_masks, state=state
         )
         suffix_embs, suffix_pad_masks, suffix_att_masks = self.embed_suffix(state, x_t, time)
-        
+
         pad_masks = torch.cat([prefix_pad_masks, suffix_pad_masks], dim=1)
         att_masks = torch.cat([prefix_att_masks, suffix_att_masks], dim=1)
 
@@ -1061,12 +1126,12 @@ class VLAFlowMatching(nn.Module):
             x_t = torch.zeros_like(noise, dtype=torch.float32, device=device)
             expanded_time = torch.zeros(bsize, dtype=torch.float32, device=device)
             x_t = self.denoise_step(
-                                state,
-                                prefix_pad_masks,
-                                past_key_values,
-                                x_t,
-                                expanded_time,
-                            )
+                state,
+                prefix_pad_masks,
+                past_key_values,
+                x_t,
+                expanded_time,
+            )
         else:
             dt = -1.0 / self.config.num_steps
             dt = torch.tensor(dt, dtype=torch.float32, device=device)
diff --git a/src/lerobot/policies/smolpi0/smolvlm_with_expert.py b/src/lerobot/policies/smolpi0/smolvlm_with_expert.py
index b910679f1..0ccdcccc8 100644
--- a/src/lerobot/policies/smolpi0/smolvlm_with_expert.py
+++ b/src/lerobot/policies/smolpi0/smolvlm_with_expert.py
@@ -12,31 +12,28 @@
 # See the License for the specific language governing permissions and
 # limitations under the License.
 
-from typing import List, Optional, Union
-from functools import partial
 import copy
+from functools import partial
+from typing import List, Optional, Union
 
 import torch
-import torch.version
 import torch.nn.functional as F  # noqa: N812
+import torch.version
 from peft import LoraConfig, TaskType, get_peft_model
 from pytest import Cache
 from torch import nn
 from transformers import (
     AutoConfig,
-    GemmaForCausalLM,
-    AutoModelForImageTextToText,
-    AutoProcessor,
-    PretrainedConfig,
-    PreTrainedModel,
-    SmolVLMForConditionalGeneration,
     AutoModel,
+    AutoModelForImageTextToText,
     AutoModelForVision2Seq,
+    AutoProcessor,
+    SmolVLMForConditionalGeneration,
 )
-from transformers.models.auto import CONFIG_MAPPING
-from transformers import SmolVLMModel, SmolVLMConfig
+
 from lerobot.policies.smolpi0.flex_attention import flex_attention_forward
 
+
 def _round_up_to_multiple(x, multiple):
     return (x + multiple - 1) // multiple * multiple
 
@@ -180,20 +177,31 @@ def apply_rope(x, positions, max_wavelength=10_000):
 #                 f"Wrong value provided for `attention_implementation` ({self.attention_implementation}). Expected 'eager', 'fa2' or 'flex'."
 #             )
 
+
 def get_intermediate_size(hidden_dim, ffn_dim_multiplier=4, multiple_of=256):
     hidden_dim = int(2 * hidden_dim / 3)
     hidden_dim = int(ffn_dim_multiplier * hidden_dim)
     hidden_dim = multiple_of * ((hidden_dim + multiple_of - 1) // multiple_of)
     return hidden_dim
-    
+
 
 class SmolVLMWithExpertModel(nn.Module):
     # config_class = PaliGemmaWithExpertConfig
 
-    def __init__(self, model_id: str = "HuggingFaceTB/SmolVLM2-500M-Video-Instruct", 
-    load_vlm_weights: bool = True, train_expert_only: bool = True, freeze_vision_encoder: bool = False, 
-    attention_implementation: str = "eager", attention_mode: str = "self_attn", num_expert_layers: int = -1, 
-    num_vlm_layers: int = -1, self_attn_every_n_layers: int = -1, expert_width_multiplier: float = 0.5, self_attn_only_actions: bool = False):
+    def __init__(
+        self,
+        model_id: str = "HuggingFaceTB/SmolVLM2-500M-Video-Instruct",
+        load_vlm_weights: bool = True,
+        train_expert_only: bool = True,
+        freeze_vision_encoder: bool = False,
+        attention_implementation: str = "eager",
+        attention_mode: str = "self_attn",
+        num_expert_layers: int = -1,
+        num_vlm_layers: int = -1,
+        self_attn_every_n_layers: int = -1,
+        expert_width_multiplier: float = 0.5,
+        self_attn_only_actions: bool = False,
+    ):
         super().__init__()
         if load_vlm_weights:
             print(f"Loading  {model_id} weights ...")
@@ -227,15 +235,17 @@ class SmolVLMWithExpertModel(nn.Module):
         # Smaller lm expert
         lm_expert_config = copy.deepcopy(config.text_config)
         hidden_size = lm_expert_config.hidden_size
-        lm_expert_config.hidden_size = int(hidden_size*expert_width_multiplier) #hidden_size // 2
-        lm_expert_config.intermediate_size = get_intermediate_size(int(hidden_size*expert_width_multiplier))
+        lm_expert_config.hidden_size = int(hidden_size * expert_width_multiplier)  # hidden_size // 2
+        lm_expert_config.intermediate_size = get_intermediate_size(int(hidden_size * expert_width_multiplier))
         lm_expert_config.num_hidden_layers = self.num_vlm_layers
-        if num_expert_layers > 0 :
-            assert len(self.get_vlm_model().text_model.layers) % num_expert_layers == 0, f"Number of layers in the VLM {len(self.get_vlm_model().text_model.layers)} are not multiple of num_expert_layers {num_expert_layers}"
+        if num_expert_layers > 0:
+            assert len(self.get_vlm_model().text_model.layers) % num_expert_layers == 0, (
+                f"Number of layers in the VLM {len(self.get_vlm_model().text_model.layers)} are not multiple of num_expert_layers {num_expert_layers}"
+            )
             lm_expert_config.num_hidden_layers = num_expert_layers
         # lm_expert_config.head_dim = lm_expert_config.head_dim * 2
         self.lm_expert = AutoModel.from_config(lm_expert_config)
-        
+
         self.num_expert_layers = len(self.lm_expert.layers)
         self.self_attn_every_n_layers = self_attn_every_n_layers
         self.self_attn_only_actions = self_attn_only_actions
@@ -245,10 +255,14 @@ class SmolVLMWithExpertModel(nn.Module):
                 if self.self_attn_every_n_layers > 0 and layer_idx % self.self_attn_every_n_layers == 0:
                     continue
                 self.lm_expert.layers[layer_idx].self_attn.k_proj = nn.Linear(
-                    config.text_config.num_key_value_heads * config.text_config.head_dim, lm_expert_config.num_key_value_heads * lm_expert_config.head_dim, bias=lm_expert_config.attention_bias
+                    config.text_config.num_key_value_heads * config.text_config.head_dim,
+                    lm_expert_config.num_key_value_heads * lm_expert_config.head_dim,
+                    bias=lm_expert_config.attention_bias,
                 )
                 self.lm_expert.layers[layer_idx].self_attn.v_proj = nn.Linear(
-                    config.text_config.num_key_value_heads * config.text_config.head_dim, lm_expert_config.num_key_value_heads * lm_expert_config.head_dim, bias=lm_expert_config.attention_bias
+                    config.text_config.num_key_value_heads * config.text_config.head_dim,
+                    lm_expert_config.num_key_value_heads * lm_expert_config.head_dim,
+                    bias=lm_expert_config.attention_bias,
                 )
         # Remove unused embed_tokens
         self.lm_expert.embed_tokens = None
@@ -308,8 +322,10 @@ class SmolVLMWithExpertModel(nn.Module):
         else:
             self.vlm = self.vlm.merge_and_unload()
 
-    def get_vlm_model(self,):
-        if hasattr(self.vlm.model, "model"): # When using peft
+    def get_vlm_model(
+        self,
+    ):
+        if hasattr(self.vlm.model, "model"):  # When using peft
             return self.vlm.model.model
         else:
             return self.vlm.model
@@ -326,22 +342,20 @@ class SmolVLMWithExpertModel(nn.Module):
         else:
             # To avoid unused params issue with distributed training
             last_layers = [self.num_vlm_layers - 1]
-            if self.num_vlm_layers != self.num_expert_layers and self.num_vlm_layers % self.num_expert_layers == 0:
+            if (
+                self.num_vlm_layers != self.num_expert_layers
+                and self.num_vlm_layers % self.num_expert_layers == 0
+            ):
                 last_layers.append(self.num_vlm_layers - 2)
             frozen_layers = [
-                            "lm_head",
-                            "text_model.model.norm.weight",
-                        ]
+                "lm_head",
+                "text_model.model.norm.weight",
+            ]
             for layer in last_layers:
                 frozen_layers.append(f"text_model.model.layers.{layer}.")
 
             for name, params in self.vlm.named_parameters():
-                if any(
-                    [
-                        k in name
-                        for k in frozen_layers
-                    ]
-                ):
+                if any([k in name for k in frozen_layers]):
                     params.requires_grad = False
         # To avoid unused params issue with distributed training
         for name, params in self.lm_expert.named_parameters():
@@ -410,19 +424,34 @@ class SmolVLMWithExpertModel(nn.Module):
 
         # FIXME(mshukor): add special image tokens specific to smolvlm
         # Get sequence from the vision encoder
-        image_hidden_states = self.get_vlm_model().vision_model(
-            pixel_values=image.to(dtype=self.get_vlm_model().vision_model.dtype),
-            patch_attention_mask=patch_attention_mask,
-        ).last_hidden_state
+        image_hidden_states = (
+            self.get_vlm_model()
+            .vision_model(
+                pixel_values=image.to(dtype=self.get_vlm_model().vision_model.dtype),
+                patch_attention_mask=patch_attention_mask,
+            )
+            .last_hidden_state
+        )
         # Modality projection & resampling
         image_hidden_states = self.get_vlm_model().connector(image_hidden_states)
         return image_hidden_states
-        
+
     def embed_language_tokens(self, tokens: torch.Tensor):
         return self.get_vlm_model().text_model.get_input_embeddings()(tokens)
 
-    def forward_attn_layer(self, model_layers, inputs_embeds, layer_idx, position_ids, attention_mask, batch_size, head_dim, use_cache: bool = True, fill_kv_cache: bool = True, past_key_values=None) -> list[torch.Tensor]:
-
+    def forward_attn_layer(
+        self,
+        model_layers,
+        inputs_embeds,
+        layer_idx,
+        position_ids,
+        attention_mask,
+        batch_size,
+        head_dim,
+        use_cache: bool = True,
+        fill_kv_cache: bool = True,
+        past_key_values=None,
+    ) -> list[torch.Tensor]:
         query_states = []
         key_states = []
         value_states = []
@@ -430,7 +459,7 @@ class SmolVLMWithExpertModel(nn.Module):
             layer = model_layers[i][layer_idx]
             if hidden_states is None or layer is None:
                 continue
-            
+
             # normalizer = torch.tensor(models[i].config.hidden_size**0.5, dtype=hidden_states.dtype)
             # hidden_states = hidden_states * normalizer
             hidden_states = layer.input_layernorm(hidden_states)
@@ -468,12 +497,16 @@ class SmolVLMWithExpertModel(nn.Module):
             if inputs_embeds[1] is not None:
                 suffix_len = inputs_embeds[1].shape[1]
                 attention_mask_[:, -suffix_len:, :-suffix_len] = False
-                position_ids_[:, -suffix_len:] = _position_ids[:, -suffix_len:] - _position_ids[:, -suffix_len][:, None]
+                position_ids_[:, -suffix_len:] = (
+                    _position_ids[:, -suffix_len:] - _position_ids[:, -suffix_len][:, None]
+                )
         else:
             attention_mask_ = _attention_mask
             position_ids_ = _position_ids
 
-        query_states = apply_rope(query_states, position_ids_) # FIXME(mshukor): this assumes we have always the vlm features?
+        query_states = apply_rope(
+            query_states, position_ids_
+        )  # FIXME(mshukor): this assumes we have always the vlm features?
         key_states = apply_rope(key_states, position_ids_)
 
         if use_cache and past_key_values is None:
@@ -491,9 +524,7 @@ class SmolVLMWithExpertModel(nn.Module):
                 # the max len, then we (for instance) double the cache size. This implementation already exists
                 # in `transformers`. (molbap)
                 key_states = torch.cat([past_key_values[layer_idx]["key_states"], key_states], dim=1)
-                value_states = torch.cat(
-                    [past_key_values[layer_idx]["value_states"], value_states], dim=1
-                )
+                value_states = torch.cat([past_key_values[layer_idx]["value_states"], value_states], dim=1)
 
         attention_interface = self.get_attention_interface()
 
@@ -504,20 +535,32 @@ class SmolVLMWithExpertModel(nn.Module):
 
         return [att_output], past_key_values
 
-
-    def forward_cross_attn_layer(self, model_layers, inputs_embeds, layer_idx, position_ids, attention_mask, batch_size, head_dim, use_cache: bool = True, fill_kv_cache: bool = True, past_key_values = None) -> list[torch.Tensor]:
-
+    def forward_cross_attn_layer(
+        self,
+        model_layers,
+        inputs_embeds,
+        layer_idx,
+        position_ids,
+        attention_mask,
+        batch_size,
+        head_dim,
+        use_cache: bool = True,
+        fill_kv_cache: bool = True,
+        past_key_values=None,
+    ) -> list[torch.Tensor]:
         attention_interface = self.get_attention_interface()
 
         att_outputs = []
-        assert len(inputs_embeds) == 2 or (use_cache and past_key_values is not None and not fill_kv_cache), f"Both len(inputs_embeds) == {len(inputs_embeds)} and past_key_values is {past_key_values}"
+        assert len(inputs_embeds) == 2 or (use_cache and past_key_values is not None and not fill_kv_cache), (
+            f"Both len(inputs_embeds) == {len(inputs_embeds)} and past_key_values is {past_key_values}"
+        )
 
         if len(inputs_embeds) == 2 and not past_key_values:
             # Prefix attention
             seq_len = inputs_embeds[0].shape[1]
-            position_id, expert_position_id = position_ids[:, :seq_len],  position_ids[:, seq_len:]
+            position_id, expert_position_id = position_ids[:, :seq_len], position_ids[:, seq_len:]
             prefix_attention_mask = attention_mask[:, :seq_len, :seq_len]
-            
+
             layer = model_layers[0][layer_idx]
 
             hidden_states = layer.input_layernorm(inputs_embeds[0])
@@ -558,7 +601,6 @@ class SmolVLMWithExpertModel(nn.Module):
                 key_states = past_key_values[layer_idx]["key_states"]
                 value_states = past_key_values[layer_idx]["value_states"]
 
-
         # Expert
         expert_layer = model_layers[1][layer_idx]
         if expert_layer is not None:
@@ -570,21 +612,37 @@ class SmolVLMWithExpertModel(nn.Module):
             expert_hidden_states = expert_hidden_states.to(dtype=expert_layer.self_attn.q_proj.weight.dtype)
             expert_query_state = expert_layer.self_attn.q_proj(expert_hidden_states).view(expert_hidden_shape)
 
+            _key_states = key_states.to(dtype=expert_layer.self_attn.k_proj.weight.dtype).view(
+                *key_states.shape[:2], -1
+            )
+            expert_key_states = expert_layer.self_attn.k_proj(_key_states).view(
+                *_key_states.shape[:-1], -1, expert_layer.self_attn.head_dim
+            )  # k_proj should have same dim as kv
 
-            _key_states = key_states.to(dtype=expert_layer.self_attn.k_proj.weight.dtype).view(*key_states.shape[:2], -1)
-            expert_key_states = expert_layer.self_attn.k_proj(_key_states).view(*_key_states.shape[:-1], -1, expert_layer.self_attn.head_dim) # k_proj should have same dim as kv
+            _value_states = value_states.to(dtype=expert_layer.self_attn.v_proj.weight.dtype).view(
+                *value_states.shape[:2], -1
+            )
+            expert_value_states = expert_layer.self_attn.v_proj(_value_states).view(
+                *_value_states.shape[:-1], -1, expert_layer.self_attn.head_dim
+            )
 
-            _value_states = value_states.to(dtype=expert_layer.self_attn.v_proj.weight.dtype).view(*value_states.shape[:2], -1)
-            expert_value_states = expert_layer.self_attn.v_proj(_value_states).view(*_value_states.shape[:-1], -1, expert_layer.self_attn.head_dim)
-
-            expert_position_id = expert_position_id - torch.min(expert_position_id, dim=1, keepdim=True).values # start from 0
-            expert_attention_mask = attention_mask[:, -inputs_embeds[1].shape[1]:, :expert_key_states.shape[1]:] # take into account kv
+            expert_position_id = (
+                expert_position_id - torch.min(expert_position_id, dim=1, keepdim=True).values
+            )  # start from 0
+            expert_attention_mask = attention_mask[
+                :, -inputs_embeds[1].shape[1] :, : expert_key_states.shape[1] :
+            ]  # take into account kv
 
             expert_query_states = apply_rope(expert_query_state, expert_position_id)
             # expert_key_states = apply_rope(expert_key_state, expert_position_id)
-            
+
             att_output = attention_interface(
-                expert_attention_mask, batch_size, head_dim, expert_query_states, expert_key_states, expert_value_states
+                expert_attention_mask,
+                batch_size,
+                head_dim,
+                expert_query_states,
+                expert_key_states,
+                expert_value_states,
             )
             att_outputs.append(att_output)
         else:
@@ -592,8 +650,8 @@ class SmolVLMWithExpertModel(nn.Module):
 
         # att_output = att_output.to(dtype=models[i].dtype)
         return att_outputs, past_key_values
-    
-    def get_model_layers(self, models: list) -> list: # FIXME(mshukor): is this efficient?
+
+    def get_model_layers(self, models: list) -> list:  # FIXME(mshukor): is this efficient?
         vlm_layers = []
         expert_layers = []
         multiple_of = self.num_vlm_layers // self.num_expert_layers
@@ -606,15 +664,16 @@ class SmolVLMWithExpertModel(nn.Module):
             vlm_layers.append(models[0].layers[i])
             expert_layers.append(expert_layer)
         return [vlm_layers, expert_layers]
+
     # TODO: break down this huge forward into modules or functions
     def forward(
         self,
-        attention_mask: Optional[torch.Tensor] = None,
-        position_ids: Optional[torch.LongTensor] = None,
-        past_key_values: Optional[Union[List[torch.FloatTensor], Cache]] = None,
-        inputs_embeds: List[torch.FloatTensor] = None,
-        use_cache: Optional[bool] = None,
-        fill_kv_cache: Optional[bool] = None,
+        attention_mask: torch.Tensor | None = None,
+        position_ids: torch.LongTensor | None = None,
+        past_key_values: list[torch.FloatTensor] | Cache | None = None,
+        inputs_embeds: list[torch.FloatTensor] = None,
+        use_cache: bool | None = None,
+        fill_kv_cache: bool | None = None,
     ):
         models = [self.get_vlm_model().text_model, self.lm_expert]
         model_layers = self.get_model_layers(models)
@@ -626,26 +685,58 @@ class SmolVLMWithExpertModel(nn.Module):
                 continue
             batch_size = hidden_states.shape[0]
 
-        # # Pad prefix embds so that prefix_embs + prefix_embs len are multiple of 128, pad left or right depending on the gen or train 
+        # # Pad prefix embds so that prefix_embs + prefix_embs len are multiple of 128, pad left or right depending on the gen or train
         if self.attention_implementation == "flex":
-            if inputs_embeds[0] is not None and inputs_embeds[1] is not None and attention_mask.shape[-1] == attention_mask.shape[-2] and past_key_values is None: # Now only during training
+            if (
+                inputs_embeds[0] is not None
+                and inputs_embeds[1] is not None
+                and attention_mask.shape[-1] == attention_mask.shape[-2]
+                and past_key_values is None
+            ):  # Now only during training
                 seq_len = inputs_embeds[0].shape[1] + inputs_embeds[1].shape[1]
-                padded_seq_len = _round_up_to_multiple(seq_len, 128) # FIXME(mshukor): more efficient to have a fixed seq len?
+                padded_seq_len = _round_up_to_multiple(
+                    seq_len, 128
+                )  # FIXME(mshukor): more efficient to have a fixed seq len?
                 b_mask, q_len, kv_len = attention_mask.shape  # The shape of your mask
                 pad = padded_seq_len - q_len
                 attention_mask = F.pad(attention_mask, (0, pad, 0, pad), value=True)
                 inputs_embeds[0] = F.pad(inputs_embeds[0], (0, 0, 0, pad), value=0.0)
                 position_ids = F.pad(position_ids, (0, pad), value=0)
 
-
         # RMSNorm
         num_layers = self.num_vlm_layers
         head_dim = self.vlm.config.text_config.head_dim
         for layer_idx in range(num_layers):
-            if fill_kv_cache or "cross" not in self.attention_mode or (self.self_attn_every_n_layers > 0 and layer_idx % self.self_attn_every_n_layers == 0):
-                att_outputs, past_key_values = self.forward_attn_layer(model_layers, inputs_embeds, layer_idx, position_ids, attention_mask, batch_size, head_dim, use_cache=use_cache, fill_kv_cache=fill_kv_cache, past_key_values=past_key_values)
+            if (
+                fill_kv_cache
+                or "cross" not in self.attention_mode
+                or (self.self_attn_every_n_layers > 0 and layer_idx % self.self_attn_every_n_layers == 0)
+            ):
+                att_outputs, past_key_values = self.forward_attn_layer(
+                    model_layers,
+                    inputs_embeds,
+                    layer_idx,
+                    position_ids,
+                    attention_mask,
+                    batch_size,
+                    head_dim,
+                    use_cache=use_cache,
+                    fill_kv_cache=fill_kv_cache,
+                    past_key_values=past_key_values,
+                )
             else:
-                att_outputs, past_key_values = self.forward_cross_attn_layer(model_layers, inputs_embeds, layer_idx, position_ids, attention_mask, batch_size, head_dim, use_cache=use_cache, fill_kv_cache=fill_kv_cache, past_key_values=past_key_values)
+                att_outputs, past_key_values = self.forward_cross_attn_layer(
+                    model_layers,
+                    inputs_embeds,
+                    layer_idx,
+                    position_ids,
+                    attention_mask,
+                    batch_size,
+                    head_dim,
+                    use_cache=use_cache,
+                    fill_kv_cache=fill_kv_cache,
+                    past_key_values=past_key_values,
+                )
             # query_states = []
             # key_states = []
             # value_states = []
@@ -703,7 +794,6 @@ class SmolVLMWithExpertModel(nn.Module):
             #     attention_mask, batch_size, head_dim, query_states, key_states, value_states
             # )
 
-
             # att_output = att_output.to(dtype=models[i].dtype)
 
             # first part of att_output is prefix (up to sequence length, [:, 0:prefix_seq_len])
@@ -712,7 +802,9 @@ class SmolVLMWithExpertModel(nn.Module):
             for i, hidden_states in enumerate(inputs_embeds):
                 # layer = models[i].layers[layer_idx]
                 layer = model_layers[i][layer_idx]
-                att_output = att_outputs[i] if i < len(att_outputs) else att_outputs[0] # in case of self_attn
+                att_output = (
+                    att_outputs[i] if i < len(att_outputs) else att_outputs[0]
+                )  # in case of self_attn
                 if hidden_states is not None:
                     if layer is None:
                         outputs_embeds.append(hidden_states)
@@ -759,7 +851,11 @@ class SmolVLMWithExpertModel(nn.Module):
         if self.attention_implementation == "fa2":
             attention_interface = self.flash_attention_forward
         elif self.attention_implementation == "flex":
-            attention_interface = partial(flex_attention_forward, num_att_heads=self.num_attention_heads, num_key_value_heads=self.num_key_value_heads)
+            attention_interface = partial(
+                flex_attention_forward,
+                num_att_heads=self.num_attention_heads,
+                num_key_value_heads=self.num_key_value_heads,
+            )
         else:
             attention_interface = self.eager_attention_forward
         return attention_interface
@@ -807,7 +903,7 @@ class SmolVLMWithExpertModel(nn.Module):
         att_weights *= head_dim**-0.5
 
         att_weights = att_weights.to(dtype=torch.float32)
-        big_neg = torch.finfo(att_weights.dtype).min #-2.3819763e38  # See gemma/modules.py
+        big_neg = torch.finfo(att_weights.dtype).min  # -2.3819763e38  # See gemma/modules.py
         masked_att_weights = torch.where(attention_mask[:, None, :, :], att_weights, big_neg)
         probs = nn.functional.softmax(masked_att_weights, dim=-1)
         probs = probs.to(dtype=value_states.dtype)
diff --git a/src/lerobot/policies/smolvla/modeling_smolvla.py b/src/lerobot/policies/smolvla/modeling_smolvla.py
index 1c07d98e5..9bb22d7f7 100644
--- a/src/lerobot/policies/smolvla/modeling_smolvla.py
+++ b/src/lerobot/policies/smolvla/modeling_smolvla.py
@@ -1027,6 +1027,7 @@ from lerobot.policies.utils import (
     populate_queues,
 )
 from lerobot.utils.utils import get_safe_dtype
+
 # OBS_STATE = 'state'
 # Matches ".soNNN", optionally followed by "-something", up to the "_buffer_" marker
 _VARIANT_RE = re.compile(r"\.so\d+(?:-[\w]+)?_buffer_")
@@ -1891,4 +1892,4 @@ class VLAFlowMatching(nn.Module):
         suffix_out = suffix_out[:, -self.config.chunk_size :]
         suffix_out = suffix_out.to(dtype=torch.float32)
         v_t = self.action_out_proj(suffix_out)
-        return v_t
\ No newline at end of file
+        return v_t
diff --git a/src/lerobot/policies/smolvla/saver.txt b/src/lerobot/policies/smolvla/saver.txt
index 3410062ba..f2ad6c76f 100644
--- a/src/lerobot/policies/smolvla/saver.txt
+++ b/src/lerobot/policies/smolvla/saver.txt
@@ -1 +1 @@
-c
\ No newline at end of file
+c
diff --git a/src/lerobot/policies/smolvla/smolvlm_with_expert.py b/src/lerobot/policies/smolvla/smolvlm_with_expert.py
index e4cd7acac..3b78c99e6 100644
--- a/src/lerobot/policies/smolvla/smolvlm_with_expert.py
+++ b/src/lerobot/policies/smolvla/smolvlm_with_expert.py
@@ -547,4 +547,4 @@ class SmolVLMWithExpertModel(nn.Module):
         # we use -1 because sequence length can change
         att_output = att_output.reshape(batch_size, -1, num_key_value_heads * num_key_value_groups * head_dim)
 
-        return att_output
\ No newline at end of file
+        return att_output
diff --git a/src/lerobot/scripts/eval.py b/src/lerobot/scripts/eval.py
index 795ed2b3c..b96fbb8a3 100644
--- a/src/lerobot/scripts/eval.py
+++ b/src/lerobot/scripts/eval.py
@@ -45,17 +45,21 @@ Note that in both examples, the repo/folder should contain at least `config.json
 
 You can learn about the CLI options for this script in the `EvalPipelineConfig` in lerobot/configs/eval.py
 """
-import concurrent
+
+import concurrent.futures as cf
 import json
 import logging
 import threading
 import time
+from collections import defaultdict
 from collections.abc import Callable
 from contextlib import nullcontext
 from copy import deepcopy
 from dataclasses import asdict
 from pathlib import Path
 from pprint import pformat
+from typing import Dict, List, Tuple, TypedDict
+from collections.abc import Iterator
 
 import einops
 import gymnasium as gym
@@ -68,7 +72,11 @@ from tqdm import trange
 from lerobot.configs import parser
 from lerobot.configs.eval import EvalPipelineConfig
 from lerobot.envs.factory import make_env
-from lerobot.envs.utils import add_envs_task, check_env_attributes_and_types, preprocess_observation, preprocess_observation1
+from lerobot.envs.utils import (
+    add_envs_task,
+    check_env_attributes_and_types,
+    preprocess_observation,
+)
 from lerobot.policies.factory import make_policy
 from lerobot.policies.pretrained import PreTrainedPolicy
 from lerobot.policies.utils import get_device_from_parameters
@@ -79,9 +87,6 @@ from lerobot.utils.utils import (
     init_logging,
     inside_slurm,
 )
-from typing import TypedDict, Dict, List, Tuple, Iterator
-from collections import defaultdict
-import concurrent.futures as cf
 
 
 def rollout(
@@ -485,8 +490,12 @@ def _compile_episode_data(
     data_dict["index"] = torch.arange(start_data_index, start_data_index + total_frames, 1)
 
     return data_dict
-from lerobot.policies.smolvla.modeling_smolvla import SmolVLAPolicy
+
+
 from lerobot.datasets.lerobot_dataset import LeRobotDatasetMetadata
+from lerobot.policies.smolvla.modeling_smolvla import SmolVLAPolicy
+
+
 def _inject_normalization_stats(policy: SmolVLAPolicy, dataset_meta: LeRobotDatasetMetadata):
     """Recreate normalization layers with proper stats from the dataset."""
     from lerobot.policies.normalize import Normalize, Unnormalize
@@ -518,7 +527,8 @@ def _inject_normalization_stats(policy: SmolVLAPolicy, dataset_meta: LeRobotData
 
 def load_smolvla(cfg, dataset_repo: str, policy):
     from lerobot.datasets.lerobot_dataset import LeRobotDataset
-    dataset = LeRobotDataset(dataset_repo, root='/raid/jade/.cache/huggingface/datasets/')
+
+    dataset = LeRobotDataset(dataset_repo, root="/raid/jade/.cache/huggingface/datasets/")
     _inject_normalization_stats(policy=policy, dataset_meta=dataset.meta)  # only needed if stats are missing
     return policy.to("cuda"), dataset
 
@@ -529,8 +539,8 @@ def eval_main(cfg: EvalPipelineConfig):
 
     # Check device is available
     device = get_safe_torch_device(cfg.policy.device, log=True)
-    #login to hf
-    from huggingface_hub import login
+    # login to hf
+
     # login()
     torch.backends.cudnn.benchmark = True
     torch.backends.cuda.matmul.allow_tf32 = True
@@ -549,7 +559,7 @@ def eval_main(cfg: EvalPipelineConfig):
     breakpoint()
     # policy, _ = load_smolvla(cfg.policy, "physical-intelligence/libero", policy)
     # rename "image" -> "observation.image"
-    
+
     policy.eval()
     with torch.no_grad(), torch.autocast(device_type=device.type) if cfg.policy.use_amp else nullcontext():
         info = eval_policy_all(
@@ -584,10 +594,11 @@ def eval_main(cfg: EvalPipelineConfig):
 
 # ---- typed payload returned by one task eval ----
 class TaskMetrics(TypedDict):
-    sum_rewards: List[float]
-    max_rewards: List[float]
-    successes: List[bool]
-    video_paths: List[str]
+    sum_rewards: list[float]
+    max_rewards: list[float]
+    successes: list[bool]
+    video_paths: list[str]
+
 
 ACC_KEYS = ("sum_rewards", "max_rewards", "successes", "video_paths")
 
@@ -610,7 +621,7 @@ def eval_policy_all(
     """
     global_start = time.time()
 
-    # inner: evaluate a single (suite, task) 
+    # inner: evaluate a single (suite, task)
     def eval_one(
         task_group: str,
         task_id: int,
@@ -650,27 +661,36 @@ def eval_policy_all(
             video_paths=task_result.get("video_paths", []),
         )
 
-    # result producer: sequential or threaded, same consumer 
-    def iter_task_results() -> Iterator[Tuple[str, int, TaskMetrics]]:
+    # result producer: sequential or threaded, same consumer
+    def iter_task_results() -> Iterator[tuple[str, int, TaskMetrics]]:
         if max_parallel_tasks == 1:
             for task_group, tasks in envs.items():
                 for task_id, vec in tasks.items():
-                    yield task_group, task_id, eval_one(
-                        task_group, task_id, vec,
-                        policy=policy,
-                        n_episodes=n_episodes,
-                        max_episodes_rendered=max_episodes_rendered,
-                        videos_dir=videos_dir,
-                        return_episode_data=return_episode_data,
-                        start_seed=start_seed,
+                    yield (
+                        task_group,
+                        task_id,
+                        eval_one(
+                            task_group,
+                            task_id,
+                            vec,
+                            policy=policy,
+                            n_episodes=n_episodes,
+                            max_episodes_rendered=max_episodes_rendered,
+                            videos_dir=videos_dir,
+                            return_episode_data=return_episode_data,
+                            start_seed=start_seed,
+                        ),
                     )
         else:
             with cf.ThreadPoolExecutor(max_workers=max_parallel_tasks) as executor:
-                fut2key: Dict[cf.Future, Tuple[str, int]] = {}
+                fut2key: dict[cf.Future, tuple[str, int]] = {}
                 for task_group, tasks in envs.items():
                     for task_id, vec in tasks.items():
                         fut = executor.submit(
-                            eval_one, task_group, task_id, vec,
+                            eval_one,
+                            task_group,
+                            task_id,
+                            vec,
                             policy=policy,
                             n_episodes=n_episodes,
                             max_episodes_rendered=max_episodes_rendered,
@@ -683,9 +703,9 @@ def eval_policy_all(
                     task_group, task_id = fut2key[fut]
                     yield task_group, task_id, fut.result()
 
-    # single accumulator path on the main thread 
-    group_acc: Dict[str, Dict[str, List]] = defaultdict(lambda: {k: [] for k in ACC_KEYS})
-    overall: Dict[str, List] = {k: [] for k in ACC_KEYS}
+    # single accumulator path on the main thread
+    group_acc: dict[str, dict[str, list]] = defaultdict(lambda: {k: [] for k in ACC_KEYS})
+    overall: dict[str, list] = {k: [] for k in ACC_KEYS}
 
     for task_group, task_id, metrics in iter_task_results():
         acc = group_acc[task_group]
@@ -694,7 +714,7 @@ def eval_policy_all(
             overall[k].extend(metrics[k])
 
     # build outputs
-    results: Dict[str, dict] = {}
+    results: dict[str, dict] = {}
     for task_group, data in group_acc.items():
         suite_rewards = data["sum_rewards"]
         suite_max = data["max_rewards"]
@@ -720,9 +740,15 @@ def eval_policy_all(
     global_eval_ep_s = global_eval_s / max(1, len(overall["sum_rewards"]))
     results["overall"] = {
         "aggregated": {
-            "avg_sum_reward": float(np.nanmean(overall["sum_rewards"])) if overall["sum_rewards"] else float("nan"),
-            "avg_max_reward": float(np.nanmean(overall["max_rewards"])) if overall["max_rewards"] else float("nan"),
-            "pc_success": float(np.nanmean(overall["successes"]) * 100) if overall["successes"] else float("nan"),
+            "avg_sum_reward": float(np.nanmean(overall["sum_rewards"]))
+            if overall["sum_rewards"]
+            else float("nan"),
+            "avg_max_reward": float(np.nanmean(overall["max_rewards"]))
+            if overall["max_rewards"]
+            else float("nan"),
+            "pc_success": float(np.nanmean(overall["successes"]) * 100)
+            if overall["successes"]
+            else float("nan"),
             "eval_s": global_eval_s,
             "eval_ep_s": global_eval_ep_s,
         },
@@ -732,7 +758,6 @@ def eval_policy_all(
     return results
 
 
-
 if __name__ == "__main__":
     init_logging()
     eval_main()
diff --git a/src/lerobot/scripts/train.py b/src/lerobot/scripts/train.py
index 3feeb0512..fc8be4ebc 100644
--- a/src/lerobot/scripts/train.py
+++ b/src/lerobot/scripts/train.py
@@ -105,6 +105,7 @@ def update_policy(
     train_metrics.update_s = time.perf_counter() - start_time
     return train_metrics, output_dict
 
+
 # def _inject_normalization_stats(policy: SmolVLAPolicy, dataset_meta: LeRobotDatasetMetadata):
 #     """Recreate normalization layers with dataset stats if missing (Adil's workaround)."""
 #     from lerobot.policies.normalize import Normalize, Unnormalize
@@ -132,6 +133,7 @@ def update_policy(
 
 #     print("✅ Normalization layers injected with dataset stats.")
 
+
 @parser.wrap()
 def train(cfg: TrainPipelineConfig):
     cfg.validate()
@@ -271,9 +273,12 @@ def train(cfg: TrainPipelineConfig):
         if cfg.env and is_eval_step:
             step_id = get_step_identifier(step, cfg.steps)
             logging.info(f"Eval policy at step {step}")
-            with torch.no_grad(), (torch.autocast(device_type=device.type) if cfg.policy.use_amp else nullcontext()):
+            with (
+                torch.no_grad(),
+                torch.autocast(device_type=device.type) if cfg.policy.use_amp else nullcontext(),
+            ):
                 eval_info = eval_policy_all(
-                    eval_env,                         # dict[suite][task_id] -> vec_env
+                    eval_env,  # dict[suite][task_id] -> vec_env
                     policy,
                     cfg.eval.n_episodes,
                     videos_dir=videos_dir,
@@ -295,15 +300,15 @@ def train(cfg: TrainPipelineConfig):
             # meters/tracker
             eval_metrics = {
                 "avg_sum_reward": AverageMeter("∑rwrd", ":.3f"),
-                "pc_success":     AverageMeter("success", ":.1f"),
-                "eval_s":         AverageMeter("eval_s", ":.3f"),
+                "pc_success": AverageMeter("success", ":.1f"),
+                "eval_s": AverageMeter("eval_s", ":.3f"),
             }
             eval_tracker = MetricsTracker(
                 cfg.batch_size, dataset.num_frames, dataset.num_episodes, eval_metrics, initial_step=step
             )
-            eval_tracker.eval_s         = aggregated.get("eval_s", 0.0)
+            eval_tracker.eval_s = aggregated.get("eval_s", 0.0)
             eval_tracker.avg_sum_reward = aggregated.get("avg_sum_reward", float("nan"))
-            eval_tracker.pc_success     = aggregated.get("pc_success", float("nan"))
+            eval_tracker.pc_success = aggregated.get("pc_success", float("nan"))
             if wandb_logger:
                 wandb_log_dict = {**eval_tracker.to_dict(), **eval_info}
                 wandb_logger.log_dict(wandb_log_dict, step, mode="eval")
diff --git a/src/lerobot/scripts/train_2.py b/src/lerobot/scripts/train_2.py
index 26a9e7aea..5b82ef044 100644
--- a/src/lerobot/scripts/train_2.py
+++ b/src/lerobot/scripts/train_2.py
@@ -104,6 +104,7 @@ def update_policy(
     train_metrics.update_s = time.perf_counter() - start_time
     return train_metrics, output_dict
 
+
 def _inject_normalization_stats(policy: SmolVLAPolicy, dataset_meta: LeRobotDatasetMetadata):
     """Recreate normalization layers with dataset stats if missing (Adil's workaround)."""
     from lerobot.policies.normalize import Normalize, Unnormalize
@@ -115,15 +116,15 @@ def _inject_normalization_stats(policy: SmolVLAPolicy, dataset_meta: LeRobotData
     stats = {}
     for key, stat_dict in dataset_meta.stats.items():
         stats[key] = {
-            stat_type: torch.as_tensor(stat_array)
-            if isinstance(stat_array, np.ndarray)
-            else stat_array
+            stat_type: torch.as_tensor(stat_array) if isinstance(stat_array, np.ndarray) else stat_array
             for stat_type, stat_array in stat_dict.items()
         }
 
     normalize_inputs = Normalize(policy.config.input_features, policy.config.normalization_mapping, stats)
     normalize_targets = Normalize(policy.config.output_features, policy.config.normalization_mapping, stats)
-    unnormalize_outputs = Unnormalize(policy.config.output_features, policy.config.normalization_mapping, stats)
+    unnormalize_outputs = Unnormalize(
+        policy.config.output_features, policy.config.normalization_mapping, stats
+    )
 
     policy.normalize_inputs = normalize_inputs
     policy.normalize_targets = normalize_targets
@@ -131,6 +132,7 @@ def _inject_normalization_stats(policy: SmolVLAPolicy, dataset_meta: LeRobotData
 
     print("✅ Normalization layers injected with dataset stats.")
 
+
 @parser.wrap()
 def train(cfg: TrainPipelineConfig):
     cfg.validate()
diff --git a/src/lerobot/scripts/train_accelerate.py b/src/lerobot/scripts/train_accelerate.py
index e205f138f..1e8a59a64 100644
--- a/src/lerobot/scripts/train_accelerate.py
+++ b/src/lerobot/scripts/train_accelerate.py
@@ -24,12 +24,15 @@ from accelerate.utils import set_seed as accelerate_set_seed
 from termcolor import colored
 from torch.optim import Optimizer
 
+from lerobot.configs import parser
+from lerobot.configs.train import TrainPipelineConfig
 from lerobot.datasets.factory import make_dataset
 from lerobot.datasets.sampler import EpisodeAwareSampler
 from lerobot.envs.factory import make_env
 from lerobot.optim.factory import make_optimizer_and_scheduler
 from lerobot.policies.factory import make_policy
 from lerobot.policies.pretrained import PreTrainedPolicy
+from lerobot.scripts.eval import eval_policy
 from lerobot.utils.logging_utils import AverageMeter, MetricsTracker
 from lerobot.utils.train_utils import (
     get_step_checkpoint_dir,
@@ -43,9 +46,6 @@ from lerobot.utils.utils import (
     has_method,
     init_logging,
 )
-from lerobot.configs import parser
-from lerobot.configs.train import TrainPipelineConfig
-from lerobot.scripts.eval import eval_policy
 
 
 def update_policy(
@@ -100,6 +100,7 @@ def train(cfg: TrainPipelineConfig):
 
     # Initialize accelerator
     from accelerate.utils import DistributedDataParallelKwargs
+
     # added by jade 2 lines
     ddp_kwargs = DistributedDataParallelKwargs(find_unused_parameters=False)
     accelerator = Accelerator(..., kwargs_handlers=[ddp_kwargs])
@@ -357,7 +358,7 @@ def train(cfg: TrainPipelineConfig):
 
     if accelerator.is_main_process:
         logging.info("End of training")
-        accelerator.end_training() # added by jade
+        accelerator.end_training()  # added by jade
 
 
 if __name__ == "__main__":

From 4c2add41d764ae3e0e05a6702f75a84bae9a2d86 Mon Sep 17 00:00:00 2001
From: Jade Choghari <chogharijade@gmail.com>
Date: Thu, 11 Sep 2025 14:18:09 +0200
Subject: [PATCH 07/12] remove files

---
 src/lerobot/configs/policies.py               |    1 -
 src/lerobot/datasets/lerobot_dataset.py       |    1 -
 src/lerobot/datasets/utils.py                 |    3 +-
 src/lerobot/policies/factory.py               |    7 -
 src/lerobot/policies/normalize.py             |  163 ---
 .../policies/smolpi0/configuration_smolpi0.py |  210 ---
 .../policies/smolpi0/flex_attention.py        |  145 --
 .../policies/smolpi0/modeling_smolpi0.py      | 1190 -----------------
 .../policies/smolpi0/smolvlm_with_expert.py   |  920 -------------
 .../policies/smolvla/modeling_smolvla.py      |  956 +------------
 .../policies/smolvla/modeling_smolvla_v2.py   |    0
 src/lerobot/policies/smolvla/saver.txt        |    1 -
 src/lerobot/scripts/eval.py                   |   29 +-
 src/lerobot/scripts/train_2.py                |  345 -----
 src/lerobot/scripts/train_accelerate.py       |  366 -----
 15 files changed, 3 insertions(+), 4334 deletions(-)
 delete mode 100644 src/lerobot/policies/smolpi0/configuration_smolpi0.py
 delete mode 100644 src/lerobot/policies/smolpi0/flex_attention.py
 delete mode 100644 src/lerobot/policies/smolpi0/modeling_smolpi0.py
 delete mode 100644 src/lerobot/policies/smolpi0/smolvlm_with_expert.py
 delete mode 100644 src/lerobot/policies/smolvla/modeling_smolvla_v2.py
 delete mode 100644 src/lerobot/policies/smolvla/saver.txt
 delete mode 100644 src/lerobot/scripts/train_2.py
 delete mode 100644 src/lerobot/scripts/train_accelerate.py

diff --git a/src/lerobot/configs/policies.py b/src/lerobot/configs/policies.py
index 75863d3fc..f5fa727cf 100644
--- a/src/lerobot/configs/policies.py
+++ b/src/lerobot/configs/policies.py
@@ -62,7 +62,6 @@ class PreTrainedConfig(draccus.ChoiceRegistry, HubMixin, abc.ABC):
     # `use_amp` determines whether to use Automatic Mixed Precision (AMP) for training and evaluation. With AMP,
     # automatic gradient scaling is used.
     use_amp: bool = False
-    gradient_accumulation_steps: int = 1
 
     push_to_hub: bool = True
     repo_id: str | None = None
diff --git a/src/lerobot/datasets/lerobot_dataset.py b/src/lerobot/datasets/lerobot_dataset.py
index 6509993bb..875608c59 100644
--- a/src/lerobot/datasets/lerobot_dataset.py
+++ b/src/lerobot/datasets/lerobot_dataset.py
@@ -597,7 +597,6 @@ class LeRobotDataset(torch.utils.data.Dataset):
         """hf_dataset contains all the observations, states, actions, rewards, etc."""
         if self.episodes is None:
             path = str(self.root / "data")
-            # added by jade
             hf_dataset = load_dataset("parquet", data_dir=path, split="train")
         else:
             files = [str(self.root / self.meta.get_data_file_path(ep_idx)) for ep_idx in self.episodes]
diff --git a/src/lerobot/datasets/utils.py b/src/lerobot/datasets/utils.py
index daa1de163..078c5351d 100644
--- a/src/lerobot/datasets/utils.py
+++ b/src/lerobot/datasets/utils.py
@@ -455,8 +455,7 @@ def dataset_to_policy_features(features: dict[str, dict]) -> dict[str, PolicyFea
                 shape = (shape[2], shape[0], shape[1])
         elif key == "observation.environment_state":
             type = FeatureType.ENV
-        # changed by jade
-        elif key.startswith("observation") or key.startswith("state"):
+        elif key.startswith("observation"):
             type = FeatureType.STATE
         elif key.startswith("action"):
             type = FeatureType.ACTION
diff --git a/src/lerobot/policies/factory.py b/src/lerobot/policies/factory.py
index cc1b0480d..c3ae9cd54 100644
--- a/src/lerobot/policies/factory.py
+++ b/src/lerobot/policies/factory.py
@@ -31,7 +31,6 @@ from lerobot.policies.pi0fast.configuration_pi0fast import PI0FASTConfig
 from lerobot.policies.pretrained import PreTrainedPolicy
 from lerobot.policies.sac.configuration_sac import SACConfig
 from lerobot.policies.sac.reward_model.configuration_classifier import RewardClassifierConfig
-from lerobot.policies.smolpi0.configuration_smolpi0 import SMOLPI0Config
 from lerobot.policies.smolvla.configuration_smolvla import SmolVLAConfig
 from lerobot.policies.tdmpc.configuration_tdmpc import TDMPCConfig
 from lerobot.policies.vqbet.configuration_vqbet import VQBeTConfig
@@ -75,10 +74,6 @@ def get_policy_class(name: str) -> PreTrainedPolicy:
         from lerobot.policies.smolvla.modeling_smolvla import SmolVLAPolicy
 
         return SmolVLAPolicy
-    elif name == "smolpi0":
-        from lerobot.policies.smolpi0.modeling_smolpi0 import SMOLPI0Policy
-
-        return SMOLPI0Policy
     else:
         raise NotImplementedError(f"Policy with name {name} is not implemented.")
 
@@ -102,8 +97,6 @@ def make_policy_config(policy_type: str, **kwargs) -> PreTrainedConfig:
         return SmolVLAConfig(**kwargs)
     elif policy_type == "reward_classifier":
         return RewardClassifierConfig(**kwargs)
-    elif policy_type == "smolpi0":
-        return SMOLPI0Config(**kwargs)
     else:
         raise ValueError(f"Policy type '{policy_type}' is not available.")
 
diff --git a/src/lerobot/policies/normalize.py b/src/lerobot/policies/normalize.py
index 646c330cb..119055873 100644
--- a/src/lerobot/policies/normalize.py
+++ b/src/lerobot/policies/normalize.py
@@ -255,85 +255,6 @@ class Unnormalize(nn.Module):
         return batch
 
 
-class NormalizePerRobotType(nn.Module):
-    """Normalizes data (e.g. "observation.image") for more stable and faster convergence during training."""
-
-    def __init__(
-        self,
-        features: dict[str, PolicyFeature],
-        norm_map: dict[str, NormalizationMode],
-        stats: dict[str, dict[str, Tensor]] | None = None,
-    ):
-        """
-        Args:
-            shapes (dict): A dictionary where keys are input modalities (e.g. "observation.image") and values
-            are their shapes (e.g. `[3,96,96]`]). These shapes are used to create the tensor buffer containing
-            mean, std, min, max statistics. If the provided `shapes` contain keys related to images, the shape
-            is adjusted to be invariant to height and width, assuming a channel-first (c, h, w) format.
-            modes (dict): A dictionary where keys are output modalities (e.g. "observation.image") and values
-                are their normalization modes among:
-                    - "mean_std": subtract the mean and divide by standard deviation.
-                    - "min_max": map to [-1, 1] range.
-            stats (dict, optional): A dictionary where keys are output modalities (e.g. "observation.image")
-                and values are dictionaries of statistic types and their values (e.g.
-                `{"mean": torch.randn(3,1,1)}, "std": torch.randn(3,1,1)}`). If provided, as expected for
-                training the model for the first time, these statistics will overwrite the default buffers. If
-                not provided, as expected for finetuning or evaluation, the default buffers should to be
-                overwritten by a call to `policy.load_state_dict(state_dict)`. That way, initializing the
-                dataset is not needed to get the stats, since they are already in the policy state_dict.
-        """
-        super().__init__()
-        self.features = features
-        self.norm_map = norm_map
-        for robot_type in stats.keys():
-            stats_buffers = create_stats_buffers(features, norm_map, stats[robot_type])
-            for key, buffer in stats_buffers.items():
-                setattr(self, f"{robot_type}_buffer_" + key.replace(".", "_"), buffer)
-
-    # TODO(rcadene): should we remove torch.no_grad?
-    @torch.no_grad
-    def forward(self, batch: dict[str, Tensor]) -> dict[str, Tensor]:
-        batch = dict(batch)  # shallow copy avoids mutating the input batch
-        assert "robot_type" in batch, "robot_type is not in the batch"
-        robot_types = batch["robot_type"]
-
-        for key, ft in self.features.items():
-            if key not in batch:
-                continue
-
-            norm_mode = self.norm_map.get(ft.type, NormalizationMode.IDENTITY)
-            if norm_mode is NormalizationMode.IDENTITY:
-                continue
-            # FIXME(mshukor): make it more efficient
-            buffers = [
-                getattr(self, f"{robot_type}_buffer_" + key.replace(".", "_")) for robot_type in robot_types
-            ]
-            if norm_mode is NormalizationMode.MEAN_STD:
-                mean = torch.stack([buffers[i]["mean"] for i in range(len(robot_types))], dim=0)
-                std = torch.stack([buffers[i]["std"] for i in range(len(robot_types))], dim=0)
-                if batch[key].ndim == 3:
-                    mean = mean.unsqueeze(1)
-                    std = std.unsqueeze(1)
-                assert not torch.isinf(mean).any(), _no_stats_error_str("mean")
-                assert not torch.isinf(std).any(), _no_stats_error_str("std")
-                batch[key] = (batch[key] - mean) / (std + 1e-8)
-            elif norm_mode is NormalizationMode.MIN_MAX:
-                min = torch.stack([buffers[i]["min"] for i in range(len(robot_types))], dim=0)
-                max = torch.stack([buffers[i]["max"] for i in range(len(robot_types))], dim=0)
-                assert not torch.isinf(min).any(), _no_stats_error_str("min")
-                assert not torch.isinf(max).any(), _no_stats_error_str("max")
-                if batch[key].ndim == 3:
-                    min = min.unsqueeze(1)
-                    max = max.unsqueeze(1)
-                # normalize to [0,1]
-                batch[key] = (batch[key] - min) / (max - min + 1e-8)
-                # normalize to [-1, 1]
-                batch[key] = batch[key] * 2 - 1
-            else:
-                raise ValueError(norm_mode)
-        return batch
-
-
 # TODO (azouitine): We should replace all normalization on the policies with register_buffer normalization
 #       and remove the `Normalize` and `Unnormalize` classes.
 def _initialize_stats_buffers(
@@ -497,87 +418,3 @@ class UnnormalizeBuffer(nn.Module):
             raise ValueError(norm_mode)
 
         return batch
-
-
-class UnnormalizePerRobotType(nn.Module):
-    """
-    Similar to `Normalize` but unnormalizes output data (e.g. `{"action": torch.randn(b,c)}`) in their
-    original range used by the environment.
-    """
-
-    def __init__(
-        self,
-        features: dict[str, PolicyFeature],
-        norm_map: dict[str, NormalizationMode],
-        stats: dict[str, dict[str, Tensor]] | None = None,
-    ):
-        """
-        Args:
-            shapes (dict): A dictionary where keys are input modalities (e.g. "observation.image") and values
-            are their shapes (e.g. `[3,96,96]`]). These shapes are used to create the tensor buffer containing
-            mean, std, min, max statistics. If the provided `shapes` contain keys related to images, the shape
-            is adjusted to be invariant to height and width, assuming a channel-first (c, h, w) format.
-            modes (dict): A dictionary where keys are output modalities (e.g. "observation.image") and values
-                are their normalization modes among:
-                    - "mean_std": subtract the mean and divide by standard deviation.
-                    - "min_max": map to [-1, 1] range.
-            stats (dict, optional): A dictionary where keys are output modalities (e.g. "observation.image")
-                and values are dictionaries of statistic types and their values (e.g.
-                `{"mean": torch.randn(3,1,1)}, "std": torch.randn(3,1,1)}`). If provided, as expected for
-                training the model for the first time, these statistics will overwrite the default buffers. If
-                not provided, as expected for finetuning or evaluation, the default buffers should to be
-                overwritten by a call to `policy.load_state_dict(state_dict)`. That way, initializing the
-                dataset is not needed to get the stats, since they are already in the policy state_dict.
-        """
-        super().__init__()
-        self.features = features
-        self.norm_map = norm_map
-        self.stats = stats
-        # `self.buffer_observation_state["mean"]` contains `torch.tensor(state_dim)`
-        for robot_type in stats.keys():
-            stats_buffers = create_stats_buffers(features, norm_map, stats[robot_type])
-            for key, buffer in stats_buffers.items():
-                setattr(self, f"{robot_type}_buffer_" + key.replace(".", "_"), buffer)
-
-    # TODO(rcadene): should we remove torch.no_grad?
-    @torch.no_grad
-    def forward(self, batch: dict[str, Tensor]) -> dict[str, Tensor]:
-        batch = dict(batch)  # shallow copy avoids mutating the input batch
-        assert "robot_type" in batch, "robot_type is not in the batch"
-        robot_types = batch["robot_type"]
-
-        for key, ft in self.features.items():
-            if key not in batch:
-                continue
-
-            norm_mode = self.norm_map.get(ft.type, NormalizationMode.IDENTITY)
-            if norm_mode is NormalizationMode.IDENTITY:
-                continue
-
-            # buffer = getattr(self, "buffer_" + key.replace(".", "_"))
-            buffers = [
-                getattr(self, f"{robot_type}_buffer_" + key.replace(".", "_")) for robot_type in robot_types
-            ]
-
-            if norm_mode is NormalizationMode.MEAN_STD:
-                mean = torch.stack([buffers[i]["mean"] for i in range(len(robot_types))], dim=0)
-                std = torch.stack([buffers[i]["std"] for i in range(len(robot_types))], dim=0)
-                assert not torch.isinf(mean).any(), _no_stats_error_str("mean")
-                assert not torch.isinf(std).any(), _no_stats_error_str("std")
-                if batch[key].ndim == 3:
-                    mean = mean.unsqueeze(1)
-                    std = std.unsqueeze(1)
-                batch[key] = batch[key] * std + mean
-            elif norm_mode is NormalizationMode.MIN_MAX:
-                min = torch.stack([buffers[i]["min"] for i in range(len(robot_types))], dim=0)
-                max = torch.stack([buffers[i]["max"] for i in range(len(robot_types))], dim=0)
-                assert not torch.isinf(min).any(), _no_stats_error_str("min")
-                assert not torch.isinf(max).any(), _no_stats_error_str("max")
-                if batch[key].ndim == 3:
-                    min = min.unsqueeze(1)
-                    max = max.unsqueeze(1)
-                batch[key] = (batch[key] + 1) / 2
-                batch[key] = batch[key] * (max - min) + min
-            else:
-                raise ValueError(norm_mode)
-        return batch
diff --git a/src/lerobot/policies/smolpi0/configuration_smolpi0.py b/src/lerobot/policies/smolpi0/configuration_smolpi0.py
deleted file mode 100644
index e39d17f15..000000000
--- a/src/lerobot/policies/smolpi0/configuration_smolpi0.py
+++ /dev/null
@@ -1,210 +0,0 @@
-# Copyright 2024 The HuggingFace Inc. team. All rights reserved.
-#
-# Licensed under the Apache License, Version 2.0 (the "License");
-# you may not use this file except in compliance with the License.
-# You may obtain a copy of the License at
-#
-#     http://www.apache.org/licenses/LICENSE-2.0
-#
-# Unless required by applicable law or agreed to in writing, software
-# distributed under the License is distributed on an "AS IS" BASIS,
-# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-# See the License for the specific language governing permissions and
-# limitations under the License.
-
-from dataclasses import dataclass, field
-
-from lerobot.configs.policies import PreTrainedConfig
-from lerobot.configs.types import FeatureType, NormalizationMode, PolicyFeature
-from lerobot.optim.optimizers import AdamWConfig
-from lerobot.optim.schedulers import (
-    CosineDecayWithWarmupSchedulerConfig,
-)
-
-
-@dataclass
-class PEFTConfig:
-    r: int = 4
-    lora_alpha: int = 16
-    lora_dropout: float = 0.1
-    target_modules: str = "q_proj,v_proj"
-
-
-@PreTrainedConfig.register_subclass("smolpi0")
-@dataclass
-class SMOLPI0Config(PreTrainedConfig):
-    # Input / output structure.
-    n_obs_steps: int = 1
-    chunk_size: int = 50
-    n_action_steps: int = 50
-    n_obs_gap: int = 1
-
-    normalization_mapping: dict[str, NormalizationMode] = field(
-        default_factory=lambda: {
-            "VISUAL": NormalizationMode.IDENTITY,
-            "STATE": NormalizationMode.MEAN_STD,
-            "ACTION": NormalizationMode.MEAN_STD,
-        }
-    )
-
-    # Shorter state and action vectors will be padded
-    max_state_dim: int = 32
-    max_action_dim: int = 32
-
-    # Image preprocessing
-    resize_imgs_with_padding: tuple[int, int] = (512, 512)  # (224, 224)
-
-    # Add empty images. Used by pi0_aloha_sim which adds the empty
-    # left and right wrist cameras in addition to the top camera.
-    empty_cameras: int = 0
-
-    # Converts the joint and gripper values from the standard Aloha space to
-    # the space used by the pi internal runtime which was used to train the base model.
-    adapt_to_pi_aloha: bool = False
-
-    # Converts joint dimensions to deltas with respect to the current state before passing to the model.
-    # Gripper dimensions will remain in absolute values.
-    use_delta_joint_actions_aloha: bool = False
-
-    # Tokenizer
-    tokenizer_max_length: int = 48
-
-    # Projector
-    proj_width: int = 480
-
-    # Decoding
-    num_steps: int = 10
-
-    # Attention utils
-    use_cache: bool = True
-    attention_implementation: str = "eager"  # or fa2, flex
-
-    # Finetuning settings
-    freeze_vision_encoder: bool = True
-    train_expert_only: bool = False
-    train_state_proj: bool = True
-
-    # Training presets
-    optimizer_lr: float = 2.5e-5
-    optimizer_betas: tuple[float, float] = (0.9, 0.95)
-    optimizer_eps: float = 1e-8
-    optimizer_weight_decay: float = 1e-10
-    optimizer_grad_clip_norm: float = 10
-    optimizer_lr_vlm: float = 0
-
-    scheduler_warmup_steps: int = 1_000
-    scheduler_decay_steps: int = 30_000
-    scheduler_decay_lr: float = 2.5e-6
-
-    # TODO: Add EMA
-    vlm_model_name: str = "HuggingFaceTB/SmolVLM2-500M-Video-Instruct"
-    checkpoint_path: str = None
-    load_vlm_weights: bool = False
-
-    peft_method: str = ""
-    peft_config: PEFTConfig = PEFTConfig()
-    peft_target_model: str = ""
-
-    add_image_special_tokens: bool = False
-    add_prompt_template: bool = False
-    prefix_prompt_template: str = "<|im_start|>User: What action should the robot take to"
-    suffix_prompt_template: str = "?\nAssistant:"
-
-    attention_mode: str = "self_attn"
-
-    prefix_length: int = -1  # n_obs_steps * num_cameras * num_image_token_per_image + tokenizer_max_length
-
-    past_obs_keys: str = "image"
-
-    add_local_special_image_tokens: bool = False
-
-    reverse_images_order: bool = False
-
-    state_to_prefix: bool = False
-
-    pad_language_to: str = "longest"  # "max_length"
-
-    num_expert_layers: int = -1
-    num_vlm_layers: int = -1
-
-    causal_action_attention_mask: bool = False
-
-    self_attn_every_n_layers: int = -1
-
-    expert_width_multiplier: float = 0.5
-
-    robot_type: str = ""
-
-    self_attn_only_actions: bool = False
-
-    causal_attention_on_history: bool = False
-
-    predict_relative_actions: bool = False
-    relative_actions_mode: str = "first"
-
-    shuffle_camera_positions: bool = False
-    vlm_img_size: int = -1
-
-    regression_loss: bool = False
-
-    def __post_init__(self):
-        super().__post_init__()
-        if self.vlm_img_size > 0:
-            self.resize_imgs_with_padding = (self.vlm_img_size, self.vlm_img_size)
-        """Input validation (not exhaustive)."""
-        if self.n_action_steps > self.chunk_size:
-            raise ValueError(
-                f"The chunk size is the upper bound for the number of action steps per model invocation. Got "
-                f"{self.n_action_steps} for `n_action_steps` and {self.chunk_size} for `chunk_size`."
-            )
-        # if self.n_obs_steps != 1:
-        #     raise ValueError(
-        #         f"Multiple observation steps not handled yet. Got `nobs_steps={self.n_obs_steps}`"
-        #     )
-
-        if self.use_delta_joint_actions_aloha:
-            raise NotImplementedError(
-                "`use_delta_joint_actions_aloha` is used by pi0 for aloha real models. It is not ported yet in LeRobot."
-            )
-
-    def validate_features(self) -> None:
-        # TODO: implement value error
-        # if not self.image_features and not self.env_state_feature:
-        #     raise ValueError("You must provide at least one image or the environment state among the inputs.")
-
-        for i in range(self.empty_cameras):
-            key = f"observation.images.empty_camera_{i}"
-            empty_camera = PolicyFeature(
-                type=FeatureType.VISUAL,
-                shape=(3, 480, 640),
-            )
-            self.input_features[key] = empty_camera
-
-    def get_optimizer_preset(self) -> AdamWConfig:
-        return AdamWConfig(
-            lr=self.optimizer_lr,
-            betas=self.optimizer_betas,
-            eps=self.optimizer_eps,
-            weight_decay=self.optimizer_weight_decay,
-            grad_clip_norm=self.optimizer_grad_clip_norm,
-        )
-
-    def get_scheduler_preset(self):
-        return CosineDecayWithWarmupSchedulerConfig(
-            peak_lr=self.optimizer_lr,
-            decay_lr=self.scheduler_decay_lr,
-            num_warmup_steps=self.scheduler_warmup_steps,
-            num_decay_steps=self.scheduler_decay_steps,
-        )
-
-    @property
-    def observation_delta_indices(self) -> list:  # FIXME(mshukor): support spacing between observations
-        return [-k for k in range(0, self.n_obs_steps * self.n_obs_gap, self.n_obs_gap)][::-1]
-
-    @property
-    def action_delta_indices(self) -> list:
-        return list(range(self.chunk_size))
-
-    @property
-    def reward_delta_indices(self) -> None:
-        return None
diff --git a/src/lerobot/policies/smolpi0/flex_attention.py b/src/lerobot/policies/smolpi0/flex_attention.py
deleted file mode 100644
index 732920af2..000000000
--- a/src/lerobot/policies/smolpi0/flex_attention.py
+++ /dev/null
@@ -1,145 +0,0 @@
-# Copyright 2024 The HuggingFace Inc. team. All rights reserved.
-#
-# Licensed under the Apache License, Version 2.0 (the "License");
-# you may not use this file except in compliance with the License.
-# You may obtain a copy of the License at
-#
-#     http://www.apache.org/licenses/LICENSE-2.0
-#
-# Unless required by applicable law or agreed to in writing, software
-# distributed under the License is distributed on an "AS IS" BASIS,
-# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-# See the License for the specific language governing permissions and
-# limitations under the License.
-
-import torch
-import torch.nn.functional as F  # noqa: N812
-from packaging.version import Version
-
-if Version(torch.__version__) > Version("2.5.0"):
-    # Ffex attention is only available from torch 2.5 onwards
-    from torch.nn.attention.flex_attention import (
-        _mask_mod_signature,
-        _round_up_to_multiple,
-        create_block_mask,
-        create_mask,
-        flex_attention,
-    )
-
-
-@torch.compile(dynamic=False)
-def flex_attention_forward(
-    attention_mask: torch.Tensor,
-    batch_size: int,
-    head_dim: int,
-    query_states: torch.Tensor,
-    key_states: torch.Tensor,
-    value_states: torch.Tensor,
-    scaling=None,
-    num_att_heads: int = 8,
-    num_key_value_heads: int = 1,
-):
-    """
-    This is defined out of classes to make compile happy.
-    """
-
-    original_dtype = query_states.dtype
-    num_key_value_groups = num_att_heads // num_key_value_heads
-    key_states = key_states[:, :, :, None, :]
-    key_states = key_states.expand(
-        batch_size, key_states.shape[1], num_key_value_heads, num_key_value_groups, head_dim
-    )
-    key_states = key_states.reshape(
-        batch_size, key_states.shape[1], num_key_value_heads * num_key_value_groups, head_dim
-    )
-
-    value_states = value_states[:, :, :, None, :]
-    value_states = value_states.expand(
-        batch_size, value_states.shape[1], num_key_value_heads, num_key_value_groups, head_dim
-    )
-    value_states = value_states.reshape(
-        batch_size, value_states.shape[1], num_key_value_heads * num_key_value_groups, head_dim
-    )
-
-    query_states = query_states.transpose(1, 2)
-    key_states = key_states.transpose(1, 2)
-    value_states = value_states.transpose(1, 2)
-
-    # query_states = query_states.to(torch.float32)
-    # key_states = key_states.to(torch.float32)
-    # value_states = value_states.to(torch.float32)
-
-    causal_mask = attention_mask
-    if causal_mask is not None:
-        causal_mask = causal_mask[:, None, :, : key_states.shape[2]]
-
-        if causal_mask.shape[1] == 1 and query_states.shape[1] > 1:
-            causal_mask = causal_mask.expand(-1, query_states.shape[1], -1, -1)
-
-    def precomputed_mask_factory(precomputed_mask: torch.Tensor) -> _mask_mod_signature:
-        def mask_mod(b, h, q_idx, kv_idx):
-            # Danger zone: if b,h,q_idx,kv_idx exceed the shape, device-side assert occurs.
-            return precomputed_mask[b][h][q_idx][kv_idx]
-
-        return mask_mod
-
-    b_mask, h_mask, q_len, kv_len = causal_mask.shape  # The shape of your mask
-
-    block_size = 128  # limitation of flex attention
-    q_len_rounded = _round_up_to_multiple(q_len, block_size)
-    kv_len_rounded = _round_up_to_multiple(kv_len, block_size)
-
-    # *CRITICAL* we do need to expand here, else we get a CUDA index error
-
-    pad_q = q_len_rounded - q_len
-    pad_k = kv_len_rounded - kv_len
-    if pad_q > 0 or pad_k > 0:
-        padded_causal_mask = F.pad(causal_mask, (0, pad_k, 0, pad_q), value=0.0)
-    else:
-        padded_causal_mask = causal_mask
-    mask_mod_fn_orig = precomputed_mask_factory(padded_causal_mask)
-
-    mask_4d = create_mask(
-        mod_fn=mask_mod_fn_orig,
-        B=b_mask,
-        H=h_mask,
-        Q_LEN=q_len_rounded,
-        KV_LEN=kv_len_rounded,
-        device=causal_mask.device,
-    )
-
-    mask_mod_fn_padded = precomputed_mask_factory(mask_4d)
-    # FIXME(mshukor): compile mask torch.compile(create_block_mask)
-    create_block_mask_compiled = torch.compile(create_block_mask)
-    block_mask = create_block_mask_compiled(
-        mask_mod=mask_mod_fn_padded,
-        B=b_mask,
-        H=None,  #
-        Q_LEN=q_len_rounded,
-        KV_LEN=kv_len_rounded,
-        BLOCK_SIZE=block_size,
-        device=causal_mask.device,
-        _compile=False,
-    )
-    padded_query_states = F.pad(query_states, (0, 0, 0, pad_q), value=0.0) if pad_q > 0 else query_states
-    padded_key_states = F.pad(key_states, (0, 0, 0, pad_k), value=0.0) if pad_k > 0 else key_states
-    padded_value_states = F.pad(value_states, (0, 0, 0, pad_k), value=0.0) if pad_k > 0 else value_states
-    #  mask is applied inside the kernel, ideally more efficiently than score_mod.
-    attn_output, attention_weights = flex_attention(
-        padded_query_states,
-        padded_key_states,
-        padded_value_states,
-        block_mask=block_mask,
-        enable_gqa=True,  # because we shaped query/key states for GQA
-        scale=head_dim**-0.5 if scaling is None else scaling,
-        return_lse=True,
-    )
-
-    attn_output = attn_output.to(dtype=original_dtype)
-    attn_output = attn_output.transpose(1, 2).contiguous()  # [B, Q_LEN, H, head_dim]
-    attn_output = attn_output.reshape(
-        batch_size,
-        -1,
-        attn_output.shape[2] * attn_output.shape[3],  # merges [H, head_dim]
-    )
-    return attn_output[:, :-pad_k, :] if pad_k > 0 else attn_output
diff --git a/src/lerobot/policies/smolpi0/modeling_smolpi0.py b/src/lerobot/policies/smolpi0/modeling_smolpi0.py
deleted file mode 100644
index 765a5901a..000000000
--- a/src/lerobot/policies/smolpi0/modeling_smolpi0.py
+++ /dev/null
@@ -1,1190 +0,0 @@
-#!/usr/bin/env python
-
-# Copyright 2025 Physical Intelligence and The HuggingFace Inc. team. All rights reserved.
-#
-# Licensed under the Apache License, Version 2.0 (the "License");
-# you may not use this file except in compliance with the License.
-# You may obtain a copy of the License at
-#
-#     http://www.apache.org/licenses/LICENSE-2.0
-#
-# Unless required by applicable law or agreed to in writing, software
-# distributed under the License is distributed on an "AS IS" BASIS,
-# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-# See the License for the specific language governing permissions and
-# limitations under the License.
-
-"""
-π0: A Vision-Language-Action Flow Model for General Robot Control
-
-[Paper](https://www.physicalintelligence.company/download/pi0.pdf)
-[Jax code](https://github.com/Physical-Intelligence/openpi)
-
-Designed by Physical Intelligence. Ported from Jax by Hugging Face.
-
-Install pi0 extra dependencies:
-```bash
-pip install -e ".[pi0]"
-```
-
-Example of finetuning the pi0 pretrained model (`pi0_base` in `openpi`):
-```bash
-python lerobot/scripts/train.py \
---policy.path=lerobot/pi0 \
---dataset.repo_id=danaaubakirova/koch_test
-```
-
-Example of finetuning the pi0 neural network with PaliGemma and expert Gemma
-pretrained with VLM default parameters before pi0 finetuning:
-```bash
-python lerobot/scripts/train.py \
---policy.type=pi0 \
---dataset.repo_id=danaaubakirova/koch_test
-```
-
-Example of using the pi0 pretrained model outside LeRobot training framework:
-```python
-policy = Pi0Policy.from_pretrained("lerobot/pi0")
-```
-
-"""
-
-import math
-import os
-import re
-from collections import deque
-
-import safetensors
-import torch
-import torch.nn.functional as F  # noqa: N812
-from torch import Tensor, nn
-from transformers import AutoProcessor
-
-from lerobot.constants import ACTION, OBS_STATE
-from lerobot.policies.normalize import (
-    Normalize,
-    NormalizePerRobotType,
-    Unnormalize,
-    UnnormalizePerRobotType,
-)
-from lerobot.policies.pretrained import PreTrainedPolicy
-from lerobot.policies.smolpi0.configuration_smolpi0 import SMOLPI0Config
-from lerobot.policies.smolpi0.smolvlm_with_expert import SmolVLMWithExpertModel
-from lerobot.utils.utils import get_safe_dtype
-
-OBS_IMAGE = "observation.image"
-OBS_IMAGES = "observation.images"
-ACTION = "action"
-OBS_IMAGE_2 = "observation.image2"
-OBS_IMAGE_3 = "observation.image3"
-OBS_IMAGE_4 = "observation.image4"
-TASK = "task"
-ROBOT = "robot_type"
-IMAGES_ORDER = {
-    OBS_IMAGE: 0,
-    OBS_IMAGE_2: 1,
-    OBS_IMAGE_3: 2,
-    OBS_IMAGE_4: 3,
-}
-import random
-
-from lerobot.policies.utils import (
-    populate_queues,
-)
-
-
-def create_sinusoidal_pos_embedding(
-    time: torch.tensor, dimension: int, min_period: float, max_period: float, device="cpu"
-) -> Tensor:
-    """Computes sine-cosine positional embedding vectors for scalar positions."""
-    if dimension % 2 != 0:
-        raise ValueError(f"dimension ({dimension}) must be divisible by 2")
-
-    if time.ndim != 1:
-        raise ValueError("The time tensor is expected to be of shape `(batch_size, )`.")
-
-    dtype = get_safe_dtype(torch.float64, device.type)
-    fraction = torch.linspace(0.0, 1.0, dimension // 2, dtype=dtype, device=device)
-    period = min_period * (max_period / min_period) ** fraction
-
-    # Compute the outer product
-    scaling_factor = 1.0 / period * 2 * math.pi
-    sin_input = scaling_factor[None, :] * time[:, None]
-    pos_emb = torch.cat([torch.sin(sin_input), torch.cos(sin_input)], dim=1)
-    return pos_emb
-
-
-def sample_beta(alpha, beta, bsize, device):
-    gamma1 = torch.empty((bsize,), device=device).uniform_(0, 1).pow(1 / alpha)
-    gamma2 = torch.empty((bsize,), device=device).uniform_(0, 1).pow(1 / beta)
-    return gamma1 / (gamma1 + gamma2)
-
-
-def make_att_2d_masks(pad_masks, att_masks):
-    """Copied from big_vision.
-
-    Tokens can attend to valid inputs tokens which have a cumulative mask_ar
-    smaller or equal to theirs. This way `mask_ar` int[B, N] can be used to
-    setup several types of attention, for example:
-
-      [[1 1 1 1 1 1]]: pure causal attention.
-
-      [[0 0 0 1 1 1]]: prefix-lm attention. The first 3 tokens can attend between
-          themselves and the last 3 tokens have a causal attention. The first
-          entry could also be a 1 without changing behaviour.
-
-      [[1 0 1 0 1 0 0 1 0 0]]: causal attention between 4 blocks. Tokens of a
-          block can attend all previous blocks and all tokens on the same block.
-
-    Args:
-      input_mask: bool[B, N] true if its part of the input, false if padding.
-      mask_ar: int32[B, N] mask that's 1 where previous tokens cannot depend on
-        it and 0 where it shares the same attention mask as the previous token.
-    """
-    if att_masks.ndim != 2:
-        raise ValueError(att_masks.ndim)
-    if pad_masks.ndim != 2:
-        raise ValueError(pad_masks.ndim)
-
-    cumsum = torch.cumsum(att_masks, dim=1)
-    att_2d_masks = cumsum[:, None, :] <= cumsum[:, :, None]
-    pad_2d_masks = pad_masks[:, None, :] * pad_masks[:, :, None]
-    att_2d_masks = att_2d_masks & pad_2d_masks
-    return att_2d_masks
-
-
-def resize_with_pad(img, width, height, pad_value=-1):
-    # assume no-op when width height fits already
-    if img.ndim != 4:
-        raise ValueError(f"(b,c,h,w) expected, but {img.shape}")
-
-    cur_height, cur_width = img.shape[2:]
-
-    ratio = max(cur_width / width, cur_height / height)
-    resized_height = int(cur_height / ratio)
-    resized_width = int(cur_width / ratio)
-    resized_img = F.interpolate(
-        img, size=(resized_height, resized_width), mode="bilinear", align_corners=False
-    )
-
-    pad_height = max(0, int(height - resized_height))
-    pad_width = max(0, int(width - resized_width))
-
-    # pad on left and top of image
-    padded_img = F.pad(resized_img, (pad_width, 0, pad_height, 0), value=pad_value)
-    return padded_img
-
-
-_VARIANT_RE = re.compile(r"\.so\d+(?:-[\w]+)?_buffer_")
-
-
-def canonicalise(k: str) -> str:
-    """
-    Remove dataset-variant markers like '.so100-blue_' or '.so100_' from a
-    normalisation-buffer key.
-    """
-    return _VARIANT_RE.sub(".buffer_", k)
-
-
-def standardise_state_dict(
-    checkpoint: dict[str, torch.Tensor], ref_keys: set[str], *, verbose: bool = True
-) -> tuple[dict[str, torch.Tensor], list[str]]:
-    """
-    • Re-keys `checkpoint ` so that every entry matches the *reference* key set.
-    • If several variant keys collapse to the same canonical name we keep the
-      first one and log the collision.
-    • Returns the new dict + a list of entries that could not be matched.
-    """
-    out, collisions, unmatched = {}, {}, []
-
-    for k, v in checkpoint.items():
-        canon = canonicalise(k)
-        if canon in ref_keys:
-            if canon in out:  # duplicate after collapsing
-                collisions.setdefault(canon, []).append(k)
-            else:
-                out[canon] = v
-        else:
-            unmatched.append(k)
-
-    if verbose:
-        for canon, variants in collisions.items():
-            print(f"[standardise_state_dict] '{canon}'  ←  {variants}")
-        if unmatched:
-            print(f"[standardise_state_dict] kept {len(unmatched)} unmatched keys")
-
-    out.update({k: checkpoint[k] for k in unmatched})
-    return out, unmatched
-
-
-def load_smolvla(
-    model: torch.nn.Module,
-    filename: str | os.PathLike,
-    *,
-    device: str = "cpu",
-    checkpoint_keys_mapping: str = "",
-) -> torch.nn.Module:
-    state_dict = safetensors.torch.load_file(filename, device=device)
-
-    # Optional user-supplied renames (e.g. "model._orig_mod.//model.")
-    if checkpoint_keys_mapping and "//" in checkpoint_keys_mapping:
-        state_dict = rename_checkpoint_keys(state_dict, checkpoint_keys_mapping)
-
-    state_dict, _ = standardise_state_dict(state_dict, set(model.state_dict().keys()))
-
-    # HACK(aliberts): to not overwrite normalization parameters as they should come from the dataset
-    norm_keys = ("normalize_inputs", "normalize_targets", "unnormalize_outputs")
-    state_dict = {k: v for k, v in state_dict.items() if not k.startswith(norm_keys)}
-
-    missing, unexpected = model.load_state_dict(state_dict, strict=False)
-    if not all(key.startswith(norm_keys) for key in missing) or unexpected:
-        raise RuntimeError(
-            "SmolVLA %d missing / %d unexpected keys",
-            len(missing),
-            len(unexpected),
-        )
-
-    return model
-
-
-def pad_vector(vector, new_dim):
-    """Can be (batch_size x sequence_length x features_dimension)
-    or (batch_size x features_dimension)
-    """
-    if vector.shape[-1] == new_dim:
-        return vector
-    shape = list(vector.shape)
-    current_dim = shape[-1]
-    shape[-1] = new_dim
-    new_vector = torch.zeros(*shape, dtype=vector.dtype, device=vector.device)
-    new_vector[..., :current_dim] = vector
-    return new_vector
-
-
-def normalize(x, min_val, max_val):
-    return (x - min_val) / (max_val - min_val)
-
-
-def unnormalize(x, min_val, max_val):
-    return x * (max_val - min_val) + min_val
-
-
-def safe_arcsin(value):
-    # This ensures that the input stays within
-    # [−1,1] to avoid invalid values for arcsin
-    return torch.arcsin(torch.clamp(value, -1.0, 1.0))
-
-
-def aloha_gripper_to_angular(value):
-    # Aloha transforms the gripper positions into a linear space. The following code
-    # reverses this transformation to be consistent with pi0 which is pretrained in
-    # angular space.
-    #
-    # These values are coming from the Aloha code:
-    # PUPPET_GRIPPER_POSITION_OPEN, PUPPET_GRIPPER_POSITION_CLOSED
-    value = unnormalize(value, min_val=0.01844, max_val=0.05800)
-
-    # This is the inverse of the angular to linear transformation inside the Interbotix code.
-    def linear_to_radian(linear_position, arm_length, horn_radius):
-        value = (horn_radius**2 + linear_position**2 - arm_length**2) / (2 * horn_radius * linear_position)
-        return safe_arcsin(value)
-
-    # The constants are taken from the Interbotix code.
-    value = linear_to_radian(value, arm_length=0.036, horn_radius=0.022)
-
-    # Normalize to [0, 1].
-    # The values 0.4 and 1.5 were measured on an actual Trossen robot.
-    return normalize(value, min_val=0.4, max_val=1.5)
-
-
-def rename_checkpoint_keys(checkpoint: dict, rename_str: str):
-    """
-    Renames keys in a checkpoint dictionary based on the given rename string.
-
-    Args:
-        checkpoint (dict): The checkpoint dictionary.
-        rename_str (str): A string specifying key mappings in the format "old1//new1,old2//new2".
-
-    Returns:
-        dict: The modified checkpoint with renamed keys.
-    """
-
-    rename_dict = dict(pair.split("//") for pair in rename_str.split(","))
-
-    new_checkpoint = {}
-    for k, v in checkpoint.items():
-        for old_key, new_key in rename_dict.items():
-            if old_key in k:
-                k = k.replace(old_key, new_key)
-        new_checkpoint[k] = v
-    return new_checkpoint
-
-
-def aloha_gripper_from_angular(value):
-    # Convert from the gripper position used by pi0 to the gripper position that is used by Aloha.
-    # Note that the units are still angular but the range is different.
-
-    # The values 0.4 and 1.5 were measured on an actual Trossen robot.
-    value = unnormalize(value, min_val=0.4, max_val=1.5)
-
-    # These values are coming from the Aloha code:
-    # PUPPET_GRIPPER_JOINT_OPEN, PUPPET_GRIPPER_JOINT_CLOSE
-    return normalize(value, min_val=-0.6213, max_val=1.4910)
-
-
-def aloha_gripper_from_angular_inv(value):
-    # Directly inverts the gripper_from_angular function.
-    value = unnormalize(value, min_val=-0.6213, max_val=1.4910)
-    return normalize(value, min_val=0.4, max_val=1.5)
-
-
-class SMOLPI0Policy(PreTrainedPolicy):
-    """Wrapper class around VLAFlowMatching model to train and run inference within LeRobot."""
-
-    config_class = SMOLPI0Config
-    name = "smolpi0"
-
-    def __init__(
-        self,
-        config: SMOLPI0Config,
-        dataset_stats: dict[str, dict[str, Tensor]] | None = None,
-    ):
-        """
-        Args:
-            config: Policy configuration class instance or None, in which case the default instantiation of
-                    the configuration class is used.
-            dataset_stats: Dataset statistics to be used for normalization. If not passed here, it is expected
-                that they will be passed with a call to `load_state_dict` before the policy is used.
-        """
-
-        super().__init__(config)
-        config.validate_features()
-        self.config = config
-        self.normalize_per_robot_type = getattr(
-            config, "normalize_per_robot_type", False
-        )  # FIXME(mshukor): assert in case of single dataset
-        if self.normalize_per_robot_type:
-            if not dataset_stats:
-                dataset_stats[config.robot_type] = {}
-            self.normalize_inputs = NormalizePerRobotType(
-                config.input_features, config.normalization_mapping, dataset_stats
-            )
-            self.normalize_targets = NormalizePerRobotType(
-                config.output_features, config.normalization_mapping, dataset_stats
-            )
-            self.unnormalize_outputs = UnnormalizePerRobotType(
-                config.output_features, config.normalization_mapping, dataset_stats
-            )
-        else:
-            self.normalize_inputs = Normalize(
-                config.input_features, config.normalization_mapping, dataset_stats
-            )
-            self.normalize_targets = Normalize(
-                config.output_features, config.normalization_mapping, dataset_stats
-            )
-            self.unnormalize_outputs = Unnormalize(
-                config.output_features, config.normalization_mapping, dataset_stats
-            )
-
-        self.language_tokenizer = AutoProcessor.from_pretrained(self.config.vlm_model_name).tokenizer
-        self.model = VLAFlowMatching(config)
-        self.include_past_states = config.n_obs_steps > 1 and OBS_STATE in self.config.past_obs_keys.split(
-            ","
-        )
-        self.include_past_images = config.n_obs_steps > 1 and "image" in self.config.past_obs_keys.split(",")
-        self.num_past_images = self.config.n_obs_steps if self.include_past_images else 1
-        self.reset()
-
-    def reset(self):
-        """This should be called whenever the environment is reset."""
-        # self._action_queue = deque([], maxlen=self.config.n_action_steps)
-        self._queues = {
-            ACTION: deque(maxlen=self.config.n_action_steps),
-        }
-        if self.config.n_obs_steps > 1:
-            for k in self.config.input_features:
-                if any([past_obs_key in k for past_obs_key in self.config.past_obs_keys.split(",")]):
-                    self._queues[k] = deque(maxlen=self.config.n_obs_steps)
-
-    def get_optim_params(self) -> dict:
-        if self.config.optimizer_lr_vlm > 0 and self.config.optimizer_lr_vlm != self.config.optimizer_lr:
-            params = [
-                {"params": [p for n, p in self.named_parameters() if ".vlm." not in n and p.requires_grad]},
-                {
-                    "params": [p for n, p in self.named_parameters() if ".vlm." in n and p.requires_grad],
-                    "lr": self.config.optimizer_lr_vlm,
-                },
-            ]
-            return params
-
-        else:
-            return self.parameters()
-
-    def merge_peft_model_weights(self) -> None:
-        if "lora" in self.config.peft_method:
-            self.model.vlm_with_expert.merge_lora_weights()
-
-    @torch.no_grad
-    def predict_action_chunk(self, batch: dict[str, Tensor], noise: Tensor | None = None) -> Tensor:
-        """Select a single action given environment observations.
-
-        This method wraps `select_actions` in order to return one action at a time for execution in the
-        environment. It works by managing the actions in a queue and only calling `select_actions` when the
-        queue is empty.
-        """
-        self.eval()
-
-        if self.config.adapt_to_pi_aloha:
-            batch[OBS_STATE] = self._pi_aloha_decode_state(batch[OBS_STATE])
-
-        batch = self.normalize_inputs(batch)
-
-        images, img_masks = self.prepare_images(batch)
-        state = self.prepare_state(batch)
-        lang_tokens, lang_masks = self.prepare_language(batch)
-
-        actions = self.model.sample_actions(images, img_masks, lang_tokens, lang_masks, state, noise=noise)
-        # Unpad actions
-        original_action_dim = self.config.action_feature.shape[0]
-        actions = actions[:, :, :original_action_dim]
-
-        actions = self.unnormalize_outputs({"action": actions, "robot_type": batch["robot_type"]})["action"]
-
-        if self.config.adapt_to_pi_aloha:
-            actions = self._pi_aloha_encode_actions(actions)
-
-        return actions
-
-    # HACK(aliberts, danaaubakirova): we overwrite this classmethod here to fix smolVLA-specific issues
-    @classmethod
-    def _load_as_safetensor(
-        cls,
-        model: "SmolVLAPolicy",
-        model_file: str,
-        map_location: str,
-        strict: bool,
-        **kwargs,
-    ):
-        safetensors.torch.load_model(model, model_file, strict=strict, device=map_location)
-        return load_smolvla(
-            model,
-            model_file,
-            device=map_location,
-            checkpoint_keys_mapping="model._orig_mod.//model.",
-        )
-
-    @torch.no_grad
-    def select_action(self, batch: dict[str, Tensor], noise: Tensor | None = None) -> Tensor:
-        """Select a single action given environment observations.
-
-        This method wraps `select_actions` in order to return one action at a time for execution in the
-        environment. It works by managing the actions in a queue and only calling `select_actions` when the
-        queue is empty.
-        """
-        self.eval()
-
-        if self.config.adapt_to_pi_aloha:
-            batch[OBS_STATE] = self._pi_aloha_decode_state(batch[OBS_STATE])
-
-        batch = self.normalize_inputs(batch)
-
-        self._queues = populate_queues(self._queues, batch, exclude_keys=[ACTION])
-        # Action queue logic for n_action_steps > 1. When the action_queue is depleted, populate it by
-        # querying the policy.
-        if len(self._queues[ACTION]) == 0:
-            for k in batch:
-                if k in self._queues:
-                    batch[k] = torch.stack(list(self._queues[k]), dim=1)
-            images, img_masks = self.prepare_images(batch)
-            state = self.prepare_state(batch)
-            lang_tokens, lang_masks = self.prepare_language(batch)
-            actions = self.model.sample_actions(
-                images, img_masks, lang_tokens, lang_masks, state, noise=noise
-            )
-            if self.config.predict_relative_actions and actions.ndim == 3:
-                # If the model predicts relative actions, we need to unpad the actions
-                # and then convert them to absolute actions.
-                if self.config.relative_actions_mode == "first":
-                    actions = torch.cat((actions[:, :1], actions[:, 1:] + actions[:, :1]), dim=1)
-                elif self.config.relative_actions_mode == "state":
-                    actions = actions + state.unsqueeze(1)
-                else:
-                    actions = torch.cat((actions[:, :1], actions[:, 1:] + actions[:, :-1]), dim=1)
-            # Unpad actions
-            original_action_dim = self.config.action_feature.shape[0]
-            actions = actions[:, :, :original_action_dim]
-
-            actions = self.unnormalize_outputs({"action": actions})["action"]
-
-            if self.config.adapt_to_pi_aloha:
-                actions = self._pi_aloha_encode_actions(actions)
-
-            # `self.model.forward` returns a (batch_size, n_action_steps, action_dim) tensor, but the queue
-            # effectively has shape (n_action_steps, batch_size, *), hence the transpose.
-            self._queues[ACTION].extend(actions.transpose(0, 1)[: self.config.n_action_steps])
-        return self._queues[ACTION].popleft()
-
-    def forward(self, batch: dict[str, Tensor], noise=None, time=None) -> dict[str, Tensor]:
-        """Do a full training forward pass to compute the loss"""
-        if self.config.adapt_to_pi_aloha:
-            batch[OBS_STATE] = self._pi_aloha_decode_state(batch[OBS_STATE])
-            batch[ACTION] = self._pi_aloha_encode_actions_inv(batch[ACTION])
-        batch = self.normalize_inputs(batch)
-        batch = self.normalize_targets(batch)
-        images, img_masks = self.prepare_images(
-            batch
-        )  # FIXME(mshukor): adapte it to take into account already padded images in the batch
-        state = self.prepare_state(batch)
-        lang_tokens, lang_masks = self.prepare_language(batch)
-        actions = self.prepare_action(batch, state=state)
-        actions_is_pad = batch.get("actions_id_pad")
-        loss_dict = {}
-        losses = self.model.forward(images, img_masks, lang_tokens, lang_masks, state, actions, noise, time)
-        loss_dict["losses_after_forward"] = losses.mean().clone()
-
-        if actions_is_pad is not None:
-            in_episode_bound = ~actions_is_pad
-            losses = losses * in_episode_bound.unsqueeze(-1)
-            loss_dict["losses_after_in_ep_bound"] = losses.mean().clone()
-
-        # Remove padding
-        losses = losses[:, :, : self.config.max_action_dim]
-        loss_dict["losses_after_rm_padding"] = losses.mean().clone()
-
-        # For backward pass
-        loss = losses.mean()
-        # For backward pass
-        loss_dict["loss"] = loss
-        # # For logging
-        # loss_dict["l2_loss"] = loss.item() # remove for torch compile
-        return loss_dict
-
-    def prepare_images(self, batch):
-        """Apply Pi0 preprocessing to the images, like resizing to 224x224 and padding to keep aspect ratio, and
-        convert pixel range from [0.0, 1.0] to [-1.0, 1.0] as requested by SigLIP.
-        """
-        images = []
-        img_masks = []
-        present_img_keys = [key for key in self.config.image_features if key in batch]
-        missing_img_keys = [key for key in self.config.image_features if key not in batch]
-
-        present_img_keys = sorted(
-            present_img_keys,
-            key=lambda k: IMAGES_ORDER.get(k, float("inf")),
-            reverse=self.config.reverse_images_order,
-        )
-        if self.config.shuffle_camera_positions and ACTION in batch:  # only during training
-            present_img_keys = random.sample(present_img_keys, len(present_img_keys))
-        if len(present_img_keys) == 0:
-            raise ValueError(
-                f"All image features are missing from the batch. At least one expected. (batch: {batch.keys()}) (image_features:{self.config.image_features})"
-            )
-        for i in range(self.num_past_images):
-            # Preprocess image features present in the batch
-            for key in present_img_keys:
-                img = batch[key][:, i, :, :, :] if batch[key].ndim == 5 else batch[key]
-                if self.config.resize_imgs_with_padding is not None:
-                    img = resize_with_pad(img, *self.config.resize_imgs_with_padding, pad_value=0)
-
-                # Normalize from range [0,1] to [-1,1] as expacted by siglip
-                img = img * 2.0 - 1.0
-
-                bsize = img.shape[0]
-                device = img.device
-                if f"{key}_padding_mask" in batch:
-                    mask = batch[f"{key}_padding_mask"].bool()
-                else:
-                    mask = torch.ones(bsize, dtype=torch.bool, device=device)
-                images.append(img)
-                img_masks.append(mask)
-
-            # Create image features not present in the batch
-            # as fully 0 padded images.
-            for num_empty_cameras in range(len(missing_img_keys)):
-                if num_empty_cameras >= self.config.empty_cameras:
-                    break
-                img = torch.ones_like(img) * -1
-                mask = torch.zeros_like(mask)
-                images.append(img)
-                img_masks.append(mask)
-        return images, img_masks
-
-    def prepare_language(self, batch) -> tuple[Tensor, Tensor]:
-        """Tokenize the text input"""
-        device = batch[OBS_STATE].device
-        tasks = batch["task"]
-        if len(tasks) == 1:
-            tasks = [tasks[0] for _ in range(batch[OBS_STATE].shape[0])]
-
-        if self.config.add_prompt_template:
-            tasks = [
-                f"{self.config.prefix_prompt_template}{task}{self.config.suffix_prompt_template}"
-                for task in tasks
-            ]
-        else:
-            tasks = [task if task.endswith("\n") else f"{task}\n" for task in tasks]
-        tokenized_prompt = self.language_tokenizer.__call__(
-            tasks,
-            padding=self.config.pad_language_to,
-            padding_side="right",
-            max_length=self.config.tokenizer_max_length,
-            return_tensors="pt",
-            truncation=True,  # FIXME(mshukor)
-        )
-
-        lang_tokens = tokenized_prompt["input_ids"].to(device=device)
-        lang_masks = tokenized_prompt["attention_mask"].to(device=device, dtype=torch.bool)
-
-        return lang_tokens, lang_masks
-
-    def _pi_aloha_decode_state(self, state):
-        # Flip the joints.
-        for motor_idx in [1, 2, 8, 9]:
-            state[:, motor_idx] *= -1
-        # Reverse the gripper transformation that is being applied by the Aloha runtime.
-        for motor_idx in [6, 13]:
-            state[:, motor_idx] = aloha_gripper_to_angular(state[:, motor_idx])
-        return state
-
-    def _pi_aloha_encode_actions(self, actions):
-        # Flip the joints.
-        for motor_idx in [1, 2, 8, 9]:
-            actions[:, :, motor_idx] *= -1
-        # Reverse the gripper transformation that is being applied by the Aloha runtime.
-        for motor_idx in [6, 13]:
-            actions[:, :, motor_idx] = aloha_gripper_from_angular(actions[:, :, motor_idx])
-        return actions
-
-    def _pi_aloha_encode_actions_inv(self, actions):
-        # Flip the joints again.
-        for motor_idx in [1, 2, 8, 9]:
-            actions[:, :, motor_idx] *= -1
-        # Reverse the gripper transformation that is being applied by the Aloha runtime.
-        for motor_idx in [6, 13]:
-            actions[:, :, motor_idx] = aloha_gripper_from_angular_inv(actions[:, :, motor_idx])
-        return actions
-
-    def prepare_state(self, batch):
-        """Pad state"""
-        state = (
-            batch[OBS_STATE][:, -1, :]
-            if (batch[OBS_STATE].ndim > 2 and not self.include_past_states)
-            else batch[OBS_STATE]
-        )  # FIXME(mshukor): no state history for now
-        state = pad_vector(state, self.config.max_state_dim)
-        return state
-
-    def prepare_action(self, batch, state=None):
-        """Pad action"""
-        actions = pad_vector(batch[ACTION], self.config.max_action_dim)
-        if self.config.predict_relative_actions and actions.ndim == 3:
-            if self.config.relative_actions_mode == "first":
-                actions = torch.cat((actions[:, :1], actions[:, 1:] - actions[:, :1]), dim=1)
-            elif self.config.relative_actions_mode == "state":
-                assert batch[ACTION].shape[-1] == batch[OBS_STATE].shape[-1], (
-                    "Relative action mode 'state' requires the action and state to have the same dimension."
-                )
-                if state.ndim == 2:
-                    state = state.unsqueeze(1)
-                actions = actions - state
-            else:
-                actions = torch.cat((actions[:, :1], actions[:, 1:] - actions[:, :-1]), dim=1)
-        return actions
-
-
-def pad_tensor(tensor, max_len, pad_value=0):
-    """
-    Efficiently pads a tensor along sequence dimension to match max_len.
-
-    Args:
-        tensor (torch.Tensor): Shape (B, L, ...) or (B, L).
-        max_len (int): Fixed sequence length.
-        pad_value (int/float): Value for padding.
-
-    Returns:
-        torch.Tensor: Shape (B, max_len, ...) or (B, max_len).
-    """
-    B, L = tensor.shape[:2]
-
-    # Create a padded tensor of max_len and copy the existing values
-    padded_tensor = torch.full(
-        (B, max_len, *tensor.shape[2:]), pad_value, dtype=tensor.dtype, device=tensor.device
-    )
-    padded_tensor[:, :L] = tensor  # Efficient in-place copy
-
-    return padded_tensor
-
-
-class VLAFlowMatching(nn.Module):
-    """
-    π0: A Vision-Language-Action Flow Model for General Robot Control
-
-    [Paper](https://www.physicalintelligence.company/download/pi0.pdf)
-    [Jax code](https://github.com/Physical-Intelligence/openpi)
-
-    Designed by Physical Intelligence. Ported from Jax by Hugging Face.
-    ┌──────────────────────────────┐
-    │               actions        │
-    │               ▲              │
-    │              ┌┴─────┐        │
-    │  kv cache    │Gemma │        │
-    │  ┌──────────►│Expert│        │
-    │  │           │      │        │
-    │ ┌┴────────┐  │x 10  │        │
-    │ │         │  └▲──▲──┘        │
-    │ │   VLM   │   │  │           │
-    │ │         │   │  robot state │
-    │ │         │   noise          │
-    │ └▲──▲─────┘                  │
-    │  │  │                        │
-    │  │  image(s)                 │
-    │  language tokens             │
-    └──────────────────────────────┘
-    """
-
-    def __init__(self, config):
-        super().__init__()
-        self.config = config
-
-        self.vlm_with_expert = SmolVLMWithExpertModel(
-            model_id=self.config.vlm_model_name,
-            freeze_vision_encoder=self.config.freeze_vision_encoder,
-            train_expert_only=self.config.train_expert_only,
-            attention_implementation=self.config.attention_implementation,
-            load_vlm_weights=self.config.load_vlm_weights,
-            attention_mode=self.config.attention_mode,
-            num_expert_layers=self.config.num_expert_layers,
-            num_vlm_layers=self.config.num_vlm_layers,
-            self_attn_every_n_layers=self.config.self_attn_every_n_layers,
-            expert_width_multiplier=self.config.expert_width_multiplier,
-            self_attn_only_actions=self.config.self_attn_only_actions,
-        )
-        # self.paligemma_with_expert = self.configure_peft(paligemma_with_expert)
-        self.vlm_with_expert.configure_peft(config=self.config)
-        # Projections are float32
-        self.state_to_prefix = self.config.state_to_prefix
-        if self.state_to_prefix:
-            self.state_proj = nn.Linear(
-                self.config.max_state_dim, self.vlm_with_expert.config.text_config.hidden_size
-            )
-        else:
-            self.state_proj = nn.Linear(self.config.max_state_dim, self.vlm_with_expert.expert_hidden_size)
-        self.action_in_proj = nn.Linear(self.config.max_action_dim, self.vlm_with_expert.expert_hidden_size)
-        self.action_out_proj = nn.Linear(self.vlm_with_expert.expert_hidden_size, self.config.max_action_dim)
-
-        self.action_time_mlp_in = nn.Linear(
-            self.vlm_with_expert.expert_hidden_size * 2, self.vlm_with_expert.expert_hidden_size
-        )
-        self.action_time_mlp_out = nn.Linear(
-            self.vlm_with_expert.expert_hidden_size, self.vlm_with_expert.expert_hidden_size
-        )
-
-        self.set_requires_grad()
-        # SmolVLM2 has: [fake_tok + crop_tok + crop + fake_tok + crop_tok ... + fake_tok + global_tok + global + fake_tok] + [second image] + ...
-        if any([k in self.config.vlm_model_name for k in ["SmolVLM-", "SmolVLA-"]]):
-            if "SmolVLM-Instruct" in self.config.vlm_model_name:
-                self.fake_image_token = 49152
-                self.global_image_token = [44, 13906, 29, 6266, 46]
-                self.global_image_start_token = torch.tensor(
-                    [self.fake_image_token] + self.global_image_token, dtype=torch.long
-                )
-            else:
-                self.fake_image_token = 49189
-                self.global_image_token = 49152
-                self.global_image_start_token = torch.tensor(
-                    [self.fake_image_token, self.global_image_token], dtype=torch.long
-                )
-        else:
-            self.fake_image_token = self.vlm_with_expert.processor.tokenizer.fake_image_token_id
-            self.global_image_token = self.vlm_with_expert.processor.tokenizer.global_image_token_id
-            self.global_image_start_token = torch.tensor(
-                [self.fake_image_token, self.global_image_token], dtype=torch.long
-            )
-
-        self.add_image_special_tokens = self.config.add_image_special_tokens
-        self.add_local_special_image_tokens = self.config.add_local_special_image_tokens
-        self.local_image_tokens = [
-            torch.tensor([self.fake_image_token, tok], dtype=torch.long)
-            for tok in [49153, 49154, 49155, 49159, 49160, 49161, 49165, 49166, 49167]
-        ]  # assume 3 x 3 grid
-
-        self.local_image_start_token = self.global_image_start_token
-        self.image_end_token = torch.tensor([self.fake_image_token], dtype=torch.long)
-        self.prefix_length = self.config.prefix_length
-        self.include_past_images = self.config.n_obs_steps > 1 and "image" in self.config.past_obs_keys.split(
-            ","
-        )
-        self.num_past_images = self.config.n_obs_steps if self.include_past_images else 1
-        self.causal_attention_on_history = self.config.causal_attention_on_history
-
-    # def configure_peft(self, model):
-    #     # return model
-    #     self.peft_method = self.config.peft_method
-    #     if "lora" in self.peft_method:
-    #         peft_config = self.config.peft_config
-    #         target_modules = peft_config.target_modules
-    #         if not isinstance(target_modules, list):
-    #             target_modules = target_modules.split(",")
-    #         lora_config = LoraConfig(
-    #             task_type=TaskType.CAUSAL_LM,  # Based on the task type (e.g., language modeling, etc.)
-    #             r=peft_config.r,  # The rank of the low-rank adaptation
-    #             lora_alpha=peft_config.lora_alpha,  # Scaling factor
-    #             lora_dropout=peft_config.lora_dropout,  # Dropout applied to LoRA layers
-    #             target_modules=target_modules,  # The components where LoRA is applied
-    #             exclude_modules=["gemma_expert", "model.gemma_expert.model.layers"], # FIXME(mshukor): this does not work for now
-    #         )
-    #         # LoraConfig(task_type=TaskType.CAUSAL_LM, r=16, lora_alpha=1, lora_dropout=0, target_modules=["q_proj"], exclude_modules=["gemma_expert"])
-    #         self.lora_config = lora_config
-    #         # Apply LoRA and ensure only LoRA parameters are trainable
-
-    #         model = get_peft_model(model, lora_config)
-    #         assert self.config.train_expert_only, "Backbone should be frozen and only lora parameters are " # FIXME(mshukor): handle this here?
-    #         for name, param in model.named_parameters():
-    #             if (
-    #                 "lora" in name
-    #             ):  # lm_head is not a parameter in most LLMs becasue it's tied to the embedding layer
-    #                 param.requires_grad = True
-    #     return model
-
-    def set_requires_grad(self):
-        for params in self.state_proj.parameters():
-            params.requires_grad = self.config.train_state_proj
-
-    def sample_noise(self, shape, device):
-        noise = torch.normal(
-            mean=0.0,
-            std=1.0,
-            size=shape,
-            dtype=torch.float32,
-            device=device,
-        )
-        return noise
-
-    def sample_time(self, bsize, device):
-        time_beta = sample_beta(1.5, 1.0, bsize, device)
-        time = time_beta * 0.999 + 0.001
-        return time.to(dtype=torch.float32, device=device)
-
-    def embed_prefix(
-        self, images, img_masks, lang_tokens, lang_masks, state: torch.Tensor = None
-    ) -> tuple[torch.Tensor, torch.Tensor, torch.Tensor]:
-        """Embed images with SigLIP and language tokens with embedding layer to prepare
-        for SmolVLM transformer processing.
-        """
-        # TODO: avoid list in python and torch.cat ; prefer pre-allocation with torch.empty
-        embs = []
-        pad_masks = []
-        att_masks = []
-        num_images = len(images) // self.num_past_images
-        # TODO: remove for loop
-        for img_idx, (
-            img,
-            img_mask,
-        ) in enumerate(zip(images, img_masks, strict=False)):
-            # FIXME(mshukor):  add special tokens for the history each history_steps or not
-            if self.add_image_special_tokens:
-                if self.add_local_special_image_tokens and img_idx % num_images != num_images - 1:
-                    local_token_idx = img_idx % num_images
-                    image_start_token = (
-                        self.vlm_with_expert.embed_language_tokens(
-                            self.local_image_tokens[local_token_idx].to(
-                                device=self.vlm_with_expert.vlm.device
-                            )
-                        )
-                        .unsqueeze(0)
-                        .expand(img.shape[0], -1, -1)
-                    )
-                else:
-                    image_start_token = (
-                        self.vlm_with_expert.embed_language_tokens(
-                            self.global_image_start_token.to(device=self.vlm_with_expert.vlm.device)
-                        )
-                        .unsqueeze(0)
-                        .expand(img.shape[0], -1, -1)
-                    )
-                image_start_mask = torch.ones_like(
-                    image_start_token[:, :, 0], dtype=torch.bool, device=image_start_token.device
-                )
-                if self.causal_attention_on_history and img_idx % num_images == 0:
-                    att_masks += [1] + [0] * (image_start_mask.shape[-1] - 1)
-                else:
-                    att_masks += [0] * (image_start_mask.shape[-1])
-                embs.append(image_start_token)
-                pad_masks.append(image_start_mask)
-
-            img_emb = self.vlm_with_expert.embed_image(img)
-            img_emb = img_emb  # .to(dtype=self.vlm_with_expert.type)
-
-            # Normalize image embeddings
-            img_emb_dim = img_emb.shape[-1]
-            img_emb = img_emb * torch.tensor(img_emb_dim**0.5, dtype=img_emb.dtype, device=img_emb.device)
-
-            bsize, num_img_embs = img_emb.shape[:2]
-            img_mask = img_mask[:, None].expand(bsize, num_img_embs)
-
-            # FIXME(mshukor): add special image tokens. Assume no tiling fake global images fake
-            # template <|im_start|>User: What actions? image tokens \nAssistant: or processor.apply_chat_template?
-            # processor.fake_image_token
-            # processor.global_image_token
-
-            embs.append(img_emb)
-            pad_masks.append(img_mask)
-
-            att_masks += [0] * (num_img_embs)
-            if self.add_image_special_tokens:
-                if not self.add_local_special_image_tokens or (
-                    self.add_local_special_image_tokens and img_idx % num_images == num_images - 1
-                ):
-                    image_end_token = (
-                        self.vlm_with_expert.embed_language_tokens(
-                            self.image_end_token.to(device=self.vlm_with_expert.vlm.device)
-                        )
-                        .unsqueeze(0)
-                        .expand(img.shape[0], -1, -1)
-                    )
-                    image_end_mask = torch.ones_like(
-                        image_end_token[:, :, 0], dtype=torch.bool, device=image_end_token.device
-                    )
-                    embs.append(image_end_token)
-                    pad_masks.append(image_end_mask)
-                    att_masks += [0] * (image_end_mask.shape[1])
-        lang_emb = self.vlm_with_expert.embed_language_tokens(lang_tokens)
-        # Normalize language embeddings
-        lang_emb_dim = lang_emb.shape[-1]
-        lang_emb = lang_emb * math.sqrt(lang_emb_dim)  # FIXME(mshukor): is this needed for smolvlm?
-
-        embs.append(lang_emb)
-        pad_masks.append(lang_masks)
-
-        # full attention between image and language inputs
-        num_lang_embs = lang_emb.shape[1]
-        att_masks += [0] * num_lang_embs
-
-        if state is not None and self.state_to_prefix:
-            state_emb = self.state_proj(state)
-            state_emb = (
-                state_emb[:, None, :] if state_emb.ndim == 2 else state_emb
-            )  # .to(dtype=self.vlm_with_expert.type)
-            embs.append(state_emb)
-            bsize = state_emb.shape[0]
-            dtype = state_emb.dtype
-            device = state_emb.device
-
-            states_seq_len = state_emb.shape[1]
-            state_mask = torch.ones(bsize, states_seq_len, dtype=torch.bool, device=device)
-            pad_masks.append(state_mask)
-
-            # Set attention masks so that image and language inputs do not attend to state or actions
-            # att_masks += [1] + [0]*(states_seq_len - 1)
-            att_masks += [1] * (states_seq_len)
-        embs = torch.cat(embs, dim=1)
-        pad_masks = torch.cat(pad_masks, dim=1)
-        att_masks = torch.tensor(att_masks, dtype=torch.bool, device=pad_masks.device)
-        att_masks = att_masks[None, :]
-
-        seq_len = pad_masks.shape[1]
-        if seq_len < self.prefix_length:
-            embs = pad_tensor(embs, self.prefix_length, pad_value=0)
-            pad_masks = pad_tensor(pad_masks, self.prefix_length, pad_value=0)
-            att_masks = pad_tensor(att_masks, self.prefix_length, pad_value=0)
-
-        att_masks = att_masks.expand(bsize, -1)
-
-        return embs, pad_masks, att_masks
-
-    def embed_suffix(self, state, noisy_actions, timestep):
-        """Embed state, noisy_actions, timestep to prepare for Expert Gemma processing."""
-        embs = []
-        pad_masks = []
-        att_masks = []
-
-        # Embed state
-        if not self.state_to_prefix:
-            state_emb = self.state_proj(state)
-            state_emb = (
-                state_emb[:, None, :] if state_emb.ndim == 2 else state_emb
-            )  # .to(dtype=self.vlm_with_expert.type)
-            embs.append(state_emb)
-            bsize = state_emb.shape[0]
-            dtype = state_emb.dtype
-            device = state_emb.device
-
-            states_seq_len = state_emb.shape[1]
-            state_mask = torch.ones(bsize, states_seq_len, dtype=torch.bool, device=device)
-            pad_masks.append(state_mask)
-
-            # Set attention masks so that image and language inputs do not attend to state or actions
-            att_masks += [1] + [0] * (states_seq_len - 1)
-
-        # Fuse timestep + action information using an MLP
-        action_emb = self.action_in_proj(noisy_actions)
-        device = action_emb.device
-        bsize = action_emb.shape[0]
-        dtype = action_emb.dtype
-        # Embed timestep using sine-cosine positional encoding with sensitivity in the range [0, 1]
-        time_emb = create_sinusoidal_pos_embedding(
-            timestep, self.vlm_with_expert.expert_hidden_size, min_period=4e-3, max_period=4.0, device=device
-        )
-        time_emb = time_emb.type(dtype=dtype)
-
-        time_emb = time_emb[:, None, :].expand_as(action_emb)
-        action_time_emb = torch.cat([action_emb, time_emb], dim=2)
-
-        action_time_emb = self.action_time_mlp_in(action_time_emb)
-        action_time_emb = F.silu(action_time_emb)  # swish == silu
-        action_time_emb = self.action_time_mlp_out(action_time_emb)
-
-        # Add to input tokens
-        embs.append(action_time_emb)
-
-        bsize, action_time_dim = action_time_emb.shape[:2]
-        action_time_mask = torch.ones(bsize, action_time_dim, dtype=torch.bool, device=device)
-        pad_masks.append(action_time_mask)
-
-        # Set attention masks so that image, language and state inputs do not attend to action tokens
-        if self.config.causal_action_attention_mask:
-            att_masks += [1] * self.config.chunk_size
-        else:
-            att_masks += [1] + ([0] * (self.config.chunk_size - 1))
-        embs = torch.cat(embs, dim=1)
-        pad_masks = torch.cat(pad_masks, dim=1)
-        att_masks = torch.tensor(att_masks, dtype=embs.dtype, device=embs.device)
-        att_masks = att_masks[None, :].expand(bsize, len(att_masks))
-        return embs, pad_masks, att_masks
-
-    def forward(
-        self, images, img_masks, lang_tokens, lang_masks, state, actions, noise=None, time=None
-    ) -> Tensor:
-        """Do a full training forward pass and compute the loss (batch_size x num_steps x num_motors)"""
-        if noise is None:
-            noise = self.sample_noise(actions.shape, actions.device)
-
-        if time is None:
-            time = self.sample_time(actions.shape[0], actions.device)
-
-        time_expanded = time[:, None, None]
-        if self.config.regression_loss:
-            # Hack to compare regression to flow matching
-            time = torch.zeros_like(time, dtype=time.dtype, device=time.device)
-            x_t = torch.zeros_like(actions, dtype=actions.dtype, device=actions.device)
-            u_t = actions
-        else:
-            x_t = time_expanded * noise + (1 - time_expanded) * actions
-            u_t = noise - actions
-        prefix_embs, prefix_pad_masks, prefix_att_masks = self.embed_prefix(
-            images, img_masks, lang_tokens, lang_masks, state=state
-        )
-        suffix_embs, suffix_pad_masks, suffix_att_masks = self.embed_suffix(state, x_t, time)
-
-        pad_masks = torch.cat([prefix_pad_masks, suffix_pad_masks], dim=1)
-        att_masks = torch.cat([prefix_att_masks, suffix_att_masks], dim=1)
-
-        att_2d_masks = make_att_2d_masks(pad_masks, att_masks)
-        position_ids = torch.cumsum(pad_masks, dim=1) - 1
-        (_, suffix_out), _ = self.vlm_with_expert.forward(
-            attention_mask=att_2d_masks,
-            position_ids=position_ids,
-            past_key_values=None,
-            inputs_embeds=[prefix_embs, suffix_embs],
-            use_cache=False,
-            fill_kv_cache=False,
-        )
-        suffix_out = suffix_out[:, -self.config.chunk_size :]
-        # Original openpi code, upcast attention output
-        suffix_out = suffix_out.to(dtype=torch.float32)
-        v_t = self.action_out_proj(suffix_out)
-        if self.config.regression_loss:
-            losses = F.l1_loss(u_t, v_t, reduction="none")
-        else:
-            losses = F.mse_loss(u_t, v_t, reduction="none")
-        return losses
-
-    def sample_actions(self, images, img_masks, lang_tokens, lang_masks, state, noise=None) -> Tensor:
-        """Do a full inference forward and compute the action (batch_size x num_steps x num_motors)"""
-        bsize = state.shape[0]
-        device = state.device
-
-        if noise is None:
-            actions_shape = (bsize, self.config.chunk_size, self.config.max_action_dim)
-            noise = self.sample_noise(actions_shape, device)
-
-        prefix_embs, prefix_pad_masks, prefix_att_masks = self.embed_prefix(
-            images, img_masks, lang_tokens, lang_masks, state=state
-        )
-        prefix_att_2d_masks = make_att_2d_masks(prefix_pad_masks, prefix_att_masks)
-        prefix_position_ids = torch.cumsum(prefix_pad_masks, dim=1) - 1
-        # Compute image and language key value cache
-        _, past_key_values = self.vlm_with_expert.forward(
-            attention_mask=prefix_att_2d_masks,
-            position_ids=prefix_position_ids,
-            past_key_values=None,
-            inputs_embeds=[prefix_embs, None],
-            use_cache=self.config.use_cache,
-            fill_kv_cache=True,
-        )
-        if self.config.regression_loss:
-            x_t = torch.zeros_like(noise, dtype=torch.float32, device=device)
-            expanded_time = torch.zeros(bsize, dtype=torch.float32, device=device)
-            x_t = self.denoise_step(
-                state,
-                prefix_pad_masks,
-                past_key_values,
-                x_t,
-                expanded_time,
-            )
-        else:
-            dt = -1.0 / self.config.num_steps
-            dt = torch.tensor(dt, dtype=torch.float32, device=device)
-
-            x_t = noise
-            time = torch.tensor(1.0, dtype=torch.float32, device=device)
-            while time >= -dt / 2:
-                expanded_time = time.expand(bsize)
-                v_t = self.denoise_step(
-                    state,
-                    prefix_pad_masks,
-                    past_key_values,
-                    x_t,
-                    expanded_time,
-                )
-
-                # Euler step
-                x_t += dt * v_t
-                time += dt
-        return x_t
-
-    def denoise_step(
-        self,
-        state,
-        prefix_pad_masks,
-        past_key_values,
-        x_t,
-        timestep,
-    ):
-        """Apply one denoising step of the noise `x_t` at a given timestep."""
-        suffix_embs, suffix_pad_masks, suffix_att_masks = self.embed_suffix(state, x_t, timestep)
-
-        suffix_len = suffix_pad_masks.shape[1]
-        batch_size = prefix_pad_masks.shape[0]
-        prefix_len = prefix_pad_masks.shape[1]
-        prefix_pad_2d_masks = prefix_pad_masks[:, None, :].expand(batch_size, suffix_len, prefix_len)
-
-        suffix_att_2d_masks = make_att_2d_masks(suffix_pad_masks, suffix_att_masks)
-
-        full_att_2d_masks = torch.cat([prefix_pad_2d_masks, suffix_att_2d_masks], dim=2)
-        prefix_offsets = torch.sum(prefix_pad_masks, dim=-1)[:, None]
-        position_ids = prefix_offsets + torch.cumsum(suffix_pad_masks, dim=1) - 1
-
-        outputs_embeds, _ = self.vlm_with_expert.forward(
-            attention_mask=full_att_2d_masks,
-            position_ids=position_ids,
-            past_key_values=past_key_values,
-            inputs_embeds=[None, suffix_embs],
-            use_cache=self.config.use_cache,
-            fill_kv_cache=False,
-        )
-        suffix_out = outputs_embeds[1]
-        suffix_out = suffix_out[:, -self.config.chunk_size :]
-        suffix_out = suffix_out.to(dtype=torch.float32)
-        v_t = self.action_out_proj(suffix_out)
-        return v_t
diff --git a/src/lerobot/policies/smolpi0/smolvlm_with_expert.py b/src/lerobot/policies/smolpi0/smolvlm_with_expert.py
deleted file mode 100644
index 0ccdcccc8..000000000
--- a/src/lerobot/policies/smolpi0/smolvlm_with_expert.py
+++ /dev/null
@@ -1,920 +0,0 @@
-# Copyright 2024 The HuggingFace Inc. team. All rights reserved.
-#
-# Licensed under the Apache License, Version 2.0 (the "License");
-# you may not use this file except in compliance with the License.
-# You may obtain a copy of the License at
-#
-#     http://www.apache.org/licenses/LICENSE-2.0
-#
-# Unless required by applicable law or agreed to in writing, software
-# distributed under the License is distributed on an "AS IS" BASIS,
-# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-# See the License for the specific language governing permissions and
-# limitations under the License.
-
-import copy
-from functools import partial
-from typing import List, Optional, Union
-
-import torch
-import torch.nn.functional as F  # noqa: N812
-import torch.version
-from peft import LoraConfig, TaskType, get_peft_model
-from pytest import Cache
-from torch import nn
-from transformers import (
-    AutoConfig,
-    AutoModel,
-    AutoModelForImageTextToText,
-    AutoModelForVision2Seq,
-    AutoProcessor,
-    SmolVLMForConditionalGeneration,
-)
-
-from lerobot.policies.smolpi0.flex_attention import flex_attention_forward
-
-
-def _round_up_to_multiple(x, multiple):
-    return (x + multiple - 1) // multiple * multiple
-
-
-def apply_rope(x, positions, max_wavelength=10_000):
-    """
-    Applies RoPE positions [B, L] to x [B, L, H, D].
-    """
-    d_half = x.shape[-1] // 2
-    device = x.device
-    dtype = x.dtype
-    x = x.to(torch.float32)
-
-    freq_exponents = (2.0 / x.shape[-1]) * torch.arange(d_half, dtype=torch.float32, device=device)
-    timescale = max_wavelength**freq_exponents
-    radians = positions[..., None].to(torch.float32) / timescale[None, None, :].to(torch.float32)
-
-    radians = radians[..., None, :]
-
-    sin = torch.sin(radians)  # .to(dtype=dtype)
-    cos = torch.cos(radians)  # .to(dtype=dtype)
-
-    x1, x2 = x.split(d_half, dim=-1)
-    res = torch.empty_like(x)
-    res[..., :d_half] = x1 * cos - x2 * sin
-    res[..., d_half:] = x2 * cos + x1 * sin
-
-    return res.to(dtype)
-
-
-# class SmolVLMWithExpertConfig(PretrainedConfig):
-#     model_type = "SmolVLMWithExpertModel"
-#     sub_configs = {"smolvlm_config": AutoConfig, "lm_expert_config": AutoConfig}
-
-#     def __init__(
-#         self,
-#         smolvlm_config: dict | None = None,
-#         lm_expert_config: dict | None = None,
-#         freeze_vision_encoder: bool = True,
-#         train_expert_only: bool = True,
-#         attention_implementation: str = "eager",
-#         load_vlm_weights: bool = False,
-#         **kwargs,
-#     ):
-#         self.load_vlm_weights = load_vlm_weights
-#         self.freeze_vision_encoder = freeze_vision_encoder
-#         self.train_expert_only = train_expert_only
-#         self.attention_implementation = attention_implementation
-
-#         if smolvlm_config is None:
-#             # Default config from Pi0
-#             self.smolvlm_config = CONFIG_MAPPING["smolvlm"](
-#                 transformers_version="4.48.1",
-#                 _vocab_size=257152,
-#                 bos_token_id=2,
-#                 eos_token_id=1,
-#                 hidden_size=2048,
-#                 image_token_index=257152,
-#                 model_type="smolvlm",
-#                 pad_token_id=0,
-#                 projection_dim=2048,
-#                 text_config={
-#                     "hidden_activation": "gelu_pytorch_tanh",
-#                     "hidden_size": 2048,
-#                     "intermediate_size": 16384,
-#                     "model_type": "gemma",
-#                     "num_attention_heads": 8,
-#                     "num_hidden_layers": 18,
-#                     "num_image_tokens": 256,
-#                     "num_key_value_heads": 1,
-#                     "torch_dtype": "float32",
-#                     "vocab_size": 257152,
-#                 },
-#                 vision_config={
-#                     "hidden_size": 1152,
-#                     "intermediate_size": 4304,
-#                     "model_type": "siglip_vision_model",
-#                     "num_attention_heads": 16,
-#                     "num_hidden_layers": 27,
-#                     "num_image_tokens": 256,
-#                     "patch_size": 14,
-#                     "projection_dim": 2048,
-#                     "projector_hidden_act": "gelu_fast",
-#                     "torch_dtype": "float32",
-#                     "vision_use_head": False,
-#                 },
-#             )
-#         elif isinstance(self.paligemma_config, dict):
-#             # Override Pi0 default config for PaliGemma
-#             if "model_type" not in gemma_expert_config:
-#                 paligemma_config["model_type"] = "paligemma"
-
-#             cfg_cls = CONFIG_MAPPING[paligemma_config["model_type"]]
-#             self.paligemma_config = cfg_cls(**paligemma_config)
-
-#         if gemma_expert_config is None:
-#             # Default config from Pi0
-#             self.gemma_expert_config = CONFIG_MAPPING["gemma"](
-#                 attention_bias=False,
-#                 attention_dropout=0.0,
-#                 bos_token_id=2,
-#                 eos_token_id=1,
-#                 head_dim=256,
-#                 hidden_act="gelu_pytorch_tanh",
-#                 hidden_activation="gelu_pytorch_tanh",
-#                 hidden_size=1024,
-#                 initializer_range=0.02,
-#                 intermediate_size=4096,
-#                 max_position_embeddings=8192,
-#                 model_type="gemma",
-#                 num_attention_heads=8,
-#                 num_hidden_layers=18,
-#                 num_key_value_heads=1,
-#                 pad_token_id=0,
-#                 rms_norm_eps=1e-06,
-#                 rope_theta=10000.0,
-#                 torch_dtype="float32",
-#                 transformers_version="4.48.1",
-#                 use_cache=True,
-#                 vocab_size=257152,
-#             )
-#         elif isinstance(self.gemma_expert_config, dict):
-#             # Override Pi0 default config for Gemma Expert
-#             if "model_type" not in gemma_expert_config:
-#                 gemma_expert_config["model_type"] = "gemma"
-
-#             cfg_cls = CONFIG_MAPPING[paligemma_config["model_type"]]
-#             self.gemma_expert_config = cfg_cls(**gemma_expert_config)
-
-#         super().__init__(**kwargs)
-
-#     def __post_init__(self):
-#         super().__post_init__()
-#         if self.train_expert_only and not self.freeze_vision_encoder:
-#             raise ValueError(
-#                 "You set `freeze_vision_encoder=False` and `train_expert_only=True` which are not compatible."
-#             )
-
-#         if self.attention_implementation not in ["eager", "fa2", "flex"]:
-#             raise ValueError(
-#                 f"Wrong value provided for `attention_implementation` ({self.attention_implementation}). Expected 'eager', 'fa2' or 'flex'."
-#             )
-
-
-def get_intermediate_size(hidden_dim, ffn_dim_multiplier=4, multiple_of=256):
-    hidden_dim = int(2 * hidden_dim / 3)
-    hidden_dim = int(ffn_dim_multiplier * hidden_dim)
-    hidden_dim = multiple_of * ((hidden_dim + multiple_of - 1) // multiple_of)
-    return hidden_dim
-
-
-class SmolVLMWithExpertModel(nn.Module):
-    # config_class = PaliGemmaWithExpertConfig
-
-    def __init__(
-        self,
-        model_id: str = "HuggingFaceTB/SmolVLM2-500M-Video-Instruct",
-        load_vlm_weights: bool = True,
-        train_expert_only: bool = True,
-        freeze_vision_encoder: bool = False,
-        attention_implementation: str = "eager",
-        attention_mode: str = "self_attn",
-        num_expert_layers: int = -1,
-        num_vlm_layers: int = -1,
-        self_attn_every_n_layers: int = -1,
-        expert_width_multiplier: float = 0.5,
-        self_attn_only_actions: bool = False,
-    ):
-        super().__init__()
-        if load_vlm_weights:
-            print(f"Loading  {model_id} weights ...")
-            if "SmolVLM-" in model_id:
-                self.vlm = AutoModelForVision2Seq.from_pretrained(
-                    model_id,
-                    device_map="cuda",
-                    torch_dtype="bfloat16",
-                    low_cpu_mem_usage=True,
-                )
-            else:
-                # model_id = "HuggingFaceTB/SmolVLM2-500M-Video-Instruct"
-                self.vlm = AutoModelForImageTextToText.from_pretrained(
-                    model_id,
-                    device_map="cuda",
-                    torch_dtype="bfloat16",
-                    low_cpu_mem_usage=True,
-                    # attn_implementation="eager",
-                    # attn_implementation="flash_attention_2"
-                )
-            config = self.vlm.config
-        else:
-            config = AutoConfig.from_pretrained(model_id)
-            self.vlm = SmolVLMForConditionalGeneration(config=config)
-        self.processor = AutoProcessor.from_pretrained(model_id)
-        if num_vlm_layers > 0:
-            print(f"Reducing the number of VLM layers to {num_vlm_layers} ...")
-            self.get_vlm_model().text_model.layers = self.get_vlm_model().text_model.layers[:num_vlm_layers]
-        self.num_vlm_layers = len(self.get_vlm_model().text_model.layers)
-        self.config = config
-        # Smaller lm expert
-        lm_expert_config = copy.deepcopy(config.text_config)
-        hidden_size = lm_expert_config.hidden_size
-        lm_expert_config.hidden_size = int(hidden_size * expert_width_multiplier)  # hidden_size // 2
-        lm_expert_config.intermediate_size = get_intermediate_size(int(hidden_size * expert_width_multiplier))
-        lm_expert_config.num_hidden_layers = self.num_vlm_layers
-        if num_expert_layers > 0:
-            assert len(self.get_vlm_model().text_model.layers) % num_expert_layers == 0, (
-                f"Number of layers in the VLM {len(self.get_vlm_model().text_model.layers)} are not multiple of num_expert_layers {num_expert_layers}"
-            )
-            lm_expert_config.num_hidden_layers = num_expert_layers
-        # lm_expert_config.head_dim = lm_expert_config.head_dim * 2
-        self.lm_expert = AutoModel.from_config(lm_expert_config)
-
-        self.num_expert_layers = len(self.lm_expert.layers)
-        self.self_attn_every_n_layers = self_attn_every_n_layers
-        self.self_attn_only_actions = self_attn_only_actions
-        if "cross" in attention_mode:
-            # Reshape qkv projections to have the same input dimension as the vlm
-            for layer_idx in range(len(self.lm_expert.layers)):
-                if self.self_attn_every_n_layers > 0 and layer_idx % self.self_attn_every_n_layers == 0:
-                    continue
-                self.lm_expert.layers[layer_idx].self_attn.k_proj = nn.Linear(
-                    config.text_config.num_key_value_heads * config.text_config.head_dim,
-                    lm_expert_config.num_key_value_heads * lm_expert_config.head_dim,
-                    bias=lm_expert_config.attention_bias,
-                )
-                self.lm_expert.layers[layer_idx].self_attn.v_proj = nn.Linear(
-                    config.text_config.num_key_value_heads * config.text_config.head_dim,
-                    lm_expert_config.num_key_value_heads * lm_expert_config.head_dim,
-                    bias=lm_expert_config.attention_bias,
-                )
-        # Remove unused embed_tokens
-        self.lm_expert.embed_tokens = None
-
-        self.num_attention_heads = self.config.text_config.num_attention_heads
-        self.num_key_value_heads = self.config.text_config.num_key_value_heads
-
-        self.freeze_vision_encoder = freeze_vision_encoder
-        self.train_expert_only = train_expert_only
-        self.attention_implementation = attention_implementation
-        self.attention_mode = attention_mode
-        self.expert_hidden_size = lm_expert_config.hidden_size
-        # self.to_bfloat16_like_physical_intelligence()
-        self.set_requires_grad()
-
-    def configure_peft(self, config):
-        # return model
-        self.peft_method = config.peft_method
-        self.peft_target_model = config.peft_target_model
-        if "lora" in self.peft_method:
-            peft_config = config.peft_config
-            target_modules = peft_config.target_modules
-            if not isinstance(target_modules, list):
-                target_modules = target_modules.split(",")
-            lora_config = LoraConfig(
-                task_type=TaskType.CAUSAL_LM,  # Based on the task type (e.g., language modeling, etc.)
-                r=peft_config.r,  # The rank of the low-rank adaptation
-                lora_alpha=peft_config.lora_alpha,  # Scaling factor
-                lora_dropout=peft_config.lora_dropout,  # Dropout applied to LoRA layers
-                target_modules=target_modules,  # The components where LoRA is applied
-                exclude_modules=[
-                    "lm_expert",
-                    "model.lm_expert.model.layers",
-                ],  # FIXME(mshukor): this does not work for now
-            )
-            self.lora_config = lora_config
-            # Apply LoRA and ensure only LoRA parameters are trainable
-            if "text" in self.peft_target_model:
-                self.get_vlm_model().text_model = get_peft_model(self.get_vlm_model().text_model, lora_config)
-            else:
-                self.vlm = get_peft_model(self.vlm, lora_config)
-            # assert config.train_expert_only, "Backbone should be frozen and only lora parameters are " # FIXME(mshukor): handle this here?
-            for name, param in self.vlm.named_parameters():
-                if (
-                    "lora" in name and "text_model.model.layers.17" not in name
-                ):  # lm_head is not a parameter in most LLMs becasue it's tied to the embedding layer
-                    param.requires_grad = True
-                else:
-                    param.requires_grad = False
-
-    def merge_lora_weights(self):
-        """
-        Merge LoRA weights into the base model.
-        """
-        if "text" in self.peft_target_model:
-            self.get_vlm_model().text_model = self.get_vlm_model().text_model.merge_and_unload()
-        else:
-            self.vlm = self.vlm.merge_and_unload()
-
-    def get_vlm_model(
-        self,
-    ):
-        if hasattr(self.vlm.model, "model"):  # When using peft
-            return self.vlm.model.model
-        else:
-            return self.vlm.model
-
-    def set_requires_grad(self):
-        if self.freeze_vision_encoder:
-            self.get_vlm_model().vision_model.eval()
-            for params in self.get_vlm_model().vision_model.parameters():
-                params.requires_grad = False
-        if self.train_expert_only:
-            self.vlm.eval()
-            for params in self.vlm.parameters():
-                params.requires_grad = False
-        else:
-            # To avoid unused params issue with distributed training
-            last_layers = [self.num_vlm_layers - 1]
-            if (
-                self.num_vlm_layers != self.num_expert_layers
-                and self.num_vlm_layers % self.num_expert_layers == 0
-            ):
-                last_layers.append(self.num_vlm_layers - 2)
-            frozen_layers = [
-                "lm_head",
-                "text_model.model.norm.weight",
-            ]
-            for layer in last_layers:
-                frozen_layers.append(f"text_model.model.layers.{layer}.")
-
-            for name, params in self.vlm.named_parameters():
-                if any([k in name for k in frozen_layers]):
-                    params.requires_grad = False
-        # To avoid unused params issue with distributed training
-        for name, params in self.lm_expert.named_parameters():
-            if any(
-                [
-                    k in name
-                    for k in [
-                        "lm_head",
-                    ]
-                ]
-            ):
-                params.requires_grad = False
-
-    def train(self, mode: bool = True):
-        super().train(mode)
-
-        if self.freeze_vision_encoder:
-            self.get_vlm_model().vision_model.eval()
-
-        if self.train_expert_only:
-            self.vlm.eval()
-
-    # def to_bfloat16_like_physical_intelligence(self):
-    #     self.vlm = self.vlm.to(dtype=torch.bfloat16)
-
-    #     params_to_change_dtype = [
-    #         "language_model.model.layers",
-    #         "gemma_expert.model.layers",
-    #         "vision_tower",
-    #         "multi_modal",
-    #     ]
-    #     for name, param in self.named_parameters():
-    #         if any(selector in name for selector in params_to_change_dtype):
-    #             param.data = param.data.to(dtype=torch.bfloat16)
-
-    def embed_image(self, image: torch.Tensor):
-        patch_attention_mask = None
-        # # FIXME(mshukor): probably not needed as we don't have padded images here
-        # pixel_values = image.unsqueeze(1)
-        # batch_size, num_images, num_channels, height, width = pixel_values.shape
-        # pixel_values = pixel_values
-        # pixel_values = pixel_values.view(batch_size * num_images, *pixel_values.shape[2:])
-
-        # # Remove padding images - padding images are full 0.
-        # nb_values_per_image = pixel_values.shape[1:].numel()
-        # real_images_inds = (pixel_values == 0.0).sum(dim=(-1, -2, -3)) != nb_values_per_image
-
-        # if not any(real_images_inds):
-        #     # no images, leave one empty image.
-        #     real_images_inds[0] = True
-
-        # pixel_values = pixel_values[real_images_inds].contiguous()
-
-        # # Handle the vision attention mask
-
-        # pixel_attention_mask = torch.ones(
-        #     size=[pixel_values.shape[i] for i in (0, 2, 3)],
-        #     dtype=torch.bool,
-        #     device=pixel_values.device,
-        # )
-
-        # patch_size = self.vlm.config.vision_config.patch_size
-        # patches_subgrid = pixel_attention_mask.unfold(dimension=1, size=patch_size, step=patch_size)
-        # patches_subgrid = patches_subgrid.unfold(dimension=2, size=patch_size, step=patch_size)
-        # patch_attention_mask = (patches_subgrid.sum(dim=(-1, -2)) > 0).bool()
-
-        # FIXME(mshukor): add special image tokens specific to smolvlm
-        # Get sequence from the vision encoder
-        image_hidden_states = (
-            self.get_vlm_model()
-            .vision_model(
-                pixel_values=image.to(dtype=self.get_vlm_model().vision_model.dtype),
-                patch_attention_mask=patch_attention_mask,
-            )
-            .last_hidden_state
-        )
-        # Modality projection & resampling
-        image_hidden_states = self.get_vlm_model().connector(image_hidden_states)
-        return image_hidden_states
-
-    def embed_language_tokens(self, tokens: torch.Tensor):
-        return self.get_vlm_model().text_model.get_input_embeddings()(tokens)
-
-    def forward_attn_layer(
-        self,
-        model_layers,
-        inputs_embeds,
-        layer_idx,
-        position_ids,
-        attention_mask,
-        batch_size,
-        head_dim,
-        use_cache: bool = True,
-        fill_kv_cache: bool = True,
-        past_key_values=None,
-    ) -> list[torch.Tensor]:
-        query_states = []
-        key_states = []
-        value_states = []
-        for i, hidden_states in enumerate(inputs_embeds):
-            layer = model_layers[i][layer_idx]
-            if hidden_states is None or layer is None:
-                continue
-
-            # normalizer = torch.tensor(models[i].config.hidden_size**0.5, dtype=hidden_states.dtype)
-            # hidden_states = hidden_states * normalizer
-            hidden_states = layer.input_layernorm(hidden_states)
-
-            input_shape = hidden_states.shape[:-1]
-            hidden_shape = (*input_shape, -1, layer.self_attn.head_dim)
-
-            hidden_states = hidden_states.to(dtype=layer.self_attn.q_proj.weight.dtype)
-            query_state = layer.self_attn.q_proj(hidden_states).view(hidden_shape)
-            key_state = layer.self_attn.k_proj(hidden_states).view(hidden_shape)
-            value_state = layer.self_attn.v_proj(hidden_states).view(hidden_shape)
-
-            query_states.append(query_state)
-            key_states.append(key_state)
-            value_states.append(value_state)
-
-        # FIXME(mshukor): self attention always when having only the prefix
-        # B,L,H,D with L sequence length, H number of heads, D head dim
-        # concatenate on the number of embeddings/tokens
-        query_states = torch.cat(query_states, dim=1)
-        key_states = torch.cat(key_states, dim=1)
-        value_states = torch.cat(value_states, dim=1)
-        # FIXME(mshukor): seq should be B, H, L, D ?
-        seq_len = query_states.shape[1]
-        if seq_len < position_ids.shape[1]:
-            _position_ids = position_ids[:, :seq_len]
-            _attention_mask = attention_mask[:, :seq_len, :seq_len]
-        else:
-            _position_ids = position_ids
-            _attention_mask = attention_mask
-
-        if self.self_attn_only_actions:
-            attention_mask_ = _attention_mask.clone()
-            position_ids_ = _position_ids.clone()
-            if inputs_embeds[1] is not None:
-                suffix_len = inputs_embeds[1].shape[1]
-                attention_mask_[:, -suffix_len:, :-suffix_len] = False
-                position_ids_[:, -suffix_len:] = (
-                    _position_ids[:, -suffix_len:] - _position_ids[:, -suffix_len][:, None]
-                )
-        else:
-            attention_mask_ = _attention_mask
-            position_ids_ = _position_ids
-
-        query_states = apply_rope(
-            query_states, position_ids_
-        )  # FIXME(mshukor): this assumes we have always the vlm features?
-        key_states = apply_rope(key_states, position_ids_)
-
-        if use_cache and past_key_values is None:
-            past_key_values = {}
-
-        if use_cache:
-            if fill_kv_cache:
-                past_key_values[layer_idx] = {
-                    "key_states": key_states,
-                    "value_states": value_states,
-                }
-            else:
-                # TODO here, some optimization can be done - similar to a `StaticCache` we can declare the `max_len` before.
-                # so we create an empty cache, with just one cuda malloc, and if (in autoregressive case) we reach
-                # the max len, then we (for instance) double the cache size. This implementation already exists
-                # in `transformers`. (molbap)
-                key_states = torch.cat([past_key_values[layer_idx]["key_states"], key_states], dim=1)
-                value_states = torch.cat([past_key_values[layer_idx]["value_states"], value_states], dim=1)
-
-        attention_interface = self.get_attention_interface()
-
-        att_output = attention_interface(
-            attention_mask_, batch_size, head_dim, query_states, key_states, value_states
-        )
-        # att_output = att_output.to(dtype=models[i].dtype)
-
-        return [att_output], past_key_values
-
-    def forward_cross_attn_layer(
-        self,
-        model_layers,
-        inputs_embeds,
-        layer_idx,
-        position_ids,
-        attention_mask,
-        batch_size,
-        head_dim,
-        use_cache: bool = True,
-        fill_kv_cache: bool = True,
-        past_key_values=None,
-    ) -> list[torch.Tensor]:
-        attention_interface = self.get_attention_interface()
-
-        att_outputs = []
-        assert len(inputs_embeds) == 2 or (use_cache and past_key_values is not None and not fill_kv_cache), (
-            f"Both len(inputs_embeds) == {len(inputs_embeds)} and past_key_values is {past_key_values}"
-        )
-
-        if len(inputs_embeds) == 2 and not past_key_values:
-            # Prefix attention
-            seq_len = inputs_embeds[0].shape[1]
-            position_id, expert_position_id = position_ids[:, :seq_len], position_ids[:, seq_len:]
-            prefix_attention_mask = attention_mask[:, :seq_len, :seq_len]
-
-            layer = model_layers[0][layer_idx]
-
-            hidden_states = layer.input_layernorm(inputs_embeds[0])
-
-            input_shape = hidden_states.shape[:-1]
-            hidden_shape = (*input_shape, -1, layer.self_attn.head_dim)
-
-            hidden_states = hidden_states.to(dtype=layer.self_attn.q_proj.weight.dtype)
-            query_state = layer.self_attn.q_proj(hidden_states).view(hidden_shape)
-            key_state = layer.self_attn.k_proj(hidden_states).view(hidden_shape)
-            value_states = layer.self_attn.v_proj(hidden_states).view(hidden_shape)
-
-            # B,L,H,D with L sequence length, H number of heads, D head dim
-            query_states = apply_rope(query_state, position_id)
-            key_states = apply_rope(key_state, position_id)
-
-            att_output = attention_interface(
-                prefix_attention_mask, batch_size, head_dim, query_states, key_states, value_states
-            )
-            att_outputs.append(att_output)
-        else:
-            expert_position_id = position_ids
-
-        if use_cache and past_key_values is None:
-            past_key_values = {}
-
-        if use_cache:
-            if fill_kv_cache:
-                past_key_values[layer_idx] = {
-                    "key_states": key_states,
-                    "value_states": value_states,
-                }
-            else:
-                # TODO here, some optimization can be done - similar to a `StaticCache` we can declare the `max_len` before.
-                # so we create an empty cache, with just one cuda malloc, and if (in autoregressive case) we reach
-                # the max len, then we (for instance) double the cache size. This implementation already exists
-                # in `transformers`. (molbap)
-                key_states = past_key_values[layer_idx]["key_states"]
-                value_states = past_key_values[layer_idx]["value_states"]
-
-        # Expert
-        expert_layer = model_layers[1][layer_idx]
-        if expert_layer is not None:
-            expert_hidden_states = expert_layer.input_layernorm(inputs_embeds[1])
-
-            expert_input_shape = expert_hidden_states.shape[:-1]
-            expert_hidden_shape = (*expert_input_shape, -1, expert_layer.self_attn.head_dim)
-
-            expert_hidden_states = expert_hidden_states.to(dtype=expert_layer.self_attn.q_proj.weight.dtype)
-            expert_query_state = expert_layer.self_attn.q_proj(expert_hidden_states).view(expert_hidden_shape)
-
-            _key_states = key_states.to(dtype=expert_layer.self_attn.k_proj.weight.dtype).view(
-                *key_states.shape[:2], -1
-            )
-            expert_key_states = expert_layer.self_attn.k_proj(_key_states).view(
-                *_key_states.shape[:-1], -1, expert_layer.self_attn.head_dim
-            )  # k_proj should have same dim as kv
-
-            _value_states = value_states.to(dtype=expert_layer.self_attn.v_proj.weight.dtype).view(
-                *value_states.shape[:2], -1
-            )
-            expert_value_states = expert_layer.self_attn.v_proj(_value_states).view(
-                *_value_states.shape[:-1], -1, expert_layer.self_attn.head_dim
-            )
-
-            expert_position_id = (
-                expert_position_id - torch.min(expert_position_id, dim=1, keepdim=True).values
-            )  # start from 0
-            expert_attention_mask = attention_mask[
-                :, -inputs_embeds[1].shape[1] :, : expert_key_states.shape[1] :
-            ]  # take into account kv
-
-            expert_query_states = apply_rope(expert_query_state, expert_position_id)
-            # expert_key_states = apply_rope(expert_key_state, expert_position_id)
-
-            att_output = attention_interface(
-                expert_attention_mask,
-                batch_size,
-                head_dim,
-                expert_query_states,
-                expert_key_states,
-                expert_value_states,
-            )
-            att_outputs.append(att_output)
-        else:
-            att_outputs.append(None)
-
-        # att_output = att_output.to(dtype=models[i].dtype)
-        return att_outputs, past_key_values
-
-    def get_model_layers(self, models: list) -> list:  # FIXME(mshukor): is this efficient?
-        vlm_layers = []
-        expert_layers = []
-        multiple_of = self.num_vlm_layers // self.num_expert_layers
-        for i in range(self.num_vlm_layers):
-            if multiple_of > 0 and i > 0 and i % multiple_of != 0:
-                expert_layer = None
-            else:
-                expert_layer_index = i // multiple_of if multiple_of > 0 else i
-                expert_layer = models[1].layers[expert_layer_index]
-            vlm_layers.append(models[0].layers[i])
-            expert_layers.append(expert_layer)
-        return [vlm_layers, expert_layers]
-
-    # TODO: break down this huge forward into modules or functions
-    def forward(
-        self,
-        attention_mask: torch.Tensor | None = None,
-        position_ids: torch.LongTensor | None = None,
-        past_key_values: list[torch.FloatTensor] | Cache | None = None,
-        inputs_embeds: list[torch.FloatTensor] = None,
-        use_cache: bool | None = None,
-        fill_kv_cache: bool | None = None,
-    ):
-        models = [self.get_vlm_model().text_model, self.lm_expert]
-        model_layers = self.get_model_layers(models)
-        for hidden_states in inputs_embeds:
-            # TODO this is very inefficient
-            # dtype is always the same, batch size too (if > 1 len)
-            # device could be trickier in multi gpu edge cases but that's it
-            if hidden_states is None:
-                continue
-            batch_size = hidden_states.shape[0]
-
-        # # Pad prefix embds so that prefix_embs + prefix_embs len are multiple of 128, pad left or right depending on the gen or train
-        if self.attention_implementation == "flex":
-            if (
-                inputs_embeds[0] is not None
-                and inputs_embeds[1] is not None
-                and attention_mask.shape[-1] == attention_mask.shape[-2]
-                and past_key_values is None
-            ):  # Now only during training
-                seq_len = inputs_embeds[0].shape[1] + inputs_embeds[1].shape[1]
-                padded_seq_len = _round_up_to_multiple(
-                    seq_len, 128
-                )  # FIXME(mshukor): more efficient to have a fixed seq len?
-                b_mask, q_len, kv_len = attention_mask.shape  # The shape of your mask
-                pad = padded_seq_len - q_len
-                attention_mask = F.pad(attention_mask, (0, pad, 0, pad), value=True)
-                inputs_embeds[0] = F.pad(inputs_embeds[0], (0, 0, 0, pad), value=0.0)
-                position_ids = F.pad(position_ids, (0, pad), value=0)
-
-        # RMSNorm
-        num_layers = self.num_vlm_layers
-        head_dim = self.vlm.config.text_config.head_dim
-        for layer_idx in range(num_layers):
-            if (
-                fill_kv_cache
-                or "cross" not in self.attention_mode
-                or (self.self_attn_every_n_layers > 0 and layer_idx % self.self_attn_every_n_layers == 0)
-            ):
-                att_outputs, past_key_values = self.forward_attn_layer(
-                    model_layers,
-                    inputs_embeds,
-                    layer_idx,
-                    position_ids,
-                    attention_mask,
-                    batch_size,
-                    head_dim,
-                    use_cache=use_cache,
-                    fill_kv_cache=fill_kv_cache,
-                    past_key_values=past_key_values,
-                )
-            else:
-                att_outputs, past_key_values = self.forward_cross_attn_layer(
-                    model_layers,
-                    inputs_embeds,
-                    layer_idx,
-                    position_ids,
-                    attention_mask,
-                    batch_size,
-                    head_dim,
-                    use_cache=use_cache,
-                    fill_kv_cache=fill_kv_cache,
-                    past_key_values=past_key_values,
-                )
-            # query_states = []
-            # key_states = []
-            # value_states = []
-            # for i, hidden_states in enumerate(inputs_embeds):
-            #     if hidden_states is None:
-            #         continue
-            #     layer = models[i].layers[layer_idx]
-            #     # normalizer = torch.tensor(models[i].config.hidden_size**0.5, dtype=hidden_states.dtype)
-            #     # hidden_states = hidden_states * normalizer
-            #     hidden_states = layer.input_layernorm(hidden_states)
-
-            #     input_shape = hidden_states.shape[:-1]
-            #     hidden_shape = (*input_shape, -1, layer.self_attn.head_dim)
-
-            #     hidden_states = hidden_states.to(dtype=layer.self_attn.q_proj.weight.dtype)
-            #     query_state = layer.self_attn.q_proj(hidden_states).view(hidden_shape)
-            #     key_state = layer.self_attn.k_proj(hidden_states).view(hidden_shape)
-            #     value_state = layer.self_attn.v_proj(hidden_states).view(hidden_shape)
-
-            #     query_states.append(query_state)
-            #     key_states.append(key_state)
-            #     value_states.append(value_state)
-
-            # # FIXME(mshukor): self attention always when having only the prefix
-            # # B,L,H,D with L sequence length, H number of heads, D head dim
-            # # concatenate on the number of embeddings/tokens
-            # query_states = torch.cat(query_states, dim=1)
-            # key_states = torch.cat(key_states, dim=1)
-            # value_states = torch.cat(value_states, dim=1)
-            # # FIXME(mshukor): seq should be B, H, L, D ?
-            # query_states = apply_rope(query_states, position_ids)
-            # key_states = apply_rope(key_states, position_ids)
-
-            # if use_cache and past_key_values is None:
-            #     past_key_values = {}
-
-            # if use_cache:
-            #     if fill_kv_cache:
-            #         past_key_values[layer_idx] = {
-            #             "key_states": key_states,
-            #             "value_states": value_states,
-            #         }
-            #     else:
-            #         # TODO here, some optimization can be done - similar to a `StaticCache` we can declare the `max_len` before.
-            #         # so we create an empty cache, with just one cuda malloc, and if (in autoregressive case) we reach
-            #         # the max len, then we (for instance) double the cache size. This implementation already exists
-            #         # in `transformers`. (molbap)
-            #         key_states = torch.cat([past_key_values[layer_idx]["key_states"], key_states], dim=1)
-            #         value_states = torch.cat(
-            #             [past_key_values[layer_idx]["value_states"], value_states], dim=1
-            #         )
-
-            # attention_interface = self.get_attention_interface()
-            # att_output = attention_interface(
-            #     attention_mask, batch_size, head_dim, query_states, key_states, value_states
-            # )
-
-            # att_output = att_output.to(dtype=models[i].dtype)
-
-            # first part of att_output is prefix (up to sequence length, [:, 0:prefix_seq_len])
-            outputs_embeds = []
-            start = 0
-            for i, hidden_states in enumerate(inputs_embeds):
-                # layer = models[i].layers[layer_idx]
-                layer = model_layers[i][layer_idx]
-                att_output = (
-                    att_outputs[i] if i < len(att_outputs) else att_outputs[0]
-                )  # in case of self_attn
-                if hidden_states is not None:
-                    if layer is None:
-                        outputs_embeds.append(hidden_states)
-                        continue
-                    end = start + hidden_states.shape[1]
-
-                    if att_output.dtype != layer.self_attn.o_proj.weight.dtype:
-                        att_output = att_output.to(layer.self_attn.o_proj.weight.dtype)
-                    att_out = att_output[:, start:end]
-                    out_emb = layer.self_attn.o_proj(att_out)
-
-                    # TODO: first dropout (by default 0.0)
-                    # first residual
-                    out_emb += hidden_states
-                    after_first_residual = out_emb.clone()
-
-                    out_emb = layer.post_attention_layernorm(out_emb)
-                    out_emb = layer.mlp(out_emb)
-
-                    # TODO: second dropout (by default 0.0)
-
-                    # second residual
-                    out_emb += after_first_residual
-
-                    outputs_embeds.append(out_emb)
-
-                    start = end if len(att_outputs) == 1 else 0
-                else:
-                    outputs_embeds.append(None)
-
-            inputs_embeds = outputs_embeds
-
-        # final norm
-        outputs_embeds = []
-        for i, hidden_states in enumerate(inputs_embeds):
-            if hidden_states is not None:
-                out_emb = models[i].norm(hidden_states)
-                outputs_embeds.append(out_emb)
-            else:
-                outputs_embeds.append(None)
-        return outputs_embeds, past_key_values
-
-    def get_attention_interface(self):
-        if self.attention_implementation == "fa2":
-            attention_interface = self.flash_attention_forward
-        elif self.attention_implementation == "flex":
-            attention_interface = partial(
-                flex_attention_forward,
-                num_att_heads=self.num_attention_heads,
-                num_key_value_heads=self.num_key_value_heads,
-            )
-        else:
-            attention_interface = self.eager_attention_forward
-        return attention_interface
-
-    def flash_attention_forward(
-        self, attention_mask, batch_size, head_dim, query_states, key_states, value_states
-    ):
-        raise NotImplementedError("FA2 is not implemented (yet)")
-
-    def eager_attention_forward(
-        self, attention_mask, batch_size, head_dim, query_states, key_states, value_states
-    ):
-        num_att_heads = self.num_attention_heads
-        num_key_value_heads = self.num_key_value_heads
-        num_key_value_groups = num_att_heads // num_key_value_heads
-
-        # query_states: batch_size, sequence_length, num_att_head, head_dim
-        # key_states: batch_size, sequence_length, num_key_value_head, head_dim
-        # value_states: batch_size, sequence_length, num_key_value_head, head_dim
-        sequence_length = key_states.shape[1]
-
-        key_states = key_states[:, :, :, None, :].expand(
-            batch_size, sequence_length, num_key_value_heads, num_key_value_groups, head_dim
-        )
-        key_states = key_states.reshape(
-            batch_size, sequence_length, num_key_value_heads * num_key_value_groups, head_dim
-        )
-
-        value_states = value_states[:, :, :, None, :].expand(
-            batch_size, sequence_length, num_key_value_heads, num_key_value_groups, head_dim
-        )
-        value_states = value_states.reshape(
-            batch_size, sequence_length, num_key_value_heads * num_key_value_groups, head_dim
-        )
-
-        # Attention here is upcasted to float32 to match the original eager implementation.
-
-        query_states = query_states.to(dtype=torch.float32)
-        key_states = key_states.to(dtype=torch.float32)
-
-        query_states = query_states.transpose(1, 2)
-        key_states = key_states.transpose(1, 2)
-
-        att_weights = torch.matmul(query_states, key_states.transpose(2, 3))
-        att_weights *= head_dim**-0.5
-
-        att_weights = att_weights.to(dtype=torch.float32)
-        big_neg = torch.finfo(att_weights.dtype).min  # -2.3819763e38  # See gemma/modules.py
-        masked_att_weights = torch.where(attention_mask[:, None, :, :], att_weights, big_neg)
-        probs = nn.functional.softmax(masked_att_weights, dim=-1)
-        probs = probs.to(dtype=value_states.dtype)
-
-        # probs: batch_size, num_key_value_head, num_att_head, sequence_length, sequence_length
-        # value_states: batch_size, sequence_length, num_att_heads, head_dim
-
-        att_output = torch.matmul(probs, value_states.permute(0, 2, 1, 3))
-
-        att_output = att_output.permute(0, 2, 1, 3)
-        # we use -1 because sequence length can change
-        att_output = att_output.reshape(batch_size, -1, num_key_value_heads * num_key_value_groups * head_dim)
-
-        return att_output
diff --git a/src/lerobot/policies/smolvla/modeling_smolvla.py b/src/lerobot/policies/smolvla/modeling_smolvla.py
index 9bb22d7f7..6bf956aa3 100644
--- a/src/lerobot/policies/smolvla/modeling_smolvla.py
+++ b/src/lerobot/policies/smolvla/modeling_smolvla.py
@@ -1,955 +1,3 @@
-# #!/usr/bin/env python
-
-# # Copyright 2025 HuggingFace Inc. team. All rights reserved.
-# #
-# # Licensed under the Apache License, Version 2.0 (the "License");
-# # you may not use this file except in compliance with the License.
-# # You may obtain a copy of the License at
-# #
-# #     http://www.apache.org/licenses/LICENSE-2.0
-# #
-# # Unless required by applicable law or agreed to in writing, software
-# # distributed under the License is distributed on an "AS IS" BASIS,
-# # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-# # See the License for the specific language governing permissions and
-# # limitations under the License.
-
-# """
-# SmolVLA:
-
-# [Paper](https://huggingface.co/papers/2506.01844)
-
-# Designed by Hugging Face.
-
-# Install smolvla extra dependencies:
-# ```bash
-# pip install -e ".[smolvla]"
-# ```
-
-# Example of finetuning the smolvla pretrained model (`smolvla_base`):
-# ```bash
-# lerobot-train \
-# --policy.path=lerobot/smolvla_base \
-# --dataset.repo_id=danaaubakirova/svla_so100_task1_v3 \
-# --batch_size=64 \
-# --steps=200000
-# ```
-
-# Example of finetuning a smolVLA. SmolVLA is composed of a pretrained VLM,
-# and an action expert.
-# ```bash
-# lerobot-train \
-# --policy.type=smolvla \
-# --dataset.repo_id=danaaubakirova/svla_so100_task1_v3 \
-# --batch_size=64 \
-# --steps=200000
-# ```
-
-# Example of using the smolvla pretrained model outside LeRobot training framework:
-# ```python
-# policy = SmolVLAPolicy.from_pretrained("lerobot/smolvla_base")
-# ```
-
-# """
-
-# import math
-# import os
-# import re
-# from collections import deque
-
-# import safetensors
-# import torch
-# import torch.nn.functional as F  # noqa: N812
-# from torch import Tensor, nn
-# from transformers import AutoProcessor
-
-# from lerobot.constants import ACTION
-# from lerobot.policies.normalize import (
-#     Normalize,
-#     Unnormalize,
-# )
-# from lerobot.policies.pretrained import PreTrainedPolicy
-# from lerobot.policies.smolvla.configuration_smolvla import SmolVLAConfig
-# from lerobot.policies.smolvla.smolvlm_with_expert import SmolVLMWithExpertModel
-# from lerobot.policies.utils import (
-#     populate_queues,
-# )
-# from lerobot.utils.utils import get_safe_dtype
-# OBS_STATE = 'state'
-# ACTION = 'actions'
-# # Matches ".soNNN", optionally followed by "-something", up to the "_buffer_" marker
-# _VARIANT_RE = re.compile(r"\.so\d+(?:-[\w]+)?_buffer_")
-
-
-# def canonicalise(k: str) -> str:
-#     """
-#     Remove dataset-variant markers like '.so100-blue_' or '.so100_' from a
-#     normalisation-buffer key.
-#     """
-#     return _VARIANT_RE.sub(".buffer_", k)
-
-
-# def standardise_state_dict(
-#     checkpoint: dict[str, torch.Tensor], ref_keys: set[str], *, verbose: bool = True
-# ) -> tuple[dict[str, torch.Tensor], list[str]]:
-#     """
-#     • Re-keys `checkpoint ` so that every entry matches the *reference* key set.
-#     • If several variant keys collapse to the same canonical name we keep the
-#       first one and log the collision.
-#     • Returns the new dict + a list of entries that could not be matched.
-#     """
-#     out, collisions, unmatched = {}, {}, []
-
-#     for k, v in checkpoint.items():
-#         canon = canonicalise(k)
-#         if canon in ref_keys:
-#             if canon in out:  # duplicate after collapsing
-#                 collisions.setdefault(canon, []).append(k)
-#             else:
-#                 out[canon] = v
-#         else:
-#             unmatched.append(k)
-
-#     if verbose:
-#         for canon, variants in collisions.items():
-#             print(f"[standardise_state_dict] '{canon}'  ←  {variants}")
-#         if unmatched:
-#             print(f"[standardise_state_dict] kept {len(unmatched)} unmatched keys")
-
-#     out.update({k: checkpoint[k] for k in unmatched})
-#     return out, unmatched
-
-
-# def rename_checkpoint_keys(checkpoint: dict, rename_str: str):
-#     """
-#     Renames keys in a checkpoint dictionary based on the given rename string.
-
-#     Args:
-#         checkpoint (dict): The checkpoint dictionary.
-#         rename_str (str): A string specifying key mappings in the format "old1//new1,old2//new2".
-
-#     Returns:
-#         dict: The modified checkpoint with renamed keys.
-#     """
-
-#     rename_dict = dict(pair.split("//") for pair in rename_str.split(","))
-
-#     new_checkpoint = {}
-#     for k, v in checkpoint.items():
-#         for old_key, new_key in rename_dict.items():
-#             if old_key in k:
-#                 k = k.replace(old_key, new_key)
-#         new_checkpoint[k] = v
-#     return new_checkpoint
-
-
-# def load_smolvla(
-#     model: torch.nn.Module,
-#     filename: str | os.PathLike,
-#     *,
-#     device: str = "cpu",
-#     checkpoint_keys_mapping: str = "",
-# ) -> torch.nn.Module:
-#     state_dict = safetensors.torch.load_file(filename, device=device)
-
-#     # Optional user-supplied renames (e.g. "model._orig_mod.//model.")
-#     if checkpoint_keys_mapping and "//" in checkpoint_keys_mapping:
-#         state_dict = rename_checkpoint_keys(state_dict, checkpoint_keys_mapping)
-
-#     state_dict, _ = standardise_state_dict(state_dict, set(model.state_dict().keys()))
-
-#     # HACK(aliberts): to not overwrite normalization parameters as they should come from the dataset
-#     norm_keys = ("normalize_inputs", "normalize_targets", "unnormalize_outputs")
-#     state_dict = {k: v for k, v in state_dict.items() if not k.startswith(norm_keys)}
-
-#     missing, unexpected = model.load_state_dict(state_dict, strict=False)
-
-#     if not all(key.startswith(norm_keys) for key in missing) or unexpected:
-#         raise RuntimeError(
-#             "SmolVLA %d missing / %d unexpected keys",
-#             len(missing),
-#             len(unexpected),
-#         )
-
-#     return model
-
-
-# def create_sinusoidal_pos_embedding(
-#     time: torch.tensor, dimension: int, min_period: float, max_period: float, device="cpu"
-# ) -> Tensor:
-#     """Computes sine-cosine positional embedding vectors for scalar positions."""
-#     if dimension % 2 != 0:
-#         raise ValueError(f"dimension ({dimension}) must be divisible by 2")
-
-#     if time.ndim != 1:
-#         raise ValueError("The time tensor is expected to be of shape `(batch_size, )`.")
-
-#     dtype = get_safe_dtype(torch.float64, device.type)
-#     fraction = torch.linspace(0.0, 1.0, dimension // 2, dtype=dtype, device=device)
-#     period = min_period * (max_period / min_period) ** fraction
-
-#     # Compute the outer product
-#     scaling_factor = 1.0 / period * 2 * math.pi
-#     sin_input = scaling_factor[None, :] * time[:, None]
-#     pos_emb = torch.cat([torch.sin(sin_input), torch.cos(sin_input)], dim=1)
-#     return pos_emb
-
-
-# def make_att_2d_masks(pad_masks, att_masks):
-#     """Copied from big_vision.
-
-#     Tokens can attend to valid inputs tokens which have a cumulative mask_ar
-#     smaller or equal to theirs. This way `mask_ar` int[B, N] can be used to
-#     setup several types of attention, for example:
-
-#       [[1 1 1 1 1 1]]: pure causal attention.
-
-#       [[0 0 0 1 1 1]]: prefix-lm attention. The first 3 tokens can attend between
-#           themselves and the last 3 tokens have a causal attention. The first
-#           entry could also be a 1 without changing behaviour.
-
-#       [[1 0 1 0 1 0 0 1 0 0]]: causal attention between 4 blocks. Tokens of a
-#           block can attend all previous blocks and all tokens on the same block.
-
-#     Args:
-#       input_mask: bool[B, N] true if its part of the input, false if padding.
-#       mask_ar: int32[B, N] mask that's 1 where previous tokens cannot depend on
-#         it and 0 where it shares the same attention mask as the previous token.
-#     """
-#     if att_masks.ndim != 2:
-#         raise ValueError(att_masks.ndim)
-#     if pad_masks.ndim != 2:
-#         raise ValueError(pad_masks.ndim)
-
-#     cumsum = torch.cumsum(att_masks, dim=1)
-#     att_2d_masks = cumsum[:, None, :] <= cumsum[:, :, None]
-#     pad_2d_masks = pad_masks[:, None, :] * pad_masks[:, :, None]
-#     att_2d_masks = att_2d_masks & pad_2d_masks
-#     return att_2d_masks
-
-
-# def resize_with_pad(img, width, height, pad_value=-1):
-#     # assume no-op when width height fits already
-#     if img.ndim != 4:
-#         raise ValueError(f"(b,c,h,w) expected, but {img.shape}")
-
-#     cur_height, cur_width = img.shape[2:]
-
-#     ratio = max(cur_width / width, cur_height / height)
-#     resized_height = int(cur_height / ratio)
-#     resized_width = int(cur_width / ratio)
-#     resized_img = F.interpolate(
-#         img, size=(resized_height, resized_width), mode="bilinear", align_corners=False
-#     )
-
-#     pad_height = max(0, int(height - resized_height))
-#     pad_width = max(0, int(width - resized_width))
-
-#     # pad on left and top of image
-#     padded_img = F.pad(resized_img, (pad_width, 0, pad_height, 0), value=pad_value)
-#     return padded_img
-
-
-# def pad_vector(vector, new_dim):
-#     """Can be (batch_size x sequence_length x features_dimension)
-#     or (batch_size x features_dimension)
-#     """
-#     if vector.shape[-1] == new_dim:
-#         return vector
-#     shape = list(vector.shape)
-#     current_dim = shape[-1]
-#     shape[-1] = new_dim
-#     new_vector = torch.zeros(*shape, dtype=vector.dtype, device=vector.device)
-#     new_vector[..., :current_dim] = vector
-#     return new_vector
-
-
-# def normalize(x, min_val, max_val):
-#     return (x - min_val) / (max_val - min_val)
-
-
-# def unnormalize(x, min_val, max_val):
-#     return x * (max_val - min_val) + min_val
-
-
-# def safe_arcsin(value):
-#     # This ensures that the input stays within
-#     # [−1,1] to avoid invalid values for arcsin
-#     return torch.arcsin(torch.clamp(value, -1.0, 1.0))
-
-
-# def aloha_gripper_to_angular(value):
-#     # Aloha transforms the gripper positions into a linear space. The following code
-#     # reverses this transformation to be consistent with smolvla which is pretrained in
-#     # angular space.
-#     #
-#     # These values are coming from the Aloha code:
-#     # PUPPET_GRIPPER_POSITION_OPEN, PUPPET_GRIPPER_POSITION_CLOSED
-#     value = unnormalize(value, min_val=0.01844, max_val=0.05800)
-
-#     # This is the inverse of the angular to linear transformation inside the Interbotix code.
-#     def linear_to_radian(linear_position, arm_length, horn_radius):
-#         value = (horn_radius**2 + linear_position**2 - arm_length**2) / (2 * horn_radius * linear_position)
-#         return safe_arcsin(value)
-
-#     # The constants are taken from the Interbotix code.
-#     value = linear_to_radian(value, arm_length=0.036, horn_radius=0.022)
-
-#     # Normalize to [0, 1].
-#     # The values 0.4 and 1.5 were measured on an actual Trossen robot.
-#     return normalize(value, min_val=0.4, max_val=1.5)
-
-
-# def aloha_gripper_from_angular(value):
-#     # Convert from the gripper position used by smolvla to the gripper position that is used by Aloha.
-#     # Note that the units are still angular but the range is different.
-
-#     # The values 0.4 and 1.5 were measured on an actual Trossen robot.
-#     value = unnormalize(value, min_val=0.4, max_val=1.5)
-
-#     # These values are coming from the Aloha code:
-#     # PUPPET_GRIPPER_JOINT_OPEN, PUPPET_GRIPPER_JOINT_CLOSE
-#     return normalize(value, min_val=-0.6213, max_val=1.4910)
-
-
-# def aloha_gripper_from_angular_inv(value):
-#     # Directly inverts the gripper_from_angular function.
-#     value = unnormalize(value, min_val=-0.6213, max_val=1.4910)
-#     return normalize(value, min_val=0.4, max_val=1.5)
-
-
-# class SmolVLAPolicy(PreTrainedPolicy):
-#     """Wrapper class around VLAFlowMatching model to train and run inference within LeRobot."""
-
-#     config_class = SmolVLAConfig
-#     name = "smolvla"
-
-#     def __init__(
-#         self,
-#         config: SmolVLAConfig,
-#         dataset_stats: dict[str, dict[str, Tensor]] | None = None,
-#     ):
-#         """
-#         Args:
-#             config: Policy configuration class instance or None, in which case the default instantiation of
-#                     the configuration class is used.
-#             dataset_stats: Dataset statistics to be used for normalization. If not passed here, it is expected
-#                 that they will be passed with a call to `load_state_dict` before the policy is used.
-#         """
-
-#         super().__init__(config)
-#         config.validate_features()
-#         self.config = config
-#         self.normalize_inputs = Normalize(config.input_features, config.normalization_mapping, dataset_stats)
-#         self.normalize_targets = Normalize(
-#             config.output_features, config.normalization_mapping, dataset_stats
-#         )
-#         self.unnormalize_outputs = Unnormalize(
-#             config.output_features, config.normalization_mapping, dataset_stats
-#         )
-
-#         self.language_tokenizer = AutoProcessor.from_pretrained(self.config.vlm_model_name).tokenizer
-#         self.model = VLAFlowMatching(config)
-#         self.reset()
-
-#     def reset(self):
-#         """This should be called whenever the environment is reset."""
-#         self._queues = {
-#             ACTION: deque(maxlen=self.config.n_action_steps),
-#         }
-
-#     # HACK(aliberts, danaaubakirova): we overwrite this classmethod here to fix smolVLA-specific issues
-#     @classmethod
-#     def _load_as_safetensor(
-#         cls,
-#         model: "SmolVLAPolicy",
-#         model_file: str,
-#         map_location: str,
-#         strict: bool,
-#     ):
-#         safetensors.torch.load_model(model, model_file, strict=strict, device=map_location)
-#         return load_smolvla(
-#             model,
-#             model_file,
-#             device=map_location,
-#             checkpoint_keys_mapping="model._orig_mod.//model.",
-#         )
-
-#     def get_optim_params(self) -> dict:
-#         return self.parameters()
-
-#     def _get_action_chunk(self, batch: dict[str, Tensor], noise: Tensor | None = None) -> Tensor:
-#         # TODO: Check if this for loop is needed.
-#         # Context: In fact, self.queues contains only ACTION field, and in inference, we don't have action in the batch
-#         # In the case of offline inference, we have the action in the batch
-#         # that why without the k != ACTION check, it will raise an error because we are trying to stack
-#         # on an empty container.
-#         for k in batch:
-#             if k in self._queues and k != ACTION:
-#                 batch[k] = torch.stack(list(self._queues[k]), dim=1)
-
-#         images, img_masks = self.prepare_images(batch)
-#         state = self.prepare_state(batch)
-#         lang_tokens, lang_masks = self.prepare_language(batch)
-
-#         actions = self.model.sample_actions(images, img_masks, lang_tokens, lang_masks, state, noise=noise)
-
-#         # Unpad actions
-#         original_action_dim = self.config.action_feature.shape[0]
-#         actions = actions[:, :, :original_action_dim]
-
-#         actions = self.unnormalize_outputs({ACTION: actions})[ACTION]
-
-#         if self.config.adapt_to_pi_aloha:
-#             actions = self._pi_aloha_encode_actions(actions)
-
-#         return actions
-
-#     def _prepare_batch(self, batch: dict[str, Tensor]) -> dict[str, Tensor]:
-#         if self.config.adapt_to_pi_aloha:
-#             batch[OBS_STATE] = self._pi_aloha_decode_state(batch[OBS_STATE])
-
-#         batch = self.normalize_inputs(batch)
-
-#         return batch
-
-#     @torch.no_grad()
-#     def predict_action_chunk(self, batch: dict[str, Tensor], noise: Tensor | None = None) -> Tensor:
-#         self.eval()
-
-#         batch = self._prepare_batch(batch)
-#         self._queues = populate_queues(self._queues, batch, exclude_keys=[ACTION])
-
-#         actions = self._get_action_chunk(batch, noise)
-#         return actions
-
-#     @torch.no_grad()
-#     def select_action(self, batch: dict[str, Tensor], noise: Tensor | None = None) -> Tensor:
-#         """Select a single action given environment observations.
-
-#         This method wraps `select_actions` in order to return one action at a time for execution in the
-#         environment. It works by managing the actions in a queue and only calling `select_actions` when the
-#         queue is empty.
-#         """
-#         self.eval()
-#         batch = self._prepare_batch(batch)
-#         self._queues = populate_queues(self._queues, batch, exclude_keys=[ACTION])
-
-#         # Action queue logic for n_action_steps > 1. When the action_queue is depleted, populate it by
-#         # querying the policy.
-#         if len(self._queues[ACTION]) == 0:
-#             actions = self._get_action_chunk(batch, noise)
-
-#             # `self.predict_action_chunk` returns a (batch_size, n_action_steps, action_dim) tensor, but the queue
-#             # effectively has shape (n_action_steps, batch_size, *), hence the transpose.
-#             self._queues[ACTION].extend(actions.transpose(0, 1)[: self.config.n_action_steps])
-
-#         return self._queues[ACTION].popleft()
-
-#     def forward(self, batch: dict[str, Tensor], noise=None, time=None) -> dict[str, Tensor]:
-#         """Do a full training forward pass to compute the loss"""
-#         if self.config.adapt_to_pi_aloha:
-#             batch[OBS_STATE] = self._pi_aloha_decode_state(batch[OBS_STATE])
-#             batch[ACTION] = self._pi_aloha_encode_actions_inv(batch[ACTION])
-#         batch = self.normalize_inputs(batch)
-#         batch = self.normalize_targets(batch)
-#         images, img_masks = self.prepare_images(batch)
-#         state = self.prepare_state(batch)
-#         lang_tokens, lang_masks = self.prepare_language(batch)
-#         actions = self.prepare_action(batch)
-#         actions_is_pad = batch.get("actions_id_pad")
-#         loss_dict = {}
-#         losses = self.model.forward(images, img_masks, lang_tokens, lang_masks, state, actions, noise, time)
-#         loss_dict["losses_after_forward"] = losses.clone()
-
-#         if actions_is_pad is not None:
-#             in_episode_bound = ~actions_is_pad
-#             losses = losses * in_episode_bound.unsqueeze(-1)
-#             loss_dict["losses_after_in_ep_bound"] = losses.clone()
-
-#         # Remove padding
-#         losses = losses[:, :, : self.config.max_action_dim]
-#         loss_dict["losses_after_rm_padding"] = losses.clone()
-
-#         # For backward pass
-#         loss = losses.mean()
-#         # For backward pass
-#         loss_dict["loss"] = loss.item()
-#         return loss, loss_dict
-
-#     def prepare_images(self, batch):
-#         """Apply SmolVLA preprocessing to the images, like resizing to 224x224 and padding to keep aspect ratio, and
-#         convert pixel range from [0.0, 1.0] to [-1.0, 1.0] as requested by SigLIP.
-#         """
-#         images = []
-#         img_masks = []
-#         present_img_keys = [key for key in self.config.image_features if key in batch]
-#         missing_img_keys = [key for key in self.config.image_features if key not in batch]
-
-#         if len(present_img_keys) == 0:
-#             raise ValueError(
-#                 f"All image features are missing from the batch. At least one expected. (batch: {batch.keys()}) (image_features:{self.config.image_features})"
-#             )
-#         # Preprocess image features present in the batch
-#         for key in present_img_keys:
-#             img = batch[key][:, -1, :, :, :] if batch[key].ndim == 5 else batch[key]
-#             if self.config.resize_imgs_with_padding is not None:
-#                 img = resize_with_pad(img, *self.config.resize_imgs_with_padding, pad_value=0)
-
-#             # Normalize from range [0,1] to [-1,1] as expacted by siglip
-#             img = img * 2.0 - 1.0
-
-#             bsize = img.shape[0]
-#             device = img.device
-#             if f"{key}_padding_mask" in batch:
-#                 mask = batch[f"{key}_padding_mask"].bool()
-#             else:
-#                 mask = torch.ones(bsize, dtype=torch.bool, device=device)
-#             images.append(img)
-#             img_masks.append(mask)
-
-#         # Create image features not present in the batch
-#         # as fully 0 padded images.
-#         for num_empty_cameras in range(len(missing_img_keys)):
-#             if num_empty_cameras >= self.config.empty_cameras:
-#                 break
-#             img = torch.ones_like(img) * -1
-#             mask = torch.zeros_like(mask)
-#             images.append(img)
-#             img_masks.append(mask)
-#         return images, img_masks
-
-#     def prepare_language(self, batch) -> tuple[Tensor, Tensor]:
-#         """Tokenize the text input"""
-#         device = batch[OBS_STATE].device
-#         tasks = batch["task"]
-#         if isinstance(tasks, str):
-#             tasks = [tasks]
-
-#         if len(tasks) == 1:
-#             tasks = [tasks[0] for _ in range(batch[OBS_STATE].shape[0])]
-
-#         tasks = [task if task.endswith("\n") else f"{task}\n" for task in tasks]
-
-#         tokenized_prompt = self.language_tokenizer.__call__(
-#             tasks,
-#             padding=self.config.pad_language_to,
-#             padding_side="right",
-#             max_length=self.config.tokenizer_max_length,
-#             return_tensors="pt",
-#         )
-#         lang_tokens = tokenized_prompt["input_ids"].to(device=device)
-#         lang_masks = tokenized_prompt["attention_mask"].to(device=device, dtype=torch.bool)
-
-#         return lang_tokens, lang_masks
-
-#     def _pi_aloha_decode_state(self, state):
-#         # Flip the joints.
-#         for motor_idx in [1, 2, 8, 9]:
-#             state[:, motor_idx] *= -1
-#         # Reverse the gripper transformation that is being applied by the Aloha runtime.
-#         for motor_idx in [6, 13]:
-#             state[:, motor_idx] = aloha_gripper_to_angular(state[:, motor_idx])
-#         return state
-
-#     def _pi_aloha_encode_actions(self, actions):
-#         # Flip the joints.
-#         for motor_idx in [1, 2, 8, 9]:
-#             actions[:, :, motor_idx] *= -1
-#         # Reverse the gripper transformation that is being applied by the Aloha runtime.
-#         for motor_idx in [6, 13]:
-#             actions[:, :, motor_idx] = aloha_gripper_from_angular(actions[:, :, motor_idx])
-#         return actions
-
-#     def _pi_aloha_encode_actions_inv(self, actions):
-#         # Flip the joints again.
-#         for motor_idx in [1, 2, 8, 9]:
-#             actions[:, :, motor_idx] *= -1
-#         # Reverse the gripper transformation that is being applied by the Aloha runtime.
-#         for motor_idx in [6, 13]:
-#             actions[:, :, motor_idx] = aloha_gripper_from_angular_inv(actions[:, :, motor_idx])
-#         return actions
-
-#     def prepare_state(self, batch):
-#         """Pad state"""
-#         state = batch[OBS_STATE][:, -1, :] if batch[OBS_STATE].ndim > 2 else batch[OBS_STATE]
-#         state = pad_vector(state, self.config.max_state_dim)
-#         return state
-
-#     def prepare_action(self, batch):
-#         """Pad action"""
-#         actions = pad_vector(batch[ACTION], self.config.max_action_dim)
-#         return actions
-
-
-# def pad_tensor(tensor, max_len, pad_value=0):
-#     """
-#     Efficiently pads a tensor along sequence dimension to match max_len.
-
-#     Args:
-#         tensor (torch.Tensor): Shape (B, L, ...) or (B, L).
-#         max_len (int): Fixed sequence length.
-#         pad_value (int/float): Value for padding.
-
-#     Returns:
-#         torch.Tensor: Shape (B, max_len, ...) or (B, max_len).
-#     """
-#     b, d = tensor.shape[:2]
-
-#     # Create a padded tensor of max_len and copy the existing values
-#     padded_tensor = torch.full(
-#         (b, max_len, *tensor.shape[2:]), pad_value, dtype=tensor.dtype, device=tensor.device
-#     )
-#     padded_tensor[:, :d] = tensor  # Efficient in-place copy
-
-#     return padded_tensor
-
-
-# class VLAFlowMatching(nn.Module):
-#     """
-#     SmolVLA
-
-#     [Paper]()
-
-#     Designed by Hugging Face.
-#     ┌──────────────────────────────┐
-#     │                 actions      │
-#     │                    ▲         │
-#     │ ┌─────────┐      ┌─|────┐    │
-#     │ |         │────► │      │    │
-#     │ |         │ kv   │      │    │
-#     │ |         │────► │Action│    │
-#     │ |   VLM   │cache │Expert│    |
-#     │ │         │────► |      │    │
-#     │ │         │      │      │    │
-#     │ └▲──▲───▲─┘      └───▲──┘    |
-#     │  │  |   |            │       |
-#     │  |  |   |          noise     │
-#     │  │  │ state                  │
-#     │  │ language tokens           │
-#     │  image(s)                    │
-#     └──────────────────────────────┘
-#     """
-
-#     def __init__(self, config: SmolVLAConfig):
-#         super().__init__()
-#         self.config = config
-
-#         self.vlm_with_expert = SmolVLMWithExpertModel(
-#             model_id=self.config.vlm_model_name,
-#             freeze_vision_encoder=self.config.freeze_vision_encoder,
-#             train_expert_only=self.config.train_expert_only,
-#             load_vlm_weights=self.config.load_vlm_weights,
-#             attention_mode=self.config.attention_mode,
-#             num_expert_layers=self.config.num_expert_layers,
-#             num_vlm_layers=self.config.num_vlm_layers,
-#             self_attn_every_n_layers=self.config.self_attn_every_n_layers,
-#             expert_width_multiplier=self.config.expert_width_multiplier,
-#         )
-#         self.state_proj = nn.Linear(
-#             self.config.max_state_dim, self.vlm_with_expert.config.text_config.hidden_size
-#         )
-#         self.action_in_proj = nn.Linear(self.config.max_action_dim, self.vlm_with_expert.expert_hidden_size)
-#         self.action_out_proj = nn.Linear(self.vlm_with_expert.expert_hidden_size, self.config.max_action_dim)
-
-#         self.action_time_mlp_in = nn.Linear(
-#             self.vlm_with_expert.expert_hidden_size * 2, self.vlm_with_expert.expert_hidden_size
-#         )
-#         self.action_time_mlp_out = nn.Linear(
-#             self.vlm_with_expert.expert_hidden_size, self.vlm_with_expert.expert_hidden_size
-#         )
-
-#         self.set_requires_grad()
-#         self.fake_image_token = self.vlm_with_expert.processor.tokenizer.fake_image_token_id
-#         self.global_image_token = self.vlm_with_expert.processor.tokenizer.global_image_token_id
-#         self.global_image_start_token = torch.tensor(
-#             [self.fake_image_token, self.global_image_token], dtype=torch.long
-#         )
-
-#         self.add_image_special_tokens = self.config.add_image_special_tokens
-#         self.image_end_token = torch.tensor([self.fake_image_token], dtype=torch.long)
-#         self.prefix_length = self.config.prefix_length
-
-#     def set_requires_grad(self):
-#         for params in self.state_proj.parameters():
-#             params.requires_grad = self.config.train_state_proj
-
-#     def sample_noise(self, shape, device):
-#         noise = torch.normal(
-#             mean=0.0,
-#             std=1.0,
-#             size=shape,
-#             dtype=torch.float32,
-#             device=device,
-#         )
-#         return noise
-
-#     def sample_time(self, bsize, device):
-#         beta_dist = torch.distributions.Beta(concentration1=1.5, concentration0=1.0)
-#         time_beta = beta_dist.sample((bsize,)).to(device=device, dtype=torch.float32)
-#         time = time_beta * 0.999 + 0.001
-#         return time
-
-#     def embed_prefix(
-#         self, images, img_masks, lang_tokens, lang_masks, state: torch.Tensor = None
-#     ) -> tuple[torch.Tensor, torch.Tensor, torch.Tensor]:
-#         """Embed images with SigLIP and language tokens with embedding layer to prepare
-#         for SmolVLM transformer processing.
-#         """
-#         embs = []
-#         pad_masks = []
-#         att_masks = []
-#         for _img_idx, (
-#             img,
-#             img_mask,
-#         ) in enumerate(zip(images, img_masks, strict=False)):
-#             if self.add_image_special_tokens:
-#                 image_start_token = (
-#                     self.vlm_with_expert.embed_language_tokens(
-#                         self.global_image_start_token.to(device=self.vlm_with_expert.vlm.device)
-#                     )
-#                     .unsqueeze(0)
-#                     .expand(img.shape[0], -1, -1)
-#                 )
-#                 image_start_mask = torch.ones_like(
-#                     image_start_token[:, :, 0], dtype=torch.bool, device=image_start_token.device
-#                 )
-#                 att_masks += [0] * (image_start_mask.shape[-1])
-#                 embs.append(image_start_token)
-#                 pad_masks.append(image_start_mask)
-
-#             img_emb = self.vlm_with_expert.embed_image(img)
-#             img_emb = img_emb
-
-#             # Normalize image embeddings
-#             img_emb_dim = img_emb.shape[-1]
-#             img_emb = img_emb * torch.tensor(img_emb_dim**0.5, dtype=img_emb.dtype, device=img_emb.device)
-
-#             bsize, num_img_embs = img_emb.shape[:2]
-#             img_mask = img_mask[:, None].expand(bsize, num_img_embs)
-
-#             embs.append(img_emb)
-#             pad_masks.append(img_mask)
-
-#             att_masks += [0] * (num_img_embs)
-#             if self.add_image_special_tokens:
-#                 image_end_token = (
-#                     self.vlm_with_expert.embed_language_tokens(
-#                         self.image_end_token.to(device=self.vlm_with_expert.vlm.device)
-#                     )
-#                     .unsqueeze(0)
-#                     .expand(img.shape[0], -1, -1)
-#                 )
-#                 image_end_mask = torch.ones_like(
-#                     image_end_token[:, :, 0], dtype=torch.bool, device=image_end_token.device
-#                 )
-#                 embs.append(image_end_token)
-#                 pad_masks.append(image_end_mask)
-#                 att_masks += [0] * (image_end_mask.shape[1])
-#         lang_emb = self.vlm_with_expert.embed_language_tokens(lang_tokens)
-#         # Normalize language embeddings
-#         lang_emb_dim = lang_emb.shape[-1]
-#         lang_emb = lang_emb * math.sqrt(lang_emb_dim)
-
-#         embs.append(lang_emb)
-#         pad_masks.append(lang_masks)
-
-#         num_lang_embs = lang_emb.shape[1]
-#         att_masks += [0] * num_lang_embs
-
-#         state_emb = self.state_proj(state)
-#         state_emb = state_emb[:, None, :] if state_emb.ndim == 2 else state_emb
-#         embs.append(state_emb)
-#         bsize = state_emb.shape[0]
-#         device = state_emb.device
-
-#         states_seq_len = state_emb.shape[1]
-#         state_mask = torch.ones(bsize, states_seq_len, dtype=torch.bool, device=device)
-#         pad_masks.append(state_mask)
-
-#         # Set attention masks so that image and language inputs do not attend to state or actions
-#         att_masks += [1] * (states_seq_len)
-#         embs = torch.cat(embs, dim=1)
-#         pad_masks = torch.cat(pad_masks, dim=1)
-#         att_masks = torch.tensor(att_masks, dtype=torch.bool, device=pad_masks.device)
-#         att_masks = att_masks[None, :]
-
-#         seq_len = pad_masks.shape[1]
-#         if seq_len < self.prefix_length:
-#             embs = pad_tensor(embs, self.prefix_length, pad_value=0)
-#             pad_masks = pad_tensor(pad_masks, self.prefix_length, pad_value=0)
-#             att_masks = pad_tensor(att_masks, self.prefix_length, pad_value=0)
-
-#         att_masks = att_masks.expand(bsize, -1)
-
-#         return embs, pad_masks, att_masks
-
-#     def embed_suffix(self, noisy_actions, timestep):
-#         """Embed state, noisy_actions, timestep to prepare for Expert Gemma processing."""
-#         embs = []
-#         pad_masks = []
-#         att_masks = []
-
-#         # Fuse timestep + action information using an MLP
-#         action_emb = self.action_in_proj(noisy_actions)
-#         device = action_emb.device
-#         bsize = action_emb.shape[0]
-#         dtype = action_emb.dtype
-#         # Embed timestep using sine-cosine positional encoding with sensitivity in the range [0, 1]
-#         time_emb = create_sinusoidal_pos_embedding(
-#             timestep,
-#             self.vlm_with_expert.expert_hidden_size,
-#             self.config.min_period,
-#             self.config.max_period,
-#             device=device,
-#         )
-#         time_emb = time_emb.type(dtype=dtype)
-
-#         time_emb = time_emb[:, None, :].expand_as(action_emb)
-#         action_time_emb = torch.cat([action_emb, time_emb], dim=2)
-
-#         action_time_emb = self.action_time_mlp_in(action_time_emb)
-#         action_time_emb = F.silu(action_time_emb)  # swish == silu
-#         action_time_emb = self.action_time_mlp_out(action_time_emb)
-
-#         # Add to input tokens
-#         embs.append(action_time_emb)
-
-#         bsize, action_time_dim = action_time_emb.shape[:2]
-#         action_time_mask = torch.ones(bsize, action_time_dim, dtype=torch.bool, device=device)
-#         pad_masks.append(action_time_mask)
-
-#         # Set attention masks so that image, language and state inputs do not attend to action tokens
-#         att_masks += [1] * self.config.chunk_size
-#         embs = torch.cat(embs, dim=1)
-#         pad_masks = torch.cat(pad_masks, dim=1)
-#         att_masks = torch.tensor(att_masks, dtype=embs.dtype, device=embs.device)
-#         att_masks = att_masks[None, :].expand(bsize, len(att_masks))
-#         # added by jade
-#         seq_len = pad_masks.shape[1]
-#         if seq_len < self.config.chunk_size:
-#             embs = pad_tensor(embs, self.config.chunk_size, pad_value=0)
-#             pad_masks = pad_tensor(pad_masks, self.config.chunk_size, pad_value=0)
-#             att_masks = pad_tensor(att_masks, self.config.chunk_size, pad_value=0)
-#         return embs, pad_masks, att_masks
-
-#     def forward(
-#         self, images, img_masks, lang_tokens, lang_masks, state, actions, noise=None, time=None
-#     ) -> Tensor:
-#         """Do a full training forward pass and compute the loss (batch_size x num_steps x num_motors)"""
-#         #added by jade
-#         if actions.ndim == 2:
-#             actions = actions[:, None, :].expand(-1, self.config.chunk_size, -1)
-#         if noise is None:
-#             noise = self.sample_noise(actions.shape, actions.device)
-
-#         if time is None:
-#             time = self.sample_time(actions.shape[0], actions.device)
-
-#         time_expanded = time[:, None, None]
-#         x_t = time_expanded * noise + (1 - time_expanded) * actions
-#         u_t = noise - actions
-#         prefix_embs, prefix_pad_masks, prefix_att_masks = self.embed_prefix(
-#             images, img_masks, lang_tokens, lang_masks, state=state
-#         )
-#         suffix_embs, suffix_pad_masks, suffix_att_masks = self.embed_suffix(x_t, time)
-
-#         pad_masks = torch.cat([prefix_pad_masks, suffix_pad_masks], dim=1)
-#         att_masks = torch.cat([prefix_att_masks, suffix_att_masks], dim=1)
-
-#         att_2d_masks = make_att_2d_masks(pad_masks, att_masks)
-#         position_ids = torch.cumsum(pad_masks, dim=1) - 1
-#         (_, suffix_out), _ = self.vlm_with_expert.forward(
-#             attention_mask=att_2d_masks,
-#             position_ids=position_ids,
-#             past_key_values=None,
-#             inputs_embeds=[prefix_embs, suffix_embs],
-#             use_cache=False,
-#             fill_kv_cache=False,
-#         )
-#         # suffix_out = suffix_out[:, -self.config.chunk_size :]
-#         suffix_out = suffix_out[:, -self.config.chunk_size:, :]
-#         # Original openpi code, upcast attention output
-#         suffix_out = suffix_out.to(dtype=torch.float32)
-#         v_t = self.action_out_proj(suffix_out)
-#         losses = F.mse_loss(u_t, v_t, reduction="none")
-#         return losses
-
-#     def sample_actions(self, images, img_masks, lang_tokens, lang_masks, state, noise=None) -> Tensor:
-#         """Do a full inference forward and compute the action (batch_size x num_steps x num_motors)"""
-#         bsize = state.shape[0]
-#         device = state.device
-
-#         if noise is None:
-#             actions_shape = (bsize, self.config.chunk_size, self.config.max_action_dim)
-#             noise = self.sample_noise(actions_shape, device)
-
-#         prefix_embs, prefix_pad_masks, prefix_att_masks = self.embed_prefix(
-#             images, img_masks, lang_tokens, lang_masks, state=state
-#         )
-#         prefix_att_2d_masks = make_att_2d_masks(prefix_pad_masks, prefix_att_masks)
-#         prefix_position_ids = torch.cumsum(prefix_pad_masks, dim=1) - 1
-#         # Compute image and language key value cache
-#         _, past_key_values = self.vlm_with_expert.forward(
-#             attention_mask=prefix_att_2d_masks,
-#             position_ids=prefix_position_ids,
-#             past_key_values=None,
-#             inputs_embeds=[prefix_embs, None],
-#             use_cache=self.config.use_cache,
-#             fill_kv_cache=True,
-#         )
-#         dt = -1.0 / self.config.num_steps
-#         dt = torch.tensor(dt, dtype=torch.float32, device=device)
-
-#         x_t = noise
-#         time = torch.tensor(1.0, dtype=torch.float32, device=device)
-#         while time >= -dt / 2:
-#             expanded_time = time.expand(bsize)
-#             v_t = self.denoise_step(
-#                 prefix_pad_masks,
-#                 past_key_values,
-#                 x_t,
-#                 expanded_time,
-#             )
-#             # Euler step
-#             x_t += dt * v_t
-#             time += dt
-#         return x_t
-
-#     def denoise_step(
-#         self,
-#         prefix_pad_masks,
-#         past_key_values,
-#         x_t,
-#         timestep,
-#     ):
-#         """Apply one denoising step of the noise `x_t` at a given timestep."""
-#         suffix_embs, suffix_pad_masks, suffix_att_masks = self.embed_suffix(x_t, timestep)
-
-#         suffix_len = suffix_pad_masks.shape[1]
-#         batch_size = prefix_pad_masks.shape[0]
-#         prefix_len = prefix_pad_masks.shape[1]
-#         prefix_pad_2d_masks = prefix_pad_masks[:, None, :].expand(batch_size, suffix_len, prefix_len)
-
-#         suffix_att_2d_masks = make_att_2d_masks(suffix_pad_masks, suffix_att_masks)
-
-#         full_att_2d_masks = torch.cat([prefix_pad_2d_masks, suffix_att_2d_masks], dim=2)
-#         prefix_offsets = torch.sum(prefix_pad_masks, dim=-1)[:, None]
-#         position_ids = prefix_offsets + torch.cumsum(suffix_pad_masks, dim=1) - 1
-
-#         outputs_embeds, _ = self.vlm_with_expert.forward(
-#             attention_mask=full_att_2d_masks,
-#             position_ids=position_ids,
-#             past_key_values=past_key_values,
-#             inputs_embeds=[None, suffix_embs],
-#             use_cache=self.config.use_cache,
-#             fill_kv_cache=False,
-#         )
-#         suffix_out = outputs_embeds[1]
-#         suffix_out = suffix_out[:, -self.config.chunk_size :]
-#         suffix_out = suffix_out.to(dtype=torch.float32)
-#         v_t = self.action_out_proj(suffix_out)
-#         return v_t
 #!/usr/bin/env python
 
 # Copyright 2025 HuggingFace Inc. team. All rights reserved.
@@ -1028,7 +76,6 @@ from lerobot.policies.utils import (
 )
 from lerobot.utils.utils import get_safe_dtype
 
-# OBS_STATE = 'state'
 # Matches ".soNNN", optionally followed by "-something", up to the "_buffer_" marker
 _VARIANT_RE = re.compile(r"\.so\d+(?:-[\w]+)?_buffer_")
 
@@ -1348,7 +395,6 @@ class SmolVLAPolicy(PreTrainedPolicy):
 
         # Unpad actions
         original_action_dim = self.config.action_feature.shape[0]
-        original_action_dim = 7
         actions = actions[:, :, :original_action_dim]
 
         actions = self.unnormalize_outputs({ACTION: actions})[ACTION]
@@ -1892,4 +938,4 @@ class VLAFlowMatching(nn.Module):
         suffix_out = suffix_out[:, -self.config.chunk_size :]
         suffix_out = suffix_out.to(dtype=torch.float32)
         v_t = self.action_out_proj(suffix_out)
-        return v_t
+        return v_t
\ No newline at end of file
diff --git a/src/lerobot/policies/smolvla/modeling_smolvla_v2.py b/src/lerobot/policies/smolvla/modeling_smolvla_v2.py
deleted file mode 100644
index e69de29bb..000000000
diff --git a/src/lerobot/policies/smolvla/saver.txt b/src/lerobot/policies/smolvla/saver.txt
deleted file mode 100644
index f2ad6c76f..000000000
--- a/src/lerobot/policies/smolvla/saver.txt
+++ /dev/null
@@ -1 +0,0 @@
-c
diff --git a/src/lerobot/scripts/eval.py b/src/lerobot/scripts/eval.py
index b96fbb8a3..bd131f708 100644
--- a/src/lerobot/scripts/eval.py
+++ b/src/lerobot/scripts/eval.py
@@ -132,10 +132,6 @@ def rollout(
 
     # Reset the policy and environments.
     policy.reset()
-    # added by jade
-    # for k in list(policy.config.input_features.keys()):
-    #         if k.startswith("observation.image"):
-    #             policy.config.input_features["observation.images." + k.split("observation.", 1)[1]] = policy.config.input_features.pop(k)
     observation, info = env.reset(seed=seeds)
     if render_callback is not None:
         render_callback(env)
@@ -171,26 +167,6 @@ def rollout(
         # Infer "task" from attributes of environments.
         # TODO: works with SyncVectorEnv but not AsyncVectorEnv
         observation = add_envs_task(env, observation)
-        # breakpoint()
-        # observation = {
-        #     k.replace("observation.images.", "observation.") if k.startswith("observation.images.") else k: v
-        #     for k, v in observation.items()
-        # # }
-        # if "observation.image" in observation:
-        #     observation["image"] = observation.pop("observation.image").to(
-        #         device, non_blocking=device.type == "cuda"
-        #     )
-
-        # if "observation.image2" in observation:
-        #     observation["wrist_image"] = observation.pop("observation.image2").to(
-        #         device, non_blocking=device.type == "cuda"
-        #     )
-
-        # if "observation.state" in observation:
-        #     observation["state"] = observation.pop("observation.state").to(
-        #         device, non_blocking=device.type == "cuda"
-        #     )
-
         with torch.inference_mode():
             action = policy.select_action(observation)
         # Convert to CPU / numpy.
@@ -550,15 +526,12 @@ def eval_main(cfg: EvalPipelineConfig):
 
     logging.info("Making environment.")
     envs = make_env(cfg.env, n_envs=cfg.eval.batch_size, use_async_envs=cfg.eval.use_async_envs)
-    breakpoint()
+
     logging.info("Making policy.")
     policy = make_policy(
         cfg=cfg.policy,
         env_cfg=cfg.env,
     )
-    breakpoint()
-    # policy, _ = load_smolvla(cfg.policy, "physical-intelligence/libero", policy)
-    # rename "image" -> "observation.image"
 
     policy.eval()
     with torch.no_grad(), torch.autocast(device_type=device.type) if cfg.policy.use_amp else nullcontext():
diff --git a/src/lerobot/scripts/train_2.py b/src/lerobot/scripts/train_2.py
deleted file mode 100644
index 5b82ef044..000000000
--- a/src/lerobot/scripts/train_2.py
+++ /dev/null
@@ -1,345 +0,0 @@
-#!/usr/bin/env python
-
-# Copyright 2024 The HuggingFace Inc. team. All rights reserved.
-#
-# Licensed under the Apache License, Version 2.0 (the "License");
-# you may not use this file except in compliance with the License.
-# You may obtain a copy of the License at
-#
-#     http://www.apache.org/licenses/LICENSE-2.0
-#
-# Unless required by applicable law or agreed to in writing, software
-# distributed under the License is distributed on an "AS IS" BASIS,
-# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-# See the License for the specific language governing permissions and
-# limitations under the License.
-import logging
-import time
-from contextlib import nullcontext
-from pprint import pformat
-from typing import Any
-
-import torch
-from termcolor import colored
-from torch.amp import GradScaler
-from torch.optim import Optimizer
-
-from lerobot.configs import parser
-from lerobot.configs.train import TrainPipelineConfig
-from lerobot.datasets.factory import make_dataset
-from lerobot.datasets.sampler import EpisodeAwareSampler
-from lerobot.datasets.utils import cycle
-from lerobot.envs.factory import make_env
-from lerobot.optim.factory import make_optimizer_and_scheduler
-from lerobot.policies.factory import make_policy
-from lerobot.policies.pretrained import PreTrainedPolicy
-from lerobot.policies.utils import get_device_from_parameters
-from lerobot.scripts.eval import eval_policy, eval_policy_multitask
-from lerobot.utils.logging_utils import AverageMeter, MetricsTracker
-from lerobot.utils.random_utils import set_seed
-from lerobot.utils.train_utils import (
-    get_step_checkpoint_dir,
-    get_step_identifier,
-    load_training_state,
-    save_checkpoint,
-    update_last_checkpoint,
-)
-from lerobot.utils.utils import (
-    format_big_number,
-    get_safe_torch_device,
-    has_method,
-    init_logging,
-)
-from lerobot.utils.wandb_utils import WandBLogger
-
-
-def update_policy(
-    train_metrics: MetricsTracker,
-    policy: PreTrainedPolicy,
-    batch: Any,
-    optimizer: Optimizer,
-    grad_clip_norm: float,
-    grad_scaler: GradScaler,
-    lr_scheduler=None,
-    use_amp: bool = False,
-    lock=None,
-) -> tuple[MetricsTracker, dict]:
-    start_time = time.perf_counter()
-    device = get_device_from_parameters(policy)
-    policy.train()
-    with torch.autocast(device_type=device.type) if use_amp else nullcontext():
-        loss, output_dict = policy.forward(batch)
-        # TODO(rcadene): policy.unnormalize_outputs(out_dict)
-    grad_scaler.scale(loss).backward()
-
-    # Unscale the gradient of the optimizer's assigned params in-place **prior to gradient clipping**.
-    grad_scaler.unscale_(optimizer)
-
-    grad_norm = torch.nn.utils.clip_grad_norm_(
-        policy.parameters(),
-        grad_clip_norm,
-        error_if_nonfinite=False,
-    )
-
-    # Optimizer's gradients are already unscaled, so scaler.step does not unscale them,
-    # although it still skips optimizer.step() if the gradients contain infs or NaNs.
-    with lock if lock is not None else nullcontext():
-        grad_scaler.step(optimizer)
-    # Updates the scale for next iteration.
-    grad_scaler.update()
-
-    optimizer.zero_grad()
-
-    # Step through pytorch scheduler at every batch instead of epoch
-    if lr_scheduler is not None:
-        lr_scheduler.step()
-
-    if has_method(policy, "update"):
-        # To possibly update an internal buffer (for instance an Exponential Moving Average like in TDMPC).
-        policy.update()
-
-    train_metrics.loss = loss.item()
-    train_metrics.grad_norm = grad_norm.item()
-    train_metrics.lr = optimizer.param_groups[0]["lr"]
-    train_metrics.update_s = time.perf_counter() - start_time
-    return train_metrics, output_dict
-
-
-def _inject_normalization_stats(policy: SmolVLAPolicy, dataset_meta: LeRobotDatasetMetadata):
-    """Recreate normalization layers with dataset stats if missing (Adil's workaround)."""
-    from lerobot.policies.normalize import Normalize, Unnormalize
-
-    if not hasattr(dataset_meta, "stats") or not dataset_meta.stats:
-        print("⚠️ Dataset has no stats, skipping normalization injection.")
-        return
-
-    stats = {}
-    for key, stat_dict in dataset_meta.stats.items():
-        stats[key] = {
-            stat_type: torch.as_tensor(stat_array) if isinstance(stat_array, np.ndarray) else stat_array
-            for stat_type, stat_array in stat_dict.items()
-        }
-
-    normalize_inputs = Normalize(policy.config.input_features, policy.config.normalization_mapping, stats)
-    normalize_targets = Normalize(policy.config.output_features, policy.config.normalization_mapping, stats)
-    unnormalize_outputs = Unnormalize(
-        policy.config.output_features, policy.config.normalization_mapping, stats
-    )
-
-    policy.normalize_inputs = normalize_inputs
-    policy.normalize_targets = normalize_targets
-    policy.unnormalize_outputs = unnormalize_outputs
-
-    print("✅ Normalization layers injected with dataset stats.")
-
-
-@parser.wrap()
-def train(cfg: TrainPipelineConfig):
-    cfg.validate()
-    logging.info(pformat(cfg.to_dict()))
-
-    if cfg.wandb.enable and cfg.wandb.project:
-        wandb_logger = WandBLogger(cfg)
-    else:
-        wandb_logger = None
-        logging.info(colored("Logs will be saved locally.", "yellow", attrs=["bold"]))
-
-    if cfg.seed is not None:
-        set_seed(cfg.seed)
-
-    # Check device is available
-    device = get_safe_torch_device(cfg.policy.device, log=True)
-    torch.backends.cudnn.benchmark = True
-    torch.backends.cuda.matmul.allow_tf32 = True
-
-    logging.info("Creating dataset")
-    dataset = make_dataset(cfg)
-
-    # Create environment used for evaluating checkpoints during training on simulation data.
-    # On real-world data, no need to create an environment as evaluations are done outside train.py,
-    # using the eval.py instead, with gym_dora environment and dora-rs.
-    eval_env = None
-    if cfg.eval_freq > 0 and cfg.env is not None:
-        logging.info("Creating env")
-        eval_env = make_env(cfg.env, n_envs=cfg.eval.batch_size, use_async_envs=cfg.eval.use_async_envs)
-
-    logging.info("Creating policy")
-    policy = make_policy(
-        cfg=cfg.policy,
-        ds_meta=dataset.meta,
-    )
-    logging.info("Creating optimizer and scheduler")
-    optimizer, lr_scheduler = make_optimizer_and_scheduler(cfg, policy)
-    grad_scaler = GradScaler(device.type, enabled=cfg.policy.use_amp)
-
-    step = 0  # number of policy updates (forward + backward + optim)
-
-    if cfg.resume:
-        step, optimizer, lr_scheduler = load_training_state(cfg.checkpoint_path, optimizer, lr_scheduler)
-
-    num_learnable_params = sum(p.numel() for p in policy.parameters() if p.requires_grad)
-    num_total_params = sum(p.numel() for p in policy.parameters())
-
-    logging.info(colored("Output dir:", "yellow", attrs=["bold"]) + f" {cfg.output_dir}")
-    if cfg.env is not None:
-        logging.info(f"{cfg.env.task=}")
-    logging.info(f"{cfg.steps=} ({format_big_number(cfg.steps)})")
-    logging.info(f"{dataset.num_frames=} ({format_big_number(dataset.num_frames)})")
-    logging.info(f"{dataset.num_episodes=}")
-    logging.info(f"{num_learnable_params=} ({format_big_number(num_learnable_params)})")
-    logging.info(f"{num_total_params=} ({format_big_number(num_total_params)})")
-
-    # create dataloader for offline training
-    if hasattr(cfg.policy, "drop_n_last_frames"):
-        shuffle = False
-        sampler = EpisodeAwareSampler(
-            dataset.episode_data_index,
-            drop_n_last_frames=cfg.policy.drop_n_last_frames,
-            shuffle=True,
-        )
-    else:
-        shuffle = True
-        sampler = None
-
-    dataloader = torch.utils.data.DataLoader(
-        dataset,
-        num_workers=cfg.num_workers,
-        batch_size=cfg.batch_size,
-        shuffle=shuffle,
-        sampler=sampler,
-        pin_memory=device.type == "cuda",
-        drop_last=False,
-    )
-    dl_iter = cycle(dataloader)
-
-    policy.train()
-    train_metrics = {
-        "loss": AverageMeter("loss", ":.3f"),
-        "grad_norm": AverageMeter("grdn", ":.3f"),
-        "lr": AverageMeter("lr", ":0.1e"),
-        "update_s": AverageMeter("updt_s", ":.3f"),
-        "dataloading_s": AverageMeter("data_s", ":.3f"),
-    }
-
-    train_tracker = MetricsTracker(
-        cfg.batch_size, dataset.num_frames, dataset.num_episodes, train_metrics, initial_step=step
-    )
-
-    logging.info("Start offline training on a fixed dataset")
-    for _ in range(step, cfg.steps):
-        start_time = time.perf_counter()
-        batch = next(dl_iter)
-        train_tracker.dataloading_s = time.perf_counter() - start_time
-
-        for key in batch:
-            if isinstance(batch[key], torch.Tensor):
-                batch[key] = batch[key].to(device, non_blocking=device.type == "cuda")
-
-        train_tracker, output_dict = update_policy(
-            train_tracker,
-            policy,
-            batch,
-            optimizer,
-            cfg.optimizer.grad_clip_norm,
-            grad_scaler=grad_scaler,
-            lr_scheduler=lr_scheduler,
-            use_amp=cfg.policy.use_amp,
-        )
-
-        # Note: eval and checkpoint happens *after* the `step`th training update has completed, so we
-        # increment `step` here.
-        step += 1
-        train_tracker.step()
-        is_log_step = cfg.log_freq > 0 and step % cfg.log_freq == 0
-        is_saving_step = step % cfg.save_freq == 0 or step == cfg.steps
-        is_eval_step = cfg.eval_freq > 0 and step % cfg.eval_freq == 0
-
-        if is_log_step:
-            logging.info(train_tracker)
-            if wandb_logger:
-                wandb_log_dict = train_tracker.to_dict()
-                if output_dict:
-                    wandb_log_dict.update(output_dict)
-                wandb_logger.log_dict(wandb_log_dict, step)
-            train_tracker.reset_averages()
-
-        if cfg.save_checkpoint and is_saving_step:
-            logging.info(f"Checkpoint policy after step {step}")
-            checkpoint_dir = get_step_checkpoint_dir(cfg.output_dir, cfg.steps, step)
-            save_checkpoint(checkpoint_dir, step, cfg, policy, optimizer, lr_scheduler)
-            update_last_checkpoint(checkpoint_dir)
-            if wandb_logger:
-                wandb_logger.log_policy(checkpoint_dir)
-
-        if cfg.env and is_eval_step:
-            step_id = get_step_identifier(step, cfg.steps)
-            logging.info(f"Eval policy at step {step}")
-            with (
-                torch.no_grad(),
-                torch.autocast(device_type=device.type) if cfg.policy.use_amp else nullcontext(),
-            ):
-                if cfg.env.multitask_eval:
-                    eval_info = eval_policy_multitask(
-                        eval_env,
-                        policy,
-                        cfg.eval.n_episodes,
-                        videos_dir=cfg.output_dir / "eval" / f"videos_step_{step_id}",
-                        max_episodes_rendered=4,
-                        start_seed=cfg.seed,
-                        max_parallel_tasks=cfg.env.max_parallel_tasks,
-                    )
-                    aggregated = eval_info["overall"]["aggregated"]
-                    # Print per-suite stats, log?
-                    for task_group, task_group_info in eval_info.items():
-                        if task_group == "overall":
-                            continue  # Skip the overall stats since we already printed it
-                        print(f"\nAggregated Metrics for {task_group}:")
-                        print(task_group_info["aggregated"])
-                else:
-                    eval_info = eval_policy(
-                        eval_env,
-                        policy,
-                        cfg.eval.n_episodes,
-                        videos_dir=cfg.output_dir / "eval" / f"videos_step_{step_id}",
-                        max_episodes_rendered=4,
-                        start_seed=cfg.seed,
-                    )
-                    aggregated = eval_info["aggregated"]
-
-            eval_metrics = {
-                "avg_sum_reward": AverageMeter("∑rwrd", ":.3f"),
-                "pc_success": AverageMeter("success", ":.1f"),
-                "eval_s": AverageMeter("eval_s", ":.3f"),
-            }
-            eval_tracker = MetricsTracker(
-                cfg.batch_size, dataset.num_frames, dataset.num_episodes, eval_metrics, initial_step=step
-            )
-            eval_tracker.eval_s = aggregated.pop("eval_s")
-            eval_tracker.avg_sum_reward = aggregated.pop("avg_sum_reward")
-            eval_tracker.pc_success = aggregated.pop("pc_success")
-            logging.info(eval_tracker)
-            if wandb_logger:
-                wandb_log_dict = {**eval_tracker.to_dict(), **eval_info}
-                wandb_logger.log_dict(wandb_log_dict, step, mode="eval")
-                wandb_logger.log_video(eval_info["video_paths"][0], step, mode="eval")
-
-    if eval_env:
-        if cfg.env.multitask_eval:
-            for _task_group, envs_dict in eval_env.items():
-                for _idx, env in envs_dict.items():
-                    env.close()
-        else:
-            eval_env.close()
-    logging.info("End of training")
-
-    if cfg.policy.push_to_hub:
-        policy.push_model_to_hub(cfg)
-
-
-def main():
-    init_logging()
-    train()
-
-
-if __name__ == "__main__":
-    main()
diff --git a/src/lerobot/scripts/train_accelerate.py b/src/lerobot/scripts/train_accelerate.py
deleted file mode 100644
index 1e8a59a64..000000000
--- a/src/lerobot/scripts/train_accelerate.py
+++ /dev/null
@@ -1,366 +0,0 @@
-#!/usr/bin/env python
-
-# Copyright 2024 The HuggingFace Inc. team. All rights reserved.
-#
-# Licensed under the Apache License, Version 2.0 (the "License");
-# you may not use this file except in compliance with the License.
-# You may obtain a copy of the License at
-#
-#     http://www.apache.org/licenses/LICENSE-2.0
-#
-# Unless required by applicable law or agreed to in writing, software
-# distributed under the License is distributed on an "AS IS" BASIS,
-# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-# See the License for the specific language governing permissions and
-# limitations under the License.
-import logging
-import time
-from pprint import pformat
-from typing import Any
-
-import torch
-from accelerate import Accelerator
-from accelerate.utils import set_seed as accelerate_set_seed
-from termcolor import colored
-from torch.optim import Optimizer
-
-from lerobot.configs import parser
-from lerobot.configs.train import TrainPipelineConfig
-from lerobot.datasets.factory import make_dataset
-from lerobot.datasets.sampler import EpisodeAwareSampler
-from lerobot.envs.factory import make_env
-from lerobot.optim.factory import make_optimizer_and_scheduler
-from lerobot.policies.factory import make_policy
-from lerobot.policies.pretrained import PreTrainedPolicy
-from lerobot.scripts.eval import eval_policy
-from lerobot.utils.logging_utils import AverageMeter, MetricsTracker
-from lerobot.utils.train_utils import (
-    get_step_checkpoint_dir,
-    get_step_identifier,
-    load_training_state,
-    save_checkpoint,
-    update_last_checkpoint,
-)
-from lerobot.utils.utils import (
-    format_big_number,
-    has_method,
-    init_logging,
-)
-
-
-def update_policy(
-    train_metrics: MetricsTracker,
-    policy: PreTrainedPolicy,
-    batch: Any,
-    optimizer: Optimizer,
-    grad_clip_norm: float,
-    accelerator: Accelerator,
-    lr_scheduler=None,
-) -> tuple[MetricsTracker, dict]:
-    start_time = time.perf_counter()
-    policy.train()
-
-    # Use accelerator's autocast context if mixed precision is enabled
-    with accelerator.autocast():
-        loss, output_dict = policy.forward(batch)
-        # TODO(rcadene): policy.unnormalize_outputs(out_dict)
-
-    # Use accelerator for backward pass
-    accelerator.backward(loss)
-
-    # Gradient clipping - accelerator handles unscaling automatically
-    if accelerator.sync_gradients and grad_clip_norm > 0:
-        grad_norm = accelerator.clip_grad_norm_(policy.parameters(), grad_clip_norm)
-    else:
-        grad_norm = torch.tensor(0.0)
-
-    optimizer.step()
-    lr_scheduler.step() if lr_scheduler is not None else None
-    optimizer.zero_grad()
-
-    # Update policy-specific buffers if needed
-    if has_method(policy, "update"):
-        policy.update()
-
-    # Gather metrics across all processes
-    loss_value = accelerator.gather(loss.detach()).mean().item()
-    grad_norm_value = accelerator.gather(grad_norm).mean().item()
-
-    train_metrics.loss = loss_value
-    train_metrics.grad_norm = grad_norm_value
-    train_metrics.lr = optimizer.param_groups[0]["lr"]
-    train_metrics.update_s = time.perf_counter() - start_time
-    return train_metrics, output_dict
-
-
-@parser.wrap()
-def train(cfg: TrainPipelineConfig):
-    cfg.validate()
-    logging.info(pformat(cfg.to_dict()))
-
-    # Initialize accelerator
-    from accelerate.utils import DistributedDataParallelKwargs
-
-    # added by jade 2 lines
-    ddp_kwargs = DistributedDataParallelKwargs(find_unused_parameters=False)
-    accelerator = Accelerator(..., kwargs_handlers=[ddp_kwargs])
-
-    from lerobot.utils.wandb_utils import cfg_to_group, get_wandb_run_id_from_filesystem
-
-    ddp_kwargs = DistributedDataParallelKwargs(find_unused_parameters=True)
-    accelerator = Accelerator(
-        mixed_precision="bf16" if cfg.policy.use_amp else "no",
-        gradient_accumulation_steps=cfg.policy.gradient_accumulation_steps,
-        log_with="wandb" if cfg.wandb.enable else None,
-        kwargs_handlers=[ddp_kwargs],
-        project_dir=cfg.output_dir,
-    )
-
-    accelerator.init_trackers(
-        project_name=cfg.wandb.project,
-        init_kwargs={
-            "wandb": {
-                "entity": cfg.wandb.entity,
-                "name": cfg.job_name,
-                "notes": cfg.wandb.notes,
-                "tags": cfg_to_group(cfg, return_list=True),
-                "dir": cfg.output_dir,
-                "config": cfg.to_dict(),
-                "save_code": False,
-                "job_type": "train_eval",
-                "mode": cfg.wandb.mode if cfg.wandb.mode in ["online", "offline", "disabled"] else "online",
-                "resume": "must" if cfg.resume else None,
-                "id": cfg.wandb.run_id
-                if cfg.wandb.run_id
-                else (get_wandb_run_id_from_filesystem(cfg.output_dir) if cfg.resume else None),
-            }
-        },
-    )
-
-    # Set seed for reproducibility
-    if cfg.seed is not None:
-        accelerate_set_seed(cfg.seed)
-
-    # Setup device - accelerator handles device placement
-    torch.backends.cudnn.benchmark = True
-    torch.backends.cuda.matmul.allow_tf32 = True
-
-    # Create dataset
-    if accelerator.is_main_process:
-        logging.info("Creating dataset")
-    dataset = make_dataset(cfg)
-    print("c")
-    # Create evaluation environment (only on main process)
-    eval_env = None
-    if cfg.eval_freq > 0 and cfg.env is not None and accelerator.is_main_process:
-        logging.info("Creating env")
-        eval_env = make_env(cfg.env, n_envs=cfg.eval.batch_size, use_async_envs=cfg.eval.use_async_envs)
-
-    # Create policy
-    if accelerator.is_main_process:
-        logging.info("Creating policy")
-
-    # Use accelerator's device instead of cfg.policy.device
-    with accelerator.main_process_first():
-        policy = make_policy(
-            cfg=cfg.policy,
-            ds_meta=dataset.meta,
-        )
-
-    # Create optimizer and scheduler
-    if accelerator.is_main_process:
-        logging.info("Creating optimizer and scheduler")
-    optimizer, lr_scheduler = make_optimizer_and_scheduler(cfg, policy)
-
-    step = 0  # number of policy updates
-
-    if cfg.resume:
-        step, optimizer, lr_scheduler = load_training_state(cfg.checkpoint_path, optimizer, lr_scheduler)
-
-    # Prepare dataloader
-    if hasattr(cfg.policy, "drop_n_last_frames"):
-        shuffle = False
-        sampler = EpisodeAwareSampler(
-            dataset.episode_data_index,
-            drop_n_last_frames=cfg.policy.drop_n_last_frames,
-            shuffle=True,
-        )
-    else:
-        shuffle = True
-        sampler = None
-
-    dataloader = torch.utils.data.DataLoader(
-        dataset,
-        num_workers=cfg.num_workers,
-        batch_size=cfg.batch_size,
-        shuffle=shuffle,
-        sampler=sampler,
-        pin_memory=True,
-        drop_last=True,  # Important for distributed training
-    )
-
-    # Prepare for distributed training
-    policy, optimizer, dataloader, lr_scheduler = accelerator.prepare(
-        policy, optimizer, dataloader, lr_scheduler
-    )
-
-    # Log training info (only on main process)
-    if accelerator.is_main_process:
-        num_learnable_params = sum(p.numel() for p in policy.parameters() if p.requires_grad)
-        num_total_params = sum(p.numel() for p in policy.parameters())
-
-        logging.info(colored("Output dir:", "yellow", attrs=["bold"]) + f" {cfg.output_dir}")
-        if cfg.env is not None:
-            logging.info(f"{cfg.env.task=}")
-        logging.info(f"{cfg.steps=} ({format_big_number(cfg.steps)})")
-        logging.info(f"{dataset.num_frames=} ({format_big_number(dataset.num_frames)})")
-        logging.info(f"{dataset.num_episodes=}")
-        logging.info(f"{num_learnable_params=} ({format_big_number(num_learnable_params)})")
-        logging.info(f"{num_total_params=} ({format_big_number(num_total_params)})")
-        logging.info(f"Number of processes: {accelerator.num_processes}")
-        logging.info(f"Device: {accelerator.device}")
-        logging.info(f"Mixed precision: {accelerator.mixed_precision}")
-
-    # Create metrics trackers
-    train_metrics = {
-        "loss": AverageMeter("loss", ":.3f"),
-        "grad_norm": AverageMeter("grdn", ":.3f"),
-        "lr": AverageMeter("lr", ":0.1e"),
-        "update_s": AverageMeter("updt_s", ":.3f"),
-        "dataloading_s": AverageMeter("data_s", ":.3f"),
-    }
-
-    train_tracker = MetricsTracker(
-        cfg.batch_size * accelerator.num_processes,  # Account for all processes
-        dataset.num_frames,
-        dataset.num_episodes,
-        train_metrics,
-        initial_step=step,
-    )
-
-    # Training loop
-    policy.train()
-    if accelerator.is_main_process:
-        logging.info("Start offline training on a fixed dataset")
-
-    # Create iterator from dataloader
-    dl_iter = iter(dataloader)
-
-    for current_step in range(step, cfg.steps):
-        start_time = time.perf_counter()
-        # Get next batch, cycling through dataloader if needed
-        try:
-            batch = next(dl_iter)
-            print("data laoder batch keys: ", batch.keys())
-            breakpoint()
-        except StopIteration:
-            dl_iter = iter(dataloader)
-            batch = next(dl_iter)
-        train_tracker.dataloading_s = time.perf_counter() - start_time
-        # Update policy
-        train_tracker, output_dict = update_policy(
-            train_tracker,
-            policy,
-            batch,
-            optimizer,
-            cfg.optimizer.grad_clip_norm,
-            accelerator,
-            lr_scheduler=lr_scheduler,
-        )
-
-        # Increment step counter
-        step += 1
-        train_tracker.step()
-
-        # Determine if we should log, save, or evaluate
-        is_log_step = cfg.log_freq > 0 and step % cfg.log_freq == 0
-        is_saving_step = step % cfg.save_freq == 0 or step == cfg.steps
-        is_eval_step = cfg.eval_freq > 0 and step % cfg.eval_freq == 0
-
-        # Logging (only on main process)
-        if is_log_step and accelerator.is_main_process:
-            logging.info(train_tracker)
-            wandb_log_dict = train_tracker.to_dict()
-            if output_dict:
-                wandb_log_dict.update(output_dict)
-            for k, v in wandb_log_dict.items():
-                accelerator.log({f"{'train'}/{k}": v}, step=step)
-            train_tracker.reset_averages()
-
-        # Checkpointing (only on main process)
-        if cfg.save_checkpoint and is_saving_step:
-            # ✅ all processes wait here
-            accelerator.wait_for_everyone()
-
-            if accelerator.is_main_process:
-                logging.info(f"Checkpoint policy after step {step}")
-                checkpoint_dir = get_step_checkpoint_dir(cfg.output_dir, cfg.steps, step)
-
-                unwrapped_policy = accelerator.unwrap_model(policy)
-                save_checkpoint(checkpoint_dir, step, cfg, unwrapped_policy, optimizer, lr_scheduler)
-                update_last_checkpoint(checkpoint_dir)
-
-            # ✅ all processes sync again after saving
-            accelerator.wait_for_everyone()
-
-            # if wandb_logger:
-            #     wandb_logger.log_policy(checkpoint_dir)
-
-        # Evaluation (only on main process)
-        if cfg.env and is_eval_step and accelerator.is_main_process:
-            step_id = get_step_identifier(step, cfg.steps)
-            logging.info(f"Eval policy at step {step}")
-
-            # Unwrap model for evaluation
-            unwrapped_policy = accelerator.unwrap_model(policy)
-            unwrapped_policy.eval()
-
-            with torch.no_grad():
-                eval_info = eval_policy(
-                    eval_env,
-                    unwrapped_policy,
-                    cfg.eval.n_episodes,
-                    videos_dir=cfg.output_dir / "eval" / f"videos_step_{step_id}",
-                    max_episodes_rendered=4,
-                    start_seed=cfg.seed,
-                )
-
-            eval_metrics = {
-                "avg_sum_reward": AverageMeter("∑rwrd", ":.3f"),
-                "pc_success": AverageMeter("success", ":.1f"),
-                "eval_s": AverageMeter("eval_s", ":.3f"),
-            }
-            eval_tracker = MetricsTracker(
-                cfg.batch_size * accelerator.num_processes,
-                dataset.num_frames,
-                dataset.num_episodes,
-                eval_metrics,
-                initial_step=step,
-            )
-            eval_tracker.eval_s = eval_info["aggregated"].pop("eval_s")
-            eval_tracker.avg_sum_reward = eval_info["aggregated"].pop("avg_sum_reward")
-            eval_tracker.pc_success = eval_info["aggregated"].pop("pc_success")
-            logging.info(eval_tracker)
-
-            wandb_log_dict = {**eval_tracker.to_dict(), **eval_info}
-            for k, v in wandb_log_dict.items():
-                accelerator.log({f"{'eval'}/{k}": v}, step=step)
-
-            # Set back to training mode
-            policy.train()
-
-    # Wait for all processes to finish
-    accelerator.wait_for_everyone()
-
-    # Cleanup
-    if eval_env and accelerator.is_main_process:
-        eval_env.close()
-
-    if accelerator.is_main_process:
-        logging.info("End of training")
-        accelerator.end_training()  # added by jade
-
-
-if __name__ == "__main__":
-    init_logging()
-    train()

From 978412346315244e56e2ca40097ff3dcd8c43fbe Mon Sep 17 00:00:00 2001
From: "pre-commit-ci[bot]"
 <66853113+pre-commit-ci[bot]@users.noreply.github.com>
Date: Thu, 11 Sep 2025 12:18:35 +0000
Subject: [PATCH 08/12] [pre-commit.ci] auto fixes from pre-commit.com hooks

for more information, see https://pre-commit.ci
---
 src/lerobot/envs/libero.py                       | 5 ++---
 src/lerobot/policies/smolvla/modeling_smolvla.py | 2 +-
 src/lerobot/scripts/eval.py                      | 5 ++---
 3 files changed, 5 insertions(+), 7 deletions(-)

diff --git a/src/lerobot/envs/libero.py b/src/lerobot/envs/libero.py
index 0672f583f..5d0d37c83 100644
--- a/src/lerobot/envs/libero.py
+++ b/src/lerobot/envs/libero.py
@@ -4,10 +4,9 @@ import logging
 import math
 import os
 from collections import defaultdict
-from collections.abc import Callable
-from itertools import chain
-from typing import Any, Dict, List
 from collections.abc import Callable, Iterable, Mapping, Sequence
+from itertools import chain
+from typing import Any
 
 import gymnasium as gym
 import numpy as np
diff --git a/src/lerobot/policies/smolvla/modeling_smolvla.py b/src/lerobot/policies/smolvla/modeling_smolvla.py
index 6bf956aa3..18f2fc58a 100644
--- a/src/lerobot/policies/smolvla/modeling_smolvla.py
+++ b/src/lerobot/policies/smolvla/modeling_smolvla.py
@@ -938,4 +938,4 @@ class VLAFlowMatching(nn.Module):
         suffix_out = suffix_out[:, -self.config.chunk_size :]
         suffix_out = suffix_out.to(dtype=torch.float32)
         v_t = self.action_out_proj(suffix_out)
-        return v_t
\ No newline at end of file
+        return v_t
diff --git a/src/lerobot/scripts/eval.py b/src/lerobot/scripts/eval.py
index bd131f708..4777f57e3 100644
--- a/src/lerobot/scripts/eval.py
+++ b/src/lerobot/scripts/eval.py
@@ -52,14 +52,13 @@ import logging
 import threading
 import time
 from collections import defaultdict
-from collections.abc import Callable
+from collections.abc import Callable, Iterator
 from contextlib import nullcontext
 from copy import deepcopy
 from dataclasses import asdict
 from pathlib import Path
 from pprint import pformat
-from typing import Dict, List, Tuple, TypedDict
-from collections.abc import Iterator
+from typing import TypedDict
 
 import einops
 import gymnasium as gym

From 8567ab60d840e39fcec8c713a7bf4f3ad5388a4c Mon Sep 17 00:00:00 2001
From: Jade Choghari <chogharijade@gmail.com>
Date: Thu, 11 Sep 2025 14:24:06 +0200
Subject: [PATCH 09/12] remove unces

---
 src/lerobot/datasets/lerobot_dataset.py |  3 +-
 src/lerobot/envs/utils.py               | 53 -------------------------
 src/lerobot/scripts/eval.py             |  1 -
 3 files changed, 2 insertions(+), 55 deletions(-)

diff --git a/src/lerobot/datasets/lerobot_dataset.py b/src/lerobot/datasets/lerobot_dataset.py
index 875608c59..10b4e731e 100644
--- a/src/lerobot/datasets/lerobot_dataset.py
+++ b/src/lerobot/datasets/lerobot_dataset.py
@@ -471,7 +471,8 @@ class LeRobotDataset(torch.utils.data.Dataset):
         if self.episodes is not None and self.meta._version >= packaging.version.parse("v2.1"):
             episodes_stats = [self.meta.episodes_stats[ep_idx] for ep_idx in self.episodes]
             self.stats = aggregate_stats(episodes_stats)
-
+        
+        # Load actual data
         try:
             if force_cache_sync:
                 raise FileNotFoundError
diff --git a/src/lerobot/envs/utils.py b/src/lerobot/envs/utils.py
index d65f7f29f..94109bf8b 100644
--- a/src/lerobot/envs/utils.py
+++ b/src/lerobot/envs/utils.py
@@ -80,59 +80,6 @@ def preprocess_observation(observations: dict[str, np.ndarray]) -> dict[str, Ten
 
     return return_observations
 
-
-def preprocess_observation1(
-    observations: dict[str, np.ndarray], cfg: dict[str, Any] = None
-) -> dict[str, Tensor]:
-    # TODO(aliberts, rcadene): refactor this to use features from the environment (no hardcoding)
-    """Convert environment observation to LeRobot format observation.
-    Args:
-        observation: Dictionary of observation batches from a Gym vector environment.
-    Returns:
-        Dictionary of observation batches with keys renamed to LeRobot format and values as tensors.
-    """
-    # map to expected inputs for the policy
-    return_observations = {}
-    image_key = list(cfg.image_features.keys())[0] if cfg else "observation.image"
-    state_key = cfg.robot_state_feature_key if cfg else "observation.state"
-    if "pixels" in observations:
-        if isinstance(observations["pixels"], dict):
-            # imgs = {f"{image_key}.{key}": img for key, img in observations["pixels"].items()}
-            imgs = observations["pixels"]  # keys should be OBS_IMAGE, OBS_IMAGE_2, OBS_IMAGE_3
-        else:
-            imgs = {f"{image_key}": observations["pixels"]}
-
-        for imgkey, img in imgs.items():
-            # TODO(aliberts, rcadene): use transforms.ToTensor()?
-            img = torch.from_numpy(img)
-
-            # sanity check that images are channel last
-            _, h, w, c = img.shape
-            assert c < h and c < w, f"expect channel last images, but instead got {img.shape=}"
-
-            # sanity check that images are uint8
-            assert img.dtype == torch.uint8, f"expect torch.uint8, but instead {img.dtype=}"
-
-            # convert to channel first of type float32 in range [0,1]
-            img = einops.rearrange(img, "b h w c -> b c h w").contiguous()
-            img = img.type(torch.float32)
-            img /= 255
-
-            return_observations[imgkey] = img
-
-    if "environment_state" in observations:
-        return_observations["observation.environment_state"] = torch.from_numpy(
-            observations["environment_state"]
-        ).float()
-
-    # TODO(rcadene): enable pixels only baseline with `obs_type="pixels"` in environment by removing
-    # requirement for "agent_pos"
-    return_observations[state_key] = torch.from_numpy(observations["agent_pos"]).float()
-    if "task" in observations:
-        return_observations["task"] = observations["task"]
-    return return_observations
-
-
 def env_to_policy_features(env_cfg: EnvConfig) -> dict[str, PolicyFeature]:
     # TODO(aliberts, rcadene): remove this hardcoding of keys and just use the nested keys as is
     # (need to also refactor preprocess_observation and externalize normalization from policies)
diff --git a/src/lerobot/scripts/eval.py b/src/lerobot/scripts/eval.py
index 4777f57e3..2bc928117 100644
--- a/src/lerobot/scripts/eval.py
+++ b/src/lerobot/scripts/eval.py
@@ -155,7 +155,6 @@ def rollout(
     while not np.all(done) and step < max_steps:
         # Numpy array to tensor and changing dictionary keys to LeRobot policy format.
         observation = preprocess_observation(observation)
-        # observation = preprocess_observation1(observation)
         if return_observations:
             all_observations.append(deepcopy(observation))
 

From f2530570e0352a1b57cea42a685daec04733bb98 Mon Sep 17 00:00:00 2001
From: "pre-commit-ci[bot]"
 <66853113+pre-commit-ci[bot]@users.noreply.github.com>
Date: Thu, 11 Sep 2025 12:25:13 +0000
Subject: [PATCH 10/12] [pre-commit.ci] auto fixes from pre-commit.com hooks

for more information, see https://pre-commit.ci
---
 src/lerobot/datasets/lerobot_dataset.py | 2 +-
 src/lerobot/envs/utils.py               | 1 +
 2 files changed, 2 insertions(+), 1 deletion(-)

diff --git a/src/lerobot/datasets/lerobot_dataset.py b/src/lerobot/datasets/lerobot_dataset.py
index 10b4e731e..a869cb920 100644
--- a/src/lerobot/datasets/lerobot_dataset.py
+++ b/src/lerobot/datasets/lerobot_dataset.py
@@ -471,7 +471,7 @@ class LeRobotDataset(torch.utils.data.Dataset):
         if self.episodes is not None and self.meta._version >= packaging.version.parse("v2.1"):
             episodes_stats = [self.meta.episodes_stats[ep_idx] for ep_idx in self.episodes]
             self.stats = aggregate_stats(episodes_stats)
-        
+
         # Load actual data
         try:
             if force_cache_sync:
diff --git a/src/lerobot/envs/utils.py b/src/lerobot/envs/utils.py
index 94109bf8b..4aabe7932 100644
--- a/src/lerobot/envs/utils.py
+++ b/src/lerobot/envs/utils.py
@@ -80,6 +80,7 @@ def preprocess_observation(observations: dict[str, np.ndarray]) -> dict[str, Ten
 
     return return_observations
 
+
 def env_to_policy_features(env_cfg: EnvConfig) -> dict[str, PolicyFeature]:
     # TODO(aliberts, rcadene): remove this hardcoding of keys and just use the nested keys as is
     # (need to also refactor preprocess_observation and externalize normalization from policies)

From d09b2a28af6879bd118325f300a31685f2482198 Mon Sep 17 00:00:00 2001
From: Jade Choghari <chogharijade@gmail.com>
Date: Thu, 11 Sep 2025 14:28:46 +0200
Subject: [PATCH 11/12] remove

---
 .../policies/smolvla/smolvlm_with_expert.py   |  1 -
 src/lerobot/scripts/eval.py                   | 42 -------------------
 src/lerobot/scripts/train.py                  | 29 -------------
 3 files changed, 72 deletions(-)

diff --git a/src/lerobot/policies/smolvla/smolvlm_with_expert.py b/src/lerobot/policies/smolvla/smolvlm_with_expert.py
index 3b78c99e6..f3d1a693a 100644
--- a/src/lerobot/policies/smolvla/smolvlm_with_expert.py
+++ b/src/lerobot/policies/smolvla/smolvlm_with_expert.py
@@ -78,7 +78,6 @@ class SmolVLMWithExpertModel(nn.Module):
                 model_id,
                 device_map="auto",
                 torch_dtype="bfloat16",
-                # torch_dtype=torch.float16,
                 low_cpu_mem_usage=True,
             )
             config = self.vlm.config
diff --git a/src/lerobot/scripts/eval.py b/src/lerobot/scripts/eval.py
index 2bc928117..5e8d63f09 100644
--- a/src/lerobot/scripts/eval.py
+++ b/src/lerobot/scripts/eval.py
@@ -465,48 +465,6 @@ def _compile_episode_data(
 
     return data_dict
 
-
-from lerobot.datasets.lerobot_dataset import LeRobotDatasetMetadata
-from lerobot.policies.smolvla.modeling_smolvla import SmolVLAPolicy
-
-
-def _inject_normalization_stats(policy: SmolVLAPolicy, dataset_meta: LeRobotDatasetMetadata):
-    """Recreate normalization layers with proper stats from the dataset."""
-    from lerobot.policies.normalize import Normalize, Unnormalize
-
-    # Convert numpy stats to the format expected by normalization layers
-    stats = {}
-    for key, stat_dict in dataset_meta.stats.items():
-        stats[key] = {
-            stat_type: torch.from_numpy(stat_array) if isinstance(stat_array, np.ndarray) else stat_array
-            for stat_type, stat_array in stat_dict.items()
-        }
-
-    # Recreate normalization layers with proper stats
-    normalize_inputs = Normalize(policy.config.input_features, policy.config.normalization_mapping, stats)
-
-    normalize_targets = Normalize(policy.config.output_features, policy.config.normalization_mapping, stats)
-
-    unnormalize_outputs = Unnormalize(
-        policy.config.output_features, policy.config.normalization_mapping, stats
-    )
-
-    # Replace the normalization layers on the policy
-    policy.normalize_inputs = normalize_inputs
-    policy.normalize_targets = normalize_targets
-    policy.unnormalize_outputs = unnormalize_outputs
-
-    print("Normalization layers recreated with dataset stats.")
-
-
-def load_smolvla(cfg, dataset_repo: str, policy):
-    from lerobot.datasets.lerobot_dataset import LeRobotDataset
-
-    dataset = LeRobotDataset(dataset_repo, root="/raid/jade/.cache/huggingface/datasets/")
-    _inject_normalization_stats(policy=policy, dataset_meta=dataset.meta)  # only needed if stats are missing
-    return policy.to("cuda"), dataset
-
-
 @parser.wrap()
 def eval_main(cfg: EvalPipelineConfig):
     logging.info(pformat(asdict(cfg)))
diff --git a/src/lerobot/scripts/train.py b/src/lerobot/scripts/train.py
index fc8be4ebc..3aea697d0 100644
--- a/src/lerobot/scripts/train.py
+++ b/src/lerobot/scripts/train.py
@@ -105,35 +105,6 @@ def update_policy(
     train_metrics.update_s = time.perf_counter() - start_time
     return train_metrics, output_dict
 
-
-# def _inject_normalization_stats(policy: SmolVLAPolicy, dataset_meta: LeRobotDatasetMetadata):
-#     """Recreate normalization layers with dataset stats if missing (Adil's workaround)."""
-#     from lerobot.policies.normalize import Normalize, Unnormalize
-
-#     if not hasattr(dataset_meta, "stats") or not dataset_meta.stats:
-#         print("⚠️ Dataset has no stats, skipping normalization injection.")
-#         return
-
-#     stats = {}
-#     for key, stat_dict in dataset_meta.stats.items():
-#         stats[key] = {
-#             stat_type: torch.as_tensor(stat_array)
-#             if isinstance(stat_array, np.ndarray)
-#             else stat_array
-#             for stat_type, stat_array in stat_dict.items()
-#         }
-
-#     normalize_inputs = Normalize(policy.config.input_features, policy.config.normalization_mapping, stats)
-#     normalize_targets = Normalize(policy.config.output_features, policy.config.normalization_mapping, stats)
-#     unnormalize_outputs = Unnormalize(policy.config.output_features, policy.config.normalization_mapping, stats)
-
-#     policy.normalize_inputs = normalize_inputs
-#     policy.normalize_targets = normalize_targets
-#     policy.unnormalize_outputs = unnormalize_outputs
-
-#     print("✅ Normalization layers injected with dataset stats.")
-
-
 @parser.wrap()
 def train(cfg: TrainPipelineConfig):
     cfg.validate()

From 8fe977118b1ee4d661bb8762b0f5c1e5137fa36e Mon Sep 17 00:00:00 2001
From: "pre-commit-ci[bot]"
 <66853113+pre-commit-ci[bot]@users.noreply.github.com>
Date: Thu, 11 Sep 2025 12:30:08 +0000
Subject: [PATCH 12/12] [pre-commit.ci] auto fixes from pre-commit.com hooks

for more information, see https://pre-commit.ci
---
 src/lerobot/scripts/eval.py  | 1 +
 src/lerobot/scripts/train.py | 1 +
 2 files changed, 2 insertions(+)

diff --git a/src/lerobot/scripts/eval.py b/src/lerobot/scripts/eval.py
index 5e8d63f09..38a825777 100644
--- a/src/lerobot/scripts/eval.py
+++ b/src/lerobot/scripts/eval.py
@@ -465,6 +465,7 @@ def _compile_episode_data(
 
     return data_dict
 
+
 @parser.wrap()
 def eval_main(cfg: EvalPipelineConfig):
     logging.info(pformat(asdict(cfg)))
diff --git a/src/lerobot/scripts/train.py b/src/lerobot/scripts/train.py
index 3aea697d0..2656ee6a2 100644
--- a/src/lerobot/scripts/train.py
+++ b/src/lerobot/scripts/train.py
@@ -105,6 +105,7 @@ def update_policy(
     train_metrics.update_s = time.perf_counter() - start_time
     return train_metrics, output_dict
 
+
 @parser.wrap()
 def train(cfg: TrainPipelineConfig):
     cfg.validate()